From 56ee95e58473c1cbeb0184d2e32b45d5e1c2bf37 Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Mon, 8 Jun 2026 23:15:56 -0400 Subject: [PATCH 01/11] [TON-582] Add post-setup Agent installation add-on template Extract the permission-attachment custom resource out of datadog_integration_role.yaml into a new nested datadog_integration_permissions.yaml, gated by a ManageBasePermissions flag, so it can be reused with zero duplication by a new standalone add-on, main_agent_installation.yaml. The add-on attaches only the instrumentation IAM policies to an existing integration role and deploys the EventBridge forwarding pipeline, letting customers who declined the Agent installation option during initial setup enable it later. The role template now nests the permissions stack with ManageBasePermissions: true, preserving behavior for all four main_* templates. The add-on nests it with ManageBasePermissions: false so it never touches the standard/resource-collection policies the role stack owns. Also keep the tested attach_integration_permissions.py copy in sync, add ManageBasePermissions gating tests, wire that test suite into CI, add the two templates to release.sh placeholder substitution, and bump the version to v4.14.0. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/python-test.yml | 4 + aws_quickstart/CHANGELOG.md | 4 + .../attach_integration_permissions.py | 14 +- .../attach_integration_permissions_test.py | 81 +++++ .../datadog_integration_permissions.yaml | 317 ++++++++++++++++++ aws_quickstart/datadog_integration_role.yaml | 284 +--------------- aws_quickstart/main_agent_installation.yaml | 113 +++++++ aws_quickstart/release.sh | 2 +- aws_quickstart/version.txt | 2 +- 9 files changed, 542 insertions(+), 279 deletions(-) create mode 100644 aws_quickstart/datadog_integration_permissions.yaml create mode 100644 aws_quickstart/main_agent_installation.yaml diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 0191c36d..18382cf5 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -21,3 +21,7 @@ jobs: run: | cd aws_quickstart python -B -S -m unittest datadog_agentless_api_call_test.py -v + - name: Run integration permissions unit tests + run: | + cd aws_quickstart + python -B -S -m unittest attach_integration_permissions_test.py -v diff --git a/aws_quickstart/CHANGELOG.md b/aws_quickstart/CHANGELOG.md index c5b31e76..16620c96 100644 --- a/aws_quickstart/CHANGELOG.md +++ b/aws_quickstart/CHANGELOG.md @@ -1,3 +1,7 @@ +# 4.14.0 (June 8, 2026) + +- Add `main_agent_installation.yaml`, a standalone add-on template that lets customers who declined the Agent installation option during initial AWS integration setup enable it later against an existing integration role. It attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline without touching the standard or resource-collection policies owned by the role stack. To keep this DRY, the permission-attachment custom resource was extracted out of `datadog_integration_role.yaml` into a new nested `datadog_integration_permissions.yaml` (gated by a `ManageBasePermissions` flag) that both the role template and the add-on consume. Behavior of `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` is unchanged. Affects `datadog_integration_role.yaml`, `datadog_integration_permissions.yaml`, `main_agent_installation.yaml`, and `release.sh`. + # 4.13.0 (May 29, 2026) - Add `uk1.datadoghq.com` site support. Affects `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml`. diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index f9ea665c..532969aa 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -171,9 +171,11 @@ def handle_delete(event, context): role_name = props['DatadogIntegrationRole'] account_id = props['AccountId'] partition = props.get('Partition', 'aws') + manage_base_permissions = str(props.get('ManageBasePermissions', 'true')).lower() == 'true' iam_client = boto3.client('iam') try: - cleanup_existing_policies(iam_client, role_name, account_id, partition) + if manage_base_permissions: + cleanup_existing_policies(iam_client, role_name, account_id, partition) cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: @@ -186,6 +188,7 @@ def handle_create_update(event, context): role_name = props['DatadogIntegrationRole'] account_id = props['AccountId'] partition = props.get('Partition', 'aws') + manage_base_permissions = str(props.get('ManageBasePermissions', 'true')).lower() == 'true' should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' datadog_site = props.get('DatadogSite') or 'datadoghq.com' instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) @@ -195,10 +198,11 @@ def handle_create_update(event, context): try: iam_client = boto3.client('iam') - cleanup_existing_policies(iam_client, role_name, account_id, partition) - attach_standard_permissions(iam_client, role_name) - if should_install_security_audit_policy: - attach_resource_collection_permissions(iam_client, role_name) + if manage_base_permissions: + cleanup_existing_policies(iam_client, role_name, account_id, partition) + attach_standard_permissions(iam_client, role_name) + if should_install_security_audit_policy: + attach_resource_collection_permissions(iam_client, role_name) attach_instrumentation_permissions( iam_client, role_name, account_id, partition, datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types, diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index aca49602..310d58f7 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -19,6 +19,8 @@ attach_instrumentation_permissions, cleanup_existing_policies, cleanup_instrumentation_policies, + handle_create_update, + handle_delete, BASE_POLICY_PREFIX_INSTRUMENTATION, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, ) @@ -196,5 +198,84 @@ def test_cleanup_instrumentation_targets_only_instrumentation_prefix(self): self.assertTrue(all(BASE_POLICY_PREFIX_INSTRUMENTATION in arn for arn in detached)) +class TestManageBasePermissions(unittest.TestCase): + # ManageBasePermissions gates the standard + resource-collection policies. The role-creation + # path sets it true (manage everything); the post-setup add-on sets it false so it manages only + # the instrumentation policies and never touches the standard/resource-collection policies the + # role stack owns. + def setUp(self): + self.iam = MagicMock() + self.iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) + self.iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) + + def _props(self, **overrides): + props = { + "DatadogIntegrationRole": "DatadogIntegrationRole", + "AccountId": "123456789012", + "Partition": "aws", + "ResourceCollectionPermissions": "true", + "InstrumentationResourceTypes": "", + "DatadogSite": "datadoghq.com", + } + props.update(overrides) + return {"RequestType": "Create", "ResourceProperties": props} + + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.attach_instrumentation_permissions") + @patch("attach_integration_permissions.attach_resource_collection_permissions") + @patch("attach_integration_permissions.attach_standard_permissions") + @patch("attach_integration_permissions.cleanup_existing_policies") + def test_create_manage_base_true_attaches_base( + self, mock_cleanup, mock_standard, mock_rc, mock_instr, mock_client + ): + mock_client.return_value = self.iam + handle_create_update(self._props(ManageBasePermissions="true"), None) + mock_cleanup.assert_called_once() + mock_standard.assert_called_once() + mock_rc.assert_called_once() + mock_instr.assert_called_once() + + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.attach_instrumentation_permissions") + @patch("attach_integration_permissions.attach_resource_collection_permissions") + @patch("attach_integration_permissions.attach_standard_permissions") + @patch("attach_integration_permissions.cleanup_existing_policies") + def test_create_manage_base_false_only_instrumentation( + self, mock_cleanup, mock_standard, mock_rc, mock_instr, mock_client + ): + mock_client.return_value = self.iam + handle_create_update(self._props(ManageBasePermissions="false"), None) + mock_cleanup.assert_not_called() + mock_standard.assert_not_called() + mock_rc.assert_not_called() + mock_instr.assert_called_once() + + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.cleanup_instrumentation_policies") + @patch("attach_integration_permissions.cleanup_existing_policies") + def test_delete_manage_base_false_only_instrumentation( + self, mock_cleanup_base, mock_cleanup_instr, mock_client + ): + mock_client.return_value = self.iam + event = self._props(ManageBasePermissions="false") + event["RequestType"] = "Delete" + handle_delete(event, None) + mock_cleanup_base.assert_not_called() + mock_cleanup_instr.assert_called_once() + + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.cleanup_instrumentation_policies") + @patch("attach_integration_permissions.cleanup_existing_policies") + def test_delete_manage_base_true_cleans_both( + self, mock_cleanup_base, mock_cleanup_instr, mock_client + ): + mock_client.return_value = self.iam + event = self._props(ManageBasePermissions="true") + event["RequestType"] = "Delete" + handle_delete(event, None) + mock_cleanup_base.assert_called_once() + mock_cleanup_instr.assert_called_once() + + if __name__ == "__main__": unittest.main() diff --git a/aws_quickstart/datadog_integration_permissions.yaml b/aws_quickstart/datadog_integration_permissions.yaml new file mode 100644 index 00000000..91bb904d --- /dev/null +++ b/aws_quickstart/datadog_integration_permissions.yaml @@ -0,0 +1,317 @@ +AWSTemplateFormatVersion: 2010-09-09 +Description: Datadog AWS Integration - attach IAM permissions to the integration role +Parameters: + IAMRoleName: + Description: Name of the IAM role to attach the Datadog integration permissions to. + Type: String + Default: DatadogIntegrationRole + ResourceCollectionPermissions: + Type: String + Default: false + AllowedValues: + - true + - false + Description: >- + Set this value to "true" to add permissions for Datadog to collect resource configuration data. + InstrumentationResourceTypes: + Type: String + Default: "" + Description: >- + Comma-separated list of AWS resource types (UDM form, e.g. aws:ec2:instance, aws:ecs:cluster, + aws:eks:cluster) that the Datadog integration role should be granted the IAM permissions + required to instrument with the Datadog Agent. Leave blank to skip. + DatadogSite: + Type: String + Default: "datadoghq.com" + Description: >- + Datadog site the integration is being installed against. Used to call the Datadog API that + returns the IAM permissions required to instrument the selected resource types. + ManageBasePermissions: + Type: String + Default: true + AllowedValues: + - true + - false + Description: >- + Set this value to "true" to manage the standard and resource-collection permissions on the role + (the role-creation case). Set it to "false" to manage only the instrumentation permissions and + leave the standard and resource-collection policies untouched (the post-setup add-on case). +Resources: + DatadogAttachIntegrationPermissionsLambdaExecutionRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: + - lambda.amazonaws.com + Action: + - sts:AssumeRole + Path: "/" + ManagedPolicyArns: + - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + Policies: + - PolicyName: !Sub "datadog-aws-integration-iam-permissions-${IAMRoleName}" + PolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - iam:CreatePolicy + - iam:DeletePolicy + - iam:DeleteRolePolicy + - iam:AttachRolePolicy + - iam:DetachRolePolicy + - iam:PutRolePolicy + Resource: + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMRoleName} + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-resource-collection-permissions-* + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-instrumentation-permissions-* + - !Sub "arn:${AWS::Partition}:iam::aws:policy/SecurityAudit" + DatadogAttachIntegrationPermissionsFunction: + Type: AWS::Lambda::Function + Properties: + Description: "A function to attach Datadog AWS integration permissions to an IAM role." + Role: !GetAtt DatadogAttachIntegrationPermissionsLambdaExecutionRole.Arn + Handler: "index.handler" + LoggingConfig: + ApplicationLogLevel: "INFO" + LogFormat: "JSON" + Runtime: "python3.14" + Timeout: 300 + Code: + ZipFile: | + import json + import logging + from urllib.request import Request + import urllib.error + import urllib.parse + import urllib.request + import cfnresponse + import boto3 + + LOGGER = logging.getLogger() + LOGGER.setLevel(logging.INFO) + API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" + POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" + BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" + BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" + STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" + RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" + INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" + + + class DatadogAPIError(Exception): + pass + + + def fetch_permissions_from_datadog(api_url): + headers = { + "Dd-Aws-Api-Call-Source": API_CALL_SOURCE_HEADER_VALUE, + } + request = Request(api_url, headers=headers) + request.get_method = lambda: "GET" + + try: + response = urllib.request.urlopen(request) + except urllib.error.HTTPError as e: + error_body = json.loads(e.read()) + error_message = error_body.get('errors', ['Unknown error'])[0] + raise DatadogAPIError(f"Datadog API error: {error_message}") from e + + return json.loads(response.read())["data"]["attributes"]["permissions"] + + + def parse_resource_types(raw): + # CFN forwards CommaDelimitedList parameters as JSON arrays to custom resources, + # while String parameters arrive as comma-delimited strings; accept both. + if raw is None: + return [] + items = raw.split(",") if isinstance(raw, str) else list(raw) + return [t.strip() for t in items if t and t.strip()] + + + def build_instrumentation_permissions_url(datadog_site, resource_types): + query = urllib.parse.urlencode( + [("resource_type", t) for t in resource_types] + [("chunked", "true")] + ) + return f"https://api.{datadog_site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}" + + + def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): + # Detach + delete are both no-ops if the entity is already gone, so callers can blindly + # iterate the policy-name space without first checking what actually exists. + try: + iam_client.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn) + except iam_client.exceptions.NoSuchEntityException: + pass + except Exception as e: + LOGGER.error(f"Error detaching policy {policy_name}: {str(e)}") + + try: + iam_client.delete_policy(PolicyArn=policy_arn) + except iam_client.exceptions.NoSuchEntityException: + pass + except iam_client.exceptions.DeleteConflictException: + LOGGER.warning(f"Policy {policy_name} still attached, skipping delete") + except Exception as e: + LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}") + + + def _cleanup_chunked_policies(iam_client, role_name, account_id, partition, prefix, max_policies=10): + for i in range(max_policies): + policy_name = f"{prefix}-{role_name}-{i+1}" + policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" + _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) + + + def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, max_policies) + + try: + iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD) + except iam_client.exceptions.NoSuchEntityException: + pass + except Exception as e: + LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") + + + def cleanup_instrumentation_policies(iam_client, role_name, account_id, partition, max_policies=10): + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) + + + def attach_standard_permissions(iam_client, role_name): + permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) + policy_document = { + "Version": "2012-10-17", + "Statement": [{"Effect": "Allow", "Action": permissions, "Resource": "*"}], + } + iam_client.put_role_policy( + RoleName=role_name, + PolicyName=POLICY_NAME_STANDARD, + PolicyDocument=json.dumps(policy_document, separators=(',', ':')), + ) + + + def _create_and_attach_policy(iam_client, role_name, policy_name, actions): + policy_json = json.dumps( + { + "Version": "2012-10-17", + "Statement": [{"Effect": "Allow", "Action": actions, "Resource": "*"}], + }, + separators=(',', ':'), + ) + LOGGER.info(f"Creating policy {policy_name} with {len(actions)} permissions ({len(policy_json)} characters)") + policy = iam_client.create_policy(PolicyName=policy_name, PolicyDocument=policy_json) + iam_client.attach_role_policy(RoleName=role_name, PolicyArn=policy['Policy']['Arn']) + + + def attach_resource_collection_permissions(iam_client, role_name): + permission_chunks = fetch_permissions_from_datadog(RESOURCE_COLLECTION_PERMISSIONS_API_URL) + for i, chunk in enumerate(permission_chunks): + _create_and_attach_policy( + iam_client, + role_name, + f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}", + chunk, + ) + + + def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types): + # Best-effort: instrumentation permissions are additive convenience on top of the + # integration, so any failure here is logged and swallowed rather than blocking install. + # Fetch before cleanup so that a transient API failure on an Update leaves the + # previously-attached policies in place instead of silently revoking them. + if not resource_types: + # Only clean up if the previous Update had instrumentation enabled — avoids running + # delete calls on stacks that never opted in to instrumentation in the first place. + if previous_resource_types: + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) + return + + try: + url = build_instrumentation_permissions_url(datadog_site, resource_types) + LOGGER.info(f"Fetching instrumentation permissions for {resource_types} from {url}") + permission_chunks = fetch_permissions_from_datadog(url) + except Exception as e: + LOGGER.warning( + f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " + "Leaving any previously-attached instrumentation policies in place." + ) + return + + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) + for i, chunk in enumerate(permission_chunks): + policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" + try: + _create_and_attach_policy(iam_client, role_name, policy_name, chunk) + except Exception as e: + LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") + + + def handle_delete(event, context): + props = event['ResourceProperties'] + role_name = props['DatadogIntegrationRole'] + account_id = props['AccountId'] + partition = props.get('Partition', 'aws') + manage_base_permissions = str(props.get('ManageBasePermissions', 'true')).lower() == 'true' + iam_client = boto3.client('iam') + try: + if manage_base_permissions: + cleanup_existing_policies(iam_client, role_name, account_id, partition) + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) + cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) + except Exception as e: + LOGGER.error(f"Error deleting policy: {str(e)}") + cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) + + + def handle_create_update(event, context): + props = event['ResourceProperties'] + role_name = props['DatadogIntegrationRole'] + account_id = props['AccountId'] + partition = props.get('Partition', 'aws') + manage_base_permissions = str(props.get('ManageBasePermissions', 'true')).lower() == 'true' + should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' + datadog_site = props.get('DatadogSite') or 'datadoghq.com' + instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) + previous_instrumentation_resource_types = parse_resource_types( + event.get('OldResourceProperties', {}).get('InstrumentationResourceTypes') + ) + + try: + iam_client = boto3.client('iam') + if manage_base_permissions: + cleanup_existing_policies(iam_client, role_name, account_id, partition) + attach_standard_permissions(iam_client, role_name) + if should_install_security_audit_policy: + attach_resource_collection_permissions(iam_client, role_name) + attach_instrumentation_permissions( + iam_client, role_name, account_id, partition, + datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types, + ) + cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) + except Exception as e: + LOGGER.error(f"Error creating/attaching policy: {str(e)}") + cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) + + + def handler(event, context): + LOGGER.info("Event received: %s", json.dumps(event)) + if event['RequestType'] == 'Delete': + handle_delete(event, context) + else: + handle_create_update(event, context) + DatadogAttachIntegrationPermissionsFunctionTrigger: + Type: Custom::DatadogAttachIntegrationPermissionsFunctionTrigger + Properties: + ServiceToken: !GetAtt DatadogAttachIntegrationPermissionsFunction.Arn + DatadogIntegrationRole: !Ref IAMRoleName + AccountId: !Ref AWS::AccountId + Partition: !Sub "${AWS::Partition}" + ResourceCollectionPermissions: !Ref ResourceCollectionPermissions + InstrumentationResourceTypes: !Ref InstrumentationResourceTypes + DatadogSite: !Ref DatadogSite + ManageBasePermissions: !Ref ManageBasePermissions diff --git a/aws_quickstart/datadog_integration_role.yaml b/aws_quickstart/datadog_integration_role.yaml index 8ebd27b4..0a3cf14e 100644 --- a/aws_quickstart/datadog_integration_role.yaml +++ b/aws_quickstart/datadog_integration_role.yaml @@ -74,280 +74,20 @@ Resources: [!Sub "arn:${AWS::Partition}:iam::aws:policy/SecurityAudit"], !Ref AWS::NoValue, ] - DatadogAttachIntegrationPermissionsLambdaExecutionRole: - Type: AWS::IAM::Role - Properties: - AssumeRolePolicyDocument: - Version: '2012-10-17' - Statement: - - Effect: Allow - Principal: - Service: - - lambda.amazonaws.com - Action: - - sts:AssumeRole - Path: "/" - ManagedPolicyArns: - - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" - Policies: - - PolicyName: !Sub "datadog-aws-integration-iam-permissions-${IAMRoleName}" - PolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: Allow - Action: - - iam:CreatePolicy - - iam:DeletePolicy - - iam:DeleteRolePolicy - - iam:AttachRolePolicy - - iam:DetachRolePolicy - - iam:PutRolePolicy - Resource: - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMRoleName} - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-resource-collection-permissions-* - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-instrumentation-permissions-* - - !Sub "arn:${AWS::Partition}:iam::aws:policy/SecurityAudit" - DatadogAttachIntegrationPermissionsFunction: - Type: AWS::Lambda::Function - Properties: - Description: "A function to attach Datadog AWS integration permissions to an IAM role." - Role: !GetAtt DatadogAttachIntegrationPermissionsLambdaExecutionRole.Arn - Handler: "index.handler" - LoggingConfig: - ApplicationLogLevel: "INFO" - LogFormat: "JSON" - Runtime: "python3.14" - Timeout: 300 - Code: - ZipFile: | - import json - import logging - from urllib.request import Request - import urllib.error - import urllib.parse - import urllib.request - import cfnresponse - import boto3 - - LOGGER = logging.getLogger() - LOGGER.setLevel(logging.INFO) - API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" - POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" - BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" - BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" - STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" - RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" - INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" - - - class DatadogAPIError(Exception): - pass - - - def fetch_permissions_from_datadog(api_url): - headers = { - "Dd-Aws-Api-Call-Source": API_CALL_SOURCE_HEADER_VALUE, - } - request = Request(api_url, headers=headers) - request.get_method = lambda: "GET" - - try: - response = urllib.request.urlopen(request) - except urllib.error.HTTPError as e: - error_body = json.loads(e.read()) - error_message = error_body.get('errors', ['Unknown error'])[0] - raise DatadogAPIError(f"Datadog API error: {error_message}") from e - - return json.loads(response.read())["data"]["attributes"]["permissions"] - - - def parse_resource_types(raw): - # CFN forwards CommaDelimitedList parameters as JSON arrays to custom resources, - # while String parameters arrive as comma-delimited strings; accept both. - if raw is None: - return [] - items = raw.split(",") if isinstance(raw, str) else list(raw) - return [t.strip() for t in items if t and t.strip()] - - - def build_instrumentation_permissions_url(datadog_site, resource_types): - query = urllib.parse.urlencode( - [("resource_type", t) for t in resource_types] + [("chunked", "true")] - ) - return f"https://api.{datadog_site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}" - - - def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): - # Detach + delete are both no-ops if the entity is already gone, so callers can blindly - # iterate the policy-name space without first checking what actually exists. - try: - iam_client.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn) - except iam_client.exceptions.NoSuchEntityException: - pass - except Exception as e: - LOGGER.error(f"Error detaching policy {policy_name}: {str(e)}") - - try: - iam_client.delete_policy(PolicyArn=policy_arn) - except iam_client.exceptions.NoSuchEntityException: - pass - except iam_client.exceptions.DeleteConflictException: - LOGGER.warning(f"Policy {policy_name} still attached, skipping delete") - except Exception as e: - LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}") - - - def _cleanup_chunked_policies(iam_client, role_name, account_id, partition, prefix, max_policies=10): - for i in range(max_policies): - policy_name = f"{prefix}-{role_name}-{i+1}" - policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" - _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) - - - def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, max_policies) - - try: - iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD) - except iam_client.exceptions.NoSuchEntityException: - pass - except Exception as e: - LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") - - - def cleanup_instrumentation_policies(iam_client, role_name, account_id, partition, max_policies=10): - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) - - - def attach_standard_permissions(iam_client, role_name): - permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) - policy_document = { - "Version": "2012-10-17", - "Statement": [{"Effect": "Allow", "Action": permissions, "Resource": "*"}], - } - iam_client.put_role_policy( - RoleName=role_name, - PolicyName=POLICY_NAME_STANDARD, - PolicyDocument=json.dumps(policy_document, separators=(',', ':')), - ) - - - def _create_and_attach_policy(iam_client, role_name, policy_name, actions): - policy_json = json.dumps( - { - "Version": "2012-10-17", - "Statement": [{"Effect": "Allow", "Action": actions, "Resource": "*"}], - }, - separators=(',', ':'), - ) - LOGGER.info(f"Creating policy {policy_name} with {len(actions)} permissions ({len(policy_json)} characters)") - policy = iam_client.create_policy(PolicyName=policy_name, PolicyDocument=policy_json) - iam_client.attach_role_policy(RoleName=role_name, PolicyArn=policy['Policy']['Arn']) - - - def attach_resource_collection_permissions(iam_client, role_name): - permission_chunks = fetch_permissions_from_datadog(RESOURCE_COLLECTION_PERMISSIONS_API_URL) - for i, chunk in enumerate(permission_chunks): - _create_and_attach_policy( - iam_client, - role_name, - f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}", - chunk, - ) - - - def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types): - # Best-effort: instrumentation permissions are additive convenience on top of the - # integration, so any failure here is logged and swallowed rather than blocking install. - # Fetch before cleanup so that a transient API failure on an Update leaves the - # previously-attached policies in place instead of silently revoking them. - if not resource_types: - # Only clean up if the previous Update had instrumentation enabled — avoids running - # delete calls on stacks that never opted in to instrumentation in the first place. - if previous_resource_types: - cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) - return - - try: - url = build_instrumentation_permissions_url(datadog_site, resource_types) - LOGGER.info(f"Fetching instrumentation permissions for {resource_types} from {url}") - permission_chunks = fetch_permissions_from_datadog(url) - except Exception as e: - LOGGER.warning( - f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " - "Leaving any previously-attached instrumentation policies in place." - ) - return - - cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) - for i, chunk in enumerate(permission_chunks): - policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" - try: - _create_and_attach_policy(iam_client, role_name, policy_name, chunk) - except Exception as e: - LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") - - - def handle_delete(event, context): - props = event['ResourceProperties'] - role_name = props['DatadogIntegrationRole'] - account_id = props['AccountId'] - partition = props.get('Partition', 'aws') - iam_client = boto3.client('iam') - try: - cleanup_existing_policies(iam_client, role_name, account_id, partition) - cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) - cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) - except Exception as e: - LOGGER.error(f"Error deleting policy: {str(e)}") - cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) - - - def handle_create_update(event, context): - props = event['ResourceProperties'] - role_name = props['DatadogIntegrationRole'] - account_id = props['AccountId'] - partition = props.get('Partition', 'aws') - should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' - datadog_site = props.get('DatadogSite') or 'datadoghq.com' - instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) - previous_instrumentation_resource_types = parse_resource_types( - event.get('OldResourceProperties', {}).get('InstrumentationResourceTypes') - ) - - try: - iam_client = boto3.client('iam') - cleanup_existing_policies(iam_client, role_name, account_id, partition) - attach_standard_permissions(iam_client, role_name) - if should_install_security_audit_policy: - attach_resource_collection_permissions(iam_client, role_name) - attach_instrumentation_permissions( - iam_client, role_name, account_id, partition, - datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types, - ) - cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) - except Exception as e: - LOGGER.error(f"Error creating/attaching policy: {str(e)}") - cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) - - - def handler(event, context): - LOGGER.info("Event received: %s", json.dumps(event)) - if event['RequestType'] == 'Delete': - handle_delete(event, context) - else: - handle_create_update(event, context) - DatadogAttachIntegrationPermissionsFunctionTrigger: - Type: Custom::DatadogAttachIntegrationPermissionsFunctionTrigger + # Attaches the standard, resource-collection, and instrumentation IAM policies to the role + # above. Extracted into a nested template so the same custom resource can be reused by the + # post-setup Agent installation add-on (main_agent_installation.yaml). + DatadogIntegrationPermissionsStack: + Type: AWS::CloudFormation::Stack DependsOn: DatadogIntegrationRole Properties: - ServiceToken: !GetAtt DatadogAttachIntegrationPermissionsFunction.Arn - DatadogIntegrationRole: !Ref IAMRoleName - AccountId: !Ref AWS::AccountId - Partition: !Sub "${AWS::Partition}" - ResourceCollectionPermissions: !Ref ResourceCollectionPermissions - InstrumentationResourceTypes: !Ref InstrumentationResourceTypes - DatadogSite: !Ref DatadogSite + TemplateURL: "https://.s3.amazonaws.com/aws//datadog_integration_permissions.yaml" + Parameters: + IAMRoleName: !Ref IAMRoleName + ResourceCollectionPermissions: !Ref ResourceCollectionPermissions + InstrumentationResourceTypes: !Ref InstrumentationResourceTypes + DatadogSite: !Ref DatadogSite + ManageBasePermissions: true Metadata: AWS::CloudFormation::Interface: ParameterGroups: diff --git a/aws_quickstart/main_agent_installation.yaml b/aws_quickstart/main_agent_installation.yaml new file mode 100644 index 00000000..0c03cd55 --- /dev/null +++ b/aws_quickstart/main_agent_installation.yaml @@ -0,0 +1,113 @@ +# version: +# +# Post-setup Agent installation add-on. Lets customers who declined the Agent installation option +# during initial AWS integration setup enable it later, against an existing integration role: it +# attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline, +# without touching the standard or resource-collection policies owned by the role stack. +# +AWSTemplateFormatVersion: 2010-09-09 +Description: Datadog AWS Integration - Agent installation add-on +Parameters: + APIKey: + Description: >- + API key for the Datadog account (find at https://app.datadoghq.com/organization-settings/api-keys). + Type: String + NoEcho: true + APPKey: + Description: >- + APP key for the Datadog account (find at https://app.datadoghq.com/organization-settings/application-keys). + Type: String + NoEcho: true + DatadogSite: + Type: String + Default: datadoghq.com + Description: >- + Define your Datadog Site to send data to. + Allowed values: datadoghq.com, datadoghq.eu, us3.datadoghq.com, us5.datadoghq.com, + ap1.datadoghq.com, ap2.datadoghq.com, uk1.datadoghq.com, ddog-gov.com (GovCloud), us2.ddog-gov.com (GovCloud). + IAMRoleName: + Description: Name of the existing IAM role used by the Datadog AWS integration. + Type: String + Default: DatadogIntegrationRole + AccountId: + Type: String + Description: The AWS account ID of the account integrated in Datadog. + InstrumentationResourceTypes: + Type: CommaDelimitedList + Description: >- + Comma-separated list of AWS resource types (UDM form, e.g. aws:ec2:instance, aws:ecs:cluster, aws:eks:cluster) + to enable Datadog Agent installation for. The integration role is granted the IAM permissions required to + instrument these resources, and CloudTrail events for them are forwarded to Datadog. +Rules: + ValidateAccountId: + Assertions: + - Assert: !Equals [!Ref AccountId, !Ref "AWS::AccountId"] + AssertDescription: "The AWS Account Id of the account integrated in Datadog does not match the AWS Account Id of the account where this stack will be created." +Conditions: + ShouldForwardEvents: + Fn::Not: + - Fn::Equals: + - !Join ["", !Ref InstrumentationResourceTypes] + - "" +Resources: + # Attaches only the instrumentation IAM policies to the existing integration role. ManageBasePermissions + # is false so the standard and resource-collection policies owned by the role stack are left untouched. + DatadogIntegrationPermissionsStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: "https://.s3.amazonaws.com/aws//datadog_integration_permissions.yaml" + Parameters: + IAMRoleName: !Ref IAMRoleName + ResourceCollectionPermissions: false + InstrumentationResourceTypes: !Join [",", !Ref InstrumentationResourceTypes] + DatadogSite: !Ref DatadogSite + ManageBasePermissions: false + # EventBridge pipeline forwarding CloudTrail events to the Datadog resource update intake. + # Deployed only when at least one InstrumentationResourceTypes value is set; single-region + # (covers the region this stack is deployed in). + DatadogAgentResourceUpdateForwardingStack: + Type: AWS::CloudFormation::Stack + Condition: ShouldForwardEvents + Properties: + TemplateURL: "https://.s3.amazonaws.com/aws//datadog_agent_resource_update_forwarding.yaml" + Parameters: + APIKey: !Ref APIKey + APPKey: !Ref APPKey + DatadogSite: !Ref DatadogSite + InstrumentationResourceTypes: !Join [",", !Ref InstrumentationResourceTypes] +Outputs: + IAMRoleName: + Description: AWS IAM Role named to be used with the DataDog AWS Integration + Value: !Ref IAMRoleName + AccountId: + Description: AWS Account number + Value: !Ref "AWS::AccountId" + Region: + Description: AWS Region + Value: !Ref "AWS::Region" +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: Required + Parameters: + - APIKey + - APPKey + - DatadogSite + - AccountId + - InstrumentationResourceTypes + - Label: + default: Advanced + Parameters: + - IAMRoleName + ParameterLabels: + APIKey: + default: "DatadogApiKey *" + APPKey: + default: "DatadogAppKey *" + DatadogSite: + default: "DatadogSite *" + AccountId: + default: "AccountId *" + InstrumentationResourceTypes: + default: "InstrumentationResourceTypes *" diff --git a/aws_quickstart/release.sh b/aws_quickstart/release.sh index 3b8728f2..d26ac07a 100755 --- a/aws_quickstart/release.sh +++ b/aws_quickstart/release.sh @@ -116,7 +116,7 @@ cp datadog_agentless_api_call.py "${TEMP_DIR}/" cd "${TEMP_DIR}" # Update placeholder -for template in main_workflow.yaml main_extended_workflow.yaml main_v2.yaml main_extended.yaml; do +for template in main_workflow.yaml main_extended_workflow.yaml main_v2.yaml main_extended.yaml datadog_integration_role.yaml main_agent_installation.yaml; do perl -pi -e "s//${BUCKET}/g" $template perl -pi -e "s//${VERSION}/g" $template done diff --git a/aws_quickstart/version.txt b/aws_quickstart/version.txt index c4475d31..cabad0ce 100644 --- a/aws_quickstart/version.txt +++ b/aws_quickstart/version.txt @@ -1 +1 @@ -v4.13.0 +v4.14.0 From 0d3587a81dbf372e54f97ba06131adb1c662bd03 Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Mon, 8 Jun 2026 23:49:08 -0400 Subject: [PATCH 02/11] Use -v2 policy names in extracted permissions template for safe upgrades Extracting the permission-attach custom resource out of the role template means an in-place upgrade from a release that still had the inline trigger (<= v4.13) removes that old trigger, whose Delete handler deletes the standard/resource-collection/instrumentation policies by their un-suffixed names. CloudFormation can run that delete after the new nested stack has re-attached them, leaving the integration role with no permissions. Give the attached policies a -v2 generation name so the old handler only ever cleans up the old-named policies and never touches the new ones; existing roles get a gap-free, one-time policy replacement on upgrade. The exposure was on the standard + resource-collection policies (shipped since v4.4.0), so it affects existing integrations, not just the unreleased Agent-installation instrumentation policies. Add a test locking in that the new names are disjoint from the legacy ones. Co-Authored-By: Claude Opus 4.8 (1M context) --- aws_quickstart/CHANGELOG.md | 2 +- .../attach_integration_permissions.py | 12 ++++++-- .../attach_integration_permissions_test.py | 28 +++++++++++++++++++ .../datadog_integration_permissions.yaml | 16 +++++++---- 4 files changed, 49 insertions(+), 9 deletions(-) diff --git a/aws_quickstart/CHANGELOG.md b/aws_quickstart/CHANGELOG.md index 16620c96..41e739a5 100644 --- a/aws_quickstart/CHANGELOG.md +++ b/aws_quickstart/CHANGELOG.md @@ -1,6 +1,6 @@ # 4.14.0 (June 8, 2026) -- Add `main_agent_installation.yaml`, a standalone add-on template that lets customers who declined the Agent installation option during initial AWS integration setup enable it later against an existing integration role. It attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline without touching the standard or resource-collection policies owned by the role stack. To keep this DRY, the permission-attachment custom resource was extracted out of `datadog_integration_role.yaml` into a new nested `datadog_integration_permissions.yaml` (gated by a `ManageBasePermissions` flag) that both the role template and the add-on consume. Behavior of `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` is unchanged. Affects `datadog_integration_role.yaml`, `datadog_integration_permissions.yaml`, `main_agent_installation.yaml`, and `release.sh`. +- Add `main_agent_installation.yaml`, a standalone add-on template that lets customers who declined the Agent installation option during initial AWS integration setup enable it later against an existing integration role. It attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline without touching the standard or resource-collection policies owned by the role stack. To keep this DRY, the permission-attachment custom resource was extracted out of `datadog_integration_role.yaml` into a new nested `datadog_integration_permissions.yaml` (gated by a `ManageBasePermissions` flag) that both the role template and the add-on consume. The managed/inline policies the custom resource attaches were given a `-v2` generation name (e.g. `datadog-aws-integration-resource-collection-permissions-v2--N`, `DatadogAWSIntegrationPolicyV2`) so that an in-place upgrade from a prior version (whose now-removed inline trigger deletes the un-suffixed policy names on teardown) cannot wipe the policies the new nested stack re-attaches; existing roles see a one-time, gap-free policy replacement on upgrade. Behavior of `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` is otherwise unchanged. Affects `datadog_integration_role.yaml`, `datadog_integration_permissions.yaml`, `main_agent_installation.yaml`, and `release.sh`. # 4.13.0 (May 29, 2026) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index 532969aa..9a59906d 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -10,9 +10,15 @@ LOGGER = logging.getLogger() LOGGER.setLevel(logging.INFO) API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" -POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" -BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" -BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" +# The "-v2" generation suffix is load-bearing, not cosmetic. Extracting this custom resource +# out of datadog_integration_role.yaml means an in-place upgrade from a release that still had +# the inline trigger (<= v4.13) removes that old trigger, whose Delete handler deletes policies +# by the un-suffixed names. CloudFormation can run that delete after this nested stack has +# re-attached them, which would leave the role with no permissions. Using distinct names here +# means the old handler only ever cleans up the old-named policies and never touches these. +POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicyV2" +BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions-v2" +BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions-v2" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index 310d58f7..05908fad 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -21,11 +21,21 @@ cleanup_instrumentation_policies, handle_create_update, handle_delete, + POLICY_NAME_STANDARD, BASE_POLICY_PREFIX_INSTRUMENTATION, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, ) +# Policy names the pre-extraction inline trigger (<= v4.13) created and, crucially, deletes by name +# in its Delete handler. The extracted custom resource must NOT reuse these, or an in-place upgrade +# could let the old handler wipe the freshly re-attached policies. See the comment on the constants +# in datadog_integration_permissions.yaml. +LEGACY_POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" +LEGACY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" +LEGACY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" + + class TestParseResourceTypes(unittest.TestCase): def test_none(self): self.assertEqual(parse_resource_types(None), []) @@ -277,5 +287,23 @@ def test_delete_manage_base_true_cleans_both( mock_cleanup_instr.assert_called_once() +class TestUpgradeSafePolicyNames(unittest.TestCase): + # Guards the invariant that makes in-place upgrades from the inline-trigger era safe: the names + # this template manages must be disjoint from the names the legacy (<= v4.13) Delete handler + # removes, so the old handler can never wipe a policy this stack just attached. + role = "DatadogIntegrationRole" + + def test_standard_policy_name_differs_from_legacy(self): + self.assertNotEqual(POLICY_NAME_STANDARD, LEGACY_POLICY_NAME_STANDARD) + + def test_managed_policy_names_disjoint_from_legacy(self): + def names(prefix): + return {f"{prefix}-{self.role}-{i+1}" for i in range(10)} + + new_names = names(BASE_POLICY_PREFIX_RESOURCE_COLLECTION) | names(BASE_POLICY_PREFIX_INSTRUMENTATION) + legacy_names = names(LEGACY_PREFIX_RESOURCE_COLLECTION) | names(LEGACY_PREFIX_INSTRUMENTATION) + self.assertEqual(new_names & legacy_names, set()) + + if __name__ == "__main__": unittest.main() diff --git a/aws_quickstart/datadog_integration_permissions.yaml b/aws_quickstart/datadog_integration_permissions.yaml index 91bb904d..17aaa2e6 100644 --- a/aws_quickstart/datadog_integration_permissions.yaml +++ b/aws_quickstart/datadog_integration_permissions.yaml @@ -67,8 +67,8 @@ Resources: - iam:PutRolePolicy Resource: - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMRoleName} - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-resource-collection-permissions-* - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-instrumentation-permissions-* + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-resource-collection-permissions-v2-* + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-instrumentation-permissions-v2-* - !Sub "arn:${AWS::Partition}:iam::aws:policy/SecurityAudit" DatadogAttachIntegrationPermissionsFunction: Type: AWS::Lambda::Function @@ -95,9 +95,15 @@ Resources: LOGGER = logging.getLogger() LOGGER.setLevel(logging.INFO) API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" - POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" - BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" - BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" + # The "-v2" generation suffix is load-bearing, not cosmetic. Extracting this custom resource + # out of datadog_integration_role.yaml means an in-place upgrade from a release that still had + # the inline trigger (<= v4.13) removes that old trigger, whose Delete handler deletes policies + # by the un-suffixed names. CloudFormation can run that delete after this nested stack has + # re-attached them, which would leave the role with no permissions. Using distinct names here + # means the old handler only ever cleans up the old-named policies and never touches these. + POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicyV2" + BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions-v2" + BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions-v2" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" From ca2517956c93f703ec32722eb2375f58adee978c Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Tue, 9 Jun 2026 00:24:58 -0400 Subject: [PATCH 03/11] Fail the Agent installation add-on when instrumentation attach fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In add-on mode the only work the permissions stack does is attach the instrumentation policies, which are best-effort by design — so a bad IAMRoleName or transient Datadog API/IAM error would let the stack report SUCCESS while attaching nothing, leaving Agent installation silently nonfunctional. Add a FailOnInstrumentationError flag (default false, so the role-creation path keeps the ticket-mandated best-effort/non-blocking semantics) that the add-on sets to true. When set, instrumentation fetch and per-policy attach failures propagate and fail the stack; CloudFormation rolls back and the Delete handler cleans up any partial attachments. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../attach_integration_permissions.py | 14 ++++-- .../attach_integration_permissions_test.py | 46 +++++++++++++++++++ .../datadog_integration_permissions.yaml | 26 +++++++++-- aws_quickstart/main_agent_installation.yaml | 1 + 4 files changed, 81 insertions(+), 6 deletions(-) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index 9a59906d..e785f712 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -140,9 +140,11 @@ def attach_resource_collection_permissions(iam_client, role_name): ) -def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types): - # Best-effort: instrumentation permissions are additive convenience on top of the - # integration, so any failure here is logged and swallowed rather than blocking install. +def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types, fail_on_error=False): + # Best-effort by default: instrumentation permissions are additive convenience on top of the + # integration, so any failure is logged and swallowed rather than blocking install. The + # post-setup add-on passes fail_on_error=True because attaching these policies is the stack's + # whole purpose, so a silent SUCCESS that attached nothing would be worse than a visible failure. # Fetch before cleanup so that a transient API failure on an Update leaves the # previously-attached policies in place instead of silently revoking them. if not resource_types: @@ -157,6 +159,8 @@ def attach_instrumentation_permissions(iam_client, role_name, account_id, partit LOGGER.info(f"Fetching instrumentation permissions for {resource_types} from {url}") permission_chunks = fetch_permissions_from_datadog(url) except Exception as e: + if fail_on_error: + raise LOGGER.warning( f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " "Leaving any previously-attached instrumentation policies in place." @@ -169,6 +173,8 @@ def attach_instrumentation_permissions(iam_client, role_name, account_id, partit try: _create_and_attach_policy(iam_client, role_name, policy_name, chunk) except Exception as e: + if fail_on_error: + raise LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") @@ -195,6 +201,7 @@ def handle_create_update(event, context): account_id = props['AccountId'] partition = props.get('Partition', 'aws') manage_base_permissions = str(props.get('ManageBasePermissions', 'true')).lower() == 'true' + fail_on_instrumentation_error = str(props.get('FailOnInstrumentationError', 'false')).lower() == 'true' should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' datadog_site = props.get('DatadogSite') or 'datadoghq.com' instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) @@ -212,6 +219,7 @@ def handle_create_update(event, context): attach_instrumentation_permissions( iam_client, role_name, account_id, partition, datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types, + fail_on_error=fail_on_instrumentation_error, ) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index 05908fad..d02bf1a6 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -184,6 +184,29 @@ def test_per_chunk_failure_is_swallowed_and_others_continue(self, mock_urlopen): self.assertEqual(self.iam.create_policy.call_count, 3) self.assertEqual(self.iam.attach_role_policy.call_count, 2) + @patch("attach_integration_permissions.urllib.request.urlopen") + def test_fail_on_error_raises_on_fetch_failure(self, mock_urlopen): + # Add-on mode (fail_on_error=True): a fetch failure must propagate so the stack fails + # instead of silently reporting SUCCESS with nothing attached. + mock_urlopen.side_effect = HTTPError( + "u", 500, "boom", {}, BytesIO(b'{"errors":["upstream down"]}') + ) + with self.assertRaises(Exception): + attach_instrumentation_permissions( + self.iam, self.role_name, self.account_id, self.partition, self.site, + ["aws:ec2:instance"], (), fail_on_error=True, + ) + + @patch("attach_integration_permissions.urllib.request.urlopen") + def test_fail_on_error_raises_on_attach_failure(self, mock_urlopen): + mock_urlopen.return_value = self._mock_chunks_response([["chunk1:Action"]]) + self.iam.create_policy.side_effect = Exception("AccessDenied") + with self.assertRaises(Exception): + attach_instrumentation_permissions( + self.iam, self.role_name, self.account_id, self.partition, self.site, + ["aws:ec2:instance"], (), fail_on_error=True, + ) + class TestCleanup(unittest.TestCase): def setUp(self): @@ -286,6 +309,29 @@ def test_delete_manage_base_true_cleans_both( mock_cleanup_base.assert_called_once() mock_cleanup_instr.assert_called_once() + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.attach_instrumentation_permissions") + def test_create_threads_fail_on_instrumentation_error(self, mock_instr, mock_client): + mock_client.return_value = self.iam + handle_create_update( + self._props(ManageBasePermissions="false", FailOnInstrumentationError="true"), None + ) + self.assertTrue(mock_instr.call_args.kwargs["fail_on_error"]) + + @patch("attach_integration_permissions.cfnresponse") + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.attach_instrumentation_permissions") + def test_create_reports_failed_when_instrumentation_raises( + self, mock_instr, mock_client, mock_cfn + ): + # Add-on mode: a propagated instrumentation failure must surface as a FAILED response. + mock_client.return_value = self.iam + mock_instr.side_effect = Exception("AccessDenied") + handle_create_update( + self._props(ManageBasePermissions="false", FailOnInstrumentationError="true"), None + ) + self.assertEqual(mock_cfn.send.call_args.args[2], mock_cfn.FAILED) + class TestUpgradeSafePolicyNames(unittest.TestCase): # Guards the invariant that makes in-place upgrades from the inline-trigger era safe: the names diff --git a/aws_quickstart/datadog_integration_permissions.yaml b/aws_quickstart/datadog_integration_permissions.yaml index 17aaa2e6..20eabf06 100644 --- a/aws_quickstart/datadog_integration_permissions.yaml +++ b/aws_quickstart/datadog_integration_permissions.yaml @@ -36,6 +36,17 @@ Parameters: Set this value to "true" to manage the standard and resource-collection permissions on the role (the role-creation case). Set it to "false" to manage only the instrumentation permissions and leave the standard and resource-collection policies untouched (the post-setup add-on case). + FailOnInstrumentationError: + Type: String + Default: false + AllowedValues: + - true + - false + Description: >- + Set this value to "true" to fail the stack when the instrumentation permissions cannot be fetched + or attached. Defaults to "false" (best-effort) for the role-creation case, where instrumentation is + an optional add-on to the broader install. The post-setup add-on sets this to "true" because + attaching the instrumentation permissions is the stack's only purpose. Resources: DatadogAttachIntegrationPermissionsLambdaExecutionRole: Type: AWS::IAM::Role @@ -225,9 +236,11 @@ Resources: ) - def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types): - # Best-effort: instrumentation permissions are additive convenience on top of the - # integration, so any failure here is logged and swallowed rather than blocking install. + def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types, fail_on_error=False): + # Best-effort by default: instrumentation permissions are additive convenience on top of the + # integration, so any failure is logged and swallowed rather than blocking install. The + # post-setup add-on passes fail_on_error=True because attaching these policies is the stack's + # whole purpose, so a silent SUCCESS that attached nothing would be worse than a visible failure. # Fetch before cleanup so that a transient API failure on an Update leaves the # previously-attached policies in place instead of silently revoking them. if not resource_types: @@ -242,6 +255,8 @@ Resources: LOGGER.info(f"Fetching instrumentation permissions for {resource_types} from {url}") permission_chunks = fetch_permissions_from_datadog(url) except Exception as e: + if fail_on_error: + raise LOGGER.warning( f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " "Leaving any previously-attached instrumentation policies in place." @@ -254,6 +269,8 @@ Resources: try: _create_and_attach_policy(iam_client, role_name, policy_name, chunk) except Exception as e: + if fail_on_error: + raise LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") @@ -280,6 +297,7 @@ Resources: account_id = props['AccountId'] partition = props.get('Partition', 'aws') manage_base_permissions = str(props.get('ManageBasePermissions', 'true')).lower() == 'true' + fail_on_instrumentation_error = str(props.get('FailOnInstrumentationError', 'false')).lower() == 'true' should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' datadog_site = props.get('DatadogSite') or 'datadoghq.com' instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) @@ -297,6 +315,7 @@ Resources: attach_instrumentation_permissions( iam_client, role_name, account_id, partition, datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types, + fail_on_error=fail_on_instrumentation_error, ) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: @@ -321,3 +340,4 @@ Resources: InstrumentationResourceTypes: !Ref InstrumentationResourceTypes DatadogSite: !Ref DatadogSite ManageBasePermissions: !Ref ManageBasePermissions + FailOnInstrumentationError: !Ref FailOnInstrumentationError diff --git a/aws_quickstart/main_agent_installation.yaml b/aws_quickstart/main_agent_installation.yaml index 0c03cd55..08f03e20 100644 --- a/aws_quickstart/main_agent_installation.yaml +++ b/aws_quickstart/main_agent_installation.yaml @@ -62,6 +62,7 @@ Resources: InstrumentationResourceTypes: !Join [",", !Ref InstrumentationResourceTypes] DatadogSite: !Ref DatadogSite ManageBasePermissions: false + FailOnInstrumentationError: true # EventBridge pipeline forwarding CloudTrail events to the Datadog resource update intake. # Deployed only when at least one InstrumentationResourceTypes value is set; single-region # (covers the region this stack is deployed in). From 0d9518dd7c66e6a11d9edd28466e6a0dd32f7925 Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Tue, 9 Jun 2026 00:39:47 -0400 Subject: [PATCH 04/11] Clean up legacy policies before attaching v2 to stay under IAM limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The -v2 rename deliberately lets the old and new policies coexist so the legacy Delete handler can't wipe the new ones. But IAM caps managed policies per role (default 10), so on an upgrade of a role that already had resource-collection and/or instrumentation policies, attaching the full v2 set on top of the legacy set could hit LimitExceeded and roll the upgrade back. Have the custom resource remove the legacy un-suffixed policies itself, in the same invocation, before attaching the v2 policies — so the two generations never sit attached at once, while the v2 naming still prevents the old handler (running later) from touching the new policies. Gated by include_base so add-on mode leaves the role stack's standard/resource- collection policies alone. Exec-role wildcards already cover both name generations. Also clarify the add-on's InstrumentationResourceTypes help: event forwarding currently covers EC2/EKS; other types get IAM permissions but no forwarding rule. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../attach_integration_permissions.py | 25 ++++++++ .../attach_integration_permissions_test.py | 58 +++++++++++++++---- .../datadog_integration_permissions.yaml | 31 +++++++++- aws_quickstart/main_agent_installation.yaml | 7 ++- 4 files changed, 105 insertions(+), 16 deletions(-) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index e785f712..0b911c5f 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -19,6 +19,13 @@ POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicyV2" BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions-v2" BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions-v2" +# Un-suffixed names created by the pre-extraction inline trigger (<= v4.13). We clean these up +# ourselves before attaching the v2 policies so the two generations never sit attached at the +# same time (IAM caps managed policies per role, default 10); the old trigger's own Delete +# handler then no-ops against names that are already gone. +LEGACY_POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" +LEGACY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" +LEGACY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" @@ -103,6 +110,23 @@ def cleanup_instrumentation_policies(iam_client, role_name, account_id, partitio _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) +def cleanup_legacy_policies(iam_client, role_name, account_id, partition, include_base, max_policies=10): + # Remove the un-suffixed policies left by the inline trigger of releases that predate this + # nested template, so they are gone before the v2 policies are attached (avoids piling both + # generations against the IAM managed-policy limit during an in-place upgrade). include_base + # is False in add-on mode, where the role stack still owns the standard/resource-collection + # policies and this stack must not touch them. + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_INSTRUMENTATION, max_policies) + if include_base: + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, max_policies) + try: + iam_client.delete_role_policy(RoleName=role_name, PolicyName=LEGACY_POLICY_NAME_STANDARD) + except iam_client.exceptions.NoSuchEntityException: + pass + except Exception as e: + LOGGER.error(f"Error deleting legacy inline policy {LEGACY_POLICY_NAME_STANDARD}: {str(e)}") + + def attach_standard_permissions(iam_client, role_name): permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) policy_document = { @@ -211,6 +235,7 @@ def handle_create_update(event, context): try: iam_client = boto3.client('iam') + cleanup_legacy_policies(iam_client, role_name, account_id, partition, include_base=manage_base_permissions) if manage_base_permissions: cleanup_existing_policies(iam_client, role_name, account_id, partition) attach_standard_permissions(iam_client, role_name) diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index d02bf1a6..ad6a8408 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -19,23 +19,18 @@ attach_instrumentation_permissions, cleanup_existing_policies, cleanup_instrumentation_policies, + cleanup_legacy_policies, handle_create_update, handle_delete, POLICY_NAME_STANDARD, BASE_POLICY_PREFIX_INSTRUMENTATION, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, + LEGACY_POLICY_NAME_STANDARD, + LEGACY_PREFIX_RESOURCE_COLLECTION, + LEGACY_PREFIX_INSTRUMENTATION, ) -# Policy names the pre-extraction inline trigger (<= v4.13) created and, crucially, deletes by name -# in its Delete handler. The extracted custom resource must NOT reuse these, or an in-place upgrade -# could let the old handler wipe the freshly re-attached policies. See the comment on the constants -# in datadog_integration_permissions.yaml. -LEGACY_POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" -LEGACY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" -LEGACY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" - - class TestParseResourceTypes(unittest.TestCase): def test_none(self): self.assertEqual(parse_resource_types(None), []) @@ -231,6 +226,42 @@ def test_cleanup_instrumentation_targets_only_instrumentation_prefix(self): self.assertTrue(all(BASE_POLICY_PREFIX_INSTRUMENTATION in arn for arn in detached)) +class TestCleanupLegacyPolicies(unittest.TestCase): + # Removing the old un-suffixed policies before attaching the v2 ones is what keeps both + # generations from sitting attached at once during an in-place upgrade (IAM managed-policy limit). + def setUp(self): + self.iam = MagicMock() + self.iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) + self.iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) + self.iam.detach_role_policy.side_effect = self.iam.exceptions.NoSuchEntityException + self.iam.delete_policy.side_effect = self.iam.exceptions.NoSuchEntityException + + def _detached_arns(self): + return [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] + + def test_only_targets_legacy_names_not_v2(self): + cleanup_legacy_policies(self.iam, "MyRole", "123456789012", "aws", include_base=True, max_policies=3) + for arn in self._detached_arns(): + # Legacy managed-policy ARNs must never carry the -v2 generation segment. + self.assertNotIn("-permissions-v2-", arn) + + def test_include_base_true_cleans_base_and_instrumentation(self): + cleanup_legacy_policies(self.iam, "MyRole", "123456789012", "aws", include_base=True, max_policies=3) + arns = self._detached_arns() + self.assertTrue(any(LEGACY_PREFIX_RESOURCE_COLLECTION + "-MyRole" in a for a in arns)) + self.assertTrue(any(LEGACY_PREFIX_INSTRUMENTATION + "-MyRole" in a for a in arns)) + self.iam.delete_role_policy.assert_called_once_with( + RoleName="MyRole", PolicyName=LEGACY_POLICY_NAME_STANDARD + ) + + def test_include_base_false_leaves_base_policies(self): + cleanup_legacy_policies(self.iam, "MyRole", "123456789012", "aws", include_base=False, max_policies=3) + arns = self._detached_arns() + self.assertTrue(all(LEGACY_PREFIX_RESOURCE_COLLECTION + "-MyRole" not in a for a in arns)) + self.assertTrue(any(LEGACY_PREFIX_INSTRUMENTATION + "-MyRole" in a for a in arns)) + self.iam.delete_role_policy.assert_not_called() + + class TestManageBasePermissions(unittest.TestCase): # ManageBasePermissions gates the standard + resource-collection policies. The role-creation # path sets it true (manage everything); the post-setup add-on sets it false so it manages only @@ -253,13 +284,14 @@ def _props(self, **overrides): props.update(overrides) return {"RequestType": "Create", "ResourceProperties": props} + @patch("attach_integration_permissions.cleanup_legacy_policies") @patch("attach_integration_permissions.boto3.client") @patch("attach_integration_permissions.attach_instrumentation_permissions") @patch("attach_integration_permissions.attach_resource_collection_permissions") @patch("attach_integration_permissions.attach_standard_permissions") @patch("attach_integration_permissions.cleanup_existing_policies") def test_create_manage_base_true_attaches_base( - self, mock_cleanup, mock_standard, mock_rc, mock_instr, mock_client + self, mock_cleanup, mock_standard, mock_rc, mock_instr, mock_client, mock_legacy ): mock_client.return_value = self.iam handle_create_update(self._props(ManageBasePermissions="true"), None) @@ -267,14 +299,16 @@ def test_create_manage_base_true_attaches_base( mock_standard.assert_called_once() mock_rc.assert_called_once() mock_instr.assert_called_once() + self.assertTrue(mock_legacy.call_args.kwargs["include_base"]) + @patch("attach_integration_permissions.cleanup_legacy_policies") @patch("attach_integration_permissions.boto3.client") @patch("attach_integration_permissions.attach_instrumentation_permissions") @patch("attach_integration_permissions.attach_resource_collection_permissions") @patch("attach_integration_permissions.attach_standard_permissions") @patch("attach_integration_permissions.cleanup_existing_policies") def test_create_manage_base_false_only_instrumentation( - self, mock_cleanup, mock_standard, mock_rc, mock_instr, mock_client + self, mock_cleanup, mock_standard, mock_rc, mock_instr, mock_client, mock_legacy ): mock_client.return_value = self.iam handle_create_update(self._props(ManageBasePermissions="false"), None) @@ -282,6 +316,8 @@ def test_create_manage_base_false_only_instrumentation( mock_standard.assert_not_called() mock_rc.assert_not_called() mock_instr.assert_called_once() + # Add-on mode must not touch the role stack's standard/resource-collection policies. + self.assertFalse(mock_legacy.call_args.kwargs["include_base"]) @patch("attach_integration_permissions.boto3.client") @patch("attach_integration_permissions.cleanup_instrumentation_policies") diff --git a/aws_quickstart/datadog_integration_permissions.yaml b/aws_quickstart/datadog_integration_permissions.yaml index 20eabf06..22029f95 100644 --- a/aws_quickstart/datadog_integration_permissions.yaml +++ b/aws_quickstart/datadog_integration_permissions.yaml @@ -77,9 +77,11 @@ Resources: - iam:DetachRolePolicy - iam:PutRolePolicy Resource: + # Wildcards cover both the v2 names this template creates and the un-suffixed legacy + # names it cleans up on an in-place upgrade. - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMRoleName} - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-resource-collection-permissions-v2-* - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-instrumentation-permissions-v2-* + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-resource-collection-permissions-* + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-instrumentation-permissions-* - !Sub "arn:${AWS::Partition}:iam::aws:policy/SecurityAudit" DatadogAttachIntegrationPermissionsFunction: Type: AWS::Lambda::Function @@ -115,6 +117,13 @@ Resources: POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicyV2" BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions-v2" BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions-v2" + # Un-suffixed names created by the pre-extraction inline trigger (<= v4.13). We clean these up + # ourselves before attaching the v2 policies so the two generations never sit attached at the + # same time (IAM caps managed policies per role, default 10); the old trigger's own Delete + # handler then no-ops against names that are already gone. + LEGACY_POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" + LEGACY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" + LEGACY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" @@ -199,6 +208,23 @@ Resources: _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) + def cleanup_legacy_policies(iam_client, role_name, account_id, partition, include_base, max_policies=10): + # Remove the un-suffixed policies left by the inline trigger of releases that predate this + # nested template, so they are gone before the v2 policies are attached (avoids piling both + # generations against the IAM managed-policy limit during an in-place upgrade). include_base + # is False in add-on mode, where the role stack still owns the standard/resource-collection + # policies and this stack must not touch them. + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_INSTRUMENTATION, max_policies) + if include_base: + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, max_policies) + try: + iam_client.delete_role_policy(RoleName=role_name, PolicyName=LEGACY_POLICY_NAME_STANDARD) + except iam_client.exceptions.NoSuchEntityException: + pass + except Exception as e: + LOGGER.error(f"Error deleting legacy inline policy {LEGACY_POLICY_NAME_STANDARD}: {str(e)}") + + def attach_standard_permissions(iam_client, role_name): permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) policy_document = { @@ -307,6 +333,7 @@ Resources: try: iam_client = boto3.client('iam') + cleanup_legacy_policies(iam_client, role_name, account_id, partition, include_base=manage_base_permissions) if manage_base_permissions: cleanup_existing_policies(iam_client, role_name, account_id, partition) attach_standard_permissions(iam_client, role_name) diff --git a/aws_quickstart/main_agent_installation.yaml b/aws_quickstart/main_agent_installation.yaml index 08f03e20..9ea6a8eb 100644 --- a/aws_quickstart/main_agent_installation.yaml +++ b/aws_quickstart/main_agent_installation.yaml @@ -35,9 +35,10 @@ Parameters: InstrumentationResourceTypes: Type: CommaDelimitedList Description: >- - Comma-separated list of AWS resource types (UDM form, e.g. aws:ec2:instance, aws:ecs:cluster, aws:eks:cluster) - to enable Datadog Agent installation for. The integration role is granted the IAM permissions required to - instrument these resources, and CloudTrail events for them are forwarded to Datadog. + Comma-separated list of AWS resource types (UDM form, e.g. aws:ec2:instance, aws:eks:cluster) to enable + Datadog Agent installation for. The integration role is granted the IAM permissions required to instrument + these resources. CloudTrail update events are forwarded to Datadog for the supported resource types + (currently aws:ec2:instance and aws:eks:cluster); other types receive IAM permissions but no event forwarding. Rules: ValidateAccountId: Assertions: From 97e3688abfdc33e8b36f2c5e56d72527da716a25 Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Tue, 9 Jun 2026 00:51:51 -0400 Subject: [PATCH 05/11] Defer legacy instrumentation cleanup until the v2 fetch succeeds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit's legacy cleanup deleted the legacy instrumentation policies up front, before attach_instrumentation_permissions fetches the v2 replacements. On the role path that attach is best-effort, so a transient Datadog API failure would return SUCCESS having deleted the old policies and attached nothing — silently revoking the role's instrumentation permissions, the exact failure the fetch-before-cleanup design avoids. Split the legacy cleanup: standard/resource-collection (which fail loud on error) are still removed up front in cleanup_legacy_base_policies, but the legacy instrumentation policies are now removed inside attach_instrumentation_permissions, only after a successful fetch. The IAM managed-policy-limit math still holds — legacy RC is gone before v2 RC is attached, so peak attachments stay around the role's original count. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../attach_integration_permissions.py | 37 +++++++++------- .../attach_integration_permissions_test.py | 44 +++++++++++-------- .../datadog_integration_permissions.yaml | 37 +++++++++------- 3 files changed, 68 insertions(+), 50 deletions(-) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index 0b911c5f..01db2881 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -110,21 +110,21 @@ def cleanup_instrumentation_policies(iam_client, role_name, account_id, partitio _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) -def cleanup_legacy_policies(iam_client, role_name, account_id, partition, include_base, max_policies=10): - # Remove the un-suffixed policies left by the inline trigger of releases that predate this - # nested template, so they are gone before the v2 policies are attached (avoids piling both - # generations against the IAM managed-policy limit during an in-place upgrade). include_base - # is False in add-on mode, where the role stack still owns the standard/resource-collection - # policies and this stack must not touch them. - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_INSTRUMENTATION, max_policies) - if include_base: - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, max_policies) - try: - iam_client.delete_role_policy(RoleName=role_name, PolicyName=LEGACY_POLICY_NAME_STANDARD) - except iam_client.exceptions.NoSuchEntityException: - pass - except Exception as e: - LOGGER.error(f"Error deleting legacy inline policy {LEGACY_POLICY_NAME_STANDARD}: {str(e)}") +def cleanup_legacy_base_policies(iam_client, role_name, account_id, partition, max_policies=10): + # Remove the un-suffixed standard + resource-collection policies left by the inline trigger of + # releases that predate this nested template, so they are gone before the v2 standard/RC + # policies are attached (keeps both generations from piling up against the IAM managed-policy + # limit during an in-place upgrade). Only the role-creation path calls this; the add-on must + # not touch the standard/RC policies the role stack owns. The legacy *instrumentation* policies + # are handled in attach_instrumentation_permissions instead, where cleanup is deferred until a + # successful fetch so a transient API failure can't leave the role with no instrumentation. + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, max_policies) + try: + iam_client.delete_role_policy(RoleName=role_name, PolicyName=LEGACY_POLICY_NAME_STANDARD) + except iam_client.exceptions.NoSuchEntityException: + pass + except Exception as e: + LOGGER.error(f"Error deleting legacy inline policy {LEGACY_POLICY_NAME_STANDARD}: {str(e)}") def attach_standard_permissions(iam_client, role_name): @@ -191,7 +191,12 @@ def attach_instrumentation_permissions(iam_client, role_name, account_id, partit ) return + # Only now that the fetch has succeeded do we remove the existing instrumentation policies — + # both this template's v2 names and any legacy un-suffixed ones from a pre-extraction upgrade. + # Deferring until here is what keeps a transient fetch failure above from leaving the role + # with no instrumentation permissions. cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_INSTRUMENTATION) for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" try: @@ -235,8 +240,8 @@ def handle_create_update(event, context): try: iam_client = boto3.client('iam') - cleanup_legacy_policies(iam_client, role_name, account_id, partition, include_base=manage_base_permissions) if manage_base_permissions: + cleanup_legacy_base_policies(iam_client, role_name, account_id, partition) cleanup_existing_policies(iam_client, role_name, account_id, partition) attach_standard_permissions(iam_client, role_name) if should_install_security_audit_policy: diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index ad6a8408..06338f3b 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -19,7 +19,7 @@ attach_instrumentation_permissions, cleanup_existing_policies, cleanup_instrumentation_policies, - cleanup_legacy_policies, + cleanup_legacy_base_policies, handle_create_update, handle_delete, POLICY_NAME_STANDARD, @@ -147,11 +147,21 @@ def test_happy_path_attaches_each_chunk(self, mock_urlopen): sent_request = mock_urlopen.call_args[0][0] self.assertEqual(sent_request.headers.get("Dd-aws-api-call-source"), "cfn-quickstart") + @patch("attach_integration_permissions.urllib.request.urlopen") + def test_cleans_legacy_instrumentation_after_successful_fetch(self, mock_urlopen): + # Upgrade case: once the v2 permissions are fetched, the legacy un-suffixed instrumentation + # policies are removed (so they don't linger), but only after the fetch succeeds. + mock_urlopen.return_value = self._mock_chunks_response([["ec2:Describe*"]]) + self._attach(["aws:ec2:instance"]) + detached = [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] + self.assertTrue(any(LEGACY_PREFIX_INSTRUMENTATION + "-" + self.role_name in a for a in detached)) + @patch("attach_integration_permissions.urllib.request.urlopen") def test_fetch_failure_preserves_existing_policies(self, mock_urlopen): # Regression: a transient API failure on Update must not silently revoke the - # previously-attached instrumentation policies. The function must neither - # call detach_role_policy / delete_policy nor raise. + # previously-attached instrumentation policies — including the legacy un-suffixed + # ones during an upgrade, since their cleanup is deferred until after a successful + # fetch. The function must neither call detach_role_policy / delete_policy nor raise. mock_urlopen.side_effect = HTTPError( "u", 500, "boom", {}, BytesIO(b'{"errors":["upstream down"]}') ) @@ -226,8 +236,8 @@ def test_cleanup_instrumentation_targets_only_instrumentation_prefix(self): self.assertTrue(all(BASE_POLICY_PREFIX_INSTRUMENTATION in arn for arn in detached)) -class TestCleanupLegacyPolicies(unittest.TestCase): - # Removing the old un-suffixed policies before attaching the v2 ones is what keeps both +class TestCleanupLegacyBasePolicies(unittest.TestCase): + # Removing the old un-suffixed base policies before attaching the v2 ones is what keeps both # generations from sitting attached at once during an in-place upgrade (IAM managed-policy limit). def setUp(self): self.iam = MagicMock() @@ -240,26 +250,24 @@ def _detached_arns(self): return [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] def test_only_targets_legacy_names_not_v2(self): - cleanup_legacy_policies(self.iam, "MyRole", "123456789012", "aws", include_base=True, max_policies=3) + cleanup_legacy_base_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=3) for arn in self._detached_arns(): # Legacy managed-policy ARNs must never carry the -v2 generation segment. self.assertNotIn("-permissions-v2-", arn) - def test_include_base_true_cleans_base_and_instrumentation(self): - cleanup_legacy_policies(self.iam, "MyRole", "123456789012", "aws", include_base=True, max_policies=3) + def test_cleans_legacy_resource_collection_and_standard(self): + cleanup_legacy_base_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=3) arns = self._detached_arns() self.assertTrue(any(LEGACY_PREFIX_RESOURCE_COLLECTION + "-MyRole" in a for a in arns)) - self.assertTrue(any(LEGACY_PREFIX_INSTRUMENTATION + "-MyRole" in a for a in arns)) self.iam.delete_role_policy.assert_called_once_with( RoleName="MyRole", PolicyName=LEGACY_POLICY_NAME_STANDARD ) - def test_include_base_false_leaves_base_policies(self): - cleanup_legacy_policies(self.iam, "MyRole", "123456789012", "aws", include_base=False, max_policies=3) + def test_does_not_touch_instrumentation(self): + # Legacy instrumentation cleanup is deferred to attach_instrumentation_permissions (post-fetch). + cleanup_legacy_base_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=3) arns = self._detached_arns() - self.assertTrue(all(LEGACY_PREFIX_RESOURCE_COLLECTION + "-MyRole" not in a for a in arns)) - self.assertTrue(any(LEGACY_PREFIX_INSTRUMENTATION + "-MyRole" in a for a in arns)) - self.iam.delete_role_policy.assert_not_called() + self.assertTrue(all(LEGACY_PREFIX_INSTRUMENTATION + "-MyRole" not in a for a in arns)) class TestManageBasePermissions(unittest.TestCase): @@ -284,7 +292,7 @@ def _props(self, **overrides): props.update(overrides) return {"RequestType": "Create", "ResourceProperties": props} - @patch("attach_integration_permissions.cleanup_legacy_policies") + @patch("attach_integration_permissions.cleanup_legacy_base_policies") @patch("attach_integration_permissions.boto3.client") @patch("attach_integration_permissions.attach_instrumentation_permissions") @patch("attach_integration_permissions.attach_resource_collection_permissions") @@ -299,9 +307,9 @@ def test_create_manage_base_true_attaches_base( mock_standard.assert_called_once() mock_rc.assert_called_once() mock_instr.assert_called_once() - self.assertTrue(mock_legacy.call_args.kwargs["include_base"]) + mock_legacy.assert_called_once() - @patch("attach_integration_permissions.cleanup_legacy_policies") + @patch("attach_integration_permissions.cleanup_legacy_base_policies") @patch("attach_integration_permissions.boto3.client") @patch("attach_integration_permissions.attach_instrumentation_permissions") @patch("attach_integration_permissions.attach_resource_collection_permissions") @@ -317,7 +325,7 @@ def test_create_manage_base_false_only_instrumentation( mock_rc.assert_not_called() mock_instr.assert_called_once() # Add-on mode must not touch the role stack's standard/resource-collection policies. - self.assertFalse(mock_legacy.call_args.kwargs["include_base"]) + mock_legacy.assert_not_called() @patch("attach_integration_permissions.boto3.client") @patch("attach_integration_permissions.cleanup_instrumentation_policies") diff --git a/aws_quickstart/datadog_integration_permissions.yaml b/aws_quickstart/datadog_integration_permissions.yaml index 22029f95..11b5e14c 100644 --- a/aws_quickstart/datadog_integration_permissions.yaml +++ b/aws_quickstart/datadog_integration_permissions.yaml @@ -208,21 +208,21 @@ Resources: _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) - def cleanup_legacy_policies(iam_client, role_name, account_id, partition, include_base, max_policies=10): - # Remove the un-suffixed policies left by the inline trigger of releases that predate this - # nested template, so they are gone before the v2 policies are attached (avoids piling both - # generations against the IAM managed-policy limit during an in-place upgrade). include_base - # is False in add-on mode, where the role stack still owns the standard/resource-collection - # policies and this stack must not touch them. - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_INSTRUMENTATION, max_policies) - if include_base: - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, max_policies) - try: - iam_client.delete_role_policy(RoleName=role_name, PolicyName=LEGACY_POLICY_NAME_STANDARD) - except iam_client.exceptions.NoSuchEntityException: - pass - except Exception as e: - LOGGER.error(f"Error deleting legacy inline policy {LEGACY_POLICY_NAME_STANDARD}: {str(e)}") + def cleanup_legacy_base_policies(iam_client, role_name, account_id, partition, max_policies=10): + # Remove the un-suffixed standard + resource-collection policies left by the inline trigger of + # releases that predate this nested template, so they are gone before the v2 standard/RC + # policies are attached (keeps both generations from piling up against the IAM managed-policy + # limit during an in-place upgrade). Only the role-creation path calls this; the add-on must + # not touch the standard/RC policies the role stack owns. The legacy *instrumentation* policies + # are handled in attach_instrumentation_permissions instead, where cleanup is deferred until a + # successful fetch so a transient API failure can't leave the role with no instrumentation. + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, max_policies) + try: + iam_client.delete_role_policy(RoleName=role_name, PolicyName=LEGACY_POLICY_NAME_STANDARD) + except iam_client.exceptions.NoSuchEntityException: + pass + except Exception as e: + LOGGER.error(f"Error deleting legacy inline policy {LEGACY_POLICY_NAME_STANDARD}: {str(e)}") def attach_standard_permissions(iam_client, role_name): @@ -289,7 +289,12 @@ Resources: ) return + # Only now that the fetch has succeeded do we remove the existing instrumentation policies — + # both this template's v2 names and any legacy un-suffixed ones from a pre-extraction upgrade. + # Deferring until here is what keeps a transient fetch failure above from leaving the role + # with no instrumentation permissions. cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_INSTRUMENTATION) for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" try: @@ -333,8 +338,8 @@ Resources: try: iam_client = boto3.client('iam') - cleanup_legacy_policies(iam_client, role_name, account_id, partition, include_base=manage_base_permissions) if manage_base_permissions: + cleanup_legacy_base_policies(iam_client, role_name, account_id, partition) cleanup_existing_policies(iam_client, role_name, account_id, partition) attach_standard_permissions(iam_client, role_name) if should_install_security_audit_policy: From 42f7f6f994f9d46f4ecc2c3ac354d50f2d720a4e Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Tue, 9 Jun 2026 11:11:21 -0400 Subject: [PATCH 06/11] Revert legacy-instrumentation handling now that instrumentation is unreleased MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Agent-installation/instrumentation feature is not exposed in any released flow (InstrumentationResourceTypes defaults empty and no released launch sets it), so no field stack carries legacy instrumentation policies. The defensive handling added for that case is therefore dead weight: - Instrumentation policies revert to the un-suffixed name (no -v2). The -v2 generation only matters for standard/resource-collection, which shipped in v4.4.0 and must survive the removed inline trigger's destructive Delete on an in-place upgrade. Instrumentation has no such field history. - Drop the deferred legacy-instrumentation cleanup in attach_instrumentation_permissions and the LEGACY_PREFIX_INSTRUMENTATION constant; cleanup_legacy_base_policies now covers only standard/RC. The standard/resource-collection upgrade fix (v2 names + legacy base cleanup before attach to stay under the IAM managed-policy limit) is retained — that exposure is real and released. Tests and CHANGELOG updated to match. Co-Authored-By: Claude Opus 4.8 (1M context) --- aws_quickstart/CHANGELOG.md | 4 +-- .../attach_integration_permissions.py | 33 +++++++++-------- .../attach_integration_permissions_test.py | 36 ++++++++----------- .../datadog_integration_permissions.yaml | 33 +++++++++-------- 4 files changed, 48 insertions(+), 58 deletions(-) diff --git a/aws_quickstart/CHANGELOG.md b/aws_quickstart/CHANGELOG.md index 41e739a5..16338464 100644 --- a/aws_quickstart/CHANGELOG.md +++ b/aws_quickstart/CHANGELOG.md @@ -1,6 +1,6 @@ -# 4.14.0 (June 8, 2026) +# 4.14.0 (June 9, 2026) -- Add `main_agent_installation.yaml`, a standalone add-on template that lets customers who declined the Agent installation option during initial AWS integration setup enable it later against an existing integration role. It attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline without touching the standard or resource-collection policies owned by the role stack. To keep this DRY, the permission-attachment custom resource was extracted out of `datadog_integration_role.yaml` into a new nested `datadog_integration_permissions.yaml` (gated by a `ManageBasePermissions` flag) that both the role template and the add-on consume. The managed/inline policies the custom resource attaches were given a `-v2` generation name (e.g. `datadog-aws-integration-resource-collection-permissions-v2--N`, `DatadogAWSIntegrationPolicyV2`) so that an in-place upgrade from a prior version (whose now-removed inline trigger deletes the un-suffixed policy names on teardown) cannot wipe the policies the new nested stack re-attaches; existing roles see a one-time, gap-free policy replacement on upgrade. Behavior of `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` is otherwise unchanged. Affects `datadog_integration_role.yaml`, `datadog_integration_permissions.yaml`, `main_agent_installation.yaml`, and `release.sh`. +- Add `main_agent_installation.yaml`, a standalone add-on template that lets customers who declined the Agent installation option during initial AWS integration setup enable it later against an existing integration role. It attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline without touching the standard or resource-collection policies owned by the role stack. To keep this DRY, the permission-attachment custom resource was extracted out of `datadog_integration_role.yaml` into a new nested `datadog_integration_permissions.yaml` (gated by a `ManageBasePermissions` flag) that both the role template and the add-on consume. The standard and resource-collection policies the custom resource attaches were given a `-v2` generation name (e.g. `datadog-aws-integration-resource-collection-permissions-v2--N`, `DatadogAWSIntegrationPolicyV2`) so that an in-place upgrade from a prior version (whose now-removed inline trigger deletes the un-suffixed policy names on teardown) cannot wipe the policies the new nested stack re-attaches; existing roles see a one-time, gap-free policy replacement on upgrade, and the legacy-named policies are cleaned up before the v2 ones are attached so they don't pile up against the IAM managed-policy limit. Behavior of `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` is otherwise unchanged. Affects `datadog_integration_role.yaml`, `datadog_integration_permissions.yaml`, `main_agent_installation.yaml`, and `release.sh`. # 4.13.0 (May 29, 2026) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index 01db2881..1bcf52ec 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -10,22 +10,24 @@ LOGGER = logging.getLogger() LOGGER.setLevel(logging.INFO) API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" -# The "-v2" generation suffix is load-bearing, not cosmetic. Extracting this custom resource -# out of datadog_integration_role.yaml means an in-place upgrade from a release that still had -# the inline trigger (<= v4.13) removes that old trigger, whose Delete handler deletes policies -# by the un-suffixed names. CloudFormation can run that delete after this nested stack has -# re-attached them, which would leave the role with no permissions. Using distinct names here -# means the old handler only ever cleans up the old-named policies and never touches these. +# The "-v2" suffix on the standard and resource-collection names is load-bearing, not cosmetic. +# Extracting this custom resource out of datadog_integration_role.yaml means an in-place upgrade +# from a release that still had the inline trigger (<= v4.13) removes that old trigger, whose +# Delete handler deletes the standard/resource-collection policies by their un-suffixed names. +# CloudFormation can run that delete after this nested stack has re-attached them, which would +# leave the role without those permissions. The v2 names dodge that: the old handler only ever +# touches the old-named policies. Instrumentation needs no such suffix — that feature is +# unreleased, so no field stack carries instrumentation policies for the old handler to collide +# with, and the inline trigger is gone for good from v4.14 onward. POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicyV2" BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions-v2" -BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions-v2" -# Un-suffixed names created by the pre-extraction inline trigger (<= v4.13). We clean these up -# ourselves before attaching the v2 policies so the two generations never sit attached at the -# same time (IAM caps managed policies per role, default 10); the old trigger's own Delete -# handler then no-ops against names that are already gone. +BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" +# Un-suffixed standard/resource-collection names created by the pre-extraction inline trigger +# (<= v4.13). We clean these up ourselves before attaching the v2 policies so the two generations +# never sit attached at once (IAM caps managed policies per role, default 10); the old trigger's +# own Delete handler then no-ops against names that are already gone. LEGACY_POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" LEGACY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" -LEGACY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" @@ -191,12 +193,9 @@ def attach_instrumentation_permissions(iam_client, role_name, account_id, partit ) return - # Only now that the fetch has succeeded do we remove the existing instrumentation policies — - # both this template's v2 names and any legacy un-suffixed ones from a pre-extraction upgrade. - # Deferring until here is what keeps a transient fetch failure above from leaving the role - # with no instrumentation permissions. + # Only now that the fetch has succeeded do we remove the existing instrumentation policies, so + # a transient fetch failure above leaves the previously-attached policies in place. cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_INSTRUMENTATION) for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" try: diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index 06338f3b..26e5827f 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -27,7 +27,6 @@ BASE_POLICY_PREFIX_RESOURCE_COLLECTION, LEGACY_POLICY_NAME_STANDARD, LEGACY_PREFIX_RESOURCE_COLLECTION, - LEGACY_PREFIX_INSTRUMENTATION, ) @@ -147,21 +146,11 @@ def test_happy_path_attaches_each_chunk(self, mock_urlopen): sent_request = mock_urlopen.call_args[0][0] self.assertEqual(sent_request.headers.get("Dd-aws-api-call-source"), "cfn-quickstart") - @patch("attach_integration_permissions.urllib.request.urlopen") - def test_cleans_legacy_instrumentation_after_successful_fetch(self, mock_urlopen): - # Upgrade case: once the v2 permissions are fetched, the legacy un-suffixed instrumentation - # policies are removed (so they don't linger), but only after the fetch succeeds. - mock_urlopen.return_value = self._mock_chunks_response([["ec2:Describe*"]]) - self._attach(["aws:ec2:instance"]) - detached = [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] - self.assertTrue(any(LEGACY_PREFIX_INSTRUMENTATION + "-" + self.role_name in a for a in detached)) - @patch("attach_integration_permissions.urllib.request.urlopen") def test_fetch_failure_preserves_existing_policies(self, mock_urlopen): # Regression: a transient API failure on Update must not silently revoke the - # previously-attached instrumentation policies — including the legacy un-suffixed - # ones during an upgrade, since their cleanup is deferred until after a successful - # fetch. The function must neither call detach_role_policy / delete_policy nor raise. + # previously-attached instrumentation policies. The function must neither + # call detach_role_policy / delete_policy nor raise. mock_urlopen.side_effect = HTTPError( "u", 500, "boom", {}, BytesIO(b'{"errors":["upstream down"]}') ) @@ -264,10 +253,10 @@ def test_cleans_legacy_resource_collection_and_standard(self): ) def test_does_not_touch_instrumentation(self): - # Legacy instrumentation cleanup is deferred to attach_instrumentation_permissions (post-fetch). + # Base cleanup only handles standard/resource-collection; instrumentation is managed separately. cleanup_legacy_base_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=3) arns = self._detached_arns() - self.assertTrue(all(LEGACY_PREFIX_INSTRUMENTATION + "-MyRole" not in a for a in arns)) + self.assertTrue(all("instrumentation" not in a for a in arns)) class TestManageBasePermissions(unittest.TestCase): @@ -378,21 +367,24 @@ def test_create_reports_failed_when_instrumentation_raises( class TestUpgradeSafePolicyNames(unittest.TestCase): - # Guards the invariant that makes in-place upgrades from the inline-trigger era safe: the names - # this template manages must be disjoint from the names the legacy (<= v4.13) Delete handler - # removes, so the old handler can never wipe a policy this stack just attached. + # Guards the invariant that makes in-place upgrades from the inline-trigger era safe: the standard + # and resource-collection names this template manages must be disjoint from the names the legacy + # (<= v4.13) Delete handler removes, so the old handler can never wipe a policy this stack just + # attached. Instrumentation is intentionally excluded — that feature is unreleased, so no field + # stack has legacy instrumentation policies, and it keeps the un-suffixed name. role = "DatadogIntegrationRole" def test_standard_policy_name_differs_from_legacy(self): self.assertNotEqual(POLICY_NAME_STANDARD, LEGACY_POLICY_NAME_STANDARD) - def test_managed_policy_names_disjoint_from_legacy(self): + def test_resource_collection_names_disjoint_from_legacy(self): def names(prefix): return {f"{prefix}-{self.role}-{i+1}" for i in range(10)} - new_names = names(BASE_POLICY_PREFIX_RESOURCE_COLLECTION) | names(BASE_POLICY_PREFIX_INSTRUMENTATION) - legacy_names = names(LEGACY_PREFIX_RESOURCE_COLLECTION) | names(LEGACY_PREFIX_INSTRUMENTATION) - self.assertEqual(new_names & legacy_names, set()) + self.assertEqual( + names(BASE_POLICY_PREFIX_RESOURCE_COLLECTION) & names(LEGACY_PREFIX_RESOURCE_COLLECTION), + set(), + ) if __name__ == "__main__": diff --git a/aws_quickstart/datadog_integration_permissions.yaml b/aws_quickstart/datadog_integration_permissions.yaml index 11b5e14c..20932849 100644 --- a/aws_quickstart/datadog_integration_permissions.yaml +++ b/aws_quickstart/datadog_integration_permissions.yaml @@ -108,22 +108,24 @@ Resources: LOGGER = logging.getLogger() LOGGER.setLevel(logging.INFO) API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" - # The "-v2" generation suffix is load-bearing, not cosmetic. Extracting this custom resource - # out of datadog_integration_role.yaml means an in-place upgrade from a release that still had - # the inline trigger (<= v4.13) removes that old trigger, whose Delete handler deletes policies - # by the un-suffixed names. CloudFormation can run that delete after this nested stack has - # re-attached them, which would leave the role with no permissions. Using distinct names here - # means the old handler only ever cleans up the old-named policies and never touches these. + # The "-v2" suffix on the standard and resource-collection names is load-bearing, not cosmetic. + # Extracting this custom resource out of datadog_integration_role.yaml means an in-place upgrade + # from a release that still had the inline trigger (<= v4.13) removes that old trigger, whose + # Delete handler deletes the standard/resource-collection policies by their un-suffixed names. + # CloudFormation can run that delete after this nested stack has re-attached them, which would + # leave the role without those permissions. The v2 names dodge that: the old handler only ever + # touches the old-named policies. Instrumentation needs no such suffix — that feature is + # unreleased, so no field stack carries instrumentation policies for the old handler to collide + # with, and the inline trigger is gone for good from v4.14 onward. POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicyV2" BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions-v2" - BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions-v2" - # Un-suffixed names created by the pre-extraction inline trigger (<= v4.13). We clean these up - # ourselves before attaching the v2 policies so the two generations never sit attached at the - # same time (IAM caps managed policies per role, default 10); the old trigger's own Delete - # handler then no-ops against names that are already gone. + BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" + # Un-suffixed standard/resource-collection names created by the pre-extraction inline trigger + # (<= v4.13). We clean these up ourselves before attaching the v2 policies so the two generations + # never sit attached at once (IAM caps managed policies per role, default 10); the old trigger's + # own Delete handler then no-ops against names that are already gone. LEGACY_POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" LEGACY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" - LEGACY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" @@ -289,12 +291,9 @@ Resources: ) return - # Only now that the fetch has succeeded do we remove the existing instrumentation policies — - # both this template's v2 names and any legacy un-suffixed ones from a pre-extraction upgrade. - # Deferring until here is what keeps a transient fetch failure above from leaving the role - # with no instrumentation permissions. + # Only now that the fetch has succeeded do we remove the existing instrumentation policies, so + # a transient fetch failure above leaves the previously-attached policies in place. cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_INSTRUMENTATION) for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" try: From e439e0d858cc41f3f5e5ce96cdba473b7d71c07d Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Tue, 9 Jun 2026 11:27:16 -0400 Subject: [PATCH 07/11] Fix stale comment on cleanup_legacy_base_policies The docstring still described the deferred legacy-instrumentation cleanup that was reverted, which no longer happens. Trim it to what the function actually does. Co-Authored-By: Claude Opus 4.8 (1M context) --- aws_quickstart/attach_integration_permissions.py | 11 ++++------- aws_quickstart/datadog_integration_permissions.yaml | 11 ++++------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index 1bcf52ec..b3798e4e 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -113,13 +113,10 @@ def cleanup_instrumentation_policies(iam_client, role_name, account_id, partitio def cleanup_legacy_base_policies(iam_client, role_name, account_id, partition, max_policies=10): - # Remove the un-suffixed standard + resource-collection policies left by the inline trigger of - # releases that predate this nested template, so they are gone before the v2 standard/RC - # policies are attached (keeps both generations from piling up against the IAM managed-policy - # limit during an in-place upgrade). Only the role-creation path calls this; the add-on must - # not touch the standard/RC policies the role stack owns. The legacy *instrumentation* policies - # are handled in attach_instrumentation_permissions instead, where cleanup is deferred until a - # successful fetch so a transient API failure can't leave the role with no instrumentation. + # Remove the un-suffixed standard + resource-collection policies left by the pre-extraction + # inline trigger before the v2 policies are attached, so the two generations don't pile up + # against the IAM managed-policy limit during an in-place upgrade. Only the role-creation path + # calls this; the add-on must not touch the policies the role stack owns. _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, max_policies) try: iam_client.delete_role_policy(RoleName=role_name, PolicyName=LEGACY_POLICY_NAME_STANDARD) diff --git a/aws_quickstart/datadog_integration_permissions.yaml b/aws_quickstart/datadog_integration_permissions.yaml index 20932849..bc7f7b38 100644 --- a/aws_quickstart/datadog_integration_permissions.yaml +++ b/aws_quickstart/datadog_integration_permissions.yaml @@ -211,13 +211,10 @@ Resources: def cleanup_legacy_base_policies(iam_client, role_name, account_id, partition, max_policies=10): - # Remove the un-suffixed standard + resource-collection policies left by the inline trigger of - # releases that predate this nested template, so they are gone before the v2 standard/RC - # policies are attached (keeps both generations from piling up against the IAM managed-policy - # limit during an in-place upgrade). Only the role-creation path calls this; the add-on must - # not touch the standard/RC policies the role stack owns. The legacy *instrumentation* policies - # are handled in attach_instrumentation_permissions instead, where cleanup is deferred until a - # successful fetch so a transient API failure can't leave the role with no instrumentation. + # Remove the un-suffixed standard + resource-collection policies left by the pre-extraction + # inline trigger before the v2 policies are attached, so the two generations don't pile up + # against the IAM managed-policy limit during an in-place upgrade. Only the role-creation path + # calls this; the add-on must not touch the policies the role stack owns. _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, max_policies) try: iam_client.delete_role_policy(RoleName=role_name, PolicyName=LEGACY_POLICY_NAME_STANDARD) From adc2628ef31e890763fc229227bf0e487bf309a7 Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Tue, 9 Jun 2026 12:05:37 -0400 Subject: [PATCH 08/11] Restore -v2 name on instrumentation policies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pre-extraction inline trigger (<= v4.13) deletes instrumentation policies by their un-suffixed prefix unconditionally on teardown, and that teardown runs whenever the old trigger is removed — including an in-place upgrade of a role stack off <= v4.13. So this sequence silently breaks: a customer installs the add-on against an existing v4.13 role (attaching instrumentation policies), then later upgrades that role stack to v4.14+; removing the old trigger deletes the add-on's instrumentation policies while the add-on stack stays green. Give instrumentation the same -v2 generation name as the standard/resource- collection policies so the old trigger's prefix-delete can't match them. The legacy-instrumentation *cleanup* stays removed — that's still moot (no field stack carries legacy instrumentation policies), only the naming matters here. Co-Authored-By: Claude Opus 4.8 (1M context) --- aws_quickstart/CHANGELOG.md | 2 +- .../attach_integration_permissions.py | 28 ++++++++++--------- .../attach_integration_permissions_test.py | 25 +++++++++++------ .../datadog_integration_permissions.yaml | 28 ++++++++++--------- 4 files changed, 48 insertions(+), 35 deletions(-) diff --git a/aws_quickstart/CHANGELOG.md b/aws_quickstart/CHANGELOG.md index 16338464..a2f9ebe7 100644 --- a/aws_quickstart/CHANGELOG.md +++ b/aws_quickstart/CHANGELOG.md @@ -1,6 +1,6 @@ # 4.14.0 (June 9, 2026) -- Add `main_agent_installation.yaml`, a standalone add-on template that lets customers who declined the Agent installation option during initial AWS integration setup enable it later against an existing integration role. It attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline without touching the standard or resource-collection policies owned by the role stack. To keep this DRY, the permission-attachment custom resource was extracted out of `datadog_integration_role.yaml` into a new nested `datadog_integration_permissions.yaml` (gated by a `ManageBasePermissions` flag) that both the role template and the add-on consume. The standard and resource-collection policies the custom resource attaches were given a `-v2` generation name (e.g. `datadog-aws-integration-resource-collection-permissions-v2--N`, `DatadogAWSIntegrationPolicyV2`) so that an in-place upgrade from a prior version (whose now-removed inline trigger deletes the un-suffixed policy names on teardown) cannot wipe the policies the new nested stack re-attaches; existing roles see a one-time, gap-free policy replacement on upgrade, and the legacy-named policies are cleaned up before the v2 ones are attached so they don't pile up against the IAM managed-policy limit. Behavior of `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` is otherwise unchanged. Affects `datadog_integration_role.yaml`, `datadog_integration_permissions.yaml`, `main_agent_installation.yaml`, and `release.sh`. +- Add `main_agent_installation.yaml`, a standalone add-on template that lets customers who declined the Agent installation option during initial AWS integration setup enable it later against an existing integration role. It attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline without touching the standard or resource-collection policies owned by the role stack. To keep this DRY, the permission-attachment custom resource was extracted out of `datadog_integration_role.yaml` into a new nested `datadog_integration_permissions.yaml` (gated by a `ManageBasePermissions` flag) that both the role template and the add-on consume. The policies the custom resource attaches were given a `-v2` generation name (e.g. `datadog-aws-integration-resource-collection-permissions-v2--N`, `DatadogAWSIntegrationPolicyV2`) so that the prior version's now-removed inline trigger — which deletes the un-suffixed policy names by prefix whenever it is torn down, i.e. on any role-stack upgrade off the old version — cannot wipe the policies this stack attaches. For the standard/resource-collection policies the legacy-named ones are also cleaned up before the v2 ones are attached, so existing roles get a one-time, gap-free replacement on upgrade without piling both generations against the IAM managed-policy limit; instrumentation policies (attached by the add-on) use `-v2` so a later upgrade of the target role's stack can't delete them. Behavior of `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` is otherwise unchanged. Affects `datadog_integration_role.yaml`, `datadog_integration_permissions.yaml`, `main_agent_installation.yaml`, and `release.sh`. # 4.13.0 (May 29, 2026) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index b3798e4e..ac4aa64e 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -10,22 +10,24 @@ LOGGER = logging.getLogger() LOGGER.setLevel(logging.INFO) API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" -# The "-v2" suffix on the standard and resource-collection names is load-bearing, not cosmetic. -# Extracting this custom resource out of datadog_integration_role.yaml means an in-place upgrade -# from a release that still had the inline trigger (<= v4.13) removes that old trigger, whose -# Delete handler deletes the standard/resource-collection policies by their un-suffixed names. -# CloudFormation can run that delete after this nested stack has re-attached them, which would -# leave the role without those permissions. The v2 names dodge that: the old handler only ever -# touches the old-named policies. Instrumentation needs no such suffix — that feature is -# unreleased, so no field stack carries instrumentation policies for the old handler to collide -# with, and the inline trigger is gone for good from v4.14 onward. +# The "-v2" suffix on these policy names is load-bearing, not cosmetic. The pre-extraction +# inline trigger (<= v4.13) deletes policies by their un-suffixed names on teardown, and that +# teardown runs whenever the old trigger is removed — i.e. when a role stack is upgraded off +# <= v4.13. Distinct v2 names ensure that destructive delete can never hit the policies this +# template attaches: +# - standard / resource-collection: an in-place role-stack upgrade removes the old trigger +# after this nested stack has re-attached them; v2 names keep them from being wiped. +# - instrumentation: the add-on attaches these against an existing role; if that role's stack +# is later upgraded off <= v4.13, the old trigger's unconditional instrumentation cleanup +# would wipe them unless they sit under a name it doesn't know. POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicyV2" BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions-v2" -BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" +BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions-v2" # Un-suffixed standard/resource-collection names created by the pre-extraction inline trigger -# (<= v4.13). We clean these up ourselves before attaching the v2 policies so the two generations -# never sit attached at once (IAM caps managed policies per role, default 10); the old trigger's -# own Delete handler then no-ops against names that are already gone. +# (<= v4.13). The role-creation path cleans these up before attaching the v2 policies so the two +# generations never sit attached at once (IAM caps managed policies per role, default 10); the +# old trigger's own Delete handler then no-ops against names that are already gone. Legacy +# instrumentation policies need no such cleanup — that feature is unreleased, so none exist. LEGACY_POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" LEGACY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index 26e5827f..6a9b7597 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -367,22 +367,31 @@ def test_create_reports_failed_when_instrumentation_raises( class TestUpgradeSafePolicyNames(unittest.TestCase): - # Guards the invariant that makes in-place upgrades from the inline-trigger era safe: the standard - # and resource-collection names this template manages must be disjoint from the names the legacy - # (<= v4.13) Delete handler removes, so the old handler can never wipe a policy this stack just - # attached. Instrumentation is intentionally excluded — that feature is unreleased, so no field - # stack has legacy instrumentation policies, and it keeps the un-suffixed name. + # Guards the invariant that makes the inline-trigger era safe: every policy name this template + # attaches must be disjoint from the un-suffixed names the legacy (<= v4.13) Delete handler removes, + # so the old handler can never wipe a policy this stack attached. This covers instrumentation too — + # the add-on attaches instrumentation policies against an existing role, and a later upgrade of that + # role's stack removes the old trigger, whose unconditional instrumentation cleanup would otherwise + # delete them. role = "DatadogIntegrationRole" + # Un-suffixed prefix the legacy trigger deletes instrumentation policies by. + LEGACY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" + + def _names(self, prefix): + return {f"{prefix}-{self.role}-{i+1}" for i in range(10)} def test_standard_policy_name_differs_from_legacy(self): self.assertNotEqual(POLICY_NAME_STANDARD, LEGACY_POLICY_NAME_STANDARD) def test_resource_collection_names_disjoint_from_legacy(self): - def names(prefix): - return {f"{prefix}-{self.role}-{i+1}" for i in range(10)} + self.assertEqual( + self._names(BASE_POLICY_PREFIX_RESOURCE_COLLECTION) & self._names(LEGACY_PREFIX_RESOURCE_COLLECTION), + set(), + ) + def test_instrumentation_names_disjoint_from_legacy(self): self.assertEqual( - names(BASE_POLICY_PREFIX_RESOURCE_COLLECTION) & names(LEGACY_PREFIX_RESOURCE_COLLECTION), + self._names(BASE_POLICY_PREFIX_INSTRUMENTATION) & self._names(self.LEGACY_PREFIX_INSTRUMENTATION), set(), ) diff --git a/aws_quickstart/datadog_integration_permissions.yaml b/aws_quickstart/datadog_integration_permissions.yaml index bc7f7b38..4affc019 100644 --- a/aws_quickstart/datadog_integration_permissions.yaml +++ b/aws_quickstart/datadog_integration_permissions.yaml @@ -108,22 +108,24 @@ Resources: LOGGER = logging.getLogger() LOGGER.setLevel(logging.INFO) API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" - # The "-v2" suffix on the standard and resource-collection names is load-bearing, not cosmetic. - # Extracting this custom resource out of datadog_integration_role.yaml means an in-place upgrade - # from a release that still had the inline trigger (<= v4.13) removes that old trigger, whose - # Delete handler deletes the standard/resource-collection policies by their un-suffixed names. - # CloudFormation can run that delete after this nested stack has re-attached them, which would - # leave the role without those permissions. The v2 names dodge that: the old handler only ever - # touches the old-named policies. Instrumentation needs no such suffix — that feature is - # unreleased, so no field stack carries instrumentation policies for the old handler to collide - # with, and the inline trigger is gone for good from v4.14 onward. + # The "-v2" suffix on these policy names is load-bearing, not cosmetic. The pre-extraction + # inline trigger (<= v4.13) deletes policies by their un-suffixed names on teardown, and that + # teardown runs whenever the old trigger is removed — i.e. when a role stack is upgraded off + # <= v4.13. Distinct v2 names ensure that destructive delete can never hit the policies this + # template attaches: + # - standard / resource-collection: an in-place role-stack upgrade removes the old trigger + # after this nested stack has re-attached them; v2 names keep them from being wiped. + # - instrumentation: the add-on attaches these against an existing role; if that role's stack + # is later upgraded off <= v4.13, the old trigger's unconditional instrumentation cleanup + # would wipe them unless they sit under a name it doesn't know. POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicyV2" BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions-v2" - BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" + BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions-v2" # Un-suffixed standard/resource-collection names created by the pre-extraction inline trigger - # (<= v4.13). We clean these up ourselves before attaching the v2 policies so the two generations - # never sit attached at once (IAM caps managed policies per role, default 10); the old trigger's - # own Delete handler then no-ops against names that are already gone. + # (<= v4.13). The role-creation path cleans these up before attaching the v2 policies so the two + # generations never sit attached at once (IAM caps managed policies per role, default 10); the + # old trigger's own Delete handler then no-ops against names that are already gone. Legacy + # instrumentation policies need no such cleanup — that feature is unreleased, so none exist. LEGACY_POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" LEGACY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" From f65775feaf4bcc42d32c254a170a255fc7a0e48a Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Tue, 9 Jun 2026 12:24:21 -0400 Subject: [PATCH 09/11] Soften CHANGELOG "gap-free" wording The base-permission attach still deletes the old policies before fetching their replacements (pre-existing behavior, left as-is), so the upgrade isn't strictly gap-free. Drop the overclaim. Co-Authored-By: Claude Opus 4.8 (1M context) --- aws_quickstart/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws_quickstart/CHANGELOG.md b/aws_quickstart/CHANGELOG.md index a2f9ebe7..9c78a5db 100644 --- a/aws_quickstart/CHANGELOG.md +++ b/aws_quickstart/CHANGELOG.md @@ -1,6 +1,6 @@ # 4.14.0 (June 9, 2026) -- Add `main_agent_installation.yaml`, a standalone add-on template that lets customers who declined the Agent installation option during initial AWS integration setup enable it later against an existing integration role. It attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline without touching the standard or resource-collection policies owned by the role stack. To keep this DRY, the permission-attachment custom resource was extracted out of `datadog_integration_role.yaml` into a new nested `datadog_integration_permissions.yaml` (gated by a `ManageBasePermissions` flag) that both the role template and the add-on consume. The policies the custom resource attaches were given a `-v2` generation name (e.g. `datadog-aws-integration-resource-collection-permissions-v2--N`, `DatadogAWSIntegrationPolicyV2`) so that the prior version's now-removed inline trigger — which deletes the un-suffixed policy names by prefix whenever it is torn down, i.e. on any role-stack upgrade off the old version — cannot wipe the policies this stack attaches. For the standard/resource-collection policies the legacy-named ones are also cleaned up before the v2 ones are attached, so existing roles get a one-time, gap-free replacement on upgrade without piling both generations against the IAM managed-policy limit; instrumentation policies (attached by the add-on) use `-v2` so a later upgrade of the target role's stack can't delete them. Behavior of `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` is otherwise unchanged. Affects `datadog_integration_role.yaml`, `datadog_integration_permissions.yaml`, `main_agent_installation.yaml`, and `release.sh`. +- Add `main_agent_installation.yaml`, a standalone add-on template that lets customers who declined the Agent installation option during initial AWS integration setup enable it later against an existing integration role. It attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline without touching the standard or resource-collection policies owned by the role stack. To keep this DRY, the permission-attachment custom resource was extracted out of `datadog_integration_role.yaml` into a new nested `datadog_integration_permissions.yaml` (gated by a `ManageBasePermissions` flag) that both the role template and the add-on consume. The policies the custom resource attaches were given a `-v2` generation name (e.g. `datadog-aws-integration-resource-collection-permissions-v2--N`, `DatadogAWSIntegrationPolicyV2`) so that the prior version's now-removed inline trigger — which deletes the un-suffixed policy names by prefix whenever it is torn down, i.e. on any role-stack upgrade off the old version — cannot wipe the policies this stack attaches. For the standard/resource-collection policies the legacy-named ones are also cleaned up before the v2 ones are attached, so existing roles get a one-time replacement on upgrade without piling both generations against the IAM managed-policy limit; instrumentation policies (attached by the add-on) use `-v2` so a later upgrade of the target role's stack can't delete them. Behavior of `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` is otherwise unchanged. Affects `datadog_integration_role.yaml`, `datadog_integration_permissions.yaml`, `main_agent_installation.yaml`, and `release.sh`. # 4.13.0 (May 29, 2026) From 1665955afd7b1340bc94bb69e92d2b774aa490be Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Tue, 9 Jun 2026 12:34:27 -0400 Subject: [PATCH 10/11] Simplify: share base-cleanup helper and test IAM-mock setup - cleanup_existing_policies and cleanup_legacy_base_policies were the same body parameterized by v2-vs-legacy constants; both now delegate to a private _cleanup_base_policies, removing the duplicated try/except. Public names kept (handle_delete and the tests use them). - Drop a comment that restated the function's own fetch-before-cleanup note. - Collapse the four near-identical test setUp IAM mocks and the duplicated detached-arns list-comp into make_iam_mock()/detached_arns() helpers. No behavior change. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../attach_integration_permissions.py | 23 ++++----- .../attach_integration_permissions_test.py | 49 +++++++++---------- .../datadog_integration_permissions.yaml | 23 ++++----- 3 files changed, 41 insertions(+), 54 deletions(-) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index ac4aa64e..7102f889 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -99,15 +99,18 @@ def _cleanup_chunked_policies(iam_client, role_name, account_id, partition, pref _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) -def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, max_policies) - +def _cleanup_base_policies(iam_client, role_name, account_id, partition, rc_prefix, standard_name, max_policies=10): + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, rc_prefix, max_policies) try: - iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD) + iam_client.delete_role_policy(RoleName=role_name, PolicyName=standard_name) except iam_client.exceptions.NoSuchEntityException: pass except Exception as e: - LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") + LOGGER.error(f"Error deleting inline policy {standard_name}: {str(e)}") + + +def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): + _cleanup_base_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, POLICY_NAME_STANDARD, max_policies) def cleanup_instrumentation_policies(iam_client, role_name, account_id, partition, max_policies=10): @@ -119,13 +122,7 @@ def cleanup_legacy_base_policies(iam_client, role_name, account_id, partition, m # inline trigger before the v2 policies are attached, so the two generations don't pile up # against the IAM managed-policy limit during an in-place upgrade. Only the role-creation path # calls this; the add-on must not touch the policies the role stack owns. - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, max_policies) - try: - iam_client.delete_role_policy(RoleName=role_name, PolicyName=LEGACY_POLICY_NAME_STANDARD) - except iam_client.exceptions.NoSuchEntityException: - pass - except Exception as e: - LOGGER.error(f"Error deleting legacy inline policy {LEGACY_POLICY_NAME_STANDARD}: {str(e)}") + _cleanup_base_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, LEGACY_POLICY_NAME_STANDARD, max_policies) def attach_standard_permissions(iam_client, role_name): @@ -192,8 +189,6 @@ def attach_instrumentation_permissions(iam_client, role_name, account_id, partit ) return - # Only now that the fetch has succeeded do we remove the existing instrumentation policies, so - # a transient fetch failure above leaves the previously-attached policies in place. cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index 6a9b7597..191ef671 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -30,6 +30,20 @@ ) +def make_iam_mock(cleanup_side_effects=True): + iam = MagicMock() + iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) + iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) + if cleanup_side_effects: + iam.detach_role_policy.side_effect = iam.exceptions.NoSuchEntityException + iam.delete_policy.side_effect = iam.exceptions.NoSuchEntityException + return iam + + +def detached_arns(iam): + return [c.kwargs["PolicyArn"] for c in iam.detach_role_policy.call_args_list] + + class TestParseResourceTypes(unittest.TestCase): def test_none(self): self.assertEqual(parse_resource_types(None), []) @@ -84,12 +98,8 @@ def test_repeated_resource_type_and_chunked(self): class TestAttachInstrumentationPermissions(unittest.TestCase): def setUp(self): - self.iam = MagicMock() - self.iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) - self.iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) + self.iam = make_iam_mock() self.iam.create_policy.return_value = {"Policy": {"Arn": "arn:aws:iam::123:policy/X"}} - self.iam.detach_role_policy.side_effect = self.iam.exceptions.NoSuchEntityException - self.iam.delete_policy.side_effect = self.iam.exceptions.NoSuchEntityException self.role_name = "DatadogIntegrationRole" self.account_id = "123456789012" self.partition = "aws" @@ -204,23 +214,19 @@ def test_fail_on_error_raises_on_attach_failure(self, mock_urlopen): class TestCleanup(unittest.TestCase): def setUp(self): - self.iam = MagicMock() - self.iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) - self.iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) - self.iam.detach_role_policy.side_effect = self.iam.exceptions.NoSuchEntityException - self.iam.delete_policy.side_effect = self.iam.exceptions.NoSuchEntityException + self.iam = make_iam_mock() def test_cleanup_existing_does_not_touch_instrumentation(self): cleanup_existing_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=2) - detached = [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] + detached = detached_arns(self.iam) self.assertTrue(all(BASE_POLICY_PREFIX_INSTRUMENTATION not in arn for arn in detached)) self.assertTrue(any(BASE_POLICY_PREFIX_RESOURCE_COLLECTION in arn for arn in detached)) def test_cleanup_instrumentation_targets_only_instrumentation_prefix(self): cleanup_instrumentation_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=2) - detached = [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] + detached = detached_arns(self.iam) self.assertEqual(len(detached), 2) self.assertTrue(all(BASE_POLICY_PREFIX_INSTRUMENTATION in arn for arn in detached)) @@ -229,24 +235,17 @@ class TestCleanupLegacyBasePolicies(unittest.TestCase): # Removing the old un-suffixed base policies before attaching the v2 ones is what keeps both # generations from sitting attached at once during an in-place upgrade (IAM managed-policy limit). def setUp(self): - self.iam = MagicMock() - self.iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) - self.iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) - self.iam.detach_role_policy.side_effect = self.iam.exceptions.NoSuchEntityException - self.iam.delete_policy.side_effect = self.iam.exceptions.NoSuchEntityException - - def _detached_arns(self): - return [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] + self.iam = make_iam_mock() def test_only_targets_legacy_names_not_v2(self): cleanup_legacy_base_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=3) - for arn in self._detached_arns(): + for arn in detached_arns(self.iam): # Legacy managed-policy ARNs must never carry the -v2 generation segment. self.assertNotIn("-permissions-v2-", arn) def test_cleans_legacy_resource_collection_and_standard(self): cleanup_legacy_base_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=3) - arns = self._detached_arns() + arns = detached_arns(self.iam) self.assertTrue(any(LEGACY_PREFIX_RESOURCE_COLLECTION + "-MyRole" in a for a in arns)) self.iam.delete_role_policy.assert_called_once_with( RoleName="MyRole", PolicyName=LEGACY_POLICY_NAME_STANDARD @@ -255,7 +254,7 @@ def test_cleans_legacy_resource_collection_and_standard(self): def test_does_not_touch_instrumentation(self): # Base cleanup only handles standard/resource-collection; instrumentation is managed separately. cleanup_legacy_base_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=3) - arns = self._detached_arns() + arns = detached_arns(self.iam) self.assertTrue(all("instrumentation" not in a for a in arns)) @@ -265,9 +264,7 @@ class TestManageBasePermissions(unittest.TestCase): # the instrumentation policies and never touches the standard/resource-collection policies the # role stack owns. def setUp(self): - self.iam = MagicMock() - self.iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) - self.iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) + self.iam = make_iam_mock(cleanup_side_effects=False) def _props(self, **overrides): props = { diff --git a/aws_quickstart/datadog_integration_permissions.yaml b/aws_quickstart/datadog_integration_permissions.yaml index 4affc019..5bf11c80 100644 --- a/aws_quickstart/datadog_integration_permissions.yaml +++ b/aws_quickstart/datadog_integration_permissions.yaml @@ -197,15 +197,18 @@ Resources: _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) - def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, max_policies) - + def _cleanup_base_policies(iam_client, role_name, account_id, partition, rc_prefix, standard_name, max_policies=10): + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, rc_prefix, max_policies) try: - iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD) + iam_client.delete_role_policy(RoleName=role_name, PolicyName=standard_name) except iam_client.exceptions.NoSuchEntityException: pass except Exception as e: - LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") + LOGGER.error(f"Error deleting inline policy {standard_name}: {str(e)}") + + + def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): + _cleanup_base_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, POLICY_NAME_STANDARD, max_policies) def cleanup_instrumentation_policies(iam_client, role_name, account_id, partition, max_policies=10): @@ -217,13 +220,7 @@ Resources: # inline trigger before the v2 policies are attached, so the two generations don't pile up # against the IAM managed-policy limit during an in-place upgrade. Only the role-creation path # calls this; the add-on must not touch the policies the role stack owns. - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, max_policies) - try: - iam_client.delete_role_policy(RoleName=role_name, PolicyName=LEGACY_POLICY_NAME_STANDARD) - except iam_client.exceptions.NoSuchEntityException: - pass - except Exception as e: - LOGGER.error(f"Error deleting legacy inline policy {LEGACY_POLICY_NAME_STANDARD}: {str(e)}") + _cleanup_base_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, LEGACY_POLICY_NAME_STANDARD, max_policies) def attach_standard_permissions(iam_client, role_name): @@ -290,8 +287,6 @@ Resources: ) return - # Only now that the fetch has succeeded do we remove the existing instrumentation policies, so - # a transient fetch failure above leaves the previously-attached policies in place. cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" From e7ed7ac6b6a48a6c2639ce7c2f21d29d414831ef Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Thu, 11 Jun 2026 09:31:55 -0400 Subject: [PATCH 11/11] Condense the 4.14.0 changelog entry Drop the internal implementation detail (policy-name generations, flags, IAM-limit mechanics) and keep a short, customer-facing note in line with recent entries. Co-Authored-By: Claude Opus 4.8 (1M context) --- aws_quickstart/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws_quickstart/CHANGELOG.md b/aws_quickstart/CHANGELOG.md index 9c78a5db..c2499359 100644 --- a/aws_quickstart/CHANGELOG.md +++ b/aws_quickstart/CHANGELOG.md @@ -1,6 +1,6 @@ # 4.14.0 (June 9, 2026) -- Add `main_agent_installation.yaml`, a standalone add-on template that lets customers who declined the Agent installation option during initial AWS integration setup enable it later against an existing integration role. It attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline without touching the standard or resource-collection policies owned by the role stack. To keep this DRY, the permission-attachment custom resource was extracted out of `datadog_integration_role.yaml` into a new nested `datadog_integration_permissions.yaml` (gated by a `ManageBasePermissions` flag) that both the role template and the add-on consume. The policies the custom resource attaches were given a `-v2` generation name (e.g. `datadog-aws-integration-resource-collection-permissions-v2--N`, `DatadogAWSIntegrationPolicyV2`) so that the prior version's now-removed inline trigger — which deletes the un-suffixed policy names by prefix whenever it is torn down, i.e. on any role-stack upgrade off the old version — cannot wipe the policies this stack attaches. For the standard/resource-collection policies the legacy-named ones are also cleaned up before the v2 ones are attached, so existing roles get a one-time replacement on upgrade without piling both generations against the IAM managed-policy limit; instrumentation policies (attached by the add-on) use `-v2` so a later upgrade of the target role's stack can't delete them. Behavior of `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` is otherwise unchanged. Affects `datadog_integration_role.yaml`, `datadog_integration_permissions.yaml`, `main_agent_installation.yaml`, and `release.sh`. +- Add `main_agent_installation.yaml`, a standalone template that enables Datadog Agent installation against an existing AWS integration role. Customers who skipped Agent installation during initial setup can deploy it later without recreating the integration. # 4.13.0 (May 29, 2026)