diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 0191c36d..18382cf5 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -21,3 +21,7 @@ jobs: run: | cd aws_quickstart python -B -S -m unittest datadog_agentless_api_call_test.py -v + - name: Run integration permissions unit tests + run: | + cd aws_quickstart + python -B -S -m unittest attach_integration_permissions_test.py -v diff --git a/aws_quickstart/CHANGELOG.md b/aws_quickstart/CHANGELOG.md index c5b31e76..c2499359 100644 --- a/aws_quickstart/CHANGELOG.md +++ b/aws_quickstart/CHANGELOG.md @@ -1,3 +1,7 @@ +# 4.14.0 (June 9, 2026) + +- Add `main_agent_installation.yaml`, a standalone template that enables Datadog Agent installation against an existing AWS integration role. Customers who skipped Agent installation during initial setup can deploy it later without recreating the integration. + # 4.13.0 (May 29, 2026) - Add `uk1.datadoghq.com` site support. Affects `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml`. diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index f9ea665c..7102f889 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -10,9 +10,26 @@ LOGGER = logging.getLogger() LOGGER.setLevel(logging.INFO) API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" -POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" -BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" -BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" +# The "-v2" suffix on these policy names is load-bearing, not cosmetic. The pre-extraction +# inline trigger (<= v4.13) deletes policies by their un-suffixed names on teardown, and that +# teardown runs whenever the old trigger is removed — i.e. when a role stack is upgraded off +# <= v4.13. Distinct v2 names ensure that destructive delete can never hit the policies this +# template attaches: +# - standard / resource-collection: an in-place role-stack upgrade removes the old trigger +# after this nested stack has re-attached them; v2 names keep them from being wiped. +# - instrumentation: the add-on attaches these against an existing role; if that role's stack +# is later upgraded off <= v4.13, the old trigger's unconditional instrumentation cleanup +# would wipe them unless they sit under a name it doesn't know. +POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicyV2" +BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions-v2" +BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions-v2" +# Un-suffixed standard/resource-collection names created by the pre-extraction inline trigger +# (<= v4.13). The role-creation path cleans these up before attaching the v2 policies so the two +# generations never sit attached at once (IAM caps managed policies per role, default 10); the +# old trigger's own Delete handler then no-ops against names that are already gone. Legacy +# instrumentation policies need no such cleanup — that feature is unreleased, so none exist. +LEGACY_POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" +LEGACY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" @@ -82,21 +99,32 @@ def _cleanup_chunked_policies(iam_client, role_name, account_id, partition, pref _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) -def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, max_policies) - +def _cleanup_base_policies(iam_client, role_name, account_id, partition, rc_prefix, standard_name, max_policies=10): + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, rc_prefix, max_policies) try: - iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD) + iam_client.delete_role_policy(RoleName=role_name, PolicyName=standard_name) except iam_client.exceptions.NoSuchEntityException: pass except Exception as e: - LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") + LOGGER.error(f"Error deleting inline policy {standard_name}: {str(e)}") + + +def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): + _cleanup_base_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, POLICY_NAME_STANDARD, max_policies) def cleanup_instrumentation_policies(iam_client, role_name, account_id, partition, max_policies=10): _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) +def cleanup_legacy_base_policies(iam_client, role_name, account_id, partition, max_policies=10): + # Remove the un-suffixed standard + resource-collection policies left by the pre-extraction + # inline trigger before the v2 policies are attached, so the two generations don't pile up + # against the IAM managed-policy limit during an in-place upgrade. Only the role-creation path + # calls this; the add-on must not touch the policies the role stack owns. + _cleanup_base_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, LEGACY_POLICY_NAME_STANDARD, max_policies) + + def attach_standard_permissions(iam_client, role_name): permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) policy_document = { @@ -134,9 +162,11 @@ def attach_resource_collection_permissions(iam_client, role_name): ) -def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types): - # Best-effort: instrumentation permissions are additive convenience on top of the - # integration, so any failure here is logged and swallowed rather than blocking install. +def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types, fail_on_error=False): + # Best-effort by default: instrumentation permissions are additive convenience on top of the + # integration, so any failure is logged and swallowed rather than blocking install. The + # post-setup add-on passes fail_on_error=True because attaching these policies is the stack's + # whole purpose, so a silent SUCCESS that attached nothing would be worse than a visible failure. # Fetch before cleanup so that a transient API failure on an Update leaves the # previously-attached policies in place instead of silently revoking them. if not resource_types: @@ -151,6 +181,8 @@ def attach_instrumentation_permissions(iam_client, role_name, account_id, partit LOGGER.info(f"Fetching instrumentation permissions for {resource_types} from {url}") permission_chunks = fetch_permissions_from_datadog(url) except Exception as e: + if fail_on_error: + raise LOGGER.warning( f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " "Leaving any previously-attached instrumentation policies in place." @@ -163,6 +195,8 @@ def attach_instrumentation_permissions(iam_client, role_name, account_id, partit try: _create_and_attach_policy(iam_client, role_name, policy_name, chunk) except Exception as e: + if fail_on_error: + raise LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") @@ -171,9 +205,11 @@ def handle_delete(event, context): role_name = props['DatadogIntegrationRole'] account_id = props['AccountId'] partition = props.get('Partition', 'aws') + manage_base_permissions = str(props.get('ManageBasePermissions', 'true')).lower() == 'true' iam_client = boto3.client('iam') try: - cleanup_existing_policies(iam_client, role_name, account_id, partition) + if manage_base_permissions: + cleanup_existing_policies(iam_client, role_name, account_id, partition) cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: @@ -186,6 +222,8 @@ def handle_create_update(event, context): role_name = props['DatadogIntegrationRole'] account_id = props['AccountId'] partition = props.get('Partition', 'aws') + manage_base_permissions = str(props.get('ManageBasePermissions', 'true')).lower() == 'true' + fail_on_instrumentation_error = str(props.get('FailOnInstrumentationError', 'false')).lower() == 'true' should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' datadog_site = props.get('DatadogSite') or 'datadoghq.com' instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) @@ -195,13 +233,16 @@ def handle_create_update(event, context): try: iam_client = boto3.client('iam') - cleanup_existing_policies(iam_client, role_name, account_id, partition) - attach_standard_permissions(iam_client, role_name) - if should_install_security_audit_policy: - attach_resource_collection_permissions(iam_client, role_name) + if manage_base_permissions: + cleanup_legacy_base_policies(iam_client, role_name, account_id, partition) + cleanup_existing_policies(iam_client, role_name, account_id, partition) + attach_standard_permissions(iam_client, role_name) + if should_install_security_audit_policy: + attach_resource_collection_permissions(iam_client, role_name) attach_instrumentation_permissions( iam_client, role_name, account_id, partition, datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types, + fail_on_error=fail_on_instrumentation_error, ) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index aca49602..191ef671 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -19,11 +19,31 @@ attach_instrumentation_permissions, cleanup_existing_policies, cleanup_instrumentation_policies, + cleanup_legacy_base_policies, + handle_create_update, + handle_delete, + POLICY_NAME_STANDARD, BASE_POLICY_PREFIX_INSTRUMENTATION, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, + LEGACY_POLICY_NAME_STANDARD, + LEGACY_PREFIX_RESOURCE_COLLECTION, ) +def make_iam_mock(cleanup_side_effects=True): + iam = MagicMock() + iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) + iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) + if cleanup_side_effects: + iam.detach_role_policy.side_effect = iam.exceptions.NoSuchEntityException + iam.delete_policy.side_effect = iam.exceptions.NoSuchEntityException + return iam + + +def detached_arns(iam): + return [c.kwargs["PolicyArn"] for c in iam.detach_role_policy.call_args_list] + + class TestParseResourceTypes(unittest.TestCase): def test_none(self): self.assertEqual(parse_resource_types(None), []) @@ -78,12 +98,8 @@ def test_repeated_resource_type_and_chunked(self): class TestAttachInstrumentationPermissions(unittest.TestCase): def setUp(self): - self.iam = MagicMock() - self.iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) - self.iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) + self.iam = make_iam_mock() self.iam.create_policy.return_value = {"Policy": {"Arn": "arn:aws:iam::123:policy/X"}} - self.iam.detach_role_policy.side_effect = self.iam.exceptions.NoSuchEntityException - self.iam.delete_policy.side_effect = self.iam.exceptions.NoSuchEntityException self.role_name = "DatadogIntegrationRole" self.account_id = "123456789012" self.partition = "aws" @@ -172,29 +188,210 @@ def test_per_chunk_failure_is_swallowed_and_others_continue(self, mock_urlopen): self.assertEqual(self.iam.create_policy.call_count, 3) self.assertEqual(self.iam.attach_role_policy.call_count, 2) + @patch("attach_integration_permissions.urllib.request.urlopen") + def test_fail_on_error_raises_on_fetch_failure(self, mock_urlopen): + # Add-on mode (fail_on_error=True): a fetch failure must propagate so the stack fails + # instead of silently reporting SUCCESS with nothing attached. + mock_urlopen.side_effect = HTTPError( + "u", 500, "boom", {}, BytesIO(b'{"errors":["upstream down"]}') + ) + with self.assertRaises(Exception): + attach_instrumentation_permissions( + self.iam, self.role_name, self.account_id, self.partition, self.site, + ["aws:ec2:instance"], (), fail_on_error=True, + ) + + @patch("attach_integration_permissions.urllib.request.urlopen") + def test_fail_on_error_raises_on_attach_failure(self, mock_urlopen): + mock_urlopen.return_value = self._mock_chunks_response([["chunk1:Action"]]) + self.iam.create_policy.side_effect = Exception("AccessDenied") + with self.assertRaises(Exception): + attach_instrumentation_permissions( + self.iam, self.role_name, self.account_id, self.partition, self.site, + ["aws:ec2:instance"], (), fail_on_error=True, + ) + class TestCleanup(unittest.TestCase): def setUp(self): - self.iam = MagicMock() - self.iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) - self.iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) - self.iam.detach_role_policy.side_effect = self.iam.exceptions.NoSuchEntityException - self.iam.delete_policy.side_effect = self.iam.exceptions.NoSuchEntityException + self.iam = make_iam_mock() def test_cleanup_existing_does_not_touch_instrumentation(self): cleanup_existing_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=2) - detached = [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] + detached = detached_arns(self.iam) self.assertTrue(all(BASE_POLICY_PREFIX_INSTRUMENTATION not in arn for arn in detached)) self.assertTrue(any(BASE_POLICY_PREFIX_RESOURCE_COLLECTION in arn for arn in detached)) def test_cleanup_instrumentation_targets_only_instrumentation_prefix(self): cleanup_instrumentation_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=2) - detached = [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] + detached = detached_arns(self.iam) self.assertEqual(len(detached), 2) self.assertTrue(all(BASE_POLICY_PREFIX_INSTRUMENTATION in arn for arn in detached)) +class TestCleanupLegacyBasePolicies(unittest.TestCase): + # Removing the old un-suffixed base policies before attaching the v2 ones is what keeps both + # generations from sitting attached at once during an in-place upgrade (IAM managed-policy limit). + def setUp(self): + self.iam = make_iam_mock() + + def test_only_targets_legacy_names_not_v2(self): + cleanup_legacy_base_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=3) + for arn in detached_arns(self.iam): + # Legacy managed-policy ARNs must never carry the -v2 generation segment. + self.assertNotIn("-permissions-v2-", arn) + + def test_cleans_legacy_resource_collection_and_standard(self): + cleanup_legacy_base_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=3) + arns = detached_arns(self.iam) + self.assertTrue(any(LEGACY_PREFIX_RESOURCE_COLLECTION + "-MyRole" in a for a in arns)) + self.iam.delete_role_policy.assert_called_once_with( + RoleName="MyRole", PolicyName=LEGACY_POLICY_NAME_STANDARD + ) + + def test_does_not_touch_instrumentation(self): + # Base cleanup only handles standard/resource-collection; instrumentation is managed separately. + cleanup_legacy_base_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=3) + arns = detached_arns(self.iam) + self.assertTrue(all("instrumentation" not in a for a in arns)) + + +class TestManageBasePermissions(unittest.TestCase): + # ManageBasePermissions gates the standard + resource-collection policies. The role-creation + # path sets it true (manage everything); the post-setup add-on sets it false so it manages only + # the instrumentation policies and never touches the standard/resource-collection policies the + # role stack owns. + def setUp(self): + self.iam = make_iam_mock(cleanup_side_effects=False) + + def _props(self, **overrides): + props = { + "DatadogIntegrationRole": "DatadogIntegrationRole", + "AccountId": "123456789012", + "Partition": "aws", + "ResourceCollectionPermissions": "true", + "InstrumentationResourceTypes": "", + "DatadogSite": "datadoghq.com", + } + props.update(overrides) + return {"RequestType": "Create", "ResourceProperties": props} + + @patch("attach_integration_permissions.cleanup_legacy_base_policies") + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.attach_instrumentation_permissions") + @patch("attach_integration_permissions.attach_resource_collection_permissions") + @patch("attach_integration_permissions.attach_standard_permissions") + @patch("attach_integration_permissions.cleanup_existing_policies") + def test_create_manage_base_true_attaches_base( + self, mock_cleanup, mock_standard, mock_rc, mock_instr, mock_client, mock_legacy + ): + mock_client.return_value = self.iam + handle_create_update(self._props(ManageBasePermissions="true"), None) + mock_cleanup.assert_called_once() + mock_standard.assert_called_once() + mock_rc.assert_called_once() + mock_instr.assert_called_once() + mock_legacy.assert_called_once() + + @patch("attach_integration_permissions.cleanup_legacy_base_policies") + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.attach_instrumentation_permissions") + @patch("attach_integration_permissions.attach_resource_collection_permissions") + @patch("attach_integration_permissions.attach_standard_permissions") + @patch("attach_integration_permissions.cleanup_existing_policies") + def test_create_manage_base_false_only_instrumentation( + self, mock_cleanup, mock_standard, mock_rc, mock_instr, mock_client, mock_legacy + ): + mock_client.return_value = self.iam + handle_create_update(self._props(ManageBasePermissions="false"), None) + mock_cleanup.assert_not_called() + mock_standard.assert_not_called() + mock_rc.assert_not_called() + mock_instr.assert_called_once() + # Add-on mode must not touch the role stack's standard/resource-collection policies. + mock_legacy.assert_not_called() + + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.cleanup_instrumentation_policies") + @patch("attach_integration_permissions.cleanup_existing_policies") + def test_delete_manage_base_false_only_instrumentation( + self, mock_cleanup_base, mock_cleanup_instr, mock_client + ): + mock_client.return_value = self.iam + event = self._props(ManageBasePermissions="false") + event["RequestType"] = "Delete" + handle_delete(event, None) + mock_cleanup_base.assert_not_called() + mock_cleanup_instr.assert_called_once() + + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.cleanup_instrumentation_policies") + @patch("attach_integration_permissions.cleanup_existing_policies") + def test_delete_manage_base_true_cleans_both( + self, mock_cleanup_base, mock_cleanup_instr, mock_client + ): + mock_client.return_value = self.iam + event = self._props(ManageBasePermissions="true") + event["RequestType"] = "Delete" + handle_delete(event, None) + mock_cleanup_base.assert_called_once() + mock_cleanup_instr.assert_called_once() + + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.attach_instrumentation_permissions") + def test_create_threads_fail_on_instrumentation_error(self, mock_instr, mock_client): + mock_client.return_value = self.iam + handle_create_update( + self._props(ManageBasePermissions="false", FailOnInstrumentationError="true"), None + ) + self.assertTrue(mock_instr.call_args.kwargs["fail_on_error"]) + + @patch("attach_integration_permissions.cfnresponse") + @patch("attach_integration_permissions.boto3.client") + @patch("attach_integration_permissions.attach_instrumentation_permissions") + def test_create_reports_failed_when_instrumentation_raises( + self, mock_instr, mock_client, mock_cfn + ): + # Add-on mode: a propagated instrumentation failure must surface as a FAILED response. + mock_client.return_value = self.iam + mock_instr.side_effect = Exception("AccessDenied") + handle_create_update( + self._props(ManageBasePermissions="false", FailOnInstrumentationError="true"), None + ) + self.assertEqual(mock_cfn.send.call_args.args[2], mock_cfn.FAILED) + + +class TestUpgradeSafePolicyNames(unittest.TestCase): + # Guards the invariant that makes the inline-trigger era safe: every policy name this template + # attaches must be disjoint from the un-suffixed names the legacy (<= v4.13) Delete handler removes, + # so the old handler can never wipe a policy this stack attached. This covers instrumentation too — + # the add-on attaches instrumentation policies against an existing role, and a later upgrade of that + # role's stack removes the old trigger, whose unconditional instrumentation cleanup would otherwise + # delete them. + role = "DatadogIntegrationRole" + # Un-suffixed prefix the legacy trigger deletes instrumentation policies by. + LEGACY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" + + def _names(self, prefix): + return {f"{prefix}-{self.role}-{i+1}" for i in range(10)} + + def test_standard_policy_name_differs_from_legacy(self): + self.assertNotEqual(POLICY_NAME_STANDARD, LEGACY_POLICY_NAME_STANDARD) + + def test_resource_collection_names_disjoint_from_legacy(self): + self.assertEqual( + self._names(BASE_POLICY_PREFIX_RESOURCE_COLLECTION) & self._names(LEGACY_PREFIX_RESOURCE_COLLECTION), + set(), + ) + + def test_instrumentation_names_disjoint_from_legacy(self): + self.assertEqual( + self._names(BASE_POLICY_PREFIX_INSTRUMENTATION) & self._names(self.LEGACY_PREFIX_INSTRUMENTATION), + set(), + ) + + if __name__ == "__main__": unittest.main() diff --git a/aws_quickstart/datadog_integration_permissions.yaml b/aws_quickstart/datadog_integration_permissions.yaml new file mode 100644 index 00000000..5bf11c80 --- /dev/null +++ b/aws_quickstart/datadog_integration_permissions.yaml @@ -0,0 +1,368 @@ +AWSTemplateFormatVersion: 2010-09-09 +Description: Datadog AWS Integration - attach IAM permissions to the integration role +Parameters: + IAMRoleName: + Description: Name of the IAM role to attach the Datadog integration permissions to. + Type: String + Default: DatadogIntegrationRole + ResourceCollectionPermissions: + Type: String + Default: false + AllowedValues: + - true + - false + Description: >- + Set this value to "true" to add permissions for Datadog to collect resource configuration data. + InstrumentationResourceTypes: + Type: String + Default: "" + Description: >- + Comma-separated list of AWS resource types (UDM form, e.g. aws:ec2:instance, aws:ecs:cluster, + aws:eks:cluster) that the Datadog integration role should be granted the IAM permissions + required to instrument with the Datadog Agent. Leave blank to skip. + DatadogSite: + Type: String + Default: "datadoghq.com" + Description: >- + Datadog site the integration is being installed against. Used to call the Datadog API that + returns the IAM permissions required to instrument the selected resource types. + ManageBasePermissions: + Type: String + Default: true + AllowedValues: + - true + - false + Description: >- + Set this value to "true" to manage the standard and resource-collection permissions on the role + (the role-creation case). Set it to "false" to manage only the instrumentation permissions and + leave the standard and resource-collection policies untouched (the post-setup add-on case). + FailOnInstrumentationError: + Type: String + Default: false + AllowedValues: + - true + - false + Description: >- + Set this value to "true" to fail the stack when the instrumentation permissions cannot be fetched + or attached. Defaults to "false" (best-effort) for the role-creation case, where instrumentation is + an optional add-on to the broader install. The post-setup add-on sets this to "true" because + attaching the instrumentation permissions is the stack's only purpose. +Resources: + DatadogAttachIntegrationPermissionsLambdaExecutionRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: + - lambda.amazonaws.com + Action: + - sts:AssumeRole + Path: "/" + ManagedPolicyArns: + - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + Policies: + - PolicyName: !Sub "datadog-aws-integration-iam-permissions-${IAMRoleName}" + PolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - iam:CreatePolicy + - iam:DeletePolicy + - iam:DeleteRolePolicy + - iam:AttachRolePolicy + - iam:DetachRolePolicy + - iam:PutRolePolicy + Resource: + # Wildcards cover both the v2 names this template creates and the un-suffixed legacy + # names it cleans up on an in-place upgrade. + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMRoleName} + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-resource-collection-permissions-* + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-instrumentation-permissions-* + - !Sub "arn:${AWS::Partition}:iam::aws:policy/SecurityAudit" + DatadogAttachIntegrationPermissionsFunction: + Type: AWS::Lambda::Function + Properties: + Description: "A function to attach Datadog AWS integration permissions to an IAM role." + Role: !GetAtt DatadogAttachIntegrationPermissionsLambdaExecutionRole.Arn + Handler: "index.handler" + LoggingConfig: + ApplicationLogLevel: "INFO" + LogFormat: "JSON" + Runtime: "python3.14" + Timeout: 300 + Code: + ZipFile: | + import json + import logging + from urllib.request import Request + import urllib.error + import urllib.parse + import urllib.request + import cfnresponse + import boto3 + + LOGGER = logging.getLogger() + LOGGER.setLevel(logging.INFO) + API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" + # The "-v2" suffix on these policy names is load-bearing, not cosmetic. The pre-extraction + # inline trigger (<= v4.13) deletes policies by their un-suffixed names on teardown, and that + # teardown runs whenever the old trigger is removed — i.e. when a role stack is upgraded off + # <= v4.13. Distinct v2 names ensure that destructive delete can never hit the policies this + # template attaches: + # - standard / resource-collection: an in-place role-stack upgrade removes the old trigger + # after this nested stack has re-attached them; v2 names keep them from being wiped. + # - instrumentation: the add-on attaches these against an existing role; if that role's stack + # is later upgraded off <= v4.13, the old trigger's unconditional instrumentation cleanup + # would wipe them unless they sit under a name it doesn't know. + POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicyV2" + BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions-v2" + BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions-v2" + # Un-suffixed standard/resource-collection names created by the pre-extraction inline trigger + # (<= v4.13). The role-creation path cleans these up before attaching the v2 policies so the two + # generations never sit attached at once (IAM caps managed policies per role, default 10); the + # old trigger's own Delete handler then no-ops against names that are already gone. Legacy + # instrumentation policies need no such cleanup — that feature is unreleased, so none exist. + LEGACY_POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" + LEGACY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" + STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" + RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" + INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" + + + class DatadogAPIError(Exception): + pass + + + def fetch_permissions_from_datadog(api_url): + headers = { + "Dd-Aws-Api-Call-Source": API_CALL_SOURCE_HEADER_VALUE, + } + request = Request(api_url, headers=headers) + request.get_method = lambda: "GET" + + try: + response = urllib.request.urlopen(request) + except urllib.error.HTTPError as e: + error_body = json.loads(e.read()) + error_message = error_body.get('errors', ['Unknown error'])[0] + raise DatadogAPIError(f"Datadog API error: {error_message}") from e + + return json.loads(response.read())["data"]["attributes"]["permissions"] + + + def parse_resource_types(raw): + # CFN forwards CommaDelimitedList parameters as JSON arrays to custom resources, + # while String parameters arrive as comma-delimited strings; accept both. + if raw is None: + return [] + items = raw.split(",") if isinstance(raw, str) else list(raw) + return [t.strip() for t in items if t and t.strip()] + + + def build_instrumentation_permissions_url(datadog_site, resource_types): + query = urllib.parse.urlencode( + [("resource_type", t) for t in resource_types] + [("chunked", "true")] + ) + return f"https://api.{datadog_site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}" + + + def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): + # Detach + delete are both no-ops if the entity is already gone, so callers can blindly + # iterate the policy-name space without first checking what actually exists. + try: + iam_client.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn) + except iam_client.exceptions.NoSuchEntityException: + pass + except Exception as e: + LOGGER.error(f"Error detaching policy {policy_name}: {str(e)}") + + try: + iam_client.delete_policy(PolicyArn=policy_arn) + except iam_client.exceptions.NoSuchEntityException: + pass + except iam_client.exceptions.DeleteConflictException: + LOGGER.warning(f"Policy {policy_name} still attached, skipping delete") + except Exception as e: + LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}") + + + def _cleanup_chunked_policies(iam_client, role_name, account_id, partition, prefix, max_policies=10): + for i in range(max_policies): + policy_name = f"{prefix}-{role_name}-{i+1}" + policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" + _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) + + + def _cleanup_base_policies(iam_client, role_name, account_id, partition, rc_prefix, standard_name, max_policies=10): + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, rc_prefix, max_policies) + try: + iam_client.delete_role_policy(RoleName=role_name, PolicyName=standard_name) + except iam_client.exceptions.NoSuchEntityException: + pass + except Exception as e: + LOGGER.error(f"Error deleting inline policy {standard_name}: {str(e)}") + + + def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): + _cleanup_base_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, POLICY_NAME_STANDARD, max_policies) + + + def cleanup_instrumentation_policies(iam_client, role_name, account_id, partition, max_policies=10): + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) + + + def cleanup_legacy_base_policies(iam_client, role_name, account_id, partition, max_policies=10): + # Remove the un-suffixed standard + resource-collection policies left by the pre-extraction + # inline trigger before the v2 policies are attached, so the two generations don't pile up + # against the IAM managed-policy limit during an in-place upgrade. Only the role-creation path + # calls this; the add-on must not touch the policies the role stack owns. + _cleanup_base_policies(iam_client, role_name, account_id, partition, LEGACY_PREFIX_RESOURCE_COLLECTION, LEGACY_POLICY_NAME_STANDARD, max_policies) + + + def attach_standard_permissions(iam_client, role_name): + permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) + policy_document = { + "Version": "2012-10-17", + "Statement": [{"Effect": "Allow", "Action": permissions, "Resource": "*"}], + } + iam_client.put_role_policy( + RoleName=role_name, + PolicyName=POLICY_NAME_STANDARD, + PolicyDocument=json.dumps(policy_document, separators=(',', ':')), + ) + + + def _create_and_attach_policy(iam_client, role_name, policy_name, actions): + policy_json = json.dumps( + { + "Version": "2012-10-17", + "Statement": [{"Effect": "Allow", "Action": actions, "Resource": "*"}], + }, + separators=(',', ':'), + ) + LOGGER.info(f"Creating policy {policy_name} with {len(actions)} permissions ({len(policy_json)} characters)") + policy = iam_client.create_policy(PolicyName=policy_name, PolicyDocument=policy_json) + iam_client.attach_role_policy(RoleName=role_name, PolicyArn=policy['Policy']['Arn']) + + + def attach_resource_collection_permissions(iam_client, role_name): + permission_chunks = fetch_permissions_from_datadog(RESOURCE_COLLECTION_PERMISSIONS_API_URL) + for i, chunk in enumerate(permission_chunks): + _create_and_attach_policy( + iam_client, + role_name, + f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}", + chunk, + ) + + + def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types, fail_on_error=False): + # Best-effort by default: instrumentation permissions are additive convenience on top of the + # integration, so any failure is logged and swallowed rather than blocking install. The + # post-setup add-on passes fail_on_error=True because attaching these policies is the stack's + # whole purpose, so a silent SUCCESS that attached nothing would be worse than a visible failure. + # Fetch before cleanup so that a transient API failure on an Update leaves the + # previously-attached policies in place instead of silently revoking them. + if not resource_types: + # Only clean up if the previous Update had instrumentation enabled — avoids running + # delete calls on stacks that never opted in to instrumentation in the first place. + if previous_resource_types: + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) + return + + try: + url = build_instrumentation_permissions_url(datadog_site, resource_types) + LOGGER.info(f"Fetching instrumentation permissions for {resource_types} from {url}") + permission_chunks = fetch_permissions_from_datadog(url) + except Exception as e: + if fail_on_error: + raise + LOGGER.warning( + f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " + "Leaving any previously-attached instrumentation policies in place." + ) + return + + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) + for i, chunk in enumerate(permission_chunks): + policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" + try: + _create_and_attach_policy(iam_client, role_name, policy_name, chunk) + except Exception as e: + if fail_on_error: + raise + LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") + + + def handle_delete(event, context): + props = event['ResourceProperties'] + role_name = props['DatadogIntegrationRole'] + account_id = props['AccountId'] + partition = props.get('Partition', 'aws') + manage_base_permissions = str(props.get('ManageBasePermissions', 'true')).lower() == 'true' + iam_client = boto3.client('iam') + try: + if manage_base_permissions: + cleanup_existing_policies(iam_client, role_name, account_id, partition) + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) + cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) + except Exception as e: + LOGGER.error(f"Error deleting policy: {str(e)}") + cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) + + + def handle_create_update(event, context): + props = event['ResourceProperties'] + role_name = props['DatadogIntegrationRole'] + account_id = props['AccountId'] + partition = props.get('Partition', 'aws') + manage_base_permissions = str(props.get('ManageBasePermissions', 'true')).lower() == 'true' + fail_on_instrumentation_error = str(props.get('FailOnInstrumentationError', 'false')).lower() == 'true' + should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' + datadog_site = props.get('DatadogSite') or 'datadoghq.com' + instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) + previous_instrumentation_resource_types = parse_resource_types( + event.get('OldResourceProperties', {}).get('InstrumentationResourceTypes') + ) + + try: + iam_client = boto3.client('iam') + if manage_base_permissions: + cleanup_legacy_base_policies(iam_client, role_name, account_id, partition) + cleanup_existing_policies(iam_client, role_name, account_id, partition) + attach_standard_permissions(iam_client, role_name) + if should_install_security_audit_policy: + attach_resource_collection_permissions(iam_client, role_name) + attach_instrumentation_permissions( + iam_client, role_name, account_id, partition, + datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types, + fail_on_error=fail_on_instrumentation_error, + ) + cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) + except Exception as e: + LOGGER.error(f"Error creating/attaching policy: {str(e)}") + cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) + + + def handler(event, context): + LOGGER.info("Event received: %s", json.dumps(event)) + if event['RequestType'] == 'Delete': + handle_delete(event, context) + else: + handle_create_update(event, context) + DatadogAttachIntegrationPermissionsFunctionTrigger: + Type: Custom::DatadogAttachIntegrationPermissionsFunctionTrigger + Properties: + ServiceToken: !GetAtt DatadogAttachIntegrationPermissionsFunction.Arn + DatadogIntegrationRole: !Ref IAMRoleName + AccountId: !Ref AWS::AccountId + Partition: !Sub "${AWS::Partition}" + ResourceCollectionPermissions: !Ref ResourceCollectionPermissions + InstrumentationResourceTypes: !Ref InstrumentationResourceTypes + DatadogSite: !Ref DatadogSite + ManageBasePermissions: !Ref ManageBasePermissions + FailOnInstrumentationError: !Ref FailOnInstrumentationError diff --git a/aws_quickstart/datadog_integration_role.yaml b/aws_quickstart/datadog_integration_role.yaml index 8ebd27b4..0a3cf14e 100644 --- a/aws_quickstart/datadog_integration_role.yaml +++ b/aws_quickstart/datadog_integration_role.yaml @@ -74,280 +74,20 @@ Resources: [!Sub "arn:${AWS::Partition}:iam::aws:policy/SecurityAudit"], !Ref AWS::NoValue, ] - DatadogAttachIntegrationPermissionsLambdaExecutionRole: - Type: AWS::IAM::Role - Properties: - AssumeRolePolicyDocument: - Version: '2012-10-17' - Statement: - - Effect: Allow - Principal: - Service: - - lambda.amazonaws.com - Action: - - sts:AssumeRole - Path: "/" - ManagedPolicyArns: - - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" - Policies: - - PolicyName: !Sub "datadog-aws-integration-iam-permissions-${IAMRoleName}" - PolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: Allow - Action: - - iam:CreatePolicy - - iam:DeletePolicy - - iam:DeleteRolePolicy - - iam:AttachRolePolicy - - iam:DetachRolePolicy - - iam:PutRolePolicy - Resource: - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMRoleName} - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-resource-collection-permissions-* - - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-instrumentation-permissions-* - - !Sub "arn:${AWS::Partition}:iam::aws:policy/SecurityAudit" - DatadogAttachIntegrationPermissionsFunction: - Type: AWS::Lambda::Function - Properties: - Description: "A function to attach Datadog AWS integration permissions to an IAM role." - Role: !GetAtt DatadogAttachIntegrationPermissionsLambdaExecutionRole.Arn - Handler: "index.handler" - LoggingConfig: - ApplicationLogLevel: "INFO" - LogFormat: "JSON" - Runtime: "python3.14" - Timeout: 300 - Code: - ZipFile: | - import json - import logging - from urllib.request import Request - import urllib.error - import urllib.parse - import urllib.request - import cfnresponse - import boto3 - - LOGGER = logging.getLogger() - LOGGER.setLevel(logging.INFO) - API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" - POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" - BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" - BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" - STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" - RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" - INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" - - - class DatadogAPIError(Exception): - pass - - - def fetch_permissions_from_datadog(api_url): - headers = { - "Dd-Aws-Api-Call-Source": API_CALL_SOURCE_HEADER_VALUE, - } - request = Request(api_url, headers=headers) - request.get_method = lambda: "GET" - - try: - response = urllib.request.urlopen(request) - except urllib.error.HTTPError as e: - error_body = json.loads(e.read()) - error_message = error_body.get('errors', ['Unknown error'])[0] - raise DatadogAPIError(f"Datadog API error: {error_message}") from e - - return json.loads(response.read())["data"]["attributes"]["permissions"] - - - def parse_resource_types(raw): - # CFN forwards CommaDelimitedList parameters as JSON arrays to custom resources, - # while String parameters arrive as comma-delimited strings; accept both. - if raw is None: - return [] - items = raw.split(",") if isinstance(raw, str) else list(raw) - return [t.strip() for t in items if t and t.strip()] - - - def build_instrumentation_permissions_url(datadog_site, resource_types): - query = urllib.parse.urlencode( - [("resource_type", t) for t in resource_types] + [("chunked", "true")] - ) - return f"https://api.{datadog_site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}" - - - def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): - # Detach + delete are both no-ops if the entity is already gone, so callers can blindly - # iterate the policy-name space without first checking what actually exists. - try: - iam_client.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn) - except iam_client.exceptions.NoSuchEntityException: - pass - except Exception as e: - LOGGER.error(f"Error detaching policy {policy_name}: {str(e)}") - - try: - iam_client.delete_policy(PolicyArn=policy_arn) - except iam_client.exceptions.NoSuchEntityException: - pass - except iam_client.exceptions.DeleteConflictException: - LOGGER.warning(f"Policy {policy_name} still attached, skipping delete") - except Exception as e: - LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}") - - - def _cleanup_chunked_policies(iam_client, role_name, account_id, partition, prefix, max_policies=10): - for i in range(max_policies): - policy_name = f"{prefix}-{role_name}-{i+1}" - policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" - _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) - - - def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, max_policies) - - try: - iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD) - except iam_client.exceptions.NoSuchEntityException: - pass - except Exception as e: - LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") - - - def cleanup_instrumentation_policies(iam_client, role_name, account_id, partition, max_policies=10): - _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) - - - def attach_standard_permissions(iam_client, role_name): - permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) - policy_document = { - "Version": "2012-10-17", - "Statement": [{"Effect": "Allow", "Action": permissions, "Resource": "*"}], - } - iam_client.put_role_policy( - RoleName=role_name, - PolicyName=POLICY_NAME_STANDARD, - PolicyDocument=json.dumps(policy_document, separators=(',', ':')), - ) - - - def _create_and_attach_policy(iam_client, role_name, policy_name, actions): - policy_json = json.dumps( - { - "Version": "2012-10-17", - "Statement": [{"Effect": "Allow", "Action": actions, "Resource": "*"}], - }, - separators=(',', ':'), - ) - LOGGER.info(f"Creating policy {policy_name} with {len(actions)} permissions ({len(policy_json)} characters)") - policy = iam_client.create_policy(PolicyName=policy_name, PolicyDocument=policy_json) - iam_client.attach_role_policy(RoleName=role_name, PolicyArn=policy['Policy']['Arn']) - - - def attach_resource_collection_permissions(iam_client, role_name): - permission_chunks = fetch_permissions_from_datadog(RESOURCE_COLLECTION_PERMISSIONS_API_URL) - for i, chunk in enumerate(permission_chunks): - _create_and_attach_policy( - iam_client, - role_name, - f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}", - chunk, - ) - - - def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types): - # Best-effort: instrumentation permissions are additive convenience on top of the - # integration, so any failure here is logged and swallowed rather than blocking install. - # Fetch before cleanup so that a transient API failure on an Update leaves the - # previously-attached policies in place instead of silently revoking them. - if not resource_types: - # Only clean up if the previous Update had instrumentation enabled — avoids running - # delete calls on stacks that never opted in to instrumentation in the first place. - if previous_resource_types: - cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) - return - - try: - url = build_instrumentation_permissions_url(datadog_site, resource_types) - LOGGER.info(f"Fetching instrumentation permissions for {resource_types} from {url}") - permission_chunks = fetch_permissions_from_datadog(url) - except Exception as e: - LOGGER.warning( - f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " - "Leaving any previously-attached instrumentation policies in place." - ) - return - - cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) - for i, chunk in enumerate(permission_chunks): - policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" - try: - _create_and_attach_policy(iam_client, role_name, policy_name, chunk) - except Exception as e: - LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") - - - def handle_delete(event, context): - props = event['ResourceProperties'] - role_name = props['DatadogIntegrationRole'] - account_id = props['AccountId'] - partition = props.get('Partition', 'aws') - iam_client = boto3.client('iam') - try: - cleanup_existing_policies(iam_client, role_name, account_id, partition) - cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) - cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) - except Exception as e: - LOGGER.error(f"Error deleting policy: {str(e)}") - cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) - - - def handle_create_update(event, context): - props = event['ResourceProperties'] - role_name = props['DatadogIntegrationRole'] - account_id = props['AccountId'] - partition = props.get('Partition', 'aws') - should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' - datadog_site = props.get('DatadogSite') or 'datadoghq.com' - instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) - previous_instrumentation_resource_types = parse_resource_types( - event.get('OldResourceProperties', {}).get('InstrumentationResourceTypes') - ) - - try: - iam_client = boto3.client('iam') - cleanup_existing_policies(iam_client, role_name, account_id, partition) - attach_standard_permissions(iam_client, role_name) - if should_install_security_audit_policy: - attach_resource_collection_permissions(iam_client, role_name) - attach_instrumentation_permissions( - iam_client, role_name, account_id, partition, - datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types, - ) - cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) - except Exception as e: - LOGGER.error(f"Error creating/attaching policy: {str(e)}") - cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) - - - def handler(event, context): - LOGGER.info("Event received: %s", json.dumps(event)) - if event['RequestType'] == 'Delete': - handle_delete(event, context) - else: - handle_create_update(event, context) - DatadogAttachIntegrationPermissionsFunctionTrigger: - Type: Custom::DatadogAttachIntegrationPermissionsFunctionTrigger + # Attaches the standard, resource-collection, and instrumentation IAM policies to the role + # above. Extracted into a nested template so the same custom resource can be reused by the + # post-setup Agent installation add-on (main_agent_installation.yaml). + DatadogIntegrationPermissionsStack: + Type: AWS::CloudFormation::Stack DependsOn: DatadogIntegrationRole Properties: - ServiceToken: !GetAtt DatadogAttachIntegrationPermissionsFunction.Arn - DatadogIntegrationRole: !Ref IAMRoleName - AccountId: !Ref AWS::AccountId - Partition: !Sub "${AWS::Partition}" - ResourceCollectionPermissions: !Ref ResourceCollectionPermissions - InstrumentationResourceTypes: !Ref InstrumentationResourceTypes - DatadogSite: !Ref DatadogSite + TemplateURL: "https://.s3.amazonaws.com/aws//datadog_integration_permissions.yaml" + Parameters: + IAMRoleName: !Ref IAMRoleName + ResourceCollectionPermissions: !Ref ResourceCollectionPermissions + InstrumentationResourceTypes: !Ref InstrumentationResourceTypes + DatadogSite: !Ref DatadogSite + ManageBasePermissions: true Metadata: AWS::CloudFormation::Interface: ParameterGroups: diff --git a/aws_quickstart/main_agent_installation.yaml b/aws_quickstart/main_agent_installation.yaml new file mode 100644 index 00000000..9ea6a8eb --- /dev/null +++ b/aws_quickstart/main_agent_installation.yaml @@ -0,0 +1,115 @@ +# version: +# +# Post-setup Agent installation add-on. Lets customers who declined the Agent installation option +# during initial AWS integration setup enable it later, against an existing integration role: it +# attaches the instrumentation IAM permissions and deploys the EventBridge forwarding pipeline, +# without touching the standard or resource-collection policies owned by the role stack. +# +AWSTemplateFormatVersion: 2010-09-09 +Description: Datadog AWS Integration - Agent installation add-on +Parameters: + APIKey: + Description: >- + API key for the Datadog account (find at https://app.datadoghq.com/organization-settings/api-keys). + Type: String + NoEcho: true + APPKey: + Description: >- + APP key for the Datadog account (find at https://app.datadoghq.com/organization-settings/application-keys). + Type: String + NoEcho: true + DatadogSite: + Type: String + Default: datadoghq.com + Description: >- + Define your Datadog Site to send data to. + Allowed values: datadoghq.com, datadoghq.eu, us3.datadoghq.com, us5.datadoghq.com, + ap1.datadoghq.com, ap2.datadoghq.com, uk1.datadoghq.com, ddog-gov.com (GovCloud), us2.ddog-gov.com (GovCloud). + IAMRoleName: + Description: Name of the existing IAM role used by the Datadog AWS integration. + Type: String + Default: DatadogIntegrationRole + AccountId: + Type: String + Description: The AWS account ID of the account integrated in Datadog. + InstrumentationResourceTypes: + Type: CommaDelimitedList + Description: >- + Comma-separated list of AWS resource types (UDM form, e.g. aws:ec2:instance, aws:eks:cluster) to enable + Datadog Agent installation for. The integration role is granted the IAM permissions required to instrument + these resources. CloudTrail update events are forwarded to Datadog for the supported resource types + (currently aws:ec2:instance and aws:eks:cluster); other types receive IAM permissions but no event forwarding. +Rules: + ValidateAccountId: + Assertions: + - Assert: !Equals [!Ref AccountId, !Ref "AWS::AccountId"] + AssertDescription: "The AWS Account Id of the account integrated in Datadog does not match the AWS Account Id of the account where this stack will be created." +Conditions: + ShouldForwardEvents: + Fn::Not: + - Fn::Equals: + - !Join ["", !Ref InstrumentationResourceTypes] + - "" +Resources: + # Attaches only the instrumentation IAM policies to the existing integration role. ManageBasePermissions + # is false so the standard and resource-collection policies owned by the role stack are left untouched. + DatadogIntegrationPermissionsStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: "https://.s3.amazonaws.com/aws//datadog_integration_permissions.yaml" + Parameters: + IAMRoleName: !Ref IAMRoleName + ResourceCollectionPermissions: false + InstrumentationResourceTypes: !Join [",", !Ref InstrumentationResourceTypes] + DatadogSite: !Ref DatadogSite + ManageBasePermissions: false + FailOnInstrumentationError: true + # EventBridge pipeline forwarding CloudTrail events to the Datadog resource update intake. + # Deployed only when at least one InstrumentationResourceTypes value is set; single-region + # (covers the region this stack is deployed in). + DatadogAgentResourceUpdateForwardingStack: + Type: AWS::CloudFormation::Stack + Condition: ShouldForwardEvents + Properties: + TemplateURL: "https://.s3.amazonaws.com/aws//datadog_agent_resource_update_forwarding.yaml" + Parameters: + APIKey: !Ref APIKey + APPKey: !Ref APPKey + DatadogSite: !Ref DatadogSite + InstrumentationResourceTypes: !Join [",", !Ref InstrumentationResourceTypes] +Outputs: + IAMRoleName: + Description: AWS IAM Role named to be used with the DataDog AWS Integration + Value: !Ref IAMRoleName + AccountId: + Description: AWS Account number + Value: !Ref "AWS::AccountId" + Region: + Description: AWS Region + Value: !Ref "AWS::Region" +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: Required + Parameters: + - APIKey + - APPKey + - DatadogSite + - AccountId + - InstrumentationResourceTypes + - Label: + default: Advanced + Parameters: + - IAMRoleName + ParameterLabels: + APIKey: + default: "DatadogApiKey *" + APPKey: + default: "DatadogAppKey *" + DatadogSite: + default: "DatadogSite *" + AccountId: + default: "AccountId *" + InstrumentationResourceTypes: + default: "InstrumentationResourceTypes *" diff --git a/aws_quickstart/release.sh b/aws_quickstart/release.sh index 3b8728f2..d26ac07a 100755 --- a/aws_quickstart/release.sh +++ b/aws_quickstart/release.sh @@ -116,7 +116,7 @@ cp datadog_agentless_api_call.py "${TEMP_DIR}/" cd "${TEMP_DIR}" # Update placeholder -for template in main_workflow.yaml main_extended_workflow.yaml main_v2.yaml main_extended.yaml; do +for template in main_workflow.yaml main_extended_workflow.yaml main_v2.yaml main_extended.yaml datadog_integration_role.yaml main_agent_installation.yaml; do perl -pi -e "s//${BUCKET}/g" $template perl -pi -e "s//${VERSION}/g" $template done diff --git a/aws_quickstart/version.txt b/aws_quickstart/version.txt index c4475d31..cabad0ce 100644 --- a/aws_quickstart/version.txt +++ b/aws_quickstart/version.txt @@ -1 +1 @@ -v4.13.0 +v4.14.0