diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..c1969e5f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +.git +.github +.claude +venv +dist +frontend/node_modules +frontend/dist +node_modules +.DS_Store diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3bedf897..eda2bbaf 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,13 +8,41 @@ on: required: true type: string frontend_version: - required: false + required: true + type: string + ecs_version: + required: true type: string - default: "" lambda_matrix: required: false type: string default: "[]" + ecs_matrix: + required: false + type: string + default: "[]" + outputs: + code_bucket: + description: "Bucket containing build artifacts" + value: ${{ jobs.bucket.outputs.code_bucket_name }} + lambda_version: + description: "Resolved lambda version" + value: ${{ inputs.lambda_version }} + frontend_version: + description: "Resolved frontend version" + value: ${{ inputs.frontend_version }} + ecs_version: + description: "Resolved ECS version" + value: ${{ inputs.ecs_version }} + repository_url: + description: "ECR repository url" + value: ${{ jobs.ecr.outputs.repository_url }} + ecs_image_uris: + description: "List of full ECS image URIs built by this workflow" + value: ${{ jobs.containers.outputs.ecs_image_uris }} + lambda_s3_keys: + description: "List of lambda S3 object keys built by this workflow" + value: ${{ jobs.lambdas.outputs.lambda_s3_keys }} concurrency: # only run one instance of workflow at any one time group: build-${{ inputs.environment }} @@ -57,36 +85,84 @@ jobs: steps: - uses: actions/checkout@v6 - - name: Build frontend - uses: chrispsheehan/just-aws-oidc-action@0.3.0 - with: - just_action: frontend-build - - name: Upload frontend uses: chrispsheehan/just-aws-oidc-action@0.3.0 env: BUCKET_NAME: ${{ needs.bucket.outputs.code_bucket_name }} - VERSION: ${{ inputs.frontend_version != '' && inputs.frontend_version || inputs.lambda_version }} + VERSION: ${{ inputs.frontend_version }} with: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - just_action: frontend-upload + just_action: frontend-build frontend-upload - lambdas: - needs: bucket + ecr: runs-on: ubuntu-latest + outputs: + repository_url: ${{ steps.get_repository_url.outputs.repository_url }} + steps: + - uses: actions/checkout@v6 + + - name: Get ECR infra + id: get-ecr + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/ecr + tg_action: init + + - name: Get ECR repository url + id: get_repository_url + env: + TG_OUTPUTS: ${{ steps.get-ecr.outputs.tg_outputs }} + run: | + echo "repository_url=$(echo $TG_OUTPUTS | jq -r '.repository_url.value')" >> $GITHUB_OUTPUT + + containers: + needs: ecr + runs-on: ubuntu-latest + outputs: + ecs_image_uris: ${{ steps.image_uris.outputs.ecs_image_uris }} strategy: fail-fast: true matrix: - value: ${{ fromJson(inputs.lambda_matrix) }} + value: ${{ fromJson(inputs.ecs_matrix) }} steps: - uses: actions/checkout@v6 - - name: "Build ${{ matrix.value }} Lambda" + - name: "Build ${{ matrix.value }} ECS image" uses: chrispsheehan/just-aws-oidc-action@0.3.0 env: - LAMBDA_NAME: ${{ matrix.value }} + CONTAINER_NAME: ${{ matrix.value }} + IMAGE_URI: ${{ needs.ecr.outputs.repository_url }}:${{ matrix.value }}-${{ inputs.ecs_version }} with: - just_action: lambda-build + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + just_action: docker-build docker-push + + - name: Build ECS image URI list + if: ${{ matrix.value == fromJson(inputs.ecs_matrix)[0] }} + id: image_uris + shell: bash + env: + REPOSITORY_URL: ${{ needs.ecr.outputs.repository_url }} + ECS_VERSION: ${{ inputs.ecs_version }} + ECS_MATRIX: ${{ inputs.ecs_matrix }} + run: | + echo "ecs_image_uris=$(jq -cn \ + --arg repo "$REPOSITORY_URL" \ + --arg version "$ECS_VERSION" \ + --argjson images "$ECS_MATRIX" \ + '$images | map("\($repo):\(.)-\($version)")')" >> "$GITHUB_OUTPUT" + + lambdas: + needs: bucket + runs-on: ubuntu-latest + outputs: + lambda_s3_keys: ${{ steps.lambda_s3_keys.outputs.lambda_s3_keys }} + strategy: + fail-fast: true + matrix: + value: ${{ fromJson(inputs.lambda_matrix) }} + steps: + - uses: actions/checkout@v6 - name: "Upload ${{ matrix.value }} Lambda" uses: chrispsheehan/just-aws-oidc-action@0.3.0 @@ -96,4 +172,17 @@ jobs: VERSION: ${{ inputs.lambda_version }} with: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - just_action: lambda-upload + just_action: lambda-build lambda-upload + + - name: Build lambda S3 key list + if: ${{ matrix.value == fromJson(inputs.lambda_matrix)[0] }} + id: lambda_s3_keys + shell: bash + env: + LAMBDA_VERSION: ${{ inputs.lambda_version }} + LAMBDA_MATRIX: ${{ inputs.lambda_matrix }} + run: | + echo "lambda_s3_keys=$(jq -cn \ + --arg version "$LAMBDA_VERSION" \ + --argjson lambdas "$LAMBDA_MATRIX" \ + '$lambdas | map("lambdas/\($version)/\(.).zip")')" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/build_get.yml b/.github/workflows/build_get.yml index 25bd0017..d7515096 100644 --- a/.github/workflows/build_get.yml +++ b/.github/workflows/build_get.yml @@ -10,20 +10,34 @@ on: frontend_version: required: false type: string - default: "" + ecs_version: + required: false + type: string outputs: code_bucket: description: "Bucket containing build artifacts" value: ${{ jobs.bucket.outputs.code_bucket_name }} lambda_version: - description: "Valid lambda version" + description: "Resolved lambda version" value: ${{ inputs.lambda_version }} - lambda_version_files: - description: "List of lambda version files" - value: ${{ jobs.lambdas.outputs.lambda_version_files }} frontend_version: - description: "Valid frontend version" + description: "Resolved frontend version" value: ${{ inputs.frontend_version != '' && inputs.frontend_version || inputs.lambda_version }} + ecs_version: + description: "Resolved ECS version" + value: ${{ inputs.ecs_version != '' && inputs.ecs_version || inputs.lambda_version }} + ecs_image_uris: + description: "List of full ECS image URIs" + value: ${{ jobs.images.outputs.ecs_image_uris }} + ecs_task_matrix: + description: "List of ECS service names for the version" + value: ${{ jobs.images.outputs.ecs_task_matrix }} + lambda_version_files: + description: "List of lambda names" + value: ${{ jobs.lambdas.outputs.lambda_version_files }} + lambda_s3_keys: + description: "List of lambda S3 object keys" + value: ${{ jobs.lambdas.outputs.lambda_s3_keys }} concurrency: # only run one instance of workflow at any one time group: ${{ github.workflow }}-${{ inputs.environment }} @@ -59,7 +73,98 @@ jobs: env: TG_OUTPUTS: ${{ steps.code_action.outputs.tg_outputs }} run: | - echo "bucket=$(echo $TG_OUTPUTS | jq -r '.bucket.value')" >> $GITHUB_OUTPUT + echo "πŸ” Raw TG_OUTPUTS:" + echo "$TG_OUTPUTS" | jq . + + bucket=$(echo "$TG_OUTPUTS" | jq -r '.bucket.value // empty') + + if [ -z "$bucket" ] || [ "$bucket" = "null" ]; then + echo "::error title=Missing code bucket::Failed to extract '.bucket.value' from Terragrunt output for infra/live/${{ inputs.environment }}/aws/code_bucket" + echo "::error::Full TG_OUTPUTS:" + exit 1 + fi + + echo "bucket=$bucket" >> "$GITHUB_OUTPUT" + + ecr: + runs-on: ubuntu-latest + outputs: + repository_url: ${{ steps.get_repository_url.outputs.repository_url }} + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.lambda_version }} + + - name: Get ECR infra + id: get-ecr + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/ecr + tg_action: init + + - name: Get ECR repository url + id: get_repository_url + env: + TG_OUTPUTS: ${{ steps.get-ecr.outputs.tg_outputs }} + run: | + echo "πŸ” Raw TG_OUTPUTS:" + echo "$TG_OUTPUTS" | jq . + + repository_url=$(echo "$TG_OUTPUTS" | jq -r '.repository_url.value // empty') + + if [ -z "$repository_url" ] || [ "$repository_url" = "null" ]; then + echo "::error title=Missing ECR repository URL::Failed to extract '.repository_url.value' from Terragrunt output for infra/live/${{ inputs.environment }}/aws/ecr" + echo "::error::Full TG_OUTPUTS:" + exit 1 + fi + + echo "repository_url=$repository_url" >> "$GITHUB_OUTPUT" + + images: + needs: ecr + runs-on: ubuntu-latest + outputs: + ecs_image_uris: ${{ steps.image_uris.outputs.ecs_image_uris }} + ecs_task_matrix: ${{ steps.task_matrix.outputs.just_outputs }} + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.lambda_version }} + + - name: Get ECR version images + id: get_version_images + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + REPOSITORY_URL: ${{ needs.ecr.outputs.repository_url }} + VERSION: ${{ inputs.ecs_version != '' && inputs.ecs_version || inputs.lambda_version }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + just_action: get-ecr-version-images + + - name: Build full image URIs + id: image_uris + shell: bash + env: + REPOSITORY_URL: ${{ needs.ecr.outputs.repository_url }} + VERSION: ${{ inputs.ecs_version != '' && inputs.ecs_version || inputs.lambda_version }} + IMAGE_NAMES: ${{ steps.get_version_images.outputs.just_outputs }} + run: | + echo "ecs_image_uris=$(jq -cn \ + --arg repo "$REPOSITORY_URL" \ + --arg version "$VERSION" \ + --argjson images "$IMAGE_NAMES" \ + '$images | map("\($repo):\(.)-\($version)")')" >> "$GITHUB_OUTPUT" + + - name: Build ECS task matrix + id: task_matrix + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + REPOSITORY_URL: ${{ needs.ecr.outputs.repository_url }} + VERSION: ${{ inputs.ecs_version != '' && inputs.ecs_version || inputs.lambda_version }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + just_action: get-ecr-version-tasks frontend: needs: bucket @@ -83,6 +188,7 @@ jobs: runs-on: ubuntu-latest outputs: lambda_version_files: ${{ steps.get_build_files.outputs.just_outputs }} + lambda_s3_keys: ${{ steps.get_build_file_keys.outputs.just_outputs }} steps: - uses: actions/checkout@v6 @@ -98,7 +204,7 @@ jobs: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} just_action: lambda-check-version - - name: Get build files + - name: Get lambda names id: get_build_files uses: chrispsheehan/just-aws-oidc-action@0.3.0 env: @@ -107,3 +213,13 @@ jobs: with: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} just_action: get-version-files + + - name: Get lambda S3 keys + id: get_build_file_keys + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + BUCKET_NAME: ${{ needs.bucket.outputs.code_bucket_name }} + VERSION: ${{ inputs.lambda_version }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + just_action: get-version-file-keys diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 6b56e5f2..dcc64a27 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -5,27 +5,36 @@ on: description: environment reference i.e. 'prod' or 'dev' required: true type: string + lambda_version: + description: "Valid lambda version" + required: true + type: string + frontend_version: + description: "Valid frontend version" + required: false + type: string + default: "" code_bucket: description: "Bucket containing lambda and frontend zips" required: true type: string - lambda_version: - description: "Valid lambda version" - required: true + ecs_image_uris: + description: "List of full ECS image URIs" + required: false type: string + default: "[]" lambda_matrix: required: false type: string default: "[]" + task_matrix: + required: false + type: string + default: "[]" lambda_keep: description: "Number of lambda versions to keep" default: '5' type: string - frontend_version: - description: "Valid frontend version" - required: false - type: string - default: "" concurrency: # only run one instance of workflow at any one time @@ -95,47 +104,20 @@ jobs: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} just_action: lambda-get-version - - name: Upload AppSpec bundle + - name: Run CodeDeploy uses: chrispsheehan/just-aws-oidc-action@0.3.0 env: + KEEP: ${{ inputs.lambda_keep }} BUCKET_NAME: ${{ inputs.code_bucket }} FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }} ALIAS_NAME: ${{ steps.get_infra_detail.outputs.lambda_alias_name }} CURRENT_VERSION: ${{ steps.get-version.outputs.just_outputs }} NEW_VERSION: ${{ steps.publish.outputs.just_outputs }} - APP_SPEC_FILE: ${{ github.workspace }}/appspec.yml - APP_SPEC_KEY: ${{ steps.appspec.outputs.lambda_appspec_key }} - with: - aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - just_action: lambda-upload-bundle - - - name: Set Alarms to OK for CodeDeploy (if applicable) - uses: chrispsheehan/just-aws-oidc-action@0.3.0 - env: - FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }} - with: - aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - just_action: lambda-set-code-deploy-alarms - - - name: Run CodeDeploy - uses: chrispsheehan/just-aws-oidc-action@0.3.0 - env: - FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }} - BUCKET_NAME: ${{ inputs.code_bucket }} + APP_SPEC_FILE: ${{ github.workspace }}/appspec-lambda.yml APP_SPEC_KEY: ${{ steps.appspec.outputs.lambda_appspec_key }} with: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - just_action: lambda-deploy - - - name: Prune old lambda versions - uses: chrispsheehan/just-aws-oidc-action@0.3.0 - env: - KEEP: ${{ inputs.lambda_keep }} - FUNCTION_NAME: ${{ steps.get_infra_detail.outputs.lambda_function_name }} - ALIAS_NAME: ${{ steps.get_infra_detail.outputs.lambda_alias_name }} - with: - aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - just_action: lambda-prune + just_action: lambda-upload-bundle lambda-set-code-deploy-alarms lambda-deploy lambda-prune frontend: runs-on: ubuntu-latest @@ -168,12 +150,136 @@ jobs: DISTRIBUTION_ID: ${{ steps.get_infra_detail.outputs.distribution_id }} with: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - just_action: frontend-deploy + just_action: frontend-deploy frontend-invalidate - - name: Invalidate CloudFront cache + tasks: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + value: ${{ fromJson(inputs.task_matrix) }} + steps: + - uses: actions/checkout@v6 + + - name: Resolve image URIs + id: image_uris + env: + ECS_IMAGE_URIS: ${{ inputs.ecs_image_uris }} + TASK_NAME: ${{ matrix.value }} uses: chrispsheehan/just-aws-oidc-action@0.3.0 + with: + just_action: ecs-task-get-image-uris + + - name: Set image outputs + id: task_inputs env: - DISTRIBUTION_ID: ${{ steps.get_infra_detail.outputs.distribution_id }} + IMAGE_URIS_JSON: ${{ steps.image_uris.outputs.just_outputs }} + run: | + echo "service_image_uri=$(echo "$IMAGE_URIS_JSON" | jq -r '.service_image_uri')" >> "$GITHUB_OUTPUT" + echo "debug_image_uri=$(echo "$IMAGE_URIS_JSON" | jq -r '.debug_image_uri')" >> "$GITHUB_OUTPUT" + echo "otel_image_uri=$(echo "$IMAGE_URIS_JSON" | jq -r '.otel_image_uri')" >> "$GITHUB_OUTPUT" + + - name: Deploy ${{ matrix.value }} ECS task + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + env: + TF_VAR_image_uri: ${{ steps.task_inputs.outputs.service_image_uri }} + TF_VAR_debug_image_uri: ${{ steps.task_inputs.outputs.debug_image_uri }} + TF_VAR_aws_otel_collector_image_uri: ${{ steps.task_inputs.outputs.otel_image_uri }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/task_${{ matrix.value }} + + ecs: + runs-on: ubuntu-latest + needs: tasks + strategy: + fail-fast: false + matrix: + value: ${{ fromJson(inputs.task_matrix) }} + steps: + - uses: actions/checkout@v6 + + - name: Get ${{ matrix.value }} task infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + id: get-task-infra + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/task_${{ matrix.value }} + tg_action: init + + - name: Get ${{ matrix.value }} task outputs + id: get-task-outputs + env: + TG_OUTPUTS: ${{ steps.get-task-infra.outputs.tg_outputs }} + run: | + echo "task_definition_arn=$(echo "$TG_OUTPUTS" | jq -r '.task_definition_arn.value')" >> "$GITHUB_OUTPUT" + echo "container_name=$(echo "$TG_OUTPUTS" | jq -r '.service_name.value')" >> "$GITHUB_OUTPUT" + + - name: Get ${{ matrix.value }} service infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + id: get-service-infra + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/service_${{ matrix.value }} + tg_action: init + + - name: Get ${{ matrix.value }} service outputs + id: get-service-outputs + env: + SERVICE_OUTPUTS: ${{ steps.get-service-infra.outputs.tg_outputs }} + run: | + echo "service_name=$(echo "$SERVICE_OUTPUTS" | jq -r '.service_name.value')" >> "$GITHUB_OUTPUT" + echo "cluster_name=$(echo "$SERVICE_OUTPUTS" | jq -r '.cluster_name.value')" >> "$GITHUB_OUTPUT" + echo "container_port=$(echo "$SERVICE_OUTPUTS" | jq -r '.container_port.value')" >> "$GITHUB_OUTPUT" + echo "codedeploy_app_name=$(echo "$SERVICE_OUTPUTS" | jq -r '.codedeploy_app_name.value')" >> "$GITHUB_OUTPUT" + echo "codedeploy_group_name=$(echo "$SERVICE_OUTPUTS" | jq -r '.codedeploy_deployment_group_name.value')" >> "$GITHUB_OUTPUT" + echo "app_spec_key=ecs/${{ inputs.environment }}/${{ matrix.value }}-$(echo "${{ steps.get-task-outputs.outputs.task_definition_arn }}" | awk -F: '{print $NF}').yml" >> "$GITHUB_OUTPUT" + + - name: Work out ECS deployment mode + id: deploy_mode + env: + CODE_DEPLOY_APP_NAME: ${{ steps.get-service-outputs.outputs.codedeploy_app_name }} + CODE_DEPLOY_GROUP_NAME: ${{ steps.get-service-outputs.outputs.codedeploy_group_name }} + run: | + if [[ -n "$CODE_DEPLOY_APP_NAME" && "$CODE_DEPLOY_APP_NAME" != "null" && -n "$CODE_DEPLOY_GROUP_NAME" && "$CODE_DEPLOY_GROUP_NAME" != "null" ]]; then + echo "mode=codedeploy" >> "$GITHUB_OUTPUT" + else + echo "mode=rolling" >> "$GITHUB_OUTPUT" + fi + + - name: Upload ECS AppSpec bundle + if: ${{ steps.deploy_mode.outputs.mode == 'codedeploy' }} + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + BUCKET_NAME: ${{ inputs.code_bucket }} + APP_SPEC_FILE: ${{ github.workspace }}/appspec-ecs.rendered.yml + APP_SPEC_KEY: ${{ steps.get-service-outputs.outputs.app_spec_key }} + TASK_DEFINITION_ARN: ${{ steps.get-task-outputs.outputs.task_definition_arn }} + CONTAINER_NAME: ${{ steps.get-task-outputs.outputs.container_name }} + CONTAINER_PORT: ${{ steps.get-service-outputs.outputs.container_port }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + just_action: ecs-upload-bundle + + - name: Run ECS CodeDeploy + if: ${{ steps.deploy_mode.outputs.mode == 'codedeploy' }} + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + BUCKET_NAME: ${{ inputs.code_bucket }} + APP_SPEC_KEY: ${{ steps.get-service-outputs.outputs.app_spec_key }} + CODE_DEPLOY_APP_NAME: ${{ steps.get-service-outputs.outputs.codedeploy_app_name }} + CODE_DEPLOY_GROUP_NAME: ${{ steps.get-service-outputs.outputs.codedeploy_group_name }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + just_action: ecs-deploy + + - name: Run ECS rolling deploy + if: ${{ steps.deploy_mode.outputs.mode == 'rolling' }} + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + CLUSTER_NAME: ${{ steps.get-service-outputs.outputs.cluster_name }} + SERVICE_NAME: ${{ steps.get-service-outputs.outputs.service_name }} + TASK_DEFINITION_ARN: ${{ steps.get-task-outputs.outputs.task_definition_arn }} with: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - just_action: frontend-invalidate \ No newline at end of file + just_action: ecs-rolling-deploy diff --git a/.github/workflows/deploy_dev_code_only.yml b/.github/workflows/deploy_dev_code_only.yml index b2cb132b..f97501a6 100644 --- a/.github/workflows/deploy_dev_code_only.yml +++ b/.github/workflows/deploy_dev_code_only.yml @@ -3,32 +3,14 @@ name: Dev Deploy Code Only on: workflow_dispatch: - inputs: - version: - description: "Git ref to deploy" - required: false - default: "" permissions: id-token: write contents: write -env: - VERSION: ${{ inputs.version != '' && inputs.version || github.sha }} - jobs: setup: - runs-on: ubuntu-latest - outputs: - lambdas_dirs: ${{ steps.lambdas_dirs.outputs.just_outputs }} - steps: - - uses: actions/checkout@v6 - - - name: Get lambdas Directories - id: lambdas_dirs - uses: chrispsheehan/just-aws-oidc-action@0.3.0 - with: - just_action: lambda-get-directories + uses: ./.github/workflows/get_directories.yml build: uses: ./.github/workflows/build.yml @@ -36,23 +18,33 @@ jobs: - setup with: environment: dev - lambda_version: ${{ inputs.version != '' && inputs.version || github.sha }} - lambda_matrix: ${{ needs.setup.outputs.lambdas_dirs }} + lambda_version: ${{ github.sha }} + frontend_version: ${{ github.sha }} + ecs_version: ${{ github.sha }} + lambda_matrix: ${{ needs.setup.outputs.lambda_dirs }} + ecs_matrix: ${{ needs.setup.outputs.container_dirs }} + get_build: needs: build uses: ./.github/workflows/build_get.yml with: environment: dev - lambda_version: ${{ inputs.version != '' && inputs.version || github.sha }} + lambda_version: ${{ needs.build.outputs.lambda_version }} + frontend_version: ${{ needs.build.outputs.frontend_version }} + ecs_version: ${{ needs.build.outputs.ecs_version }} deploy: uses: ./.github/workflows/deploy.yml needs: + - setup + - build - get_build with: environment: dev + lambda_version: ${{ needs.build.outputs.lambda_version }} + frontend_version: ${{ needs.build.outputs.frontend_version }} code_bucket: ${{ needs.get_build.outputs.code_bucket }} - lambda_version: ${{ needs.get_build.outputs.lambda_version }} - lambda_matrix: ${{ needs.get_build.outputs.lambda_version_files }} - frontend_version: ${{ needs.get_build.outputs.frontend_version }} + lambda_matrix: ${{ needs.setup.outputs.lambda_dirs }} + task_matrix: ${{ needs.get_build.outputs.ecs_task_matrix }} + ecs_image_uris: ${{ needs.get_build.outputs.ecs_image_uris }} diff --git a/.github/workflows/deploy_dev_full.yml b/.github/workflows/deploy_dev_full.yml index 84ef6cf6..17020023 100644 --- a/.github/workflows/deploy_dev_full.yml +++ b/.github/workflows/deploy_dev_full.yml @@ -1,4 +1,4 @@ -name: Dev Deploy Infra and Code +name: Dev Deploy Full on: workflow_dispatch: @@ -9,17 +9,7 @@ permissions: jobs: setup: - runs-on: ubuntu-latest - outputs: - lambdas_dirs: ${{ steps.lambdas_dirs.outputs.just_outputs }} - steps: - - uses: actions/checkout@v6 - - - name: Get lambdas Directories - id: lambdas_dirs - uses: chrispsheehan/just-aws-oidc-action@0.3.0 - with: - just_action: lambda-get-directories + uses: ./.github/workflows/get_directories.yml code: uses: ./.github/workflows/infra_releases.yml @@ -29,14 +19,16 @@ jobs: infra: needs: - - code - setup + - code uses: ./.github/workflows/infra.yml with: environment: dev infra_version: ${{ github.sha }} code_bucket: ${{ needs.code.outputs.code_bucket }} - lambda_matrix: ${{ needs.setup.outputs.lambdas_dirs }} + lambda_matrix: ${{ needs.setup.outputs.lambda_dirs }} + bootstrap_image_uri: ${{ needs.code.outputs.bootstrap_image_uri }} + service_matrix: ${{ needs.setup.outputs.ecs_service_dirs }} build: uses: ./.github/workflows/build.yml @@ -46,7 +38,10 @@ jobs: with: environment: dev lambda_version: ${{ github.sha }} - lambda_matrix: ${{ needs.setup.outputs.lambdas_dirs }} + frontend_version: ${{ github.sha }} + ecs_version: ${{ github.sha }} + lambda_matrix: ${{ needs.setup.outputs.lambda_dirs }} + ecs_matrix: ${{ needs.setup.outputs.container_dirs }} get_build: needs: build @@ -54,15 +49,21 @@ jobs: with: environment: dev lambda_version: ${{ github.sha }} + frontend_version: ${{ github.sha }} + ecs_version: ${{ github.sha }} deploy: uses: ./.github/workflows/deploy.yml needs: + - setup + - build - get_build - infra with: environment: dev + lambda_version: ${{ needs.build.outputs.lambda_version }} + frontend_version: ${{ needs.build.outputs.frontend_version }} code_bucket: ${{ needs.get_build.outputs.code_bucket }} - lambda_version: ${{ needs.get_build.outputs.lambda_version }} - lambda_matrix: ${{ needs.get_build.outputs.lambda_version_files }} - frontend_version: ${{ needs.get_build.outputs.frontend_version }} + lambda_matrix: ${{ needs.setup.outputs.lambda_dirs }} + task_matrix: ${{ needs.get_build.outputs.ecs_task_matrix }} + ecs_image_uris: ${{ needs.get_build.outputs.ecs_image_uris }} diff --git a/.github/workflows/deploy_prod_code_only.yml b/.github/workflows/deploy_prod_code_only.yml index e31afdd6..b2262925 100644 --- a/.github/workflows/deploy_prod_code_only.yml +++ b/.github/workflows/deploy_prod_code_only.yml @@ -29,9 +29,12 @@ jobs: - get_build with: environment: prod - code_bucket: ${{ needs.get_build.outputs.code_bucket }} lambda_version: ${{ needs.get_build.outputs.lambda_version }} - lambda_matrix: ${{ needs.get_build.outputs.lambda_version_files }} frontend_version: ${{ needs.get_build.outputs.frontend_version }} + code_bucket: ${{ needs.get_build.outputs.code_bucket }} + lambda_matrix: ${{ needs.get_build.outputs.lambda_version_files }} + task_matrix: ${{ needs.get_build.outputs.ecs_task_matrix }} + ecs_image_uris: ${{ needs.get_build.outputs.ecs_image_uris }} # we can also define a code-only deployment here if needed, as below # lambda_matrix: '["api"]' + # task_matrix: '["worker"]' diff --git a/.github/workflows/deploy_prod_full.yml b/.github/workflows/deploy_prod_full.yml index 7aadc882..ff53cec5 100644 --- a/.github/workflows/deploy_prod_full.yml +++ b/.github/workflows/deploy_prod_full.yml @@ -1,4 +1,4 @@ -name: Prod Deploy Infra and Code +name: Prod Deploy Full on: workflow_dispatch: @@ -13,6 +13,7 @@ jobs: with: environment: ci lambda_version: 0.7.2 + frontend_version: 0.7.2 infra: needs: @@ -31,7 +32,7 @@ jobs: - infra # this is only to ensure infra runs before deploy no dependencies on infra outputs i.e. infra is managed separately with: environment: prod - code_bucket: ${{ needs.get_build.outputs.code_bucket }} lambda_version: ${{ needs.get_build.outputs.lambda_version }} - lambda_matrix: ${{ needs.get_build.outputs.lambda_version_files }} frontend_version: ${{ needs.get_build.outputs.frontend_version }} + code_bucket: ${{ needs.get_build.outputs.code_bucket }} + lambda_matrix: ${{ needs.get_build.outputs.lambda_version_files }} diff --git a/.github/workflows/destroy.yml b/.github/workflows/destroy.yml index 3defe623..6d5b0854 100644 --- a/.github/workflows/destroy.yml +++ b/.github/workflows/destroy.yml @@ -25,60 +25,116 @@ env: jobs: setup: + uses: ./.github/workflows/get_directories.yml + + lambdas: runs-on: ubuntu-latest - outputs: - lambda_dirs: ${{ steps.lambda_dirs.outputs.just_outputs }} + needs: setup + strategy: + fail-fast: false + matrix: + value: ${{ fromJson(needs.setup.outputs.lambda_dirs) }} steps: - uses: actions/checkout@v6 - - uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + - name: Deploy ${{ matrix.value }} infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 with: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - tg_directory: infra/live/${{ inputs.environment }}/aws/oidc + tg_directory: infra/live/${{ inputs.environment }}/aws/${{ matrix.value }} + tg_action: destroy - - name: Get Lambda Directories - id: lambda_dirs - uses: chrispsheehan/just-aws-oidc-action@0.3.0 + frontend: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Destroy frontend infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + env: + TF_VAR_api_invoke_url: "https://placeholder.execute-api.us-east-1.amazonaws.com" with: - just_action: lambda-get-directories + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/frontend + tg_action: destroy - lambdas: - needs: setup + services: runs-on: ubuntu-latest + needs: setup strategy: fail-fast: false matrix: - value: ${{ fromJson(needs.setup.outputs.lambda_dirs) }} + value: ${{ fromJson(needs.setup.outputs.ecs_service_dirs) }} steps: - uses: actions/checkout@v6 - - name: Deploy ${{ matrix.value }} infra + - name: Destroy ${{ matrix.value }} infra uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + env: + TF_VAR_bootstrap: "true" + TF_VAR_bootstrap_image_uri: "destroy-placeholder" with: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} tg_directory: infra/live/${{ inputs.environment }}/aws/${{ matrix.value }} tg_action: destroy - frontend: - needs: lambdas + tasks: runs-on: ubuntu-latest + needs: + - setup + - services + strategy: + fail-fast: false + matrix: + value: ${{ fromJson(needs.setup.outputs.task_dirs) }} steps: - uses: actions/checkout@v6 - - name: Destroy frontend infra + - name: Destroy ${{ matrix.value }} infra uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 env: - TF_VAR_api_invoke_url: "https://placeholder.execute-api.us-east-1.amazonaws.com" + TF_VAR_image_uri: "destroy-placeholder" + TF_VAR_debug_image_uri: "destroy-placeholder" + TF_VAR_aws_otel_collector_image_uri: "destroy-placeholder" with: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} - tg_directory: infra/live/${{ inputs.environment }}/aws/frontend + tg_directory: infra/live/${{ inputs.environment }}/aws/${{ matrix.value }} + tg_action: destroy + + network: + needs: + - frontend + - services + - tasks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Destroy network infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/network tg_action: destroy - build: + security: + needs: + - network + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Destroy security infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/security + tg_action: destroy + + build-bucket: if: inputs.environment != 'prod' needs: - lambdas - - frontend runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 @@ -89,3 +145,32 @@ jobs: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} tg_directory: infra/live/${{ inputs.environment }}/aws/code_bucket tg_action: destroy + + ecr: + if: inputs.environment != 'prod' + needs: + - network + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Destroy code + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/ecr + tg_action: destroy + + cluster: + needs: + - network + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Destroy cluster infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/cluster + tg_action: destroy diff --git a/.github/workflows/get_changes.yml b/.github/workflows/get_changes.yml index d6b24bb9..66fc93e8 100644 --- a/.github/workflows/get_changes.yml +++ b/.github/workflows/get_changes.yml @@ -21,6 +21,9 @@ on: lambdas: description: "Whether lambdas files changed" value: ${{ jobs.changes.outputs.lambdas == 'true' }} + containers: + description: "Whether container files changed" + value: ${{ jobs.changes.outputs.containers == 'true' }} frontend: description: "Whether frontend files changed" value: ${{ jobs.changes.outputs.frontend == 'true' }} @@ -34,6 +37,7 @@ jobs: github: ${{ steps.filter.outputs.github }} frontend: ${{ steps.filter.outputs.frontend }} lambdas: ${{ steps.filter.outputs.lambdas }} + containers: ${{ steps.filter.outputs.containers }} steps: - name: Checkout repository uses: actions/checkout@v6 @@ -54,6 +58,8 @@ jobs: - '.github/**' lambdas: - 'lambdas/**' + containers: + - 'containers/**' frontend: - 'frontend/**' token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/get_directories.yml b/.github/workflows/get_directories.yml new file mode 100644 index 00000000..70167209 --- /dev/null +++ b/.github/workflows/get_directories.yml @@ -0,0 +1,60 @@ +on: + workflow_call: + outputs: + lambda_dirs: + description: "List of lambda directory names" + value: ${{ jobs.directories.outputs.lambda_dirs }} + container_dirs: + description: "List of container directory names" + value: ${{ jobs.directories.outputs.container_dirs }} + service_dirs: + description: "List of ECS service directory names" + value: ${{ jobs.directories.outputs.service_dirs }} + task_dirs: + description: "List of ECS task directory names" + value: ${{ jobs.directories.outputs.task_dirs }} + ecs_service_dirs: + description: "List of ECS service stack directory names" + value: ${{ jobs.directories.outputs.ecs_service_dirs }} + +jobs: + directories: + runs-on: ubuntu-latest + outputs: + lambda_dirs: ${{ steps.lambda_dirs.outputs.just_outputs }} + container_dirs: ${{ steps.container_dirs.outputs.just_outputs }} + service_dirs: ${{ steps.service_dirs.outputs.just_outputs }} + task_dirs: ${{ steps.task_dirs.outputs.just_outputs }} + ecs_service_dirs: ${{ steps.ecs_service_dirs.outputs.just_outputs }} + steps: + - uses: actions/checkout@v6 + + - name: Get Lambda Directories + id: lambda_dirs + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + with: + just_action: lambda-get-directories + + - name: Get Service Directories + id: service_dirs + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + with: + just_action: service-get-directories + + - name: Get Container Directories + id: container_dirs + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + with: + just_action: container-get-directories + + - name: Get Task Directories + id: task_dirs + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + with: + just_action: task-get-directories + + - name: Get ECS Service Directories + id: ecs_service_dirs + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + with: + just_action: ecs-service-get-directories diff --git a/.github/workflows/infra.yml b/.github/workflows/infra.yml index f47c3a6c..4bb02296 100644 --- a/.github/workflows/infra.yml +++ b/.github/workflows/infra.yml @@ -13,10 +13,19 @@ on: description: "Bucket containing build artifacts" required: true type: string + bootstrap_image_uri: + description: "Bootstrap ECS image URI" + required: false + type: string + default: "" lambda_matrix: required: false type: string default: "[]" + service_matrix: + required: false + type: string + default: "[]" concurrency: # only run one instance of workflow at any one time @@ -75,9 +84,54 @@ jobs: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} tg_directory: infra/live/${{ inputs.environment }}/aws/frontend - lambdas: + cluster: + needs: oidc + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.infra_version }} + + - name: Deploy cluster infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/cluster + + security: needs: oidc runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.infra_version }} + + - name: Deploy security infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/security + + network: + needs: security + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.infra_version }} + + - name: Deploy network infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/network + + lambdas: + needs: + - oidc + - security + - network + runs-on: ubuntu-latest strategy: fail-fast: false # this is to prevent terraform lock issues matrix: @@ -92,3 +146,26 @@ jobs: with: aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} tg_directory: infra/live/${{ inputs.environment }}/aws/${{ matrix.value }} + + services: + needs: + - cluster + - network + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + value: ${{ fromJson(inputs.service_matrix) }} + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.infra_version }} + + - name: Deploy ${{ matrix.value }} bootstrap service infra + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + env: + TF_VAR_bootstrap: "true" + TF_VAR_bootstrap_image_uri: ${{ inputs.bootstrap_image_uri }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/${{ matrix.value }} diff --git a/.github/workflows/infra_releases.yml b/.github/workflows/infra_releases.yml index 36d53ccb..c935083a 100644 --- a/.github/workflows/infra_releases.yml +++ b/.github/workflows/infra_releases.yml @@ -13,6 +13,12 @@ on: code_bucket: description: "Bucket containing build artifacts" value: ${{ jobs.bucket.outputs.bucket_name }} + repository_url: + description: "ECR repository url" + value: ${{ jobs.ecr.outputs.repository_url }} + bootstrap_image_uri: + description: "Bootstrap ECS image URI" + value: ${{ jobs.bootstrap.outputs.bootstrap_image_uri }} concurrency: # only run one instance of workflow at any one time group: ${{ github.workflow }}-${{ inputs.environment }} @@ -26,6 +32,55 @@ env: AWS_OIDC_ROLE_ARN: arn:aws:iam::${{ vars.AWS_ACCOUNT_ID }}:role/aws-serverless-github-deploy-${{ inputs.environment }}-github-oidc-role jobs: + ecr: + runs-on: ubuntu-latest + outputs: + repository_url: ${{ steps.get_repository_url.outputs.repository_url }} + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.infra_version }} + + - name: Deploy ECR + id: deploy_ecr + uses: chrispsheehan/terragrunt-aws-oidc-action@0.4.1 + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + tg_directory: infra/live/${{ inputs.environment }}/aws/ecr + + - name: Get ECR repository url + id: get_repository_url + env: + TG_OUTPUTS: ${{ steps.deploy_ecr.outputs.tg_outputs }} + run: | + echo "repository_url=$(echo "$TG_OUTPUTS" | jq -r '.repository_url.value')" >> "$GITHUB_OUTPUT" + + bootstrap: + needs: ecr + runs-on: ubuntu-latest + outputs: + bootstrap_image_uri: ${{ steps.set_bootstrap_image_uri.outputs.bootstrap_image_uri }} + steps: + - uses: actions/checkout@v6 + with: + ref: ${{ inputs.infra_version }} + + - name: Build and push bootstrap image + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + SOURCE_IMAGE: nginx:latest + IMAGE_URI: ${{ needs.ecr.outputs.repository_url }}:bootstrap-${{ inputs.infra_version }} + with: + aws_oidc_role_arn: ${{ env.AWS_OIDC_ROLE_ARN }} + just_action: docker-mirror docker-push + + - name: Set bootstrap image uri + id: set_bootstrap_image_uri + env: + BOOTSTRAP_IMAGE_URI: ${{ needs.ecr.outputs.repository_url }}:bootstrap-${{ inputs.infra_version }} + run: | + echo "bootstrap_image_uri=$BOOTSTRAP_IMAGE_URI" >> "$GITHUB_OUTPUT" + bucket: runs-on: ubuntu-latest outputs: diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 50f646f3..6dcfb6ed 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -108,30 +108,70 @@ jobs: fi echo "βœ… All lambda directories use underscores." - setup-lambdas: - if: ${{ needs.check.outputs.lambdas == 'true' }} + check-ecs-module-pairs: needs: check runs-on: ubuntu-latest - outputs: - lambda_dirs: ${{ steps.lambda_dirs.outputs.just_outputs }} + name: Check ECS task/service module pairs steps: - uses: actions/checkout@v6 - - name: Get Lambda Directories - id: lambda_dirs - uses: chrispsheehan/just-aws-oidc-action@0.3.0 - with: - just_action: lambda-get-directories + + - name: Fail if task_/service_ pairs are incomplete + shell: bash + run: | + set -euo pipefail + + missing=0 + + while IFS= read -r env_dir; do + env_name="$(basename "$env_dir")" + aws_dir="$env_dir/aws" + + [ -d "$aws_dir" ] || continue + + while IFS= read -r service_dir; do + service_name="$(basename "$service_dir")" + suffix="${service_name#service_}" + task_dir="$aws_dir/task_$suffix" + + if [ ! -d "$task_dir" ]; then + echo "::error::❌ Missing task_$suffix for $service_name in $aws_dir" + missing=1 + fi + done < <(find "$aws_dir" -mindepth 1 -maxdepth 1 -type d -name 'service_*' | sort) + + while IFS= read -r task_dir; do + task_name="$(basename "$task_dir")" + suffix="${task_name#task_}" + service_dir="$aws_dir/service_$suffix" + + if [ ! -d "$service_dir" ]; then + echo "::error::❌ Missing service_$suffix for $task_name in $aws_dir" + missing=1 + fi + done < <(find "$aws_dir" -mindepth 1 -maxdepth 1 -type d -name 'task_*' | sort) + done < <(find infra/live -mindepth 1 -maxdepth 1 -type d | sort) + + if [ "$missing" -ne 0 ]; then + exit 1 + fi + + echo "βœ… All ECS task_/service_ pairs are present." + + setup: + if: ${{ needs.check.outputs.lambdas == 'true' || needs.check.outputs.containers == 'true' }} + needs: check + uses: ./.github/workflows/get_directories.yml build-lambdas: if: ${{ needs.check.outputs.lambdas == 'true' }} needs: - check - - setup-lambdas + - setup runs-on: ubuntu-latest strategy: fail-fast: false matrix: - value: ${{ fromJson(needs.setup-lambdas.outputs.lambda_dirs) }} + value: ${{ fromJson(needs.setup.outputs.lambda_dirs) }} steps: - uses: actions/checkout@v6 @@ -142,9 +182,31 @@ jobs: with: just_action: lambda-build + build-containers: + if: ${{ needs.check.outputs.containers == 'true' }} + needs: + - check + - check-ecs-module-pairs + - setup + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + value: ${{ fromJson(needs.setup.outputs.container_dirs) }} + steps: + - uses: actions/checkout@v6 + + - name: "Build ${{ matrix.value }} ECS image" + uses: chrispsheehan/just-aws-oidc-action@0.3.0 + env: + CONTAINER_NAME: ${{ matrix.value }} + with: + just_action: docker-build + build-frontend: if: ${{ needs.check.outputs.frontend == 'true' }} - needs: check + needs: + - check runs-on: ubuntu-latest name: Build frontend timeout-minutes: 5 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 265aaa31..3d8ff69c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -87,16 +87,7 @@ jobs: echo "EOF" >> $GITHUB_OUTPUT get-apps: - runs-on: ubuntu-latest - outputs: - lambdas_dirs: ${{ steps.lambdas_dirs.outputs.just_outputs }} - steps: - - uses: actions/checkout@v6 - - name: Get lambdas Directories - id: lambdas_dirs - uses: chrispsheehan/just-aws-oidc-action@0.3.0 - with: - just_action: lambda-get-directories + uses: ./.github/workflows/get_directories.yml build: @@ -112,7 +103,10 @@ jobs: with: environment: ci lambda_version: ${{ needs.get-next-tag.outputs.tag }} - lambda_matrix: ${{ needs.get-apps.outputs.lambdas_dirs }} + frontend_version: ${{ needs.get-next-tag.outputs.tag }} + ecs_version: ${{ needs.get-next-tag.outputs.tag }} + lambda_matrix: ${{ needs.get-apps.outputs.lambda_dirs }} + ecs_matrix: ${{ needs.get-apps.outputs.container_dirs }} code: needs: diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..2c51f7e5 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,77 @@ +# Repo Instructions + +## Documentation + +Update documentation in the same change: + +- update the repo root `README.md` for cross-cutting behavior changes +- update affected module `README.md` files under `infra/modules/**` for module contract or responsibility changes + +## Deployment Guide + +Choose deployment modes that match the runtime shape. + +### Lambda + +- `all_at_once` + Use for background jobs and low-risk changes where fastest rollout is preferred. +- `canary` + Use for request-serving Lambdas such as APIs where a partial rollout and automatic rollback are valuable. +- `linear` + Use for user-facing or higher-risk Lambdas when you want a steadier rollout than canary. + +### ECS + +- `rolling` + Use for ECS services that are not load-balanced in this repo's model, such as internal workers without `internal_dns` or `vpc_link`. +- `all_at_once` + Use for load-balanced ECS services when you want CodeDeploy but do not need gradual traffic shifting. +- `canary` + Use for load-balanced ECS services where you want partial traffic shifting before full promotion. +- `linear` + Use for load-balanced ECS services where you want a gradual, repeated traffic shift. +- `blue_green` + Treat as an alias of ECS CodeDeploy all-at-once semantics unless and until the repo differentiates it further. + +### ECS Constraints + +- ECS CodeDeploy requires a load-balanced service shape in this repo. +- In practice that means `connection_type` must be `internal_dns` or `vpc_link` for CodeDeploy-backed ECS deploys. +- If `connection_type = "internal"`, prefer `rolling`. + +## Feasibility Check + +Before implementing deployment-related changes, check that the requested combination is feasible in the current repo shape. + +### What To Check + +- runtime type: Lambda or ECS +- deployment mode: `rolling`, `all_at_once`, `canary`, `linear`, or `blue_green` +- connection type for ECS: `internal`, `internal_dns`, or `vpc_link` +- whether the service is load-balanced +- whether the required infra resources already exist, such as: + - CodeDeploy app and deployment group + - target groups and listeners + - VPC link + - alarm inputs + +### Expected Behavior + +- If the combination is valid, proceed with implementation. +- If the combination is invalid or incomplete, say so clearly and explain the missing requirement. +- If a requested combination is not feasible in the current repo shape, explicitly state that it fails the feasibility check and say what would need to change to make it feasible. +- Prefer the smallest viable change that matches the requested behavior. + +## CI Dependency Safety + +When changing CI workflows or Terraform module dependencies, check dependency behavior across the full lifecycle, not just the happy path. + +- check apply, deploy, and destroy behavior +- when the same setup or lookup pattern appears in multiple workflows, suggest extracting it into a shared reusable workflow or shared `just` recipe instead of repeating it +- check workflow dependency wiring such as `needs`, job outputs, matrix values, and reused workflow inputs +- watch for `data.terraform_remote_state` dependencies that can fail if another stack has not been created yet or has already been destroyed +- check required Terraform input variables on destroy paths as well as apply paths; destroy can still fail before resource deletion if required vars are unset +- make sure every referenced `needs..outputs.*` value is actually in scope for that job +- make sure matrix values match the expected naming contract for the workflow, module, or path being used +- prefer making modules tolerant of unnecessary upstream state dependencies where possible +- do not change CI ordering blindly; first check whether the real issue is an avoidable cross-stack dependency diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..54105059 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,18 @@ +# Contributing + +## Docs Expectations + +Keep documentation aligned with code changes: + +- CI/CD behavior +- Terraform module inputs or outputs +- deployment strategy +- bootstrap behavior +- operator-facing commands + +Also update the affected module `README.md` files under `infra/modules/**` whenever module responsibilities, dependencies, inputs, or outputs change. + +## Working Style + +- keep module READMEs short and operational +- prefer updating existing docs in the same PR rather than leaving follow-up documentation tasks diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..74508ac8 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,32 @@ +FROM python:3.12-slim AS python-base + +WORKDIR /usr/app + +COPY containers/worker/requirements.txt /tmp/requirements-worker.txt +RUN pip install --no-cache-dir -r /tmp/requirements-worker.txt + + +FROM python-base AS worker + +COPY containers/worker/app.py /usr/app/app.py + +CMD ["python", "-u", "app.py"] + + +FROM python-base AS debug + +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl \ + && rm -rf /var/lib/apt/lists/* + +CMD ["sleep", "infinity"] + + +FROM public.ecr.aws/aws-observability/aws-otel-collector:latest AS collector + +COPY config/otel/collector-config.yaml /opt/aws/aws-otel-collector/etc/collector-config.yaml + +CMD ["--config", "/opt/aws/aws-otel-collector/etc/collector-config.yaml"] + + +FROM collector AS otel_collector diff --git a/README.md b/README.md index a98fe040..3faa7eff 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # aws-serverless-github-deploy **Terraform + GitHub Actions for AWS serverless deployments.** -Lambda + API Gateway with CodeDeploy rollouts and provisioned concurrency controls β€” driven by clean module variables and `just` recipes. +Lambda + ECS with CodeDeploy rollouts, plus provisioned concurrency controls for Lambda β€” driven by clean module variables and `just` recipes. --- @@ -13,6 +13,31 @@ just tg dev aws/oidc apply just tg prod aws/oidc apply ``` +## 🧱 prerequisite network + +The AWS account must already have the landing-zone or StackSet network in place before deploying this repo. + +- the Terraform in this repo reads the VPC and subnets with `data` sources rather than creating them +- the expected VPC and subnets must therefore already exist +- the private subnets must be tagged so the module lookups can find them, for example with names matching `*private*` + +If those shared network resources do not exist yet, the infra applies in this repo will fail during data lookup. + +The repo `network` module also owns the shared internal ALB and shared HTTP API Gateway surface used by ECS services: + +- HTTP API +- default API stage +- VPC link +- internal ALB and target groups + +The `api` module is Lambda-specific and plugs the Lambda integration and root routes into that shared API. + +Terragrunt also provides a shared default ECR repository name to ECS task modules: + +- shared artifact base: `dev -> ---dev`, otherwise `---ci` +- default ECR repository: `-ecs-worker` +- override it in `infra/live//environment_vars.hcl` only if the repository naming diverges from that convention + ## πŸ› οΈ local plan some infra Given a terragrunt file is found at `infra/live/dev/aws/api/terragrunt.hcl` @@ -120,6 +145,59 @@ deployment_config = { } ``` +## 🚦 types of ecs deploy + +```hcl +module "service_example" { + source = "../_shared/service" + ... + deployment_strategy = var.your_deployment_strategy +} +``` + +#### ⚑ [default] All at once: + +- use case: internal services, queue workers, low-risk changes +- for load-balanced ECS services this uses CodeDeploy and shifts traffic in one step +```hcl +deployment_strategy = "all_at_once" +``` + +#### 🐀 canary deployment: + +- use case: HTTP services behind the load balancer +- shifts 10% of traffic for 5 minutes before moving to 100% +```hcl +deployment_strategy = "canary" +``` + +#### πŸ“Ά linear deployment: + +- use case: steady rollout with smaller blast radius +- shifts traffic 10% every minute until complete +```hcl +deployment_strategy = "linear" +``` + +#### 🟦🟩 blue/green deployment: + +- use case: explicit blue/green semantics while still using the default ECS all-at-once traffic switch +- currently maps to the ECS CodeDeploy all-at-once config +```hcl +deployment_strategy = "blue_green" +``` + +- ECS CodeDeploy is only created for load-balanced ECS services in `_shared/service` +- internal ECS services without load balancer integration should use native ECS rolling updates instead +- the shared ECS service resource ignores `task_definition` drift so later infra applies do not revert the live task revision after either a rolling deploy or a CodeDeploy rollout +- the deployment workflow: + - applies the new `task_*` revision + - if the service has CodeDeploy resources, reads `codedeploy_app_name` and `codedeploy_deployment_group_name` from `service_*` + - renders [`appspec-ecs.yml`](appspec-ecs.yml) + - uploads the AppSpec to the code bucket + - runs `just ecs-deploy` + - otherwise updates the ECS service to the new task definition with a native rolling deploy + ## πŸ”₯↩️ deployment roll-back - use cloudwatch metrics and alarms to automatically roll-back a deployment @@ -134,6 +212,7 @@ module "lambda_example" { ] } ``` +- the ECS shared service module accepts the same `codedeploy_alarm_names` input - if the alarm triggers during a deployment you will see the below in the CI ``` @@ -161,5 +240,6 @@ Error: Process completed with exit code 1. - Infrastructure and feature code deployments (via codedeploy) are completely decoupled. - Initial infrastructure deployments deploys `infra/modules/aws/_shared/lambda/bootstrap/index.py` which serves as a place-holder. +- Initial ECS infrastructure deployments can use a bootstrap task, while the deploy workflow later registers a real `task_*` revision and promotes it via CodeDeploy. - The code deploy app and group are also deployed, which is the mechanism used to deploy the real builds. - Subsequent re-runs of the infrastructure deployments will not update the code. diff --git a/appspec-ecs.yml b/appspec-ecs.yml new file mode 100644 index 00000000..9d530cfd --- /dev/null +++ b/appspec-ecs.yml @@ -0,0 +1,9 @@ +version: 0.0 +Resources: + - TargetService: + Type: AWS::ECS::Service + Properties: + TaskDefinition: TASK_DEFINITION_ARN + LoadBalancerInfo: + ContainerName: CONTAINER_NAME + ContainerPort: CONTAINER_PORT diff --git a/appspec-lambda.yml b/appspec-lambda.yml new file mode 100644 index 00000000..4a7cdc44 --- /dev/null +++ b/appspec-lambda.yml @@ -0,0 +1,9 @@ +version: 0.0 +Resources: + - LambdaFunction: + Type: AWS::Lambda::Function + Properties: + Name: FUNCTION_NAME + Alias: ALIAS_NAME + CurrentVersion: CURRENT_VERSION + TargetVersion: TARGET_VERSION diff --git a/appspec.yml b/appspec.yml deleted file mode 100644 index db9a9fc3..00000000 --- a/appspec.yml +++ /dev/null @@ -1,9 +0,0 @@ -version: 0.0 -Resources: - - LambdaFunction: - Type: AWS::Lambda::Function - Properties: - Name: ${FUNCTION_NAME} - Alias: ${FUNCTION_ALIAS} - CurrentVersion: ${CURRENT_VERSION} - TargetVersion: ${NEW_VERSION} \ No newline at end of file diff --git a/config/otel/collector-config.yaml b/config/otel/collector-config.yaml new file mode 100644 index 00000000..c68db8d1 --- /dev/null +++ b/config/otel/collector-config.yaml @@ -0,0 +1,28 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: ${OTEL_ENDPOINT} + +exporters: + awsxray: + region: ${AWS_REGION} + +processors: + probabilistic_sampler: + hash_seed: 22 + sampling_percentage: ${OTEL_SAMPLING_PERCENTAGE} + + filter/health: + traces: + span: + - 'name == "GET /health"' + - 'attributes["http.target"] == "/health"' + - 'attributes["http.path"] == "/health"' + +service: + pipelines: + traces: + receivers: [otlp] + processors: [filter/health, probabilistic_sampler] + exporters: [awsxray] diff --git a/containers/worker/app.py b/containers/worker/app.py new file mode 100644 index 00000000..ebe26110 --- /dev/null +++ b/containers/worker/app.py @@ -0,0 +1,40 @@ +import boto3 +import os +import time + +QUEUE_URL = os.environ['AWS_SQS_QUEUE_URL'] +AWS_REGION = os.environ['AWS_REGION'] +POLL_TIMEOUT = int(os.getenv("POLL_TIMEOUT", "60")) + +sqs = boto3.client('sqs', region_name=AWS_REGION) + + +def process_message(msg): + # TODO: implement business logic + print({"message_id": msg['MessageId'], "body": msg['Body'][:200]}) + + +def poll(): + response = sqs.receive_message( + QueueUrl=QUEUE_URL, + MaxNumberOfMessages=10, + WaitTimeSeconds=20, + VisibilityTimeout=30, + ) + messages = response.get('Messages', []) + if not messages: + print("No messages") + return + for msg in messages: + try: + process_message(msg) + sqs.delete_message(QueueUrl=QUEUE_URL, ReceiptHandle=msg['ReceiptHandle']) + except Exception as e: + print(f"Failed {msg['MessageId']}: {e}") + + +if __name__ == "__main__": + print(f"Starting SQS poller for {QUEUE_URL}") + while True: + poll() + time.sleep(POLL_TIMEOUT) diff --git a/containers/worker/requirements.txt b/containers/worker/requirements.txt new file mode 100644 index 00000000..30ddf823 --- /dev/null +++ b/containers/worker/requirements.txt @@ -0,0 +1 @@ +boto3 diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 00000000..0677becf --- /dev/null +++ b/infra/README.md @@ -0,0 +1,97 @@ +# Infra Glossary + +This directory contains the Terraform and Terragrunt layout for the repo. + +## Structure + +- `infra/root.hcl` + Shared Terragrunt root config. This is where remote state, generated provider config, shared inputs, and naming conventions are defined. +- `infra/modules/aws` + Reusable Terraform modules. +- `infra/live//aws/` + Environment-specific Terragrunt stacks that point at modules in `infra/modules/aws`. + +## Environments + +- `dev` + Main development environment. +- `prod` + Production environment. +- `ci` + Shared CI-only infra such as ECR and code bucket where applicable. + +## How State Is Named + +The root Terragrunt file derives state paths from the live stack path: + +- bucket: `---tfstate` +- key: `///terraform.tfstate` + +Shared artifact names also follow environment-aware conventions from `infra/root.hcl`: + +- shared artifact base: `dev -> ...-dev`, otherwise `...-ci` +- code bucket: `-code` +- ECS ECR repository: `-ecs-worker` + +So a stack at: + +`infra/live/dev/aws/task_worker/terragrunt.hcl` + +stores state at: + +`dev/aws/task_worker/terraform.tfstate` + +## Module Types + +- `_shared/*` + Reusable building blocks such as Lambda, ECS task, ECS service, ECR, SQS, cluster, and code bucket. +- concrete modules such as `task_worker`, `service_worker`, `lambda_worker`, `api` + Thin wrappers that apply repo-specific behavior on top of shared modules. + +## Shared Stack Responsibilities + +- `network` + Owns the internal ALB, shared HTTP API Gateway API, VPC link, and VPC endpoints. +- `security` + Owns shared security groups. +- `cluster` + Owns the ECS cluster. +- `api` + Owns the Lambda-backed API integration and routes into the shared HTTP API. +- `task_*` + Register ECS task definitions. +- `service_*` + Own the ECS services and, when applicable, CodeDeploy resources. + +## Dependency Notes + +- many modules use `data.terraform_remote_state` to read outputs from other stacks +- because of that, workflow ordering matters for apply, deploy, and destroy +- some shared infrastructure, such as the landing-zone VPC and tagged private subnets, is discovered with `data` lookups and must already exist + +## Deployment Model + +- infra workflows create or update infrastructure stacks +- build workflows produce Lambda zips and container images +- deploy workflows: + - publish Lambda versions and use Lambda CodeDeploy + - register ECS task revisions + - then either: + - use ECS CodeDeploy for load-balanced services + - or use native ECS rolling updates for internal services + +## Naming Conventions + +- `task_` + ECS task-definition stack/module +- `service_` + ECS service stack/module +- `lambda_` or concrete Lambda stack names + Lambda stacks + +In CI workflows, be careful whether a matrix is carrying: + +- logical service names like `worker` +- or concrete stack names like `task_worker` / `service_worker` + +That distinction has caused several workflow bugs already. diff --git a/infra/live/ci/aws/code_bucket/terragrunt.hcl b/infra/live/ci/aws/code_bucket/terragrunt.hcl index d8b45907..b0d65051 100644 --- a/infra/live/ci/aws/code_bucket/terragrunt.hcl +++ b/infra/live/ci/aws/code_bucket/terragrunt.hcl @@ -3,5 +3,5 @@ include { } terraform { - source = "../../../../modules//aws//code_bucket" + source = "../../../../modules//aws//_shared//code_bucket" } diff --git a/infra/live/ci/aws/ecr/terragrunt.hcl b/infra/live/ci/aws/ecr/terragrunt.hcl new file mode 100644 index 00000000..dcee4543 --- /dev/null +++ b/infra/live/ci/aws/ecr/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//_shared//ecr" +} diff --git a/infra/live/dev/aws/cluster/terragrunt.hcl b/infra/live/dev/aws/cluster/terragrunt.hcl new file mode 100644 index 00000000..7440c4ef --- /dev/null +++ b/infra/live/dev/aws/cluster/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//_shared//cluster" +} diff --git a/infra/live/dev/aws/code_bucket/terragrunt.hcl b/infra/live/dev/aws/code_bucket/terragrunt.hcl index d8b45907..b0d65051 100644 --- a/infra/live/dev/aws/code_bucket/terragrunt.hcl +++ b/infra/live/dev/aws/code_bucket/terragrunt.hcl @@ -3,5 +3,5 @@ include { } terraform { - source = "../../../../modules//aws//code_bucket" + source = "../../../../modules//aws//_shared//code_bucket" } diff --git a/infra/live/dev/aws/ecr/terragrunt.hcl b/infra/live/dev/aws/ecr/terragrunt.hcl new file mode 100644 index 00000000..dcee4543 --- /dev/null +++ b/infra/live/dev/aws/ecr/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//_shared//ecr" +} diff --git a/infra/live/dev/aws/network/terragrunt.hcl b/infra/live/dev/aws/network/terragrunt.hcl new file mode 100644 index 00000000..e6dc9947 --- /dev/null +++ b/infra/live/dev/aws/network/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//network" +} diff --git a/infra/live/dev/aws/security/terragrunt.hcl b/infra/live/dev/aws/security/terragrunt.hcl new file mode 100644 index 00000000..a072ef9a --- /dev/null +++ b/infra/live/dev/aws/security/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//security" +} diff --git a/infra/live/dev/aws/service_worker/terragrunt.hcl b/infra/live/dev/aws/service_worker/terragrunt.hcl new file mode 100644 index 00000000..ece97f02 --- /dev/null +++ b/infra/live/dev/aws/service_worker/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//service_worker" +} diff --git a/infra/live/dev/aws/task_worker/terragrunt.hcl b/infra/live/dev/aws/task_worker/terragrunt.hcl new file mode 100644 index 00000000..3448e18a --- /dev/null +++ b/infra/live/dev/aws/task_worker/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//task_worker" +} diff --git a/infra/live/dev/environment_vars.hcl b/infra/live/dev/environment_vars.hcl index 529ee12e..d168beac 100644 --- a/infra/live/dev/environment_vars.hcl +++ b/infra/live/dev/environment_vars.hcl @@ -1,7 +1,15 @@ locals { - deploy_branches = ["*"] + deploy_branches = ["*"] + image_expiration_days = 30 + force_delete = true + local_tunnel = true + xray_enabled = true } inputs = { - deploy_branches = local.deploy_branches + deploy_branches = local.deploy_branches + image_expiration_days = local.image_expiration_days + force_delete = local.force_delete + local_tunnel = local.local_tunnel + xray_enabled = local.xray_enabled } diff --git a/infra/live/global_vars.hcl b/infra/live/global_vars.hcl index 1f9e6756..70a5faa5 100644 --- a/infra/live/global_vars.hcl +++ b/infra/live/global_vars.hcl @@ -1,4 +1,5 @@ locals { + vpc_name = "vpc" aws_region = "eu-west-2" allowed_role_actions = [ "s3:*", @@ -10,12 +11,20 @@ locals { "application-autoscaling:*", "cloudwatch:*", "sqs:*", + "sns:*", "cloudfront:*", - "xray:*" + "xray:*", + "ec2:*", + "ecs:*", + "ecr:*", + "elasticloadbalancing:*", ] + container_port = 80 } inputs = { + vpc_name = local.vpc_name aws_region = local.aws_region allowed_role_actions = local.allowed_role_actions + container_port = local.container_port } \ No newline at end of file diff --git a/infra/live/prod/aws/cluster/terragrunt.hcl b/infra/live/prod/aws/cluster/terragrunt.hcl new file mode 100644 index 00000000..7440c4ef --- /dev/null +++ b/infra/live/prod/aws/cluster/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//_shared//cluster" +} diff --git a/infra/live/prod/aws/network/terragrunt.hcl b/infra/live/prod/aws/network/terragrunt.hcl new file mode 100644 index 00000000..e6dc9947 --- /dev/null +++ b/infra/live/prod/aws/network/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//network" +} diff --git a/infra/live/prod/aws/security/terragrunt.hcl b/infra/live/prod/aws/security/terragrunt.hcl new file mode 100644 index 00000000..a072ef9a --- /dev/null +++ b/infra/live/prod/aws/security/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//security" +} diff --git a/infra/live/prod/aws/service_worker/terragrunt.hcl b/infra/live/prod/aws/service_worker/terragrunt.hcl new file mode 100644 index 00000000..ece97f02 --- /dev/null +++ b/infra/live/prod/aws/service_worker/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//service_worker" +} diff --git a/infra/live/prod/aws/task_worker/terragrunt.hcl b/infra/live/prod/aws/task_worker/terragrunt.hcl new file mode 100644 index 00000000..3448e18a --- /dev/null +++ b/infra/live/prod/aws/task_worker/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//task_worker" +} diff --git a/infra/modules/aws/_shared/cluster/README.md b/infra/modules/aws/_shared/cluster/README.md new file mode 100644 index 00000000..9014a934 --- /dev/null +++ b/infra/modules/aws/_shared/cluster/README.md @@ -0,0 +1,14 @@ +# `_shared/cluster` + +Shared ECS cluster module. + +## Owns + +- ECS cluster + +## Key outputs + +- `cluster_id` +- `cluster_name` + +Used by ECS service modules and workflows that need the target cluster identity. diff --git a/infra/modules/aws/_shared/cluster/locals.tf b/infra/modules/aws/_shared/cluster/locals.tf new file mode 100644 index 00000000..e31f34c2 --- /dev/null +++ b/infra/modules/aws/_shared/cluster/locals.tf @@ -0,0 +1,3 @@ +locals { + cluster_name = "${var.environment}-${var.project_name}-cluster" +} diff --git a/infra/modules/aws/_shared/cluster/main.tf b/infra/modules/aws/_shared/cluster/main.tf new file mode 100644 index 00000000..3afaa8fb --- /dev/null +++ b/infra/modules/aws/_shared/cluster/main.tf @@ -0,0 +1,3 @@ +resource "aws_ecs_cluster" "this" { + name = local.cluster_name +} diff --git a/infra/modules/aws/_shared/cluster/outputs.tf b/infra/modules/aws/_shared/cluster/outputs.tf new file mode 100644 index 00000000..084ef6fe --- /dev/null +++ b/infra/modules/aws/_shared/cluster/outputs.tf @@ -0,0 +1,7 @@ +output "cluster_id" { + value = aws_ecs_cluster.this.id +} + +output "cluster_name" { + value = aws_ecs_cluster.this.name +} diff --git a/infra/modules/aws/_shared/cluster/variables.tf b/infra/modules/aws/_shared/cluster/variables.tf new file mode 100644 index 00000000..364cf69d --- /dev/null +++ b/infra/modules/aws/_shared/cluster/variables.tf @@ -0,0 +1,9 @@ +### start of static vars set in root.hcl ### +variable "project_name" { + type = string +} + +variable "environment" { + type = string +} +### end of static vars set in root.hcl ### diff --git a/infra/modules/aws/_shared/code_bucket/README.md b/infra/modules/aws/_shared/code_bucket/README.md new file mode 100644 index 00000000..c131c038 --- /dev/null +++ b/infra/modules/aws/_shared/code_bucket/README.md @@ -0,0 +1,15 @@ +# `_shared/code_bucket` + +Shared S3 bucket for deployable artifacts. + +## Owns + +- Lambda zip storage +- frontend bundle storage +- ECS AppSpec storage for CodeDeploy + +## Key outputs + +- artifact bucket name + +Used by build, build-get, and deploy workflows. diff --git a/infra/modules/aws/code_bucket/main.tf b/infra/modules/aws/_shared/code_bucket/main.tf similarity index 100% rename from infra/modules/aws/code_bucket/main.tf rename to infra/modules/aws/_shared/code_bucket/main.tf diff --git a/infra/modules/aws/code_bucket/outputs.tf b/infra/modules/aws/_shared/code_bucket/outputs.tf similarity index 100% rename from infra/modules/aws/code_bucket/outputs.tf rename to infra/modules/aws/_shared/code_bucket/outputs.tf diff --git a/infra/modules/aws/code_bucket/variables.tf b/infra/modules/aws/_shared/code_bucket/variables.tf similarity index 99% rename from infra/modules/aws/code_bucket/variables.tf rename to infra/modules/aws/_shared/code_bucket/variables.tf index 474e02cf..7ad8beb8 100644 --- a/infra/modules/aws/code_bucket/variables.tf +++ b/infra/modules/aws/_shared/code_bucket/variables.tf @@ -5,7 +5,6 @@ variable "code_bucket" { } ### end of static vars set in root.hcl ### - variable "s3_expiration_days" { description = "Number of days before objects are deleted (set to 0 to disable)" type = number diff --git a/infra/modules/aws/code_bucket/versions.tf b/infra/modules/aws/_shared/code_bucket/versions.tf similarity index 98% rename from infra/modules/aws/code_bucket/versions.tf rename to infra/modules/aws/_shared/code_bucket/versions.tf index f3218ff6..4dc01c4f 100644 --- a/infra/modules/aws/code_bucket/versions.tf +++ b/infra/modules/aws/_shared/code_bucket/versions.tf @@ -6,4 +6,4 @@ terraform { source = "hashicorp/aws" } } -} \ No newline at end of file +} diff --git a/infra/modules/aws/_shared/ecr/README.md b/infra/modules/aws/_shared/ecr/README.md new file mode 100644 index 00000000..05b0ec13 --- /dev/null +++ b/infra/modules/aws/_shared/ecr/README.md @@ -0,0 +1,18 @@ +# `_shared/ecr` + +Shared ECR repository module. + +## Owns + +- the repository used for ECS images +- repository lifecycle settings + +## Key inputs + +- `ecr_repository_name` + +## Key outputs + +- `repository_url` + +Used by image build, bootstrap image mirroring, and ECS deploy workflows. diff --git a/infra/modules/aws/_shared/ecr/data.tf b/infra/modules/aws/_shared/ecr/data.tf new file mode 100644 index 00000000..593a2387 --- /dev/null +++ b/infra/modules/aws/_shared/ecr/data.tf @@ -0,0 +1,29 @@ +data "aws_iam_policy_document" "allow_ecr_pull_policy" { + statement { + sid = "AllowCrossAccountPull" + effect = "Allow" + actions = local.ecr_pull_actions + principals { + type = "AWS" + identifiers = local.allowed_account_principals + } + } +} + +data "aws_ecr_lifecycle_policy_document" "this" { + rule { + priority = 1 + description = "Remove images after ${var.image_expiration_days} days" + + selection { + tag_status = "any" + count_type = "sinceImagePushed" + count_unit = "days" + count_number = var.image_expiration_days + } + + action { + type = "expire" + } + } +} diff --git a/infra/modules/aws/_shared/ecr/locals.tf b/infra/modules/aws/_shared/ecr/locals.tf new file mode 100644 index 00000000..d63c3fd3 --- /dev/null +++ b/infra/modules/aws/_shared/ecr/locals.tf @@ -0,0 +1,11 @@ +locals { + repository_name = var.ecr_repository_name + ecr_pull_actions = [ + "ecr:BatchCheckLayerAvailability", + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer", + ] + allowed_account_principals = [ + for account_id in var.allowed_read_aws_account_ids : "arn:aws:iam::${account_id}:root" + ] +} diff --git a/infra/modules/aws/_shared/ecr/main.tf b/infra/modules/aws/_shared/ecr/main.tf new file mode 100644 index 00000000..1ab0fdf0 --- /dev/null +++ b/infra/modules/aws/_shared/ecr/main.tf @@ -0,0 +1,21 @@ +resource "aws_ecr_repository" "this" { + name = local.repository_name + + force_delete = var.force_delete + + image_scanning_configuration { + scan_on_push = var.scan_on_push + } +} + +resource "aws_ecr_repository_policy" "this" { + repository = aws_ecr_repository.this.name + policy = data.aws_iam_policy_document.allow_ecr_pull_policy.json +} + +resource "aws_ecr_lifecycle_policy" "this" { + count = var.image_expiration_days > 0 ? 1 : 0 + + repository = aws_ecr_repository.this.name + policy = data.aws_ecr_lifecycle_policy_document.this.json +} diff --git a/infra/modules/aws/_shared/ecr/outputs.tf b/infra/modules/aws/_shared/ecr/outputs.tf new file mode 100644 index 00000000..b8f6b514 --- /dev/null +++ b/infra/modules/aws/_shared/ecr/outputs.tf @@ -0,0 +1,11 @@ +output "repository_url" { + value = aws_ecr_repository.this.repository_url +} + +output "repository_name" { + value = aws_ecr_repository.this.name +} + +output "repository_arn" { + value = aws_ecr_repository.this.arn +} diff --git a/infra/modules/aws/_shared/ecr/variables.tf b/infra/modules/aws/_shared/ecr/variables.tf new file mode 100644 index 00000000..ca49753e --- /dev/null +++ b/infra/modules/aws/_shared/ecr/variables.tf @@ -0,0 +1,30 @@ +### start of static vars set in root.hcl ### +variable "project_name" { + type = string +} + +variable "ecr_repository_name" { + type = string +} +### end of static vars set in root.hcl ### + +variable "allowed_read_aws_account_ids" { + description = "AWS Account allowed to pull from ci ecr" + type = list(string) +} + +variable "scan_on_push" { + type = bool + default = true +} + +variable "force_delete" { + type = bool + default = false +} + +variable "image_expiration_days" { + description = "Number of days before images are deleted (set to 0 to disable)" + type = number + default = 0 +} diff --git a/infra/modules/aws/_shared/lambda/README.md b/infra/modules/aws/_shared/lambda/README.md new file mode 100644 index 00000000..98ff0720 --- /dev/null +++ b/infra/modules/aws/_shared/lambda/README.md @@ -0,0 +1,26 @@ +# `_shared/lambda` + +Shared Lambda module with versioned deploys through CodeDeploy. + +## Owns + +- Lambda function +- alias and published versions +- optional provisioned concurrency +- Lambda CodeDeploy app and deployment group +- bootstrap zip used for initial infra applies + +## Key inputs + +- `deployment_config` +- `provisioned_config` +- `codedeploy_alarm_names` +- `code_bucket` + +## Key outputs + +- function name and ARN +- alias name and ARN +- log group + +Use this when you want Lambda infra and Lambda rollout behavior managed together. diff --git a/infra/modules/aws/_shared/service/README.md b/infra/modules/aws/_shared/service/README.md new file mode 100644 index 00000000..4d394934 --- /dev/null +++ b/infra/modules/aws/_shared/service/README.md @@ -0,0 +1,42 @@ +# `_shared/service` + +Shared ECS service module. + +## Owns + +- ECS service +- optional bootstrap task used for first infra deploys +- service-level ALB target group and listener rule for sub-path services +- API Gateway VPC link routing for HTTP services +- ECS CodeDeploy app and deployment group for load-balanced ECS services +- service autoscaling policies and alarms + +## Key inputs + +- `task_definition_arn` +- `connection_type` +- `deployment_strategy` +- `bootstrap` +- `bootstrap_image_uri` +- `codedeploy_alarm_names` + +## Deployment strategies + +- `all_at_once` +- `canary` +- `linear` +- `blue_green` + +These map to ECS CodeDeploy deployment configs for load-balanced services. +For internal non-load-balanced services, the deploy workflow falls back to native ECS rolling updates. + +## Drift ownership + +The ECS service ignores changes to `task_definition`. + +That is intentional: + +- deploy workflows own the live task revision +- infra applies own the stable service shape + +Without that split, a later infra apply would revert a successful rolling or CodeDeploy deployment back to the older task definition stored in Terraform state. diff --git a/infra/modules/aws/_shared/service/data.tf b/infra/modules/aws/_shared/service/data.tf new file mode 100644 index 00000000..aa4953b2 --- /dev/null +++ b/infra/modules/aws/_shared/service/data.tf @@ -0,0 +1,25 @@ +data "aws_iam_policy_document" "bootstrap_assume_role" { + statement { + effect = "Allow" + + principals { + type = "Service" + identifiers = ["ecs-tasks.amazonaws.com"] + } + + actions = ["sts:AssumeRole"] + } +} + +data "aws_iam_policy_document" "codedeploy_assume_role" { + statement { + effect = "Allow" + + principals { + type = "Service" + identifiers = ["codedeploy.amazonaws.com"] + } + + actions = ["sts:AssumeRole"] + } +} diff --git a/infra/modules/aws/_shared/service/locals.tf b/infra/modules/aws/_shared/service/locals.tf new file mode 100644 index 00000000..7214eb34 --- /dev/null +++ b/infra/modules/aws/_shared/service/locals.tf @@ -0,0 +1,99 @@ +locals { + use_vpc_link = var.connection_type == "vpc_link" + enable_codedeploy = ( + var.connection_type == "internal_dns" || var.connection_type == "vpc_link" + ) + codedeploy_deployment_config_name = var.codedeploy_deployment_config_name_override != "" ? var.codedeploy_deployment_config_name_override : ( + var.deployment_strategy == "canary" + ? "CodeDeployDefault.ECSCanary10Percent5Minutes" + : var.deployment_strategy == "linear" + ? "CodeDeployDefault.ECSLinear10PercentEvery1Minutes" + : "CodeDeployDefault.ECSAllAtOnce" + ) + + priority = parseint(substr(md5(var.service_name), 0, 2), 16) % 90 + 10 + vpc_link_count = local.use_vpc_link ? 1 : 0 + full_tg_name = "${var.service_name}-tg" + target_group_name = length(local.full_tg_name) > 32 ? substr(local.full_tg_name, 0, 32) : local.full_tg_name + green_target_group_name = "tg-${substr(md5("${var.service_name}-green"), 0, 8)}-green" + + is_default_path = var.root_path == "" + health_check_path = local.is_default_path ? "/health" : "/${var.root_path}/health" + exact_route_key = local.is_default_path ? "ANY /" : "ANY /${var.root_path}" + proxy_route_key = local.is_default_path ? "ANY /{proxy+}" : "ANY /${var.root_path}/{proxy+}" + target_group_arn = local.is_default_path ? var.default_target_group_arn : aws_lb_target_group.service_target_group[0].arn + blue_target_group_name = local.is_default_path ? element(split("/", var.default_target_group_arn), 1) : aws_lb_target_group.service_target_group[0].name + + load_balancers = var.connection_type == "internal_dns" || var.connection_type == "vpc_link" ? [{ + target_group_arn = local.target_group_arn + container_name = var.service_name + container_port = var.container_port + }] : [] + + enable_cpu_scaling = try(var.scaling_strategy.cpu != null, false) + enable_sqs_scaling = try(var.scaling_strategy.sqs != null, false) + enable_alb_scaling = try(var.scaling_strategy.alb != null, false) + enable_scaling = local.enable_cpu_scaling || local.enable_sqs_scaling || local.enable_alb_scaling + + evaluation_periods_cpu_out = local.enable_cpu_scaling ? ( + var.scaling_strategy.cpu.cooldown_out <= 60 + ? 1 + : floor(var.scaling_strategy.cpu.cooldown_out / 60) + ) : null + + evaluation_periods_cpu_in = local.enable_cpu_scaling ? ( + var.scaling_strategy.cpu.cooldown_in <= 60 + ? 1 + : floor(var.scaling_strategy.cpu.cooldown_in / 60) + ) : null + + evaluation_periods_sqs_out = local.enable_sqs_scaling ? ( + var.scaling_strategy.sqs.cooldown_out <= 60 + ? 1 + : floor(var.scaling_strategy.sqs.cooldown_out / 60) + ) : null + + evaluation_periods_sqs_in = local.enable_sqs_scaling ? ( + var.scaling_strategy.sqs.cooldown_in <= 60 + ? 1 + : floor(var.scaling_strategy.sqs.cooldown_in / 60) + ) : null + + evaluation_periods_alb_out = local.enable_alb_scaling ? ( + var.scaling_strategy.alb.cooldown_out <= 60 + ? 1 + : floor(var.scaling_strategy.alb.cooldown_out / 60) + ) : null + + evaluation_periods_alb_in = local.enable_alb_scaling ? ( + var.scaling_strategy.alb.cooldown_in <= 60 + ? 1 + : floor(var.scaling_strategy.alb.cooldown_in / 60) + ) : null + + base_url = var.connection_type == "internal" ? null : ( + var.connection_type == "internal_dns" + ? var.internal_invoke_url + : var.api_invoke_url + ) + invoke_url = var.root_path == "" ? local.base_url : "${local.base_url}/${var.root_path}" + + deployment_controller_type = local.enable_codedeploy ? "CODE_DEPLOY" : "ECS" + selected_task_definition_arn = var.bootstrap ? aws_ecs_task_definition.bootstrap[0].arn : var.task_definition_arn + bootstrap_container_definitions = jsonencode([{ + name = var.service_name + image = var.bootstrap_image_uri + + portMappings = [ + { + name = "${var.service_name}-${var.container_port}-tcp" + containerPort = var.container_port + hostPort = var.container_port + protocol = "tcp" + appProtocol = "http" + } + ] + + essential = true + }]) +} diff --git a/infra/modules/aws/_shared/service/main.tf b/infra/modules/aws/_shared/service/main.tf new file mode 100644 index 00000000..5c99327c --- /dev/null +++ b/infra/modules/aws/_shared/service/main.tf @@ -0,0 +1,458 @@ +resource "aws_iam_role" "bootstrap_execution" { + count = var.bootstrap ? 1 : 0 + + name = "${var.service_name}-bootstrap-ecs-task-execution-role" + assume_role_policy = data.aws_iam_policy_document.bootstrap_assume_role.json +} + +resource "aws_iam_role" "bootstrap_task" { + count = var.bootstrap ? 1 : 0 + + name = "${var.service_name}-bootstrap-ecs-task-role" + assume_role_policy = data.aws_iam_policy_document.bootstrap_assume_role.json +} + +resource "aws_iam_role_policy_attachment" "bootstrap_execution" { + count = var.bootstrap ? 1 : 0 + + role = aws_iam_role.bootstrap_execution[0].name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} + +resource "aws_ecs_task_definition" "bootstrap" { + count = var.bootstrap ? 1 : 0 + + family = "${var.service_name}-bootstrap-task" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = 256 + memory = 512 + execution_role_arn = aws_iam_role.bootstrap_execution[0].arn + task_role_arn = aws_iam_role.bootstrap_task[0].arn + + runtime_platform { + cpu_architecture = "X86_64" + operating_system_family = "LINUX" + } + + container_definitions = local.bootstrap_container_definitions +} + +resource "aws_lb_target_group" "service_target_group" { + count = local.is_default_path ? 0 : 1 + + name = local.target_group_name + port = var.container_port + protocol = "HTTP" + target_type = "ip" + vpc_id = var.vpc_id + + health_check { + path = local.health_check_path + matcher = "200-399" + interval = 30 + timeout = 5 + healthy_threshold = 2 + unhealthy_threshold = 2 + port = "traffic-port" + protocol = "HTTP" + } +} + +resource "aws_lb_target_group" "green_target_group" { + count = local.enable_codedeploy ? 1 : 0 + + name = local.green_target_group_name + port = var.container_port + protocol = "HTTP" + target_type = "ip" + vpc_id = var.vpc_id + + health_check { + path = local.health_check_path + matcher = "200-399" + interval = 30 + timeout = 5 + healthy_threshold = 2 + unhealthy_threshold = 2 + port = "traffic-port" + protocol = "HTTP" + } +} + +resource "aws_lb_listener_rule" "service" { + count = local.is_default_path ? 0 : 1 + + listener_arn = var.default_http_listener_arn + priority = local.priority + + action { + type = "forward" + target_group_arn = aws_lb_target_group.service_target_group[0].arn + } + + condition { + path_pattern { + values = ["/${var.root_path}/*"] + } + } +} + +resource "aws_apigatewayv2_route" "service_exact" { + count = local.vpc_link_count + + api_id = var.api_id + route_key = local.exact_route_key + + target = "integrations/${aws_apigatewayv2_integration.service[0].id}" +} + +resource "aws_apigatewayv2_route" "service_proxy" { + count = local.vpc_link_count + + api_id = var.api_id + route_key = local.proxy_route_key + + target = "integrations/${aws_apigatewayv2_integration.service[0].id}" +} + +resource "aws_apigatewayv2_integration" "service" { + count = local.vpc_link_count + + api_id = var.api_id + connection_id = var.vpc_link_id + connection_type = "VPC_LINK" + integration_type = "HTTP_PROXY" + integration_method = "ANY" + integration_uri = var.default_http_listener_arn + payload_format_version = "1.0" + + lifecycle { + precondition { + condition = var.vpc_link_id != null && var.vpc_link_id != "" + error_message = "vpc_link_id must be set in the shared API stack before using connection_type = \"vpc_link\"." + } + } +} + +resource "aws_ecs_service" "service" { + name = var.service_name + cluster = var.cluster_id + task_definition = local.selected_task_definition_arn + desired_count = var.desired_task_count + launch_type = "FARGATE" + + network_configuration { + subnets = var.private_subnet_ids + assign_public_ip = false + security_groups = concat( + [var.ecs_security_group_id], + var.additional_security_group_ids, + ) + } + + dynamic "load_balancer" { + for_each = local.load_balancers + content { + target_group_arn = load_balancer.value.target_group_arn + container_name = load_balancer.value.container_name + container_port = load_balancer.value.container_port + } + } + + enable_execute_command = var.local_tunnel ? true : false + wait_for_steady_state = var.wait_for_steady_state + + dynamic "deployment_circuit_breaker" { + for_each = local.enable_codedeploy ? [] : [1] + content { + enable = false + rollback = false + } + } + + deployment_controller { + type = local.deployment_controller_type + } + + lifecycle { + # Deploy workflows own the live task revision. Terraform keeps the service + # shape stable without reverting the currently deployed revision. + ignore_changes = [ + task_definition, + ] + } +} + +resource "aws_codedeploy_app" "ecs" { + count = local.enable_codedeploy ? 1 : 0 + + name = "${var.service_name}-app" + compute_platform = "ECS" +} + +resource "aws_iam_role" "codedeploy" { + count = local.enable_codedeploy ? 1 : 0 + + name = "${var.service_name}-codedeploy-role" + assume_role_policy = data.aws_iam_policy_document.codedeploy_assume_role.json +} + +resource "aws_iam_role_policy_attachment" "codedeploy" { + count = local.enable_codedeploy ? 1 : 0 + + role = aws_iam_role.codedeploy[0].name + policy_arn = "arn:aws:iam::aws:policy/AWSCodeDeployRoleForECS" +} + +resource "aws_codedeploy_deployment_group" "ecs" { + count = local.enable_codedeploy ? 1 : 0 + + app_name = aws_codedeploy_app.ecs[0].name + deployment_group_name = "${var.service_name}-dg" + deployment_config_name = local.codedeploy_deployment_config_name + service_role_arn = aws_iam_role.codedeploy[0].arn + + auto_rollback_configuration { + enabled = true + events = ["DEPLOYMENT_FAILURE", "DEPLOYMENT_STOP_ON_ALARM"] + } + + dynamic "alarm_configuration" { + for_each = length(var.codedeploy_alarm_names) > 0 ? [1] : [] + content { + enabled = true + alarms = var.codedeploy_alarm_names + } + } + + blue_green_deployment_config { + deployment_ready_option { + action_on_timeout = "CONTINUE_DEPLOYMENT" + } + + terminate_blue_instances_on_deployment_success { + action = "TERMINATE" + termination_wait_time_in_minutes = 1 + } + } + + deployment_style { + deployment_option = "WITH_TRAFFIC_CONTROL" + deployment_type = "BLUE_GREEN" + } + + ecs_service { + cluster_name = var.cluster_name + service_name = aws_ecs_service.service.name + } + + load_balancer_info { + target_group_pair_info { + prod_traffic_route { + listener_arns = [var.default_http_listener_arn] + } + + target_group { + name = local.blue_target_group_name + } + + target_group { + name = aws_lb_target_group.green_target_group[0].name + } + } + } + + depends_on = [ + aws_ecs_service.service, + aws_iam_role_policy_attachment.codedeploy, + ] +} + +resource "aws_appautoscaling_target" "ecs" { + count = local.enable_scaling ? 1 : 0 + + max_capacity = var.scaling_strategy.max_scaled_task_count + min_capacity = var.desired_task_count + resource_id = "service/${var.cluster_name}/${var.service_name}" + scalable_dimension = "ecs:service:DesiredCount" + service_namespace = "ecs" +} + +resource "aws_appautoscaling_policy" "cpu_scale_in" { + count = local.enable_cpu_scaling ? 1 : 0 + name = "${var.service_name}-cpu-scale-in" + policy_type = "StepScaling" + resource_id = aws_appautoscaling_target.ecs[0].resource_id + scalable_dimension = aws_appautoscaling_target.ecs[0].scalable_dimension + service_namespace = aws_appautoscaling_target.ecs[0].service_namespace + + step_scaling_policy_configuration { + adjustment_type = "ChangeInCapacity" + + step_adjustment { + scaling_adjustment = var.scaling_strategy.cpu.scale_in_adjustment + metric_interval_upper_bound = 0 + } + + cooldown = var.scaling_strategy.cpu.cooldown_in + metric_aggregation_type = "Average" + } +} + +resource "aws_appautoscaling_policy" "cpu_scale_out" { + count = local.enable_cpu_scaling ? 1 : 0 + name = "${var.service_name}-cpu-scale-out" + policy_type = "StepScaling" + resource_id = aws_appautoscaling_target.ecs[0].resource_id + scalable_dimension = aws_appautoscaling_target.ecs[0].scalable_dimension + service_namespace = aws_appautoscaling_target.ecs[0].service_namespace + + step_scaling_policy_configuration { + adjustment_type = "ChangeInCapacity" + + step_adjustment { + scaling_adjustment = var.scaling_strategy.cpu.scale_out_adjustment + metric_interval_lower_bound = 0 + } + + cooldown = var.scaling_strategy.cpu.cooldown_out + metric_aggregation_type = "Average" + } +} + +resource "aws_cloudwatch_metric_alarm" "cpu_scale_in_alarm" { + count = local.enable_cpu_scaling ? 1 : 0 + + alarm_name = "${var.service_name}-cpu-scale-in-alarm" + comparison_operator = "LessThanOrEqualToThreshold" + evaluation_periods = local.evaluation_periods_cpu_in + metric_name = "CPUUtilization" + namespace = "AWS/ECS" + period = var.scaling_strategy.cpu.cooldown_in + statistic = "Average" + threshold = var.scaling_strategy.cpu.scale_in_threshold + alarm_actions = [aws_appautoscaling_policy.cpu_scale_in[0].arn] + + dimensions = { + ClusterName = var.cluster_name + ServiceName = var.service_name + } +} + +resource "aws_cloudwatch_metric_alarm" "cpu_scale_out_alarm" { + count = local.enable_cpu_scaling ? 1 : 0 + + alarm_name = "${var.service_name}-cpu-scale-out-alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = local.evaluation_periods_cpu_out + metric_name = "CPUUtilization" + namespace = "AWS/ECS" + period = var.scaling_strategy.cpu.cooldown_out + statistic = "Average" + threshold = var.scaling_strategy.cpu.scale_out_threshold + alarm_actions = [aws_appautoscaling_policy.cpu_scale_out[0].arn] + + dimensions = { + ClusterName = var.cluster_name + ServiceName = var.service_name + } +} + +resource "aws_appautoscaling_policy" "sqs_scale_in" { + count = local.enable_sqs_scaling ? 1 : 0 + name = "${var.service_name}-sqs-scale-in" + policy_type = "StepScaling" + resource_id = aws_appautoscaling_target.ecs[0].resource_id + scalable_dimension = aws_appautoscaling_target.ecs[0].scalable_dimension + service_namespace = aws_appautoscaling_target.ecs[0].service_namespace + + step_scaling_policy_configuration { + adjustment_type = "ChangeInCapacity" + + step_adjustment { + scaling_adjustment = var.scaling_strategy.sqs.scale_in_adjustment + metric_interval_upper_bound = 0 + } + + cooldown = var.scaling_strategy.sqs.cooldown_in + metric_aggregation_type = "Average" + } +} + +resource "aws_appautoscaling_policy" "sqs_scale_out" { + count = local.enable_sqs_scaling ? 1 : 0 + name = "${var.service_name}-sqs-scale-out" + policy_type = "StepScaling" + resource_id = aws_appautoscaling_target.ecs[0].resource_id + scalable_dimension = aws_appautoscaling_target.ecs[0].scalable_dimension + service_namespace = aws_appautoscaling_target.ecs[0].service_namespace + + step_scaling_policy_configuration { + adjustment_type = "ChangeInCapacity" + + step_adjustment { + scaling_adjustment = var.scaling_strategy.sqs.scale_out_adjustment + metric_interval_lower_bound = 0 + } + + cooldown = var.scaling_strategy.sqs.cooldown_out + metric_aggregation_type = "Average" + } +} + +resource "aws_cloudwatch_metric_alarm" "sqs_scale_in_alarm" { + count = local.enable_sqs_scaling ? 1 : 0 + + alarm_name = "${var.service_name}-sqs-scale-in-alarm" + comparison_operator = "LessThanOrEqualToThreshold" + evaluation_periods = local.evaluation_periods_sqs_in + metric_name = "ApproximateNumberOfMessagesVisible" + namespace = "AWS/SQS" + period = var.scaling_strategy.sqs.cooldown_in + statistic = "Average" + threshold = var.scaling_strategy.sqs.scale_in_threshold + alarm_actions = [aws_appautoscaling_policy.sqs_scale_in[0].arn] + + dimensions = { + QueueName = var.scaling_strategy.sqs.queue_name + } +} + +resource "aws_cloudwatch_metric_alarm" "sqs_scale_out_alarm" { + count = local.enable_sqs_scaling ? 1 : 0 + + alarm_name = "${var.service_name}-sqs-scale-out-alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = local.evaluation_periods_sqs_out + metric_name = "ApproximateNumberOfMessagesVisible" + namespace = "AWS/SQS" + period = var.scaling_strategy.sqs.cooldown_out + statistic = "Average" + threshold = var.scaling_strategy.sqs.scale_out_threshold + alarm_actions = [aws_appautoscaling_policy.sqs_scale_out[0].arn] + + dimensions = { + QueueName = var.scaling_strategy.sqs.queue_name + } +} + +resource "aws_appautoscaling_policy" "alb_req_per_target" { + count = local.enable_alb_scaling ? 1 : 0 + name = "${var.service_name}-alb-req-per-target" + policy_type = "TargetTrackingScaling" + resource_id = aws_appautoscaling_target.ecs[0].resource_id + scalable_dimension = aws_appautoscaling_target.ecs[0].scalable_dimension + service_namespace = aws_appautoscaling_target.ecs[0].service_namespace + + target_tracking_scaling_policy_configuration { + predefined_metric_specification { + predefined_metric_type = "ALBRequestCountPerTarget" + resource_label = "${var.load_balancer_arn_suffix}/${var.target_group_arn_suffix}" + } + + target_value = var.scaling_strategy.alb.target_requests_per_task + scale_in_cooldown = local.evaluation_periods_alb_in + scale_out_cooldown = local.evaluation_periods_alb_out + } +} diff --git a/infra/modules/aws/_shared/service/outputs.tf b/infra/modules/aws/_shared/service/outputs.tf new file mode 100644 index 00000000..313b0e05 --- /dev/null +++ b/infra/modules/aws/_shared/service/outputs.tf @@ -0,0 +1,23 @@ +output "invoke_url" { + value = local.invoke_url +} + +output "service_name" { + value = aws_ecs_service.service.name +} + +output "codedeploy_app_name" { + value = local.enable_codedeploy ? aws_codedeploy_app.ecs[0].name : null +} + +output "codedeploy_deployment_group_name" { + value = local.enable_codedeploy ? aws_codedeploy_deployment_group.ecs[0].deployment_group_name : null +} + +output "blue_target_group_name" { + value = local.enable_codedeploy ? local.blue_target_group_name : null +} + +output "green_target_group_name" { + value = local.enable_codedeploy ? aws_lb_target_group.green_target_group[0].name : null +} diff --git a/infra/modules/aws/_shared/service/variables.tf b/infra/modules/aws/_shared/service/variables.tf new file mode 100644 index 00000000..c18d81fc --- /dev/null +++ b/infra/modules/aws/_shared/service/variables.tf @@ -0,0 +1,181 @@ +### start of static vars set in root.hcl ### +variable "aws_region" { + type = string + description = "AWS region" +} +### end of static vars set in root.hcl ### + +variable "service_name" { + type = string +} + +variable "container_port" { + type = number +} + +variable "task_definition_arn" { + type = string +} + +variable "vpc_id" { + type = string +} + +variable "private_subnet_ids" { + type = list(string) +} + +variable "cluster_id" { + type = string +} + +variable "cluster_name" { + type = string +} + +variable "ecs_security_group_id" { + type = string +} + +variable "default_target_group_arn" { + type = string +} + +variable "default_http_listener_arn" { + type = string +} + +variable "load_balancer_arn_suffix" { + type = string +} + +variable "target_group_arn_suffix" { + type = string +} + +variable "api_id" { + type = string +} + +variable "vpc_link_id" { + type = string +} + +variable "internal_invoke_url" { + type = string +} + +variable "api_invoke_url" { + type = string +} + +variable "root_path" { + description = "The path to serve the service from. / is for default /example_service is for subpath" + type = string + default = "" +} + +variable "connection_type" { + description = "Type of connectivity/integration to use for the service (choices: internal, internal_dns, vpc_link)." + type = string + validation { + condition = can(regex("^(internal|internal_dns|vpc_link)$", var.connection_type)) + error_message = "connection_type must be one of: internal, internal_dns, vpc_link." + } +} + +variable "local_tunnel" { + type = bool + default = false +} + +variable "xray_enabled" { + type = bool + default = false +} + +variable "wait_for_steady_state" { + type = bool + default = false +} + +variable "bootstrap" { + type = bool + default = false +} + +variable "bootstrap_image_uri" { + type = string + default = "" + + validation { + condition = !var.bootstrap || var.bootstrap_image_uri != "" + error_message = "bootstrap_image_uri must be set when bootstrap is true." + } +} + +variable "deployment_strategy" { + type = string + default = "all_at_once" + + validation { + condition = contains([ + "all_at_once", + "blue_green", + "canary", + "linear", + ], var.deployment_strategy) + error_message = "deployment_strategy must be one of: all_at_once, blue_green, canary, linear." + } +} + +variable "codedeploy_deployment_config_name_override" { + type = string + default = "" +} + +variable "codedeploy_alarm_names" { + type = list(string) + default = [] +} + +variable "additional_security_group_ids" { + description = "List of security groups to attach to ECS service" + type = list(string) + default = [] +} + +variable "desired_task_count" { + type = number +} + +variable "scaling_strategy" { + type = object({ + max_scaled_task_count = optional(number) + cpu = optional(object({ + scale_out_threshold = number + scale_in_threshold = number + scale_out_adjustment = number + scale_in_adjustment = number + cooldown_out = number + cooldown_in = number + })) + sqs = optional(object({ + scale_out_threshold = number + scale_in_threshold = number + scale_out_adjustment = number + scale_in_adjustment = number + cooldown_out = number + cooldown_in = number + queue_name = string + })) + alb = optional(object({ + target_requests_per_task = number + cooldown_in = number + cooldown_out = number + })) + }) + + # {} = "off" by convention + default = {} +} diff --git a/infra/modules/aws/_shared/sqs/README.md b/infra/modules/aws/_shared/sqs/README.md new file mode 100644 index 00000000..3f057c17 --- /dev/null +++ b/infra/modules/aws/_shared/sqs/README.md @@ -0,0 +1,16 @@ +# `_shared/sqs` + +Shared SQS queue module. + +## Owns + +- primary queue +- dead-letter queue +- redrive policy + +## Key outputs + +- queue URL +- dead-letter queue URL + +Used by worker-style Lambda and ECS consumers. diff --git a/infra/modules/aws/_shared/sqs/variables.tf b/infra/modules/aws/_shared/sqs/variables.tf index 904fd674..6a679d05 100644 --- a/infra/modules/aws/_shared/sqs/variables.tf +++ b/infra/modules/aws/_shared/sqs/variables.tf @@ -1,3 +1,6 @@ +### start of static vars set in root.hcl ### +### end of static vars set in root.hcl ### + variable "sqs_queue_name" { type = string } diff --git a/infra/modules/aws/_shared/task/README.md b/infra/modules/aws/_shared/task/README.md new file mode 100644 index 00000000..a6d18633 --- /dev/null +++ b/infra/modules/aws/_shared/task/README.md @@ -0,0 +1,31 @@ +# `_shared/task` + +Shared ECS task-definition module. + +## Owns + +- ECS task definition +- task execution role +- task role +- log groups +- optional debug and OpenTelemetry sidecars + +## Key inputs + +- `image_uri` +- `ecr_repository_name` +- `debug_image_uri` +- `aws_otel_collector_image_uri` +- `local_tunnel` +- `xray_enabled` +- `command` + +## Key outputs + +- `task_definition_arn` +- `service_name` +- log group names + +Use this for task revision creation. Traffic rollout happens at the service layer. + +The ECR repository access policy uses the explicit `ecr_repository_name` input. In this repo, Terragrunt sets a root-level default and environments can override it if the repository naming ever changes. diff --git a/infra/modules/aws/_shared/task/data.tf b/infra/modules/aws/_shared/task/data.tf new file mode 100644 index 00000000..c84e9309 --- /dev/null +++ b/infra/modules/aws/_shared/task/data.tf @@ -0,0 +1,81 @@ +data "aws_ecr_repository" "this" { + name = var.ecr_repository_name +} + +data "aws_iam_policy_document" "assume_role" { + statement { + effect = "Allow" + + principals { + type = "Service" + identifiers = ["ecs-tasks.amazonaws.com"] + } + + actions = ["sts:AssumeRole"] + } +} + +data "aws_iam_policy_document" "logs_policy" { + statement { + actions = [ + "logs:CreateLogStream", + "logs:PutLogEvents" + ] + + effect = "Allow" + + resources = [ + "${aws_cloudwatch_log_group.ecs_log_group.arn}", + "${aws_cloudwatch_log_group.ecs_log_group.arn}:*", + "${aws_cloudwatch_log_group.ecs_otel_log_group.arn}", + "${aws_cloudwatch_log_group.ecs_otel_log_group.arn}:*" + ] + } +} + +data "aws_iam_policy_document" "ecr_policy" { + statement { + actions = [ + "ecr:GetAuthorizationToken" + ] + effect = "Allow" + resources = ["*"] + } + + statement { + actions = [ + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "ecr:BatchCheckLayerAvailability" + ] + effect = "Allow" + resources = [data.aws_ecr_repository.this.arn] + } +} + +data "aws_iam_policy_document" "ssm_messages" { + statement { + actions = [ + "ssmmessages:CreateControlChannel", + "ssmmessages:CreateDataChannel", + "ssmmessages:OpenControlChannel", + "ssmmessages:OpenDataChannel" + ] + effect = "Allow" + resources = ["*"] + } +} + +data "aws_iam_policy_document" "xray_put" { + statement { + actions = [ + "xray:PutTraceSegments", + "xray:PutTelemetryRecords", + "xray:GetSamplingRules", + "xray:GetSamplingTargets", + "xray:GetSamplingStatisticSummaries" + ] + effect = "Allow" + resources = ["*"] + } +} diff --git a/infra/modules/aws/_shared/task/locals.tf b/infra/modules/aws/_shared/task/locals.tf new file mode 100644 index 00000000..1d8ecf73 --- /dev/null +++ b/infra/modules/aws/_shared/task/locals.tf @@ -0,0 +1,129 @@ +locals { + cloudwatch_log_name = "/ecs/${var.service_name}" + cloudwatch_otel_log_name = "/ecs/${var.service_name}/otel" + image_uri = var.image_uri + aws_otel_collector_image_uri = var.aws_otel_collector_image_uri + debug_image_uri = var.debug_image_uri + root_path_prefix = var.root_path != "" ? "/${var.root_path}" : "" + + shared_environment = [ + { + name = "AWS_REGION" + value = "${var.aws_region}" + }, + { + name = "AWS_SERVICE_NAME" + value = "${var.service_name}" + }, + { + name = "IMAGE" + value = "${local.image_uri}" + }, + { + name = "AWS_XRAY_ENDPOINT" + value = "http://localhost:4317" + }, + { + name = "ROOT_PATH" + value = local.root_path_prefix + }, + ] + + otel_environment = [ + { + name = "AWS_REGION" + value = "${var.aws_region}" + }, + { + name = "OTEL_SAMPLING_PERCENTAGE" + value = tostring(var.otel_sampling_percentage) + }, + { + name = "OTEL_ENDPOINT" + value = "0.0.0.0:4317" + }, + ] + + base_containers = [ + local.svc-container + ] + + debug_sidecar = var.local_tunnel ? [local.debug-container] : [] + xray_sidecar = var.xray_enabled ? [local.otel-collector] : [] + + container_definitions = concat( + local.base_containers, + local.debug_sidecar, + local.xray_sidecar + ) + + svc-container = merge( + { + name = var.service_name + image = local.image_uri + + portMappings = [ + { + name = "${var.service_name}-${var.container_port}-tcp" + containerPort = var.container_port + hostPort = var.container_port + protocol = "tcp" + appProtocol = "http" + } + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = "${local.cloudwatch_log_name}" + "awslogs-region" = "${var.aws_region}" + "awslogs-stream-prefix" = "ecs" + } + } + + essential = true + environment = concat(local.shared_environment, var.additional_env_vars) + }, + var.command == null ? {} : { + command = var.command + } + ) + + otel-collector = { + name = "${var.service_name}-otel-collector" + image = local.aws_otel_collector_image_uri + + portMappings = [ + { + name = "${var.service_name}-otel-collector-${var.container_port}-tcp" + containerPort = 4317 + hostPort = 4317 + protocol = "tcp" + appProtocol = "http" + } + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = "${local.cloudwatch_otel_log_name}" + "awslogs-region" = "${var.aws_region}" + "awslogs-stream-prefix" = "ecs" + } + } + + command = ["--config", "/opt/aws/aws-otel-collector/etc/collector-config.yaml"] + + essential = false + environment = local.otel_environment + } + + debug-container = { + name = "${var.service_name}-debug" + image = local.debug_image_uri + + command = ["sleep", "infinity"] + + essential = false + } +} diff --git a/infra/modules/aws/_shared/task/main.tf b/infra/modules/aws/_shared/task/main.tf new file mode 100644 index 00000000..a7425473 --- /dev/null +++ b/infra/modules/aws/_shared/task/main.tf @@ -0,0 +1,88 @@ +resource "aws_iam_role" "ecs_task_execution_role" { + name = "${var.service_name}-ecs-task-execution-role" + description = "Role used to pull from ECR and setup Cloudwatch logging access" + assume_role_policy = data.aws_iam_policy_document.assume_role.json +} + +resource "aws_iam_policy" "logs_access_policy" { + name = "${var.service_name}-logs-access-policy" + policy = data.aws_iam_policy_document.logs_policy.json +} + +resource "aws_iam_policy" "ecr_access_policy" { + name = "${var.service_name}-ecr-access-policy" + policy = data.aws_iam_policy_document.ecr_policy.json +} + +resource "aws_iam_policy" "ssm_messages_policy" { + name = "${var.service_name}-ssm-messages-policy" + policy = data.aws_iam_policy_document.ssm_messages.json +} + +resource "aws_iam_role_policy_attachment" "logs_access_policy_attachment" { + role = aws_iam_role.ecs_task_execution_role.name + policy_arn = aws_iam_policy.logs_access_policy.arn +} + +resource "aws_iam_role_policy_attachment" "ecr_access_policy_attachment" { + role = aws_iam_role.ecs_task_execution_role.name + policy_arn = aws_iam_policy.ecr_access_policy.arn +} + +resource "aws_iam_role_policy_attachment" "ssm_messages_policy_attachment" { + count = var.local_tunnel ? 1 : 0 + + role = aws_iam_role.ecs_task_role.name + policy_arn = aws_iam_policy.ssm_messages_policy.arn +} + +resource "aws_cloudwatch_log_group" "ecs_log_group" { + name = local.cloudwatch_log_name + retention_in_days = 1 +} + +resource "aws_cloudwatch_log_group" "ecs_otel_log_group" { + name = local.cloudwatch_otel_log_name + retention_in_days = 1 +} + +resource "aws_iam_role" "ecs_task_role" { + name = "${var.service_name}-ecs-task-role" + description = "Role used to give the task runtime access" + assume_role_policy = data.aws_iam_policy_document.assume_role.json +} + +resource "aws_iam_policy" "xray_put_policy" { + name = "${var.service_name}-xray-put-policy" + policy = data.aws_iam_policy_document.xray_put.json +} + +resource "aws_iam_role_policy_attachment" "xray_put_policy_attachment" { + role = aws_iam_role.ecs_task_role.name + policy_arn = aws_iam_policy.xray_put_policy.arn +} + +resource "aws_iam_role_policy_attachment" "task_runtime_additional_attachments" { + for_each = { for idx, arn in var.additional_runtime_policy_arns : idx => arn } + + role = aws_iam_role.ecs_task_role.name + policy_arn = each.value +} + +resource "aws_ecs_task_definition" "task" { + family = "${var.service_name}-task" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = var.cpu + memory = var.memory + + execution_role_arn = aws_iam_role.ecs_task_execution_role.arn + task_role_arn = aws_iam_role.ecs_task_role.arn + + runtime_platform { + cpu_architecture = "X86_64" + operating_system_family = "LINUX" + } + + container_definitions = jsonencode(local.container_definitions) +} \ No newline at end of file diff --git a/infra/modules/aws/_shared/task/outputs.tf b/infra/modules/aws/_shared/task/outputs.tf new file mode 100644 index 00000000..ee16d55a --- /dev/null +++ b/infra/modules/aws/_shared/task/outputs.tf @@ -0,0 +1,15 @@ +output "task_definition_arn" { + value = aws_ecs_task_definition.task.arn +} + +output "cloudwatch_log_group" { + value = aws_cloudwatch_log_group.ecs_log_group.name +} + +output "root_path" { + value = var.root_path +} + +output "service_name" { + value = var.service_name +} diff --git a/infra/modules/aws/_shared/task/variables.tf b/infra/modules/aws/_shared/task/variables.tf new file mode 100644 index 00000000..0425a242 --- /dev/null +++ b/infra/modules/aws/_shared/task/variables.tf @@ -0,0 +1,80 @@ +### start of static vars set in root.hcl ### +variable "project_name" { + type = string +} + +variable "ecr_repository_name" { + type = string +} + +variable "service_name" { + type = string +} + +variable "aws_region" { + type = string +} +### end of static vars set in root.hcl ### + +variable "container_port" { + type = number +} + +variable "cpu" { + type = number + default = 256 +} + +variable "memory" { + type = number + default = 512 +} + +variable "image_uri" { + type = string +} + +variable "aws_otel_collector_image_uri" { + type = string +} + +variable "otel_sampling_percentage" { + description = "Percentage of requests to send to x-ray" + type = string + default = 10.0 +} + +variable "debug_image_uri" { + type = string +} + +variable "local_tunnel" { + type = bool +} + +variable "xray_enabled" { + type = bool +} + +variable "additional_env_vars" { + type = list(object({ + name = string + value = string + })) + default = [] +} + +variable "command" { + type = list(string) + nullable = true +} + +variable "root_path" { + type = string +} + +variable "additional_runtime_policy_arns" { + description = "List of IAM runtime policy ARNs to attach to the role" + type = list(string) + default = [] +} diff --git a/infra/modules/aws/api/README.md b/infra/modules/aws/api/README.md new file mode 100644 index 00000000..41918fe4 --- /dev/null +++ b/infra/modules/aws/api/README.md @@ -0,0 +1,24 @@ +# `api` + +Lambda-backed public HTTP API module. + +## Owns + +- Lambda API function via `_shared/lambda` +- Lambda proxy integration into the shared HTTP API +- root and proxy routes +- API 5xx CloudWatch alarm + +## Dependencies + +- shared API Gateway HTTP API and VPC link from `network` +- shared security state + +## Key outputs + +- `invoke_url` +- `api_id` +- `vpc_link_id` +- Lambda function and alias names + +This module is Lambda-specific. The shared API surface now lives in `network`. diff --git a/infra/modules/aws/api/data.tf b/infra/modules/aws/api/data.tf new file mode 100644 index 00000000..981bf5d7 --- /dev/null +++ b/infra/modules/aws/api/data.tf @@ -0,0 +1,9 @@ +data "terraform_remote_state" "network" { + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/network/terraform.tfstate" + region = var.aws_region + } +} diff --git a/infra/modules/aws/api/local.tf b/infra/modules/aws/api/local.tf index aade786d..f35a9c0e 100644 --- a/infra/modules/aws/api/local.tf +++ b/infra/modules/aws/api/local.tf @@ -2,4 +2,4 @@ locals { lambda_name = "${var.environment}-${var.project_name}-api" apigw_http_5xx_metric = "5xx" api_5xx_alarm_name = "${local.lambda_name}-api-v2-5xx-rate-critical" -} \ No newline at end of file +} diff --git a/infra/modules/aws/api/main.tf b/infra/modules/aws/api/main.tf index 4714b87d..dcdb0369 100644 --- a/infra/modules/aws/api/main.tf +++ b/infra/modules/aws/api/main.tf @@ -21,42 +21,31 @@ module "lambda_api" { provisioned_config = var.provisioned_config } -resource "aws_apigatewayv2_api" "http_api" { - name = "${module.lambda_api.name}-http" - protocol_type = "HTTP" -} - resource "aws_apigatewayv2_integration" "lambda_proxy" { - api_id = aws_apigatewayv2_api.http_api.id + api_id = data.terraform_remote_state.network.outputs.api_id integration_type = "AWS_PROXY" integration_uri = module.lambda_api.alias_arn payload_format_version = "2.0" } resource "aws_apigatewayv2_route" "root" { - api_id = aws_apigatewayv2_api.http_api.id + api_id = data.terraform_remote_state.network.outputs.api_id route_key = "ANY /" target = "integrations/${aws_apigatewayv2_integration.lambda_proxy.id}" } resource "aws_apigatewayv2_route" "proxy" { - api_id = aws_apigatewayv2_api.http_api.id + api_id = data.terraform_remote_state.network.outputs.api_id route_key = "ANY /{proxy+}" target = "integrations/${aws_apigatewayv2_integration.lambda_proxy.id}" } -resource "aws_apigatewayv2_stage" "default" { - api_id = aws_apigatewayv2_api.http_api.id - name = "$default" - auto_deploy = true -} - resource "aws_lambda_permission" "allow_invoke" { statement_id = "AllowAPIGatewayInvoke" action = "lambda:InvokeFunction" function_name = module.lambda_api.alias_arn principal = "apigateway.amazonaws.com" - source_arn = "${aws_apigatewayv2_api.http_api.execution_arn}/*/*" # all routes/stages + source_arn = "${data.terraform_remote_state.network.outputs.api_execution_arn}/*/*" # all routes/stages } resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" { @@ -93,8 +82,8 @@ resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" { period = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms dimensions = { - ApiId = aws_apigatewayv2_api.http_api.id - Stage = aws_apigatewayv2_stage.default.name + ApiId = data.terraform_remote_state.network.outputs.api_id + Stage = data.terraform_remote_state.network.outputs.api_stage_name } } } @@ -111,8 +100,8 @@ resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" { period = 60 dimensions = { - ApiId = aws_apigatewayv2_api.http_api.id - Stage = aws_apigatewayv2_stage.default.name + ApiId = data.terraform_remote_state.network.outputs.api_id + Stage = data.terraform_remote_state.network.outputs.api_stage_name } } } diff --git a/infra/modules/aws/api/outputs.tf b/infra/modules/aws/api/outputs.tf index 502f51db..770a3861 100644 --- a/infra/modules/aws/api/outputs.tf +++ b/infra/modules/aws/api/outputs.tf @@ -1,5 +1,13 @@ output "invoke_url" { - value = aws_apigatewayv2_api.http_api.api_endpoint + value = data.terraform_remote_state.network.outputs.api_invoke_url +} + +output "api_id" { + value = data.terraform_remote_state.network.outputs.api_id +} + +output "vpc_link_id" { + value = data.terraform_remote_state.network.outputs.vpc_link_id } output "cloudwatch_log_group" { diff --git a/infra/modules/aws/api/variables.tf b/infra/modules/aws/api/variables.tf index 5542827f..dba57dda 100644 --- a/infra/modules/aws/api/variables.tf +++ b/infra/modules/aws/api/variables.tf @@ -9,6 +9,16 @@ variable "environment" { description = "Environment reference used in naming resources i.e. 'dev'" } +variable "aws_region" { + type = string + description = "AWS region used for remote state lookups" +} + +variable "state_bucket" { + type = string + description = "Terraform state bucket" +} + variable "code_bucket" { type = string description = "Bucket where deployable code artifacts are uploaded" @@ -64,3 +74,8 @@ variable "api_5xx_alarm_datapoints_to_alarm" { type = number description = "The number of evaluated periods that must be breaching to trigger ALARM" } + +variable "vpc_name" { + type = string + description = "VPC name tag used to look up private subnets for the shared API Gateway VPC link" +} diff --git a/infra/modules/aws/frontend/README.md b/infra/modules/aws/frontend/README.md new file mode 100644 index 00000000..04c1cf8f --- /dev/null +++ b/infra/modules/aws/frontend/README.md @@ -0,0 +1,15 @@ +# `frontend` + +Static frontend hosting module. + +## Owns + +- website bucket and distribution resources +- deployment destination for built frontend assets + +## Key outputs + +- website bucket name +- CloudFront distribution id + +Used by the frontend build and deploy workflow path. diff --git a/infra/modules/aws/lambda_worker/README.md b/infra/modules/aws/lambda_worker/README.md new file mode 100644 index 00000000..7e1b8685 --- /dev/null +++ b/infra/modules/aws/lambda_worker/README.md @@ -0,0 +1,16 @@ +# `lambda_worker` + +Worker Lambda wrapper module. + +## Owns + +- worker Lambda via `_shared/lambda` +- worker queue integration via `_shared/sqs` + +## Key outputs + +- Lambda function and alias names +- queue URLs +- log group + +This is the concrete worker implementation on top of the shared Lambda primitives. diff --git a/infra/modules/aws/network/README.md b/infra/modules/aws/network/README.md new file mode 100644 index 00000000..cdabc645 --- /dev/null +++ b/infra/modules/aws/network/README.md @@ -0,0 +1,28 @@ +# `network` + +Shared network and routing module. + +## Owns + +- internal ALB +- default listener and target group +- shared HTTP API Gateway API +- default API stage +- VPC link +- interface VPC endpoints +- S3 gateway endpoint + +## Dependencies + +- pre-existing tagged VPC and private subnets discovered with `data` lookups +- shared security groups from `security` + +## Key outputs + +- ALB listener and target group identifiers +- `internal_invoke_url` +- `api_id` +- `api_invoke_url` +- `api_execution_arn` +- `api_stage_name` +- `vpc_link_id` diff --git a/infra/modules/aws/network/data.tf b/infra/modules/aws/network/data.tf new file mode 100644 index 00000000..0e46aad3 --- /dev/null +++ b/infra/modules/aws/network/data.tf @@ -0,0 +1,35 @@ +data "aws_vpc" "this" { + filter { + name = "tag:Name" + values = [var.vpc_name] + } +} + +data "aws_subnets" "private" { + filter { + name = "vpc-id" + values = [data.aws_vpc.this.id] + } + + filter { + name = "tag:Name" + values = ["*private*"] + } +} + +data "aws_route_tables" "private" { + filter { + name = "association.subnet-id" + values = data.aws_subnets.private.ids + } +} + +data "terraform_remote_state" "security" { + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/security/terraform.tfstate" + region = var.aws_region + } +} diff --git a/infra/modules/aws/network/locals.tf b/infra/modules/aws/network/locals.tf new file mode 100644 index 00000000..42da6e06 --- /dev/null +++ b/infra/modules/aws/network/locals.tf @@ -0,0 +1,26 @@ +locals { + resource_suffix = substr(md5("${var.project_name}-${var.environment}"), 0, 8) + load_balancer_name = "alb-${var.environment}-${local.resource_suffix}" + target_group_name = "tg-${var.environment}-${local.resource_suffix}" + + base_interface_endpoints = { + ecr_api = "ecr.api" + ecr_dkr = "ecr.dkr" + logs = "logs" + } + + tunnel_interface_endpoints = var.local_tunnel ? { + ssmmessages = "ssmmessages" + ec2messages = "ec2messages" + } : {} + + xray_interface_endpoints = var.xray_enabled ? { + xray = "xray" + } : {} + + interface_endpoints = merge( + local.base_interface_endpoints, + local.tunnel_interface_endpoints, + local.xray_interface_endpoints, + ) +} diff --git a/infra/modules/aws/network/main.tf b/infra/modules/aws/network/main.tf new file mode 100644 index 00000000..14a60ffb --- /dev/null +++ b/infra/modules/aws/network/main.tf @@ -0,0 +1,72 @@ +resource "aws_lb" "this" { + name = local.load_balancer_name + internal = true + load_balancer_type = "application" + security_groups = [data.terraform_remote_state.security.outputs.load_balancer_sg] + subnets = data.aws_subnets.private.ids +} + +resource "aws_apigatewayv2_api" "http_api" { + name = "${var.project_name}-${var.environment}-http" + protocol_type = "HTTP" +} + +resource "aws_apigatewayv2_vpc_link" "http_api" { + name = "${var.project_name}-${var.environment}-http-vpc-link" + subnet_ids = data.aws_subnets.private.ids + security_group_ids = [data.terraform_remote_state.security.outputs.api_vpc_link_sg] +} + +resource "aws_apigatewayv2_stage" "default" { + api_id = aws_apigatewayv2_api.http_api.id + name = "$default" + auto_deploy = true +} + +resource "aws_vpc_endpoint" "interface_endpoints" { + for_each = local.interface_endpoints + + vpc_id = data.aws_vpc.this.id + service_name = "com.amazonaws.${var.aws_region}.${each.value}" + vpc_endpoint_type = "Interface" + security_group_ids = [data.terraform_remote_state.security.outputs.vpc_endpoint_sg] + subnet_ids = data.aws_subnets.private.ids + private_dns_enabled = true +} + +resource "aws_vpc_endpoint" "gateway_s3" { + vpc_id = data.aws_vpc.this.id + service_name = "com.amazonaws.${var.aws_region}.s3" + vpc_endpoint_type = "Gateway" + route_table_ids = data.aws_route_tables.private.ids +} + +resource "aws_lb_target_group" "default" { + name = local.target_group_name + port = var.container_port + protocol = "HTTP" + target_type = "ip" + vpc_id = data.aws_vpc.this.id + + health_check { + path = "/health" + matcher = "200-399" + interval = 30 + timeout = 5 + healthy_threshold = 2 + unhealthy_threshold = 2 + port = "traffic-port" + protocol = "HTTP" + } +} + +resource "aws_lb_listener" "http" { + load_balancer_arn = aws_lb.this.arn + port = var.container_port + protocol = "HTTP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.default.arn + } +} diff --git a/infra/modules/aws/network/outputs.tf b/infra/modules/aws/network/outputs.tf new file mode 100644 index 00000000..d427a931 --- /dev/null +++ b/infra/modules/aws/network/outputs.tf @@ -0,0 +1,39 @@ +output "default_target_group_arn" { + value = aws_lb_target_group.default.arn +} + +output "default_http_listener_arn" { + value = aws_lb_listener.http.arn +} + +output "load_balancer_arn_suffix" { + value = aws_lb.this.arn_suffix +} + +output "target_group_arn_suffix" { + value = aws_lb_target_group.default.arn_suffix +} + +output "internal_invoke_url" { + value = "http://${aws_lb.this.dns_name}" +} + +output "api_id" { + value = aws_apigatewayv2_api.http_api.id +} + +output "api_invoke_url" { + value = aws_apigatewayv2_api.http_api.api_endpoint +} + +output "api_execution_arn" { + value = aws_apigatewayv2_api.http_api.execution_arn +} + +output "api_stage_name" { + value = aws_apigatewayv2_stage.default.name +} + +output "vpc_link_id" { + value = aws_apigatewayv2_vpc_link.http_api.id +} diff --git a/infra/modules/aws/network/variables.tf b/infra/modules/aws/network/variables.tf new file mode 100644 index 00000000..aa66a8ce --- /dev/null +++ b/infra/modules/aws/network/variables.tf @@ -0,0 +1,35 @@ +### start of static vars set in root.hcl ### +variable "project_name" { + type = string +} + +variable "environment" { + type = string +} + +variable "aws_region" { + type = string +} + +variable "state_bucket" { + type = string +} +### end of static vars set in root.hcl ### + +variable "vpc_name" { + type = string +} + +variable "container_port" { + type = number +} + +variable "local_tunnel" { + type = bool + default = false +} + +variable "xray_enabled" { + type = bool + default = false +} diff --git a/infra/modules/aws/security/README.md b/infra/modules/aws/security/README.md new file mode 100644 index 00000000..4282a4fe --- /dev/null +++ b/infra/modules/aws/security/README.md @@ -0,0 +1,19 @@ +# `security` + +Shared security-group module. + +## Owns + +- load balancer security group +- ECS service security group +- VPC endpoint security group +- API VPC link security group + +## Key outputs + +- `load_balancer_sg` +- `ecs_sg` +- `vpc_endpoint_sg` +- `api_vpc_link_sg` + +Used by `network`, `api`, and ECS service modules. diff --git a/infra/modules/aws/security/data.tf b/infra/modules/aws/security/data.tf new file mode 100644 index 00000000..e67c042e --- /dev/null +++ b/infra/modules/aws/security/data.tf @@ -0,0 +1,6 @@ +data "aws_vpc" "this" { + filter { + name = "tag:Name" + values = [var.vpc_name] + } +} diff --git a/infra/modules/aws/security/locals.tf b/infra/modules/aws/security/locals.tf new file mode 100644 index 00000000..1d3c247a --- /dev/null +++ b/infra/modules/aws/security/locals.tf @@ -0,0 +1,4 @@ +locals { + load_balancer_sg_name = "${var.project_name}-${var.environment}-alb-sg" + ecs_sg_name = "${var.project_name}-${var.environment}-ecs-sg" +} diff --git a/infra/modules/aws/security/main.tf b/infra/modules/aws/security/main.tf new file mode 100644 index 00000000..856a6e91 --- /dev/null +++ b/infra/modules/aws/security/main.tf @@ -0,0 +1,72 @@ +resource "aws_security_group" "load_balancer" { + name = local.load_balancer_sg_name + description = "Security group for the internal application load balancer" + vpc_id = data.aws_vpc.this.id + + ingress { + from_port = var.container_port + to_port = var.container_port + protocol = "tcp" + cidr_blocks = [data.aws_vpc.this.cidr_block] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group" "ecs" { + name = local.ecs_sg_name + description = "Security group for ECS services" + vpc_id = data.aws_vpc.this.id + + ingress { + from_port = var.container_port + to_port = var.container_port + protocol = "tcp" + security_groups = [aws_security_group.load_balancer.id] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group" "vpc_endpoint" { + name = "${var.project_name}-${var.environment}-vpce-sg" + description = "Security group for interface VPC endpoints" + vpc_id = data.aws_vpc.this.id + + ingress { + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = [data.aws_vpc.this.cidr_block] + } + + egress { + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = [data.aws_vpc.this.cidr_block] + } +} + +resource "aws_security_group" "api_vpc_link" { + name = "${var.project_name}-${var.environment}-api-vpc-link-sg" + description = "Security group for API Gateway VPC link ENIs" + vpc_id = data.aws_vpc.this.id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} diff --git a/infra/modules/aws/security/outputs.tf b/infra/modules/aws/security/outputs.tf new file mode 100644 index 00000000..5ae4a423 --- /dev/null +++ b/infra/modules/aws/security/outputs.tf @@ -0,0 +1,15 @@ +output "load_balancer_sg" { + value = aws_security_group.load_balancer.id +} + +output "ecs_sg" { + value = aws_security_group.ecs.id +} + +output "vpc_endpoint_sg" { + value = aws_security_group.vpc_endpoint.id +} + +output "api_vpc_link_sg" { + value = aws_security_group.api_vpc_link.id +} diff --git a/infra/modules/aws/security/variables.tf b/infra/modules/aws/security/variables.tf new file mode 100644 index 00000000..6f97077b --- /dev/null +++ b/infra/modules/aws/security/variables.tf @@ -0,0 +1,17 @@ +### start of static vars set in root.hcl ### +variable "project_name" { + type = string +} + +variable "environment" { + type = string +} +### end of static vars set in root.hcl ### + +variable "vpc_name" { + type = string +} + +variable "container_port" { + type = number +} diff --git a/infra/modules/aws/service_worker/README.md b/infra/modules/aws/service_worker/README.md new file mode 100644 index 00000000..a657418a --- /dev/null +++ b/infra/modules/aws/service_worker/README.md @@ -0,0 +1,22 @@ +# `service_worker` + +Concrete ECS worker service wrapper. + +## Owns + +- worker ECS service via `_shared/service` + +## Dependencies + +- `task_worker` remote state +- `cluster`, `network`, `security`, `api`, and `lambda_worker` remote state + +## Key outputs + +- `service_name` +- `cluster_name` +- `codedeploy_app_name` +- `codedeploy_deployment_group_name` +- `container_port` + +This module wires the worker-specific service onto the shared ECS service behavior. diff --git a/infra/modules/aws/service_worker/data.tf b/infra/modules/aws/service_worker/data.tf new file mode 100644 index 00000000..f20b5fd0 --- /dev/null +++ b/infra/modules/aws/service_worker/data.tf @@ -0,0 +1,69 @@ +data "terraform_remote_state" "task_worker" { + count = var.bootstrap ? 0 : 1 + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/task_worker/terraform.tfstate" + region = var.aws_region + } +} + +data "terraform_remote_state" "lambda_worker" { + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/lambda_worker/terraform.tfstate" + region = var.aws_region + } +} + +data "terraform_remote_state" "network" { + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/network/terraform.tfstate" + region = var.aws_region + } +} + +data "terraform_remote_state" "security" { + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/security/terraform.tfstate" + region = var.aws_region + } +} + +data "terraform_remote_state" "cluster" { + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/cluster/terraform.tfstate" + region = var.aws_region + } +} + +data "aws_vpc" "this" { + filter { + name = "tag:Name" + values = [var.vpc_name] + } +} + +data "aws_subnets" "private" { + filter { + name = "vpc-id" + values = [data.aws_vpc.this.id] + } + + filter { + name = "tag:Name" + values = ["*private*"] + } +} diff --git a/infra/modules/aws/service_worker/main.tf b/infra/modules/aws/service_worker/main.tf new file mode 100644 index 00000000..48f7bb32 --- /dev/null +++ b/infra/modules/aws/service_worker/main.tf @@ -0,0 +1,47 @@ +module "service_consumer" { + source = "../_shared/service" + + service_name = var.service_name + task_definition_arn = var.bootstrap ? "" : data.terraform_remote_state.task_worker[0].outputs.task_definition_arn + container_port = var.container_port + root_path = var.root_path + connection_type = var.connection_type + + aws_region = var.aws_region + vpc_id = data.aws_vpc.this.id + private_subnet_ids = data.aws_subnets.private.ids + + cluster_id = data.terraform_remote_state.cluster.outputs.cluster_id + cluster_name = data.terraform_remote_state.cluster.outputs.cluster_name + ecs_security_group_id = data.terraform_remote_state.security.outputs.ecs_sg + + default_target_group_arn = data.terraform_remote_state.network.outputs.default_target_group_arn + default_http_listener_arn = data.terraform_remote_state.network.outputs.default_http_listener_arn + load_balancer_arn_suffix = data.terraform_remote_state.network.outputs.load_balancer_arn_suffix + target_group_arn_suffix = data.terraform_remote_state.network.outputs.target_group_arn_suffix + + api_id = data.terraform_remote_state.network.outputs.api_id + vpc_link_id = data.terraform_remote_state.network.outputs.vpc_link_id + internal_invoke_url = data.terraform_remote_state.network.outputs.internal_invoke_url + api_invoke_url = data.terraform_remote_state.network.outputs.api_invoke_url + + bootstrap = var.bootstrap + bootstrap_image_uri = var.bootstrap_image_uri + xray_enabled = var.xray_enabled + local_tunnel = var.local_tunnel + wait_for_steady_state = var.wait_for_steady_state + + desired_task_count = 1 + scaling_strategy = { + max_scaled_task_count = 4 + sqs = { + scale_out_threshold = 10 # Start scaling at 10 msgs avg + scale_in_threshold = 2 # Scale in below 2 msgs avg + scale_out_adjustment = 2 # Add 2 tasks at once + scale_in_adjustment = 1 # Remove 1 task + cooldown_out = 60 # 1min cooldown (more stable) + cooldown_in = 300 # 5min cooldown (prevent flapping) + queue_name = "tbc" # SQS queue name to monitor for scaling + } + } +} diff --git a/infra/modules/aws/service_worker/outputs.tf b/infra/modules/aws/service_worker/outputs.tf new file mode 100644 index 00000000..1eb85248 --- /dev/null +++ b/infra/modules/aws/service_worker/outputs.tf @@ -0,0 +1,19 @@ +output "service_name" { + value = module.service_consumer.service_name +} + +output "cluster_name" { + value = data.terraform_remote_state.cluster.outputs.cluster_name +} + +output "codedeploy_app_name" { + value = module.service_consumer.codedeploy_app_name +} + +output "codedeploy_deployment_group_name" { + value = module.service_consumer.codedeploy_deployment_group_name +} + +output "container_port" { + value = var.container_port +} diff --git a/infra/modules/aws/service_worker/variables.tf b/infra/modules/aws/service_worker/variables.tf new file mode 100644 index 00000000..75980e2f --- /dev/null +++ b/infra/modules/aws/service_worker/variables.tf @@ -0,0 +1,76 @@ +### start of static vars set in root.hcl ### +variable "state_bucket" { + type = string +} + +variable "environment" { + type = string +} + +variable "aws_region" { + type = string +} + +variable "project_name" { + type = string +} +### end of static vars set in root.hcl ### + +variable "service_name" { + type = string + default = "ecs-worker" +} + +variable "vpc_name" { + type = string +} + +variable "container_port" { + type = number +} + +variable "root_path" { + description = "The path to serve the service from. / is for default /example_service is for subpath" + default = "" + type = string +} + +variable "connection_type" { + description = "Type of connectivity/integration to use for the service (choices: internal, internal_dns, vpc_link)." + type = string + default = "internal" + validation { + condition = can(regex("^(internal|internal_dns|vpc_link)$", var.connection_type)) + error_message = "connection_type must be one of: internal, internal_dns, vpc_link." + } +} + +variable "local_tunnel" { + type = bool + default = false +} + +variable "xray_enabled" { + type = bool + default = false +} + +variable "wait_for_steady_state" { + type = bool + default = false +} + +variable "bootstrap" { + type = bool + default = false +} + +variable "bootstrap_image_uri" { + type = string + default = "" + + validation { + condition = !var.bootstrap || var.bootstrap_image_uri != "" + error_message = "bootstrap_image_uri must be set when bootstrap is true." + } +} diff --git a/infra/modules/aws/task_worker/README.md b/infra/modules/aws/task_worker/README.md new file mode 100644 index 00000000..aea9cd01 --- /dev/null +++ b/infra/modules/aws/task_worker/README.md @@ -0,0 +1,21 @@ +# `task_worker` + +Concrete ECS worker task wrapper. + +## Owns + +- worker ECS task definition via `_shared/task` + +## Key behavior + +- runs `python -u consumer/app.py` +- publishes worker task revisions for ECS deploys +- uses the shared ECR repository named by `ecr_repository_name` + +## Key outputs + +- `task_definition_arn` +- `service_name` +- log group name + +This module is the image-driven deployment unit for the ECS worker. diff --git a/infra/modules/aws/task_worker/main.tf b/infra/modules/aws/task_worker/main.tf new file mode 100644 index 00000000..7d50919e --- /dev/null +++ b/infra/modules/aws/task_worker/main.tf @@ -0,0 +1,25 @@ +module "task_consumer" { + source = "../_shared/task" + + project_name = var.project_name + ecr_repository_name = var.ecr_repository_name + aws_region = var.aws_region + container_port = var.container_port + cpu = var.cpu + memory = var.memory + + image_uri = var.image_uri + debug_image_uri = var.debug_image_uri + aws_otel_collector_image_uri = var.aws_otel_collector_image_uri + otel_sampling_percentage = var.otel_sampling_percentage + + local_tunnel = var.local_tunnel + xray_enabled = var.xray_enabled + + additional_env_vars = [] + additional_runtime_policy_arns = [] + + root_path = "" + service_name = "ecs-worker" + command = ["python", "-u", "consumer/app.py"] +} diff --git a/infra/modules/aws/task_worker/outputs.tf b/infra/modules/aws/task_worker/outputs.tf new file mode 100644 index 00000000..8aadc299 --- /dev/null +++ b/infra/modules/aws/task_worker/outputs.tf @@ -0,0 +1,15 @@ +output "task_definition_arn" { + value = module.task_consumer.task_definition_arn +} + +output "cloudwatch_log_group" { + value = module.task_consumer.cloudwatch_log_group +} + +output "root_path" { + value = module.task_consumer.root_path +} + +output "service_name" { + value = module.task_consumer.service_name +} diff --git a/infra/modules/aws/task_worker/variables.tf b/infra/modules/aws/task_worker/variables.tf new file mode 100644 index 00000000..cdeef13e --- /dev/null +++ b/infra/modules/aws/task_worker/variables.tf @@ -0,0 +1,61 @@ +### start of static vars set in root.hcl ### +variable "state_bucket" { + type = string +} + +variable "environment" { + type = string +} + +variable "aws_region" { + type = string +} + +variable "project_name" { + type = string +} + +variable "ecr_repository_name" { + type = string +} +### end of static vars set in root.hcl ### + +variable "container_port" { + type = number +} + +variable "cpu" { + type = number + default = 256 +} + +variable "memory" { + type = number + default = 512 +} + +variable "image_uri" { + type = string +} + +variable "aws_otel_collector_image_uri" { + type = string +} + +variable "otel_sampling_percentage" { + description = "Percentage of requests to send to x-ray" + type = string + default = 10.0 +} + +variable "debug_image_uri" { + type = string +} + +variable "local_tunnel" { + type = bool +} + +variable "xray_enabled" { + type = bool +} diff --git a/infra/root.hcl b/infra/root.hcl index 99f15a28..993c3588 100644 --- a/infra/root.hcl +++ b/infra/root.hcl @@ -1,10 +1,11 @@ locals { github_token = get_env("GITHUB_TOKEN", "not_set") - git_remote = run_cmd("--terragrunt-quiet", "git", "remote", "get-url", "origin") - github_repo = regex("[/:]([-0-9_A-Za-z]*/[-0-9_A-Za-z]*)[^/]*$", local.git_remote)[0] - repo_owner = split("/", local.github_repo)[0] - aws_account_id = get_aws_account_id() + git_remote = run_cmd("--terragrunt-quiet", "git", "remote", "get-url", "origin") + github_repo = regex("[/:]([-0-9_A-Za-z]*/[-0-9_A-Za-z]*)[^/]*$", local.git_remote)[0] + repo_owner = split("/", local.github_repo)[0] + aws_account_id = get_aws_account_id() + allowed_read_aws_account_ids = [local.aws_account_id] path_parts = split("/", get_terragrunt_dir()) module = local.path_parts[length(local.path_parts) - 1] @@ -24,9 +25,10 @@ locals { state_key = "${local.environment}/${local.provider}/${local.module}/terraform.tfstate" state_lock_table = "${local.project_name}-tf-lockid" - # separate s3 version bucket when dev, otherwise ci - s3_bucket_base = local.environment == "dev" ? "${local.base_reference}-${local.environment}" : "${local.base_reference}-ci" - code_bucket = "${local.s3_bucket_base}-code" + # separate shared artifact resources when dev, otherwise ci + artifact_base = local.environment == "dev" ? "${local.base_reference}-${local.environment}" : "${local.base_reference}-ci" + code_bucket = "${local.artifact_base}-code" + ecr_repository_name = "${local.artifact_base}-ecr" } terraform { @@ -88,15 +90,17 @@ inputs = merge( local.global_vars.inputs, local.environment_vars.inputs, { - aws_account_id = local.aws_account_id - aws_region = local.aws_region - project_name = local.project_name - environment = local.environment - github_repo = local.github_repo - deploy_role_name = local.deploy_role_name - deploy_role_arn = local.deploy_role_arn - state_bucket = local.state_bucket - state_lock_table = local.state_lock_table - code_bucket = local.code_bucket + aws_account_id = local.aws_account_id + allowed_read_aws_account_ids = local.allowed_read_aws_account_ids + aws_region = local.aws_region + project_name = local.project_name + environment = local.environment + github_repo = local.github_repo + deploy_role_name = local.deploy_role_name + deploy_role_arn = local.deploy_role_arn + state_bucket = local.state_bucket + state_lock_table = local.state_lock_table + code_bucket = local.code_bucket + ecr_repository_name = local.ecr_repository_name } ) diff --git a/justfile b/justfile index d3959077..c57e5451 100644 --- a/justfile +++ b/justfile @@ -4,7 +4,9 @@ _default: PROJECT_DIR := justfile_directory() LAMBDA_DIR := "lambdas" +CONTAINERS_DIR := "containers" FRONTEND_DIR := "frontend" +EXTRA_CONTAINER_DIRECTORIES := "[\"debug\",\"otel_collector\"]" tf-lint-check: @@ -19,7 +21,6 @@ tf-lint-check: done - lambda-invoke: #!/bin/bash set -euo pipefail @@ -139,6 +140,74 @@ get-version-files: | jq -s -c . +get-version-file-keys: + #!/usr/bin/env bash + set -euo pipefail + + if [[ -z "$BUCKET_NAME" ]]; then + echo "❌ BUCKET_NAME environment variable is not set." + exit 1 + fi + + if [[ -z "$VERSION" ]]; then + echo "❌ VERSION environment variable is not set." + exit 1 + fi + + FULL_BUCKET_PATH="s3://$BUCKET_NAME/lambdas/$VERSION/" + + aws s3api head-bucket --bucket "$BUCKET_NAME" >/dev/null + aws s3 ls "$FULL_BUCKET_PATH" >/dev/null + + aws s3 ls "$FULL_BUCKET_PATH" --recursive \ + | awk '{print $4}' \ + | grep '\.zip$' \ + | grep -v 'appspec' \ + | jq -R . \ + | jq -s -c . + + +get-ecr-version-images: + #!/usr/bin/env bash + set -euo pipefail + + if [[ -z "${REPOSITORY_URL:-}" ]]; then + echo "❌ REPOSITORY_URL environment variable is not set." + exit 1 + fi + + if [[ -z "${VERSION:-}" ]]; then + echo "❌ VERSION environment variable is not set." + exit 1 + fi + + repository_name="${REPOSITORY_URL#*/}" + + aws ecr describe-images \ + --repository-name "$repository_name" \ + --query 'imageDetails[].imageTags[]' \ + --output text \ + | tr '\t' '\n' \ + | grep -- "-$VERSION\$" \ + | sed "s/-$VERSION$//" \ + | jq -R . \ + | jq -s -c . + + +get-ecr-version-tasks: + #!/usr/bin/env bash + set -euo pipefail + + image_names="$(just --justfile "{{PROJECT_DIR}}/justfile" get-ecr-version-images)" + + jq -cn \ + --argjson images "$image_names" \ + ' + $images + | map(select(. != "bootstrap" and . != "debug" and . != "otel_collector")) + ' + + lambda-get-directories: #!/usr/bin/env bash set -euo pipefail @@ -148,6 +217,141 @@ lambda-get-directories: | jq -R . \ | jq -s -c . + +docker-build: + #!/usr/bin/env bash + set -euo pipefail + + if [[ -z "$CONTAINER_NAME" ]]; then + echo "❌ CONTAINER_NAME environment variable is not set." + exit 1 + fi + + TAG="${IMAGE_URI:-$CONTAINER_NAME}" + + docker build \ + --file "{{PROJECT_DIR}}/Dockerfile" \ + --target "$CONTAINER_NAME" \ + -t "$TAG" \ + "{{PROJECT_DIR}}" + + +docker-mirror: + #!/usr/bin/env bash + set -euo pipefail + + if [[ -z "${SOURCE_IMAGE:-}" ]]; then + echo "❌ SOURCE_IMAGE environment variable is not set." + exit 1 + fi + + if [[ -z "${IMAGE_URI:-}" ]]; then + echo "❌ IMAGE_URI environment variable is not set." + exit 1 + fi + + docker pull "$SOURCE_IMAGE" + docker tag "$SOURCE_IMAGE" "$IMAGE_URI" + + +docker-push: + #!/usr/bin/env bash + set -euo pipefail + + if [[ -z "${IMAGE_URI:-}" ]]; then + echo "❌ IMAGE_URI environment variable is not set." + exit 1 + fi + + registry="${IMAGE_URI%%/*}" + aws_region="$(echo "$registry" | cut -d. -f4)" + + if [[ -z "$aws_region" ]]; then + echo "❌ Could not determine AWS region from IMAGE_URI: $IMAGE_URI" + exit 1 + fi + + aws ecr get-login-password --region "$aws_region" \ + | docker login --username AWS --password-stdin "$registry" + + docker push "$IMAGE_URI" + + +service-get-directories: + #!/usr/bin/env bash + set -euo pipefail + + find "{{CONTAINERS_DIR}}" -mindepth 1 -maxdepth 1 -type d \ + | xargs -I{} basename "{}" \ + | tr '-' '_' \ + | jq -R . \ + | jq -s -c . + + +task-get-directories: + #!/usr/bin/env bash + set -euo pipefail + + found_dirs="$(just --justfile "{{PROJECT_DIR}}/justfile" service-get-directories)" + + jq -cn \ + --argjson found "$found_dirs" \ + '$found | map("task_" + .)' + + +ecs-task-get-image-uris: + #!/usr/bin/env bash + set -euo pipefail + + if [[ -z "${ECS_IMAGE_URIS:-}" ]]; then + echo "❌ ECS_IMAGE_URIS environment variable is not set." + exit 1 + fi + + if [[ -z "${TASK_NAME:-}" ]]; then + echo "❌ TASK_NAME environment variable is not set." + exit 1 + fi + + service_name="${TASK_NAME#task_}" + + jq -cn \ + --argjson image_uris "$ECS_IMAGE_URIS" \ + --arg service_name "$service_name" \ + ' + { + service_image_uri: ($image_uris | map(select(test(":" + $service_name + "-")))[0] // ""), + debug_image_uri: ($image_uris | map(select(test(":debug-")))[0] // ""), + otel_image_uri: ($image_uris | map(select(test(":otel_collector-")))[0] // "") + } + | if .service_image_uri == "" then error("Missing ECS image URI for " + $service_name) else . end + | if .debug_image_uri == "" then error("Missing debug image URI") else . end + | if .otel_image_uri == "" then error("Missing otel_collector image URI") else . end + ' + + +ecs-service-get-directories: + #!/usr/bin/env bash + set -euo pipefail + + found_dirs="$(just --justfile "{{PROJECT_DIR}}/justfile" service-get-directories)" + + jq -cn \ + --argjson found "$found_dirs" \ + '$found | map("service_" + .)' + + +container-get-directories: + #!/usr/bin/env bash + set -euo pipefail + + found_dirs="$(just --justfile "{{PROJECT_DIR}}/justfile" service-get-directories)" + + jq -cn \ + --argjson found "$found_dirs" \ + --argjson extra '{{EXTRA_CONTAINER_DIRECTORIES}}' \ + '$found + $extra | unique' + lambda-build: #!/usr/bin/env bash @@ -234,9 +438,11 @@ lambda-upload-bundle: #!/usr/bin/env bash just lambda-prepare-appspec - LOCAL_APP_SPEC_ZIP="{{justfile_directory()}}/appspec.zip" + LOCAL_APP_SPEC_ZIP="{{justfile_directory()}}/appspec-lambda.zip" + TMPDIR="$(mktemp -d)" rm -f $LOCAL_APP_SPEC_ZIP - zip -q -j $LOCAL_APP_SPEC_ZIP $APP_SPEC_FILE + cp "$APP_SPEC_FILE" "$TMPDIR/appspec.yml" + zip -q -j $LOCAL_APP_SPEC_ZIP "$TMPDIR/appspec.yml" aws s3 cp $LOCAL_APP_SPEC_ZIP "s3://${BUCKET_NAME}/${APP_SPEC_KEY}" @@ -423,6 +629,163 @@ lambda-prune: done +ecs-prepare-appspec: + #!/usr/bin/env bash + set -euo pipefail + + if [[ -z "${APP_SPEC_FILE:-}" ]]; then + echo "❌ APP_SPEC_FILE environment variable is not set." + exit 1 + fi + + if [[ -z "${TASK_DEFINITION_ARN:-}" ]]; then + echo "❌ TASK_DEFINITION_ARN environment variable is not set." + exit 1 + fi + + if [[ -z "${CONTAINER_NAME:-}" ]]; then + echo "❌ CONTAINER_NAME environment variable is not set." + exit 1 + fi + + if [[ -z "${CONTAINER_PORT:-}" ]]; then + echo "❌ CONTAINER_PORT environment variable is not set." + exit 1 + fi + + cp "{{justfile_directory()}}/appspec-ecs.yml" "$APP_SPEC_FILE" + + yq eval -i ' + .Resources[0].TargetService.Properties.TaskDefinition = env(TASK_DEFINITION_ARN) | + .Resources[0].TargetService.Properties.LoadBalancerInfo.ContainerName = env(CONTAINER_NAME) | + .Resources[0].TargetService.Properties.LoadBalancerInfo.ContainerPort = (env(CONTAINER_PORT) | tonumber) + ' "$APP_SPEC_FILE" + + cat "$APP_SPEC_FILE" + + +ecs-upload-bundle: + #!/usr/bin/env bash + set -euo pipefail + + if [[ -z "${BUCKET_NAME:-}" ]]; then + echo "❌ BUCKET_NAME environment variable is not set." + exit 1 + fi + + if [[ -z "${APP_SPEC_KEY:-}" ]]; then + echo "❌ APP_SPEC_KEY environment variable is not set." + exit 1 + fi + + just ecs-prepare-appspec + aws s3 cp "$APP_SPEC_FILE" "s3://${BUCKET_NAME}/${APP_SPEC_KEY}" + + +ecs-deploy: + #!/usr/bin/env bash + set -euo pipefail + + if [[ -z "${CODE_DEPLOY_APP_NAME:-}" ]]; then + echo "❌ CODE_DEPLOY_APP_NAME environment variable is not set." + exit 1 + fi + + if [[ -z "${CODE_DEPLOY_GROUP_NAME:-}" ]]; then + echo "❌ CODE_DEPLOY_GROUP_NAME environment variable is not set." + exit 1 + fi + + if [[ -z "${BUCKET_NAME:-}" ]]; then + echo "❌ BUCKET_NAME environment variable is not set." + exit 1 + fi + + if [[ -z "${APP_SPEC_KEY:-}" ]]; then + echo "❌ APP_SPEC_KEY environment variable is not set." + exit 1 + fi + + DEPLOYMENT_ID=$(aws deploy create-deployment \ + --application-name "$CODE_DEPLOY_APP_NAME" \ + --deployment-group-name "$CODE_DEPLOY_GROUP_NAME" \ + --revision revisionType=S3,s3Location="{bucket=$BUCKET_NAME,key=$APP_SPEC_KEY,bundleType=YAML}" \ + --query "deploymentId" --output text) + + if [[ -z "$DEPLOYMENT_ID" || "$DEPLOYMENT_ID" == "None" ]]; then + echo "❌ Failed to create ECS deployment β€” no deployment ID returned." + exit 1 + fi + + echo "πŸš€ Deployment started: $DEPLOYMENT_ID" + echo "🏷️ CodeDeploy App: $CODE_DEPLOY_APP_NAME | Group: $CODE_DEPLOY_GROUP_NAME" + echo "πŸ“¦ AppSpec artifact: s3://$BUCKET_NAME/$APP_SPEC_KEY" + echo "⏳ Monitoring deployment status…" + + MAX_ATTEMPTS=40 + SLEEP_INTERVAL=15 + + for ((i=1; i<=MAX_ATTEMPTS; i++)); do + STATUS=$(aws deploy get-deployment \ + --deployment-id "$DEPLOYMENT_ID" \ + --query "deploymentInfo.status" \ + --output text) + + echo "[$i/$MAX_ATTEMPTS] Status: $STATUS" + + if [[ "$STATUS" == "Succeeded" ]]; then + echo "βœ… ECS deployment $DEPLOYMENT_ID completed successfully." + exit 0 + elif [[ "$STATUS" == "Failed" || "$STATUS" == "Stopped" ]]; then + echo "❌ ECS deployment $DEPLOYMENT_ID failed or was stopped." + aws deploy get-deployment \ + --deployment-id "$DEPLOYMENT_ID" \ + --query 'deploymentInfo.{Status:status, ErrorCode:errorInformation.code, ErrorMessage:errorInformation.message}' \ + --output table + exit 1 + fi + + sleep "$SLEEP_INTERVAL" + done + + echo "❌ ECS deployment $DEPLOYMENT_ID did not complete within expected time." + exit 1 + + +ecs-rolling-deploy: + #!/usr/bin/env bash + set -euo pipefail + + if [[ -z "${CLUSTER_NAME:-}" ]]; then + echo "❌ CLUSTER_NAME environment variable is not set." + exit 1 + fi + + if [[ -z "${SERVICE_NAME:-}" ]]; then + echo "❌ SERVICE_NAME environment variable is not set." + exit 1 + fi + + if [[ -z "${TASK_DEFINITION_ARN:-}" ]]; then + echo "❌ TASK_DEFINITION_ARN environment variable is not set." + exit 1 + fi + + echo "πŸš€ Starting ECS rolling deployment for $SERVICE_NAME on $CLUSTER_NAME" + + aws ecs update-service \ + --cluster "$CLUSTER_NAME" \ + --service "$SERVICE_NAME" \ + --task-definition "$TASK_DEFINITION_ARN" \ + >/dev/null + + aws ecs wait services-stable \ + --cluster "$CLUSTER_NAME" \ + --services "$SERVICE_NAME" + + echo "βœ… ECS rolling deployment completed for $SERVICE_NAME" + + frontend-build: #!/usr/bin/env bash set -euo pipefail @@ -578,4 +941,4 @@ test-send-dlq-messages: sleep 1 done - echo "Finished sending messages." \ No newline at end of file + echo "Finished sending messages."