diff --git a/.github/workflows/deploy_dev_code.yml b/.github/workflows/deploy_dev_code.yml index 294e41d5..5d6e6a0d 100644 --- a/.github/workflows/deploy_dev_code.yml +++ b/.github/workflows/deploy_dev_code.yml @@ -1,4 +1,4 @@ -name: Dev Deploy Code +name: Dev Code # This deploys code changes to dev without touching infra. on: @@ -10,11 +10,11 @@ permissions: jobs: setup: - name: Discover Directories + name: Discover uses: ./.github/workflows/get_directories.yml build: - name: Build Artifacts + name: Build uses: ./.github/workflows/build.yml needs: - setup @@ -28,7 +28,7 @@ jobs: get_build: - name: Resolve Build Outputs + name: Resolve needs: build uses: ./.github/workflows/build_get.yml with: @@ -38,7 +38,7 @@ jobs: ecs_version: ${{ needs.build.outputs.ecs_version }} deploy: - name: Deploy Code + name: Deploy uses: ./.github/workflows/deploy.yml needs: - setup diff --git a/.github/workflows/deploy_dev_infra.yml b/.github/workflows/deploy_dev_infra.yml index a9c355c9..c94033b2 100644 --- a/.github/workflows/deploy_dev_infra.yml +++ b/.github/workflows/deploy_dev_infra.yml @@ -1,4 +1,4 @@ -name: Dev Deploy Infra +name: Dev Infra on: workflow_dispatch: @@ -9,18 +9,18 @@ permissions: jobs: setup: - name: Discover Directories + name: Discover uses: ./.github/workflows/get_directories.yml code: - name: Prepare Infra Artifacts + name: Artifacts uses: ./.github/workflows/infra_releases.yml with: environment: dev infra_version: ${{ github.sha }} infra: - name: Apply Infrastructure + name: Apply needs: - setup - code @@ -32,44 +32,3 @@ jobs: lambda_matrix: ${{ needs.setup.outputs.lambda_dirs }} bootstrap_image_uri: ${{ needs.code.outputs.bootstrap_image_uri }} service_matrix: ${{ needs.setup.outputs.ecs_service_dirs }} - - build: - name: Build Artifacts - uses: ./.github/workflows/build.yml - needs: - - code - - setup - with: - environment: dev - lambda_version: ${{ github.sha }} - frontend_version: ${{ github.sha }} - ecs_version: ${{ github.sha }} - lambda_matrix: ${{ needs.setup.outputs.lambda_dirs }} - ecs_matrix: ${{ needs.setup.outputs.container_dirs }} - - get_build: - name: Resolve Build Outputs - needs: build - uses: ./.github/workflows/build_get.yml - with: - environment: dev - lambda_version: ${{ github.sha }} - frontend_version: ${{ github.sha }} - ecs_version: ${{ github.sha }} - - deploy: - name: Deploy Code - uses: ./.github/workflows/deploy.yml - needs: - - setup - - build - - get_build - - infra - with: - environment: dev - lambda_version: ${{ needs.build.outputs.lambda_version }} - frontend_version: ${{ needs.build.outputs.frontend_version }} - code_bucket: ${{ needs.get_build.outputs.code_bucket }} - lambda_matrix: ${{ needs.setup.outputs.lambda_dirs }} - task_matrix: ${{ needs.get_build.outputs.ecs_task_matrix }} - ecs_image_uris: ${{ needs.get_build.outputs.ecs_image_uris }} diff --git a/.github/workflows/deploy_prod_code.yml b/.github/workflows/deploy_prod_code.yml index 49bb5c74..172c1374 100644 --- a/.github/workflows/deploy_prod_code.yml +++ b/.github/workflows/deploy_prod_code.yml @@ -1,4 +1,4 @@ -name: Prod Deploy Code +name: Prod Code # This deploys code changes to prod from release artifacts already present in the shared CI bucket and ECR repository. on: @@ -20,7 +20,7 @@ permissions: jobs: get_build: - name: Resolve Release Artifacts + name: Resolve uses: ./.github/workflows/build_get.yml with: environment: ci @@ -29,7 +29,7 @@ jobs: ecs_version: ${{ inputs.ecs_version }} deploy: - name: Deploy Code + name: Deploy uses: ./.github/workflows/deploy.yml needs: - get_build diff --git a/.github/workflows/deploy_prod_infra.yml b/.github/workflows/deploy_prod_infra.yml index afffbe55..b55fe9cc 100644 --- a/.github/workflows/deploy_prod_infra.yml +++ b/.github/workflows/deploy_prod_infra.yml @@ -1,4 +1,4 @@ -name: Prod Deploy Infra +name: Prod Infra on: workflow_dispatch: @@ -9,7 +9,7 @@ permissions: jobs: get_build: - name: Resolve Release Artifacts + name: Resolve uses: ./.github/workflows/build_get.yml with: environment: ci @@ -18,7 +18,7 @@ jobs: ecs_version: 0.9.4 infra: - name: Apply Infrastructure + name: Apply needs: - get_build uses: ./.github/workflows/infra.yml @@ -29,18 +29,3 @@ jobs: lambda_matrix: ${{ needs.get_build.outputs.lambda_version_files }} bootstrap_image_uri: ${{ needs.get_build.outputs.bootstrap_image_uri }} service_matrix: ${{ needs.get_build.outputs.ecs_service_matrix }} - - deploy: - name: Deploy Code - uses: ./.github/workflows/deploy.yml - needs: - - get_build - - infra # this is only to ensure infra runs before deploy no dependencies on infra outputs i.e. infra is managed separately - with: - environment: prod - lambda_version: ${{ needs.get_build.outputs.lambda_version }} - frontend_version: ${{ needs.get_build.outputs.frontend_version }} - code_bucket: ${{ needs.get_build.outputs.code_bucket }} - lambda_matrix: ${{ needs.get_build.outputs.lambda_version_files }} - task_matrix: ${{ needs.get_build.outputs.ecs_task_matrix }} - ecs_image_uris: ${{ needs.get_build.outputs.ecs_image_uris }} diff --git a/.github/workflows/destroy.yml b/.github/workflows/destroy.yml index a2435f9d..d0f372b8 100644 --- a/.github/workflows/destroy.yml +++ b/.github/workflows/destroy.yml @@ -1,4 +1,4 @@ -name: Kill Environment +name: Destroy on: workflow_dispatch: @@ -25,11 +25,11 @@ env: jobs: setup: - name: Discover Directories + name: Discover uses: ./.github/workflows/get_directories.yml lambdas: - name: Destroy Lambda Infra + name: Lambdas runs-on: ubuntu-latest needs: setup strategy: @@ -47,7 +47,7 @@ jobs: tg_action: destroy frontend: - name: Destroy Frontend Infra + name: Frontend runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 @@ -62,7 +62,7 @@ jobs: tg_action: destroy services: - name: Destroy Service Infra + name: Services runs-on: ubuntu-latest needs: setup strategy: @@ -83,7 +83,7 @@ jobs: tg_action: destroy tasks: - name: Destroy Task Infra + name: Tasks runs-on: ubuntu-latest needs: - setup @@ -107,7 +107,7 @@ jobs: tg_action: destroy network: - name: Destroy Network Infra + name: Network needs: - frontend - services @@ -124,7 +124,7 @@ jobs: tg_action: destroy security: - name: Destroy Security Infra + name: Security needs: - network runs-on: ubuntu-latest @@ -139,7 +139,7 @@ jobs: tg_action: destroy build-bucket: - name: Destroy Code Bucket + name: Code Bucket if: inputs.environment != 'prod' needs: - lambdas @@ -155,7 +155,7 @@ jobs: tg_action: destroy ecr: - name: Destroy ECR + name: ECR if: inputs.environment != 'prod' needs: - network @@ -171,7 +171,7 @@ jobs: tg_action: destroy cluster: - name: Destroy Cluster Infra + name: Cluster needs: - network runs-on: ubuntu-latest diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 0fc5744a..bf2d9f26 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -1,4 +1,4 @@ -name: Pull Request +name: PR on: pull_request: @@ -14,7 +14,7 @@ on: jobs: check-pr-title: - name: Validate PR Title + name: PR Title runs-on: ubuntu-latest env: PR_TITLE: ${{ github.event.pull_request.title }} @@ -33,7 +33,7 @@ jobs: exit 1 check: - name: Detect Changes + name: Changes needs: check-pr-title permissions: pull-requests: read @@ -47,7 +47,7 @@ jobs: needs: check runs-on: ubuntu-latest if: ${{ needs.check.outputs.github == 'true' }} - name: Run github formatting checks + name: GH Fmt timeout-minutes: 2 steps: - uses: actions/checkout@v6 @@ -60,7 +60,7 @@ jobs: needs: check runs-on: ubuntu-latest if: ${{ needs.check.outputs.terragrunt == 'true' }} - name: Run terragrunt formatting checks + name: TG Fmt timeout-minutes: 2 steps: - uses: actions/checkout@v6 @@ -81,7 +81,7 @@ jobs: needs: check runs-on: ubuntu-latest if: ${{ needs.check.outputs.terraform == 'true' }} - name: Run terraform lint checks + name: TF Lint timeout-minutes: 2 steps: - uses: actions/checkout@v6 @@ -98,7 +98,7 @@ jobs: check-lambda-naming: needs: check runs-on: ubuntu-latest - name: Check lambda directory naming uses underscores + name: Lambda Names steps: - uses: actions/checkout@v6 - name: Fail if any lambda directory uses hyphens @@ -113,7 +113,7 @@ jobs: check-ecs-module-pairs: needs: check runs-on: ubuntu-latest - name: Check ECS task/service module pairs + name: ECS Pairs steps: - uses: actions/checkout@v6 @@ -160,13 +160,13 @@ jobs: echo "โœ… All ECS task_/service_ pairs are present." setup: - name: Discover App Directories + name: Discover if: ${{ needs.check.outputs.lambdas == 'true' || needs.check.outputs.containers == 'true' }} needs: check uses: ./.github/workflows/get_directories.yml build-lambdas: - name: Build Lambda Changes + name: Build Lambdas if: ${{ needs.check.outputs.lambdas == 'true' }} needs: - check @@ -187,7 +187,7 @@ jobs: just_action: lambda-build build-containers: - name: Build Container Changes + name: Build Containers if: ${{ needs.check.outputs.containers == 'true' }} needs: - check @@ -213,7 +213,7 @@ jobs: needs: - check runs-on: ubuntu-latest - name: Build frontend + name: Build Frontend timeout-minutes: 5 steps: - uses: actions/checkout@v6 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5c6c0454..54f87d0f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,7 +15,7 @@ permissions: jobs: get-current-tag: - name: Get Current Tag + name: Current Tag runs-on: ubuntu-latest outputs: current_tag: ${{ steps.get_tag.outputs.CURRENT_TAG }} @@ -30,7 +30,7 @@ jobs: echo "CURRENT_TAG=${CURRENT_TAG}" >> $GITHUB_OUTPUT get-next-tag: - name: Calculate Next Tag + name: Next Tag runs-on: ubuntu-latest outputs: tag: ${{ steps.get_next_version.outputs.version }} @@ -51,7 +51,7 @@ jobs: echo ${{ steps.get_next_version.outputs.hasNextVersion }} create-tag: - name: Create Tag + name: Tag needs: get-next-tag if: ${{ needs.get-next-tag.outputs.has-next-version == 'true' }} runs-on: ubuntu-latest @@ -66,7 +66,7 @@ jobs: git push origin --tag "$TAG" get-commits: - name: Collect Release Commits + name: Commits needs: - get-next-tag - create-tag @@ -91,12 +91,12 @@ jobs: echo "EOF" >> $GITHUB_OUTPUT get-apps: - name: Discover App Directories + name: Discover uses: ./.github/workflows/get_directories.yml build: - name: Build Release Artifacts + name: Build needs: - create-tag - get-next-tag @@ -115,7 +115,7 @@ jobs: ecs_matrix: ${{ needs.get-apps.outputs.container_dirs }} code: - name: Prepare Shared Infra Artifacts + name: Artifacts needs: - create-tag - get-next-tag @@ -128,7 +128,7 @@ jobs: infra_version: ${{ needs.get-next-tag.outputs.tag }} release: - name: Publish GitHub Release + name: Publish runs-on: ubuntu-latest needs: - get-next-tag diff --git a/AGENTS.md b/AGENTS.md index 642fb668..094c0330 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -44,6 +44,7 @@ Choose deployment modes that match the runtime shape. - ECS CodeDeploy requires a load-balanced service shape in this repo. - In practice that means `connection_type` must be `internal_dns` or `vpc_link` for CodeDeploy-backed ECS deploys. +- In this repo, subpath ECS services need a dedicated ALB listener if they are meant to use CodeDeploy blue/green. - If `connection_type = "internal"`, prefer `rolling`. ## Feasibility Check @@ -83,10 +84,13 @@ When changing CI workflows or Terraform module dependencies, check dependency be - when the same setup or lookup pattern appears in multiple workflows, suggest extracting it into a shared reusable workflow or shared `just` recipe instead of repeating it - check workflow dependency wiring such as `needs`, job outputs, matrix values, and reused workflow inputs - watch for `data.terraform_remote_state` dependencies that can fail if another stack has not been created yet or has already been destroyed +- avoid cross-runtime ownership when a resource is really part of one app shape; for example, keep the ECS worker queue with `task_worker` rather than making ECS consume `lambda_worker` state +- when a bootstrap path needs placeholder values, prefer hiding that conditional logic in locals instead of repeating `count`-indexed remote-state references through the module body +- if you do add a genuinely new stack type, update the discovery and lifecycle workflows too: `get_directories.yml`, `infra.yml`, and `destroy.yml` - check required Terraform input variables on destroy paths as well as apply paths; destroy can still fail before resource deletion if required vars are unset - make sure every referenced `needs..outputs.*` value is actually in scope for that job - make sure matrix values match the expected naming contract for the workflow, module, or path being used -- for `*_infra` deploy wrappers, verify the infra workflow receives the directory-based infra matrices it needs, while deploy workflows receive the artifact-based matrices and image URIs they need +- for `*_infra` wrappers, verify they stop at infrastructure apply and do not also run the reusable `deploy.yml` code rollout - for prod wrappers in this repo, remember that shared artifact resources come from `ci`, while deploy target resources are still in `prod` - prefer making modules tolerant of unnecessary upstream state dependencies where possible - do not change CI ordering blindly; first check whether the real issue is an avoidable cross-stack dependency diff --git a/Dockerfile b/Dockerfile index 74508ac8..9038ba74 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,31 @@ +ARG SERVICE + FROM python:3.12-slim AS python-base WORKDIR /usr/app -COPY containers/worker/requirements.txt /tmp/requirements-worker.txt -RUN pip install --no-cache-dir -r /tmp/requirements-worker.txt +FROM python-base AS service-base + +ARG SERVICE + +COPY containers/${SERVICE}/requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt + + +FROM service-base AS worker + +ARG SERVICE + +COPY containers/${SERVICE}/app.py /usr/app/app.py + +CMD ["python", "-u", "app.py"] + +FROM service-base AS api -FROM python-base AS worker +ARG SERVICE -COPY containers/worker/app.py /usr/app/app.py +COPY containers/${SERVICE}/app.py /usr/app/app.py CMD ["python", "-u", "app.py"] diff --git a/README.md b/README.md index c56923c4..040d342a 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,21 @@ The repo `network` module also owns the shared internal ALB and shared HTTP API - VPC link - internal ALB and target groups +This repo now includes a sample ECS API container service exposed separately from the Lambda API: + +- public Lambda path via CloudFront: `/api/*` +- public ECS path via CloudFront: `/api/ecs/*` +- API Gateway Lambda route namespace: `/*` +- API Gateway ECS route namespace: `/ecs/*` +- deployment model: ECS CodeDeploy `blue_green` +- ALB shape: shared private ALB with a dedicated ECS API listener on port `8080` +- stacks: `task_api` and `service_api` +- the sample frontend calls both backends and renders both responses so the path split is visible in the UI + The `api` module is Lambda-specific and plugs the Lambda integration and root routes into that shared API. +The frontend infra module also uploads a bootstrap `index.html` during infra apply so CloudFront serves a placeholder page before the built frontend assets are deployed. + Terragrunt also provides a shared default ECR repository name to ECS task modules: - shared artifact base: `dev -> ---dev`, otherwise `---ci` @@ -41,10 +54,23 @@ Terragrunt also provides a shared default ECR repository name to ECS task module - override it in `infra/live//environment_vars.hcl` only if the repository naming diverges from that convention - the concrete ECS worker task wrapper defaults `local_tunnel = false` and `xray_enabled = false` unless you explicitly set them -The reusable deploy workflows follow the same split: `prod` `*_code` and `*_infra` wrappers read shared artifact resources from `ci`, but `*_infra` still applies `prod` infrastructure stacks using the repo's directory-derived service and lambda matrices. +The reusable deploy workflows follow the same split: `prod` `*_code` and `*_infra` wrappers read shared artifact resources from `ci`, but `*_infra` only applies `prod` infrastructure stacks using the repo's directory-derived service and lambda matrices. For `*_code` release deploys, pass explicit release versions for each runtime you want to roll out. In particular, ECS code deploys should provide an `ecs_version` rather than relying on a Lambda-version fallback. +The ECS worker queue is now owned by `task_worker`, and `service_worker` reads that queue name from `task_worker` remote state. That keeps the ECS worker queue aligned with the worker stack lifecycle without depending on the Lambda worker queue. +For bootstrap service applies, `service_worker` now uses placeholder task and queue values locally rather than spreading `count`-indexed remote-state access through the module. +The ECS worker task uses a local heartbeat-file health check, which is a better fit for a non-HTTP worker than probing a service endpoint or tying task health directly to transient AWS API calls. + +## ๐Ÿงช example prompts + +Use prompts like these when asking for a new service in this repo: + +- `Add a new ECS service called billing_api exposed on /billing via API Gateway VPC link, with task_billing_api/service_billing_api, canary deploys, and update the docs.` +- `Create a new internal ECS worker called report_worker using task_report_worker/service_report_worker, rolling deploys, and hook it into the existing container build flow.` +- `Add a new Lambda called invoice_sync with its live stacks in dev and prod, wire it into the existing lambda build/deploy workflows, and document the new module contract.` +- `Create a new public Lambda API endpoint for /reports, keep it Lambda-backed rather than ECS, and update the repo docs and workflow expectations.` + ## ๐Ÿ› ๏ธ local plan some infra Given a terragrunt file is found at `infra/live/dev/aws/api/terragrunt.hcl` @@ -195,15 +221,14 @@ deployment_strategy = "blue_green" ``` - ECS CodeDeploy is only created for load-balanced ECS services in `_shared/service` +- subpath ECS services need a dedicated ALB listener if they are meant to use CodeDeploy blue/green in this repo - internal ECS services without load balancer integration should use native ECS rolling updates instead -- the shared ECS service resource ignores `task_definition` drift so later infra applies do not revert the live task revision after either a rolling deploy or a CodeDeploy rollout +- infra ignores ECS `task_definition` drift +- for CodeDeploy ECS services, infra also ignores `load_balancer` drift - the deployment workflow: - applies the new `task_*` revision - - if the service has CodeDeploy resources, reads `codedeploy_app_name` and `codedeploy_deployment_group_name` from `service_*` - - renders [`appspec-ecs.yml`](appspec-ecs.yml) - - uploads the AppSpec to the code bucket - - runs `just ecs-deploy` - - otherwise updates the ECS service to the new task definition with a native rolling deploy + - uses CodeDeploy for load-balanced services + - uses native rolling deploys for internal services ## ๐Ÿ”ฅโ†ฉ๏ธ deployment roll-back diff --git a/containers/api/app.py b/containers/api/app.py new file mode 100644 index 00000000..17e06642 --- /dev/null +++ b/containers/api/app.py @@ -0,0 +1,75 @@ +import json +import os +import socket +from http.server import BaseHTTPRequestHandler, HTTPServer + + +HOST = "0.0.0.0" +PORT = int(os.getenv("PORT", "80")) +ROOT_PATH = os.getenv("ROOT_PATH", "") +SERVICE_NAME = os.getenv("AWS_SERVICE_NAME", "ecs-service-api") +IMAGE = os.getenv("IMAGE", "unknown") + + +def _normalize_root_path(root_path: str) -> str: + if not root_path: + return "" + return root_path if root_path.startswith("/") else f"/{root_path}" + + +ROOT_PATH_PREFIX = _normalize_root_path(ROOT_PATH.rstrip("/")) + + +def route_for(path: str) -> str: + if ROOT_PATH_PREFIX and path.startswith(ROOT_PATH_PREFIX): + trimmed = path[len(ROOT_PATH_PREFIX):] + return trimmed or "/" + return path or "/" + + +class Handler(BaseHTTPRequestHandler): + server_version = "BlueGreenAPI/1.0" + + def _write_json(self, status: int, body: dict) -> None: + encoded = json.dumps(body).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(encoded))) + self.end_headers() + self.wfile.write(encoded) + + def do_GET(self) -> None: # noqa: N802 + route = route_for(self.path.split("?", 1)[0]) + + if route == "/health": + self._write_json(200, {"status": "ok", "service": SERVICE_NAME}) + return + + if route in ("/fail", "/error"): + self._write_json( + 500, + { + "message": "Forced failure for testing", + "service": SERVICE_NAME, + "route": route, + }, + ) + return + + self._write_json( + 200, + { + "message": "Hello from the blue/green ECS API", + "service": SERVICE_NAME, + "hostname": socket.gethostname(), + "image": IMAGE, + "root_path": ROOT_PATH_PREFIX, + "route": route, + }, + ) + + +if __name__ == "__main__": + httpd = HTTPServer((HOST, PORT), Handler) + print(f"Starting {SERVICE_NAME} on {HOST}:{PORT} with root path {ROOT_PATH_PREFIX or '/'}") + httpd.serve_forever() diff --git a/containers/api/requirements.txt b/containers/api/requirements.txt new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/containers/api/requirements.txt @@ -0,0 +1 @@ + diff --git a/containers/worker/app.py b/containers/worker/app.py index ebe26110..9ee30187 100644 --- a/containers/worker/app.py +++ b/containers/worker/app.py @@ -5,10 +5,16 @@ QUEUE_URL = os.environ['AWS_SQS_QUEUE_URL'] AWS_REGION = os.environ['AWS_REGION'] POLL_TIMEOUT = int(os.getenv("POLL_TIMEOUT", "60")) +HEARTBEAT_FILE = os.getenv("HEARTBEAT_FILE", "/tmp/worker-heartbeat") sqs = boto3.client('sqs', region_name=AWS_REGION) +def write_heartbeat(): + with open(HEARTBEAT_FILE, "w", encoding="utf-8") as heartbeat: + heartbeat.write(str(int(time.time()))) + + def process_message(msg): # TODO: implement business logic print({"message_id": msg['MessageId'], "body": msg['Body'][:200]}) @@ -35,6 +41,8 @@ def poll(): if __name__ == "__main__": print(f"Starting SQS poller for {QUEUE_URL}") + write_heartbeat() while True: poll() + write_heartbeat() time.sleep(POLL_TIMEOUT) diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index 6d33c84c..3eba86a6 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -1,33 +1,57 @@ import { useEffect, useState } from 'react' +async function fetchJson(url) { + const response = await fetch(url) + const text = await response.text() + + try { + return JSON.parse(text) + } catch { + throw new Error(`${response.status} ${response.statusText}: ${text.slice(0, 200)}`) + } +} + export default function App() { - const [data, setData] = useState(null) - const [error, setError] = useState(null) + const [lambdaData, setLambdaData] = useState(null) + const [lambdaError, setLambdaError] = useState(null) + const [ecsData, setEcsData] = useState(null) + const [ecsError, setEcsError] = useState(null) useEffect(() => { - fetch('/api/') - .then((r) => r.json()) - .then(setData) - .catch(setError) + fetchJson('/api/') + .then(setLambdaData) + .catch(setLambdaError) + + fetchJson('/api/ecs') + .then(setEcsData) + .catch(setEcsError) }, []) + const renderTable = (data) => ( + + + {Object.entries(data).map(([key, value]) => ( + + + + + ))} + +
{key}{String(value)}
+ ) + return (

Serverless App

- {error &&

Error: {String(error)}

} - {data && ( - - - {Object.entries(data).map(([k, v]) => ( - - - - - ))} - -
{k}{String(v)}
- )} - {!data && !error &&

Loading...

} +

Lambda Response

+ {lambdaError &&

Error: {String(lambdaError)}

} + {lambdaData && renderTable(lambdaData)} + {!lambdaData && !lambdaError &&

Loading Lambda response...

} + +

ECS Response

+ {ecsError &&

Error: {String(ecsError)}

} + {ecsData && renderTable(ecsData)} + {!ecsData && !ecsError &&

Loading ECS response...

}
) } diff --git a/infra/README.md b/infra/README.md index 838158b4..bb15b04a 100644 --- a/infra/README.md +++ b/infra/README.md @@ -63,20 +63,26 @@ stores state at: - `service_*` Own the ECS services and, when applicable, CodeDeploy resources. +Current examples include: + +- `task_worker` / `service_worker` + Internal ECS worker service shape, with the worker queue owned by `task_worker` and a container health check based on a local worker heartbeat file. +- `task_api` / `service_api` + ECS API service shape exposed on the shared API Gateway at `/ecs` using `vpc_link` and `blue_green`, backed by a dedicated listener on the shared ALB. Through the frontend distribution it is reached at `/api/ecs/*`, while the Lambda API is reached at `/api/*`. + ## Dependency Notes - many modules use `data.terraform_remote_state` to read outputs from other stacks - because of that, workflow ordering matters for apply, deploy, and destroy +- avoid making one runtime depend on another runtime's state ownership unnecessarily; for example, the ECS worker queue is owned by `task_worker` rather than by `lambda_worker` - some shared infrastructure, such as the landing-zone VPC and tagged private subnets, is discovered with `data` lookups and must already exist ## Deployment Model - infra workflows create or update infrastructure stacks - build workflows produce Lambda zips and container images -- `*_infra` deploy wrappers need two kinds of reusable-workflow inputs: - - directory-derived infra matrices for stack applies - - artifact-derived versions, task matrices, and image URIs for code deploys -- in `prod`, the wrappers read shared artifact resources from `ci` but still apply service and task stacks in `prod` +- `*_infra` wrappers need the inputs required to apply infra safely, such as directory-derived stack matrices and any artifact-derived bootstrap references +- in `prod`, the `*_infra` wrappers read shared artifact resources from `ci` but only apply service and task stacks in `prod` - deploy workflows: - publish Lambda versions and use Lambda CodeDeploy - register ECS task revisions diff --git a/infra/live/dev/aws/service_api/terragrunt.hcl b/infra/live/dev/aws/service_api/terragrunt.hcl new file mode 100644 index 00000000..d5d10fdb --- /dev/null +++ b/infra/live/dev/aws/service_api/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//service_api" +} diff --git a/infra/live/dev/aws/task_api/terragrunt.hcl b/infra/live/dev/aws/task_api/terragrunt.hcl new file mode 100644 index 00000000..1263aabf --- /dev/null +++ b/infra/live/dev/aws/task_api/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//task_api" +} diff --git a/infra/live/prod/aws/service_api/terragrunt.hcl b/infra/live/prod/aws/service_api/terragrunt.hcl new file mode 100644 index 00000000..d5d10fdb --- /dev/null +++ b/infra/live/prod/aws/service_api/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//service_api" +} diff --git a/infra/live/prod/aws/task_api/terragrunt.hcl b/infra/live/prod/aws/task_api/terragrunt.hcl new file mode 100644 index 00000000..1263aabf --- /dev/null +++ b/infra/live/prod/aws/task_api/terragrunt.hcl @@ -0,0 +1,7 @@ +include { + path = find_in_parent_folders("root.hcl") +} + +terraform { + source = "../../../../modules//aws//task_api" +} diff --git a/infra/modules/aws/_shared/service/README.md b/infra/modules/aws/_shared/service/README.md index 4d394934..1142f1b7 100644 --- a/infra/modules/aws/_shared/service/README.md +++ b/infra/modules/aws/_shared/service/README.md @@ -19,6 +19,16 @@ Shared ECS service module. - `bootstrap` - `bootstrap_image_uri` - `codedeploy_alarm_names` +- optional `dedicated_listener_port` + +Subpath services match both `/` and `//*`. +If `dedicated_listener_port` is set, the service gets its own ALB listener and uses that listener for API Gateway integration and ECS CodeDeploy traffic routing. + +## Bootstrap behavior + +Bootstrap ECS services use the shared placeholder image. +Bootstrap health checks use `/`. +Real task deploys use the normal app health path, such as `/health` or `//health`. ## Deployment strategies @@ -32,11 +42,13 @@ For internal non-load-balanced services, the deploy workflow falls back to nativ ## Drift ownership -The ECS service ignores changes to `task_definition`. +The ECS service ignores: -That is intentional: +- `task_definition` +- `load_balancer` -- deploy workflows own the live task revision -- infra applies own the stable service shape +Reason: -Without that split, a later infra apply would revert a successful rolling or CodeDeploy deployment back to the older task definition stored in Terraform state. +- deploy workflows own the live revision +- infra owns the stable service shape +- CodeDeploy ECS services reject `load_balancer` updates via `UpdateService` diff --git a/infra/modules/aws/_shared/service/locals.tf b/infra/modules/aws/_shared/service/locals.tf index 7214eb34..a80d124d 100644 --- a/infra/modules/aws/_shared/service/locals.tf +++ b/infra/modules/aws/_shared/service/locals.tf @@ -1,5 +1,6 @@ locals { - use_vpc_link = var.connection_type == "vpc_link" + use_vpc_link = var.connection_type == "vpc_link" + use_dedicated_listener = var.dedicated_listener_port != null enable_codedeploy = ( var.connection_type == "internal_dns" || var.connection_type == "vpc_link" ) @@ -18,11 +19,12 @@ locals { green_target_group_name = "tg-${substr(md5("${var.service_name}-green"), 0, 8)}-green" is_default_path = var.root_path == "" - health_check_path = local.is_default_path ? "/health" : "/${var.root_path}/health" + health_check_path = var.bootstrap ? "/" : (local.is_default_path ? "/health" : "/${var.root_path}/health") exact_route_key = local.is_default_path ? "ANY /" : "ANY /${var.root_path}" proxy_route_key = local.is_default_path ? "ANY /{proxy+}" : "ANY /${var.root_path}/{proxy+}" target_group_arn = local.is_default_path ? var.default_target_group_arn : aws_lb_target_group.service_target_group[0].arn blue_target_group_name = local.is_default_path ? element(split("/", var.default_target_group_arn), 1) : aws_lb_target_group.service_target_group[0].name + traffic_route_arn = local.use_dedicated_listener ? aws_lb_listener.service[0].arn : (local.is_default_path ? var.default_http_listener_arn : aws_lb_listener_rule.service[0].arn) load_balancers = var.connection_type == "internal_dns" || var.connection_type == "vpc_link" ? [{ target_group_arn = local.target_group_arn diff --git a/infra/modules/aws/_shared/service/main.tf b/infra/modules/aws/_shared/service/main.tf index 5c99327c..c92656f8 100644 --- a/infra/modules/aws/_shared/service/main.tf +++ b/infra/modules/aws/_shared/service/main.tf @@ -81,7 +81,7 @@ resource "aws_lb_target_group" "green_target_group" { } resource "aws_lb_listener_rule" "service" { - count = local.is_default_path ? 0 : 1 + count = (!local.is_default_path && !local.use_dedicated_listener) ? 1 : 0 listener_arn = var.default_http_listener_arn priority = local.priority @@ -93,7 +93,27 @@ resource "aws_lb_listener_rule" "service" { condition { path_pattern { - values = ["/${var.root_path}/*"] + values = ["/${var.root_path}", "/${var.root_path}/*"] + } + } +} + +resource "aws_lb_listener" "service" { + count = local.use_dedicated_listener ? 1 : 0 + + load_balancer_arn = var.load_balancer_arn + port = var.dedicated_listener_port + protocol = "HTTP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.service_target_group[0].arn + } + + lifecycle { + precondition { + condition = var.load_balancer_arn != "" + error_message = "load_balancer_arn must be set when dedicated_listener_port is used." } } } @@ -124,7 +144,7 @@ resource "aws_apigatewayv2_integration" "service" { connection_type = "VPC_LINK" integration_type = "HTTP_PROXY" integration_method = "ANY" - integration_uri = var.default_http_listener_arn + integration_uri = local.traffic_route_arn payload_format_version = "1.0" lifecycle { @@ -176,9 +196,12 @@ resource "aws_ecs_service" "service" { } lifecycle { - # Deploy workflows own the live task revision. Terraform keeps the service - # shape stable without reverting the currently deployed revision. + # Deploy workflows own the live task revision. Terraform keeps the service stable without reverting the currently deployed revision. + + # For CODE_DEPLOY services, ECS also rejects load balancer updates through UpdateService. Terraform still owns the target group and listener-rule + # resources themselves, but the ECS service attachment must stay stable after first creation. ignore_changes = [ + load_balancer, task_definition, ] } @@ -250,7 +273,7 @@ resource "aws_codedeploy_deployment_group" "ecs" { load_balancer_info { target_group_pair_info { prod_traffic_route { - listener_arns = [var.default_http_listener_arn] + listener_arns = [local.traffic_route_arn] } target_group { diff --git a/infra/modules/aws/_shared/service/variables.tf b/infra/modules/aws/_shared/service/variables.tf index c18d81fc..9d917d03 100644 --- a/infra/modules/aws/_shared/service/variables.tf +++ b/infra/modules/aws/_shared/service/variables.tf @@ -41,6 +41,11 @@ variable "default_target_group_arn" { type = string } +variable "load_balancer_arn" { + type = string + default = "" +} + variable "default_http_listener_arn" { type = string } @@ -69,6 +74,11 @@ variable "api_invoke_url" { type = string } +variable "dedicated_listener_port" { + type = number + default = null +} + variable "root_path" { description = "The path to serve the service from. / is for default /example_service is for subpath" type = string diff --git a/infra/modules/aws/_shared/task/README.md b/infra/modules/aws/_shared/task/README.md index 72a157ba..58bdd316 100644 --- a/infra/modules/aws/_shared/task/README.md +++ b/infra/modules/aws/_shared/task/README.md @@ -19,6 +19,7 @@ Shared ECS task-definition module. - `local_tunnel` - `xray_enabled` - `command` +- optional `health_check` In the concrete ECS task wrappers in this repo, `local_tunnel` and `xray_enabled` default to `false` unless the environment explicitly opts in. @@ -31,3 +32,5 @@ In the concrete ECS task wrappers in this repo, `local_tunnel` and `xray_enabled Use this for task revision creation. Traffic rollout happens at the service layer. The ECR repository access policy uses the explicit `ecr_repository_name` input. In this repo, Terragrunt sets a root-level default and environments can override it if the repository naming ever changes. + +When `health_check` is set, the module adds an ECS container health check to the main service container. diff --git a/infra/modules/aws/_shared/task/locals.tf b/infra/modules/aws/_shared/task/locals.tf index 1d8ecf73..f66ee275 100644 --- a/infra/modules/aws/_shared/task/locals.tf +++ b/infra/modules/aws/_shared/task/locals.tf @@ -84,6 +84,15 @@ locals { essential = true environment = concat(local.shared_environment, var.additional_env_vars) }, + var.health_check == null ? {} : { + healthCheck = { + command = var.health_check.command + interval = var.health_check.interval + timeout = var.health_check.timeout + retries = var.health_check.retries + startPeriod = var.health_check.start_period + } + }, var.command == null ? {} : { command = var.command } diff --git a/infra/modules/aws/_shared/task/variables.tf b/infra/modules/aws/_shared/task/variables.tf index 0425a242..8149b287 100644 --- a/infra/modules/aws/_shared/task/variables.tf +++ b/infra/modules/aws/_shared/task/variables.tf @@ -78,3 +78,15 @@ variable "additional_runtime_policy_arns" { type = list(string) default = [] } + +variable "health_check" { + description = "Optional ECS container health check configuration." + type = object({ + command = list(string) + interval = optional(number, 30) + timeout = optional(number, 5) + retries = optional(number, 3) + start_period = optional(number, 0) + }) + default = null +} diff --git a/infra/modules/aws/api/README.md b/infra/modules/aws/api/README.md index 41918fe4..4b12c187 100644 --- a/infra/modules/aws/api/README.md +++ b/infra/modules/aws/api/README.md @@ -6,7 +6,7 @@ Lambda-backed public HTTP API module. - Lambda API function via `_shared/lambda` - Lambda proxy integration into the shared HTTP API -- root and proxy routes +- root and proxy routes on the shared API - API 5xx CloudWatch alarm ## Dependencies @@ -22,3 +22,4 @@ Lambda-backed public HTTP API module. - Lambda function and alias names This module is Lambda-specific. The shared API surface now lives in `network`. +When accessed through the frontend CloudFront distribution, the public Lambda path is `/api/*` because CloudFront strips the leading `/api` prefix before forwarding to API Gateway. diff --git a/infra/modules/aws/frontend/README.md b/infra/modules/aws/frontend/README.md index 04c1cf8f..c94ce3c0 100644 --- a/infra/modules/aws/frontend/README.md +++ b/infra/modules/aws/frontend/README.md @@ -5,7 +5,18 @@ Static frontend hosting module. ## Owns - website bucket and distribution resources +- bootstrap `index.html` object for first-time infra deploys - deployment destination for built frontend assets +- path-based forwarding of `/api/*` requests to the shared API origin + +## Routing behavior + +- `/api/*` + forwarded to API Gateway and stripped to `/*` for the Lambda-backed API +- `/api/ecs/*` + forwarded to API Gateway and stripped to `/ecs/*` +- all other paths + served from the frontend bucket with SPA routing ## Key outputs @@ -13,3 +24,5 @@ Static frontend hosting module. - CloudFront distribution id Used by the frontend build and deploy workflow path. + +The Terraform module uploads a bootstrap `index.html` so the distribution serves a valid page before the built frontend assets are published. Later frontend deploys replace that object with the real app bundle output. diff --git a/infra/modules/aws/frontend/bootstrap/index.html b/infra/modules/aws/frontend/bootstrap/index.html new file mode 100644 index 00000000..63d7a9de --- /dev/null +++ b/infra/modules/aws/frontend/bootstrap/index.html @@ -0,0 +1,44 @@ + + + + + + Frontend Bootstrap + + + +
+

Frontend Bootstrap

+

The frontend infrastructure is up, but the built frontend assets have not been deployed yet.

+

Deploy the frontend bundle to replace this bootstrap page and serve the full app from index.html.

+
+ + diff --git a/infra/modules/aws/frontend/main.tf b/infra/modules/aws/frontend/main.tf index ef786fd1..24b24014 100644 --- a/infra/modules/aws/frontend/main.tf +++ b/infra/modules/aws/frontend/main.tf @@ -25,6 +25,14 @@ resource "aws_s3_bucket_policy" "frontend" { policy = data.aws_iam_policy_document.frontend_bucket_policy.json } +resource "aws_s3_object" "bootstrap_index" { + bucket = aws_s3_bucket.frontend.id + key = local.root_file + source = "${path.module}/bootstrap/index.html" + etag = filemd5("${path.module}/bootstrap/index.html") + content_type = "text/html; charset=utf-8" +} + resource "aws_cloudfront_function" "spa_routing" { name = "${local.name}-spa-routing" runtime = "cloudfront-js-2.0" diff --git a/infra/modules/aws/lambda_worker/README.md b/infra/modules/aws/lambda_worker/README.md index 7e1b8685..6082ba03 100644 --- a/infra/modules/aws/lambda_worker/README.md +++ b/infra/modules/aws/lambda_worker/README.md @@ -5,12 +5,14 @@ Worker Lambda wrapper module. ## Owns - worker Lambda via `_shared/lambda` -- worker queue integration via `_shared/sqs` +- Lambda worker queue integration via `_shared/sqs` ## Key outputs - Lambda function and alias names +- queue name and queue URLs +- SQS read policy ARN - queue URLs - log group -This is the concrete worker implementation on top of the shared Lambda primitives. +This is the concrete worker implementation on top of the shared Lambda primitives. Its queue is owned for Lambda worker processing and is no longer the queue used by the ECS worker service. diff --git a/infra/modules/aws/lambda_worker/outputs.tf b/infra/modules/aws/lambda_worker/outputs.tf index 0b31175a..7f21284b 100644 --- a/infra/modules/aws/lambda_worker/outputs.tf +++ b/infra/modules/aws/lambda_worker/outputs.tf @@ -18,6 +18,14 @@ output "sqs_queue_url" { value = module.sqs_queue.sqs_queue_url } +output "sqs_queue_name" { + value = module.sqs_queue.sqs_queue_name +} + +output "sqs_queue_read_policy_arn" { + value = module.sqs_queue.sqs_queue_read_policy_arn +} + output "dead_letter_queue_url" { value = module.sqs_queue.dead_letter_queue_url } diff --git a/infra/modules/aws/network/README.md b/infra/modules/aws/network/README.md index cdabc645..37ddf590 100644 --- a/infra/modules/aws/network/README.md +++ b/infra/modules/aws/network/README.md @@ -19,6 +19,7 @@ Shared network and routing module. ## Key outputs +- `load_balancer_arn` - ALB listener and target group identifiers - `internal_invoke_url` - `api_id` diff --git a/infra/modules/aws/network/outputs.tf b/infra/modules/aws/network/outputs.tf index d427a931..94ead26d 100644 --- a/infra/modules/aws/network/outputs.tf +++ b/infra/modules/aws/network/outputs.tf @@ -2,6 +2,10 @@ output "default_target_group_arn" { value = aws_lb_target_group.default.arn } +output "load_balancer_arn" { + value = aws_lb.this.arn +} + output "default_http_listener_arn" { value = aws_lb_listener.http.arn } diff --git a/infra/modules/aws/security/README.md b/infra/modules/aws/security/README.md index 4282a4fe..4b788fea 100644 --- a/infra/modules/aws/security/README.md +++ b/infra/modules/aws/security/README.md @@ -17,3 +17,5 @@ Shared security-group module. - `api_vpc_link_sg` Used by `network`, `api`, and ECS service modules. + +The load balancer security group also allows the additional internal listener port used by blue/green ECS services with dedicated listeners. diff --git a/infra/modules/aws/security/main.tf b/infra/modules/aws/security/main.tf index 856a6e91..594956e3 100644 --- a/infra/modules/aws/security/main.tf +++ b/infra/modules/aws/security/main.tf @@ -10,6 +10,13 @@ resource "aws_security_group" "load_balancer" { cidr_blocks = [data.aws_vpc.this.cidr_block] } + ingress { + from_port = var.additional_listener_port + to_port = var.additional_listener_port + protocol = "tcp" + cidr_blocks = [data.aws_vpc.this.cidr_block] + } + egress { from_port = 0 to_port = 0 diff --git a/infra/modules/aws/security/variables.tf b/infra/modules/aws/security/variables.tf index 6f97077b..0bea6c1a 100644 --- a/infra/modules/aws/security/variables.tf +++ b/infra/modules/aws/security/variables.tf @@ -15,3 +15,8 @@ variable "vpc_name" { variable "container_port" { type = number } + +variable "additional_listener_port" { + type = number + default = 8080 +} diff --git a/infra/modules/aws/service_api/README.md b/infra/modules/aws/service_api/README.md new file mode 100644 index 00000000..63538a63 --- /dev/null +++ b/infra/modules/aws/service_api/README.md @@ -0,0 +1,30 @@ +# `service_api` + +Concrete ECS API service wrapper for the sample API service. + +## Owns + +- sample ECS API service via `_shared/service` +- API Gateway VPC link routing on `/ecs` + +## Dependencies + +- `task_api` remote state +- `cluster`, `network`, and `security` remote state + +## Key behavior + +- exposes the ECS API container on the shared HTTP API Gateway using `connection_type = "vpc_link"` +- uses `deployment_strategy = "blue_green"` +- uses a dedicated ALB listener on port `8080` so ECS CodeDeploy can own traffic +- defaults `local_tunnel` and `xray_enabled` to `false` unless explicitly enabled + +## Key outputs + +- `service_name` +- `cluster_name` +- `codedeploy_app_name` +- `codedeploy_deployment_group_name` +- `container_port` + +This module wires the sample ECS API service into the shared API Gateway and ALB infrastructure. diff --git a/infra/modules/aws/service_api/data.tf b/infra/modules/aws/service_api/data.tf new file mode 100644 index 00000000..4e912710 --- /dev/null +++ b/infra/modules/aws/service_api/data.tf @@ -0,0 +1,59 @@ +data "terraform_remote_state" "task_api" { + count = var.bootstrap ? 0 : 1 + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/task_api/terraform.tfstate" + region = var.aws_region + } +} + +data "terraform_remote_state" "network" { + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/network/terraform.tfstate" + region = var.aws_region + } +} + +data "terraform_remote_state" "security" { + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/security/terraform.tfstate" + region = var.aws_region + } +} + +data "terraform_remote_state" "cluster" { + backend = "s3" + + config = { + bucket = var.state_bucket + key = "${var.environment}/aws/cluster/terraform.tfstate" + region = var.aws_region + } +} + +data "aws_vpc" "this" { + filter { + name = "tag:Name" + values = [var.vpc_name] + } +} + +data "aws_subnets" "private" { + filter { + name = "vpc-id" + values = [data.aws_vpc.this.id] + } + + filter { + name = "tag:Name" + values = ["*private*"] + } +} diff --git a/infra/modules/aws/service_api/main.tf b/infra/modules/aws/service_api/main.tf new file mode 100644 index 00000000..d4e1909a --- /dev/null +++ b/infra/modules/aws/service_api/main.tf @@ -0,0 +1,44 @@ +module "service_api" { + source = "../_shared/service" + + service_name = var.service_name + task_definition_arn = var.bootstrap ? "" : data.terraform_remote_state.task_api[0].outputs.task_definition_arn + container_port = var.container_port + root_path = var.root_path + connection_type = var.connection_type + + aws_region = var.aws_region + vpc_id = data.aws_vpc.this.id + private_subnet_ids = data.aws_subnets.private.ids + + cluster_id = data.terraform_remote_state.cluster.outputs.cluster_id + cluster_name = data.terraform_remote_state.cluster.outputs.cluster_name + ecs_security_group_id = data.terraform_remote_state.security.outputs.ecs_sg + + default_target_group_arn = data.terraform_remote_state.network.outputs.default_target_group_arn + load_balancer_arn = data.terraform_remote_state.network.outputs.load_balancer_arn + default_http_listener_arn = data.terraform_remote_state.network.outputs.default_http_listener_arn + load_balancer_arn_suffix = data.terraform_remote_state.network.outputs.load_balancer_arn_suffix + target_group_arn_suffix = data.terraform_remote_state.network.outputs.target_group_arn_suffix + + api_id = data.terraform_remote_state.network.outputs.api_id + vpc_link_id = data.terraform_remote_state.network.outputs.vpc_link_id + internal_invoke_url = data.terraform_remote_state.network.outputs.internal_invoke_url + api_invoke_url = data.terraform_remote_state.network.outputs.api_invoke_url + + bootstrap = var.bootstrap + bootstrap_image_uri = var.bootstrap_image_uri + xray_enabled = var.xray_enabled + local_tunnel = var.local_tunnel + wait_for_steady_state = var.wait_for_steady_state + + desired_task_count = 1 + deployment_strategy = "blue_green" + dedicated_listener_port = 8080 + codedeploy_alarm_names = [] + additional_security_group_ids = [] + + scaling_strategy = { + max_scaled_task_count = 2 + } +} diff --git a/infra/modules/aws/service_api/outputs.tf b/infra/modules/aws/service_api/outputs.tf new file mode 100644 index 00000000..d2be3728 --- /dev/null +++ b/infra/modules/aws/service_api/outputs.tf @@ -0,0 +1,19 @@ +output "service_name" { + value = module.service_api.service_name +} + +output "cluster_name" { + value = data.terraform_remote_state.cluster.outputs.cluster_name +} + +output "codedeploy_app_name" { + value = module.service_api.codedeploy_app_name +} + +output "codedeploy_deployment_group_name" { + value = module.service_api.codedeploy_deployment_group_name +} + +output "container_port" { + value = var.container_port +} diff --git a/infra/modules/aws/service_api/variables.tf b/infra/modules/aws/service_api/variables.tf new file mode 100644 index 00000000..7facdd20 --- /dev/null +++ b/infra/modules/aws/service_api/variables.tf @@ -0,0 +1,77 @@ +### start of static vars set in root.hcl ### +variable "state_bucket" { + type = string +} + +variable "environment" { + type = string +} + +variable "aws_region" { + type = string +} + +variable "project_name" { + type = string +} +### end of static vars set in root.hcl ### + +variable "service_name" { + type = string + default = "ecs-service-api" +} + +variable "vpc_name" { + type = string +} + +variable "container_port" { + type = number + default = 80 +} + +variable "root_path" { + description = "The path to serve the service from. / is for default /example_service is for subpath" + default = "ecs" + type = string +} + +variable "connection_type" { + description = "Type of connectivity/integration to use for the service (choices: internal, internal_dns, vpc_link)." + type = string + default = "vpc_link" + validation { + condition = can(regex("^(internal|internal_dns|vpc_link)$", var.connection_type)) + error_message = "connection_type must be one of: internal, internal_dns, vpc_link." + } +} + +variable "local_tunnel" { + type = bool + default = false +} + +variable "xray_enabled" { + type = bool + default = false +} + +variable "wait_for_steady_state" { + type = bool + default = false +} + +variable "bootstrap" { + type = bool + default = false +} + +variable "bootstrap_image_uri" { + type = string + default = "" + + validation { + condition = !var.bootstrap || var.bootstrap_image_uri != "" + error_message = "bootstrap_image_uri must be set when bootstrap is true." + } +} diff --git a/infra/modules/aws/service_worker/README.md b/infra/modules/aws/service_worker/README.md index a657418a..139a8f00 100644 --- a/infra/modules/aws/service_worker/README.md +++ b/infra/modules/aws/service_worker/README.md @@ -9,7 +9,7 @@ Concrete ECS worker service wrapper. ## Dependencies - `task_worker` remote state -- `cluster`, `network`, `security`, `api`, and `lambda_worker` remote state +- `cluster`, `network`, and `security` remote state ## Key outputs @@ -20,3 +20,6 @@ Concrete ECS worker service wrapper. - `container_port` This module wires the worker-specific service onto the shared ECS service behavior. + +It uses the ECS worker queue name exported by `task_worker` for service autoscaling. +During bootstrap applies, it uses placeholder values instead of reading task outputs directly so the bootstrap path does not need a pre-existing task state file. diff --git a/infra/modules/aws/service_worker/data.tf b/infra/modules/aws/service_worker/data.tf index f20b5fd0..45e6c986 100644 --- a/infra/modules/aws/service_worker/data.tf +++ b/infra/modules/aws/service_worker/data.tf @@ -9,16 +9,6 @@ data "terraform_remote_state" "task_worker" { } } -data "terraform_remote_state" "lambda_worker" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/lambda_worker/terraform.tfstate" - region = var.aws_region - } -} - data "terraform_remote_state" "network" { backend = "s3" diff --git a/infra/modules/aws/service_worker/locals.tf b/infra/modules/aws/service_worker/locals.tf new file mode 100644 index 00000000..05e6e036 --- /dev/null +++ b/infra/modules/aws/service_worker/locals.tf @@ -0,0 +1,6 @@ +locals { + task_worker_outputs = var.bootstrap ? null : one(data.terraform_remote_state.task_worker[*].outputs) + + task_definition_arn = var.bootstrap ? "" : local.task_worker_outputs.task_definition_arn + autoscaling_queue_name = var.bootstrap ? "not_set" : local.task_worker_outputs.sqs_queue_name +} diff --git a/infra/modules/aws/service_worker/main.tf b/infra/modules/aws/service_worker/main.tf index 48f7bb32..626599dc 100644 --- a/infra/modules/aws/service_worker/main.tf +++ b/infra/modules/aws/service_worker/main.tf @@ -1,8 +1,8 @@ -module "service_consumer" { +module "service_worker" { source = "../_shared/service" service_name = var.service_name - task_definition_arn = var.bootstrap ? "" : data.terraform_remote_state.task_worker[0].outputs.task_definition_arn + task_definition_arn = local.task_definition_arn container_port = var.container_port root_path = var.root_path connection_type = var.connection_type @@ -35,13 +35,13 @@ module "service_consumer" { scaling_strategy = { max_scaled_task_count = 4 sqs = { - scale_out_threshold = 10 # Start scaling at 10 msgs avg - scale_in_threshold = 2 # Scale in below 2 msgs avg - scale_out_adjustment = 2 # Add 2 tasks at once - scale_in_adjustment = 1 # Remove 1 task - cooldown_out = 60 # 1min cooldown (more stable) - cooldown_in = 300 # 5min cooldown (prevent flapping) - queue_name = "tbc" # SQS queue name to monitor for scaling + scale_out_threshold = 10 # Start scaling at 10 msgs avg + scale_in_threshold = 2 # Scale in below 2 msgs avg + scale_out_adjustment = 2 # Add 2 tasks at once + scale_in_adjustment = 1 # Remove 1 task + cooldown_out = 60 # 1min cooldown (more stable) + cooldown_in = 300 # 5min cooldown (prevent flapping) + queue_name = local.autoscaling_queue_name } } } diff --git a/infra/modules/aws/service_worker/outputs.tf b/infra/modules/aws/service_worker/outputs.tf index 1eb85248..38a97a9d 100644 --- a/infra/modules/aws/service_worker/outputs.tf +++ b/infra/modules/aws/service_worker/outputs.tf @@ -1,5 +1,5 @@ output "service_name" { - value = module.service_consumer.service_name + value = module.service_worker.service_name } output "cluster_name" { @@ -7,11 +7,11 @@ output "cluster_name" { } output "codedeploy_app_name" { - value = module.service_consumer.codedeploy_app_name + value = module.service_worker.codedeploy_app_name } output "codedeploy_deployment_group_name" { - value = module.service_consumer.codedeploy_deployment_group_name + value = module.service_worker.codedeploy_deployment_group_name } output "container_port" { diff --git a/infra/modules/aws/task_api/README.md b/infra/modules/aws/task_api/README.md new file mode 100644 index 00000000..20144935 --- /dev/null +++ b/infra/modules/aws/task_api/README.md @@ -0,0 +1,22 @@ +# `task_api` + +Concrete ECS API task wrapper for the sample API service. + +## Owns + +- sample ECS API task definition via `_shared/task` + +## Key behavior + +- runs the `containers/api` image +- publishes API task revisions for ECS deploys +- exposes the service on the `/ecs` root path +- defaults `local_tunnel` and `xray_enabled` to `false` unless explicitly enabled + +## Key outputs + +- `task_definition_arn` +- `service_name` +- log group name + +This module is the image-driven deployment unit for the sample ECS API service. diff --git a/infra/modules/aws/task_api/main.tf b/infra/modules/aws/task_api/main.tf new file mode 100644 index 00000000..fd03b27c --- /dev/null +++ b/infra/modules/aws/task_api/main.tf @@ -0,0 +1,25 @@ +module "task_api" { + source = "../_shared/task" + + project_name = var.project_name + ecr_repository_name = var.ecr_repository_name + aws_region = var.aws_region + container_port = var.container_port + cpu = var.cpu + memory = var.memory + + image_uri = var.image_uri + debug_image_uri = var.debug_image_uri + aws_otel_collector_image_uri = var.aws_otel_collector_image_uri + otel_sampling_percentage = var.otel_sampling_percentage + + local_tunnel = var.local_tunnel + xray_enabled = var.xray_enabled + + additional_env_vars = [] + additional_runtime_policy_arns = [] + + root_path = "ecs" + service_name = "ecs-service-api" + command = ["python", "-u", "app.py"] +} diff --git a/infra/modules/aws/task_api/outputs.tf b/infra/modules/aws/task_api/outputs.tf new file mode 100644 index 00000000..b70f6c55 --- /dev/null +++ b/infra/modules/aws/task_api/outputs.tf @@ -0,0 +1,15 @@ +output "task_definition_arn" { + value = module.task_api.task_definition_arn +} + +output "cloudwatch_log_group" { + value = module.task_api.cloudwatch_log_group +} + +output "root_path" { + value = module.task_api.root_path +} + +output "service_name" { + value = module.task_api.service_name +} diff --git a/infra/modules/aws/task_api/variables.tf b/infra/modules/aws/task_api/variables.tf new file mode 100644 index 00000000..c4bd09a5 --- /dev/null +++ b/infra/modules/aws/task_api/variables.tf @@ -0,0 +1,64 @@ +### start of static vars set in root.hcl ### +variable "state_bucket" { + type = string +} + +variable "environment" { + type = string +} + +variable "aws_region" { + type = string +} + +variable "project_name" { + type = string +} + +variable "ecr_repository_name" { + type = string +} +### end of static vars set in root.hcl ### + +variable "container_port" { + type = number + default = 80 +} + +variable "cpu" { + type = number + default = 256 +} + +variable "memory" { + type = number + default = 512 +} + +variable "image_uri" { + type = string +} + +variable "aws_otel_collector_image_uri" { + type = string +} + +variable "otel_sampling_percentage" { + description = "Percentage of requests to send to x-ray" + type = string + default = 10.0 +} + +variable "debug_image_uri" { + type = string +} + +variable "local_tunnel" { + type = bool + default = false +} + +variable "xray_enabled" { + type = bool + default = false +} diff --git a/infra/modules/aws/task_worker/README.md b/infra/modules/aws/task_worker/README.md index 6b43505c..7d50b839 100644 --- a/infra/modules/aws/task_worker/README.md +++ b/infra/modules/aws/task_worker/README.md @@ -5,18 +5,23 @@ Concrete ECS worker task wrapper. ## Owns - worker ECS task definition via `_shared/task` +- ECS worker queue via `_shared/sqs` ## Key behavior -- runs `python -u consumer/app.py` +- runs `python -u app.py` - publishes worker task revisions for ECS deploys - uses the shared ECR repository named by `ecr_repository_name` +- injects its own queue URL into the container via `AWS_SQS_QUEUE_URL` +- updates a local heartbeat file as it polls and uses an ECS container health check against that heartbeat - defaults `local_tunnel` and `xray_enabled` to `false` unless explicitly enabled ## Key outputs - `task_definition_arn` - `service_name` +- `sqs_queue_name` +- `sqs_queue_url` - log group name -This module is the image-driven deployment unit for the ECS worker. +This module is the image-driven deployment unit for the ECS worker and owns the ECS worker queue directly so queue creation follows the task stack lifecycle. diff --git a/infra/modules/aws/task_worker/data.tf b/infra/modules/aws/task_worker/data.tf new file mode 100644 index 00000000..e69de29b diff --git a/infra/modules/aws/task_worker/local.tf b/infra/modules/aws/task_worker/local.tf new file mode 100644 index 00000000..88347cfb --- /dev/null +++ b/infra/modules/aws/task_worker/local.tf @@ -0,0 +1,4 @@ +locals { + sqs_queue_name = "${var.project_name}-${var.environment}-ecs-worker-queue" + sqs_dlq_name = "${var.project_name}-${var.environment}-ecs-worker-dlq" +} diff --git a/infra/modules/aws/task_worker/main.tf b/infra/modules/aws/task_worker/main.tf index 7d50919e..095feab0 100644 --- a/infra/modules/aws/task_worker/main.tf +++ b/infra/modules/aws/task_worker/main.tf @@ -1,4 +1,11 @@ -module "task_consumer" { +module "sqs_queue" { + source = "../_shared/sqs" + + sqs_queue_name = local.sqs_queue_name + sqs_dlq_name = local.sqs_dlq_name +} + +module "task_worker" { source = "../_shared/task" project_name = var.project_name @@ -16,10 +23,29 @@ module "task_consumer" { local_tunnel = var.local_tunnel xray_enabled = var.xray_enabled - additional_env_vars = [] - additional_runtime_policy_arns = [] + additional_env_vars = [ + { + name = "AWS_SQS_QUEUE_URL" + value = module.sqs_queue.sqs_queue_url + }, + { + name = "HEARTBEAT_FILE" + value = "/tmp/worker-heartbeat" + } + ] + additional_runtime_policy_arns = [ + module.sqs_queue.sqs_queue_read_policy_arn + ] + + health_check = { + command = ["CMD-SHELL", "python -c \"import os, time; path=os.environ['HEARTBEAT_FILE']; now=time.time(); mtime=os.path.getmtime(path); raise SystemExit(0 if now - mtime < 180 else 1)\""] + interval = 60 + timeout = 5 + retries = 3 + start_period = 30 + } root_path = "" service_name = "ecs-worker" - command = ["python", "-u", "consumer/app.py"] + command = ["python", "-u", "app.py"] } diff --git a/infra/modules/aws/task_worker/outputs.tf b/infra/modules/aws/task_worker/outputs.tf index 8aadc299..20ee03fb 100644 --- a/infra/modules/aws/task_worker/outputs.tf +++ b/infra/modules/aws/task_worker/outputs.tf @@ -1,15 +1,27 @@ output "task_definition_arn" { - value = module.task_consumer.task_definition_arn + value = module.task_worker.task_definition_arn } output "cloudwatch_log_group" { - value = module.task_consumer.cloudwatch_log_group + value = module.task_worker.cloudwatch_log_group } output "root_path" { - value = module.task_consumer.root_path + value = module.task_worker.root_path } output "service_name" { - value = module.task_consumer.service_name + value = module.task_worker.service_name +} + +output "sqs_queue_name" { + value = module.sqs_queue.sqs_queue_name +} + +output "sqs_queue_url" { + value = module.sqs_queue.sqs_queue_url +} + +output "sqs_queue_read_policy_arn" { + value = module.sqs_queue.sqs_queue_read_policy_arn } diff --git a/justfile b/justfile index c57e5451..8033eddb 100644 --- a/justfile +++ b/justfile @@ -231,6 +231,7 @@ docker-build: docker build \ --file "{{PROJECT_DIR}}/Dockerfile" \ + --build-arg "SERVICE=$CONTAINER_NAME" \ --target "$CONTAINER_NAME" \ -t "$TAG" \ "{{PROJECT_DIR}}" @@ -869,7 +870,7 @@ frontend-invalidate: exit 1 fi - MAX_ATTEMPTS=18 + MAX_ATTEMPTS=30 SLEEP_INTERVAL=10 echo "๐Ÿ”„ Creating CloudFront invalidation..."