From fed0be0098f6f22a559ea1e21a1c7c41af521e88 Mon Sep 17 00:00:00 2001 From: MarvelintheCloud Date: Wed, 20 May 2026 23:47:59 -0500 Subject: [PATCH 01/16] Add POC: Security and OAuth 2.0 Configuration --- docs/POC-Security-OAuth-Configuration.md | 166 +++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 docs/POC-Security-OAuth-Configuration.md diff --git a/docs/POC-Security-OAuth-Configuration.md b/docs/POC-Security-OAuth-Configuration.md new file mode 100644 index 0000000..42cddc9 --- /dev/null +++ b/docs/POC-Security-OAuth-Configuration.md @@ -0,0 +1,166 @@ +# POC: Security and OAuth 2.0 Configuration + +**Purpose:** Validate the repo-aligned OAuth 2.0 trust chain across client -> Container App -> APIM, including the three Entra app registrations and Graph-based app role assignments. + +## TL;DR (< 5 minutes) + +1. **Most important rule: each hop must use its own token audience (`aud`) and role check; do not pass the same token end-to-end.** +2. Create three app registrations: APIM protected API, ACA protected API, and client caller app. +3. Assign app roles with Microsoft Graph PowerShell when portal UI cannot target managed identities. + +## What you will observe + +- Client obtains a token for ACA (`aud = api://`) and can call ACA when ACA auth is enabled. +- ACA uses managed identity to obtain a separate token for APIM (`aud = api://`) and APIM accepts `roles = API.Caller`. +- App role assignment succeeds for identities not selectable in portal UI by using Graph PowerShell cmdlets. + +## Reference + +| Setting | Value in this POC | Unit | Set in | Takes effect | +| :--- | :--- | :--- | :--- | :--- | +| APIM API app registration | `SimpleL7Proxy-APIM-API` | name | Entra App registrations | immediate | +| ACA API app registration | `SimpleL7Proxy-ACA-API` | name | Entra App registrations | immediate | +| Client app registration | `SimpleL7Proxy-Client` | name | Entra App registrations | immediate | +| App role value | `API.Caller` | claim value | APIM API + ACA API app registrations | token issuance | +| ACA scope value | `api.access` | scope value | ACA API app registration | token issuance | +| APIM audience | `api://` | URI | APIM `validate-jwt` policy | policy save | +| ACA audience | `api://` | URI | ACA auth config | config save | +| Client -> ACA token resource | `api://` | URI | token request | per request | +| ACA -> APIM token resource | `api:///.default` | URI | managed identity token request | per request | +| Graph module permission | `AppRoleAssignment.ReadWrite.All` | Graph scope | `Connect-MgGraph` | login session | + +> [!NOTE] +> Units used in this doc: all IDs are GUIDs; audiences are URI strings; role/scope values are string claims. + +## Setup + +### 1) Register APIM protected API app + +**Rule: APIM must validate tokens issued for the APIM API audience, not the ACA audience.** + +```text +Name: SimpleL7Proxy-APIM-API +Application ID URI: api:// +App role: API.Caller (Allowed member types: Applications) +``` + +> [!WARNING] +> If `Allowed member types` excludes `Applications`, app-to-app role assignment fails. + +### 2) Register ACA protected API app + +**Rule: ACA must expose its own audience and scope for inbound client tokens.** + +```text +Name: SimpleL7Proxy-ACA-API +Application ID URI: api:// +Scope: api.access +``` + +> [!TIP] +> Keep this audience distinct from APIM to avoid token confusion between hops. + +### 3) Register client app + +**Rule: the client app needs permission to ACA scope and must be allowed by ACA auth policy.** + +```text +Name: SimpleL7Proxy-Client +API permission: ACA API -> Delegated -> api.access +Credential: client secret (or cert) +``` + +> [!NOTE] +> For service-to-service calls, use client credentials and validate `roles` where applicable. + +### 4) Assign app roles with PowerShell (Graph) + +**Rule: use Graph PowerShell for app role assignments when managed identities do not appear in portal options.** + +```powershell +Connect-MgGraph -Scopes "Application.Read.All AppRoleAssignment.ReadWrite.All" +Select-MgProfile -Name "v1.0" +Get-MgContext +``` + +> [!TIP] +> If `Connect-MgGraph` fails on permissions, sign in with an Entra admin account and consent the requested scopes. + +#### 4a) Assign ACA managed identity -> APIM API role (`API.Caller`) + +```powershell +$acaSpId = "" +$apimResourceSpId = "" +$apimRoleId = "" +New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $acaSpId -PrincipalId $acaSpId -ResourceId $apimResourceSpId -AppRoleId $apimRoleId +``` + +> [!WARNING] +> Use object IDs, not app IDs, for `ServicePrincipalId`, `PrincipalId`, and `ResourceId`. + +#### 4b) Assign client service principal -> ACA API role (`API.Caller`) + +```powershell +$clientSpId = "" +$acaResourceSpId = "" +$acaRoleId = "" +New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $clientSpId -PrincipalId $clientSpId -ResourceId $acaResourceSpId -AppRoleId $acaRoleId +``` + +> [!NOTE] +> If you use delegated-only access to ACA (`api.access`), keep this step optional; for app-role based enforcement, keep it required. + +### 5) Configure ACA auth and APIM JWT validation + +**Rule: ACA validates client token audience; APIM validates ACA managed identity token audience and role.** + +```text +ACA allowed audience: api:// +APIM validate-jwt audience: api:// +APIM required claim: roles contains API.Caller +``` + +> [!WARNING] +> Passing ACA audience to APIM `validate-jwt` is a common misconfiguration and causes authorization failures. + +## Full flow + +```mermaid +flowchart LR + A[Client App Registration\n(SimpleL7Proxy-Client)] -->|Token aud=api://ACA_APP_ID| B[ACA Ingress + Easy Auth] + B -->|Validates ACA audience| C[SimpleL7Proxy in ACA] + C -->|Managed Identity token\naud=api://APIM_APP_ID/.default| D[APIM] + D -->|validate-jwt: audience + role API.Caller| E[Backend routing/policy] + + F[ACA API App Registration\n(SimpleL7Proxy-ACA-API)] -. defines audience/scope .-> B + G[APIM API App Registration\n(SimpleL7Proxy-APIM-API)] -. defines API.Caller role .-> D +``` + +## Worked example + +| Step | Example value | Result | +| :--- | :--- | :--- | +| Create APIM API app | `appId = 11111111-1111-1111-1111-111111111111` | APIM audience becomes `api://11111111-1111-1111-1111-111111111111` | +| Create ACA API app | `appId = 22222222-2222-2222-2222-222222222222` | ACA audience becomes `api://22222222-2222-2222-2222-222222222222` | +| Create client app | `appId = 33333333-3333-3333-3333-333333333333` | Client can request token for ACA audience | +| Assign ACA MI role on APIM API | `New-MgServicePrincipalAppRoleAssignment ...` | APIM accepts ACA token with `roles: API.Caller` | +| Request token in ACA for APIM | `resource = api://111.../.default` | ACA -> APIM call authorized | + +## Verify + +- [ ] APIM API app registration exists with app role `API.Caller`. +- [ ] ACA API app registration exists with scope `api.access` and identifier URI. +- [ ] Client app registration has permission to call ACA API. +- [ ] ACA managed identity is assigned to APIM API app role. +- [ ] ACA auth is enabled and configured with ACA audience. +- [ ] APIM `validate-jwt` checks APIM audience and `roles=API.Caller`. +- [ ] Client can call ACA with token for ACA audience. +- [ ] ACA can call APIM with managed identity token for APIM audience. + +## Related docs + +- [scripts/README.md](../scripts/README.md) +- [scripts/ca2apimSetup.sh](../scripts/ca2apimSetup.sh) +- [scripts/console2caSetup.sh](../scripts/console2caSetup.sh) +- [scripts/enableContainerAppAuth.sh](../scripts/enableContainerAppAuth.sh) +- [APIM-Policy/readme.md](../APIM-Policy/readme.md) From cb4273772adaa3d4d9139696a56e6035726de68d Mon Sep 17 00:00:00 2001 From: MarvelintheCloud Date: Thu, 21 May 2026 10:34:14 -0500 Subject: [PATCH 02/16] Fix Mermaid syntax in OAuth POC full-flow diagram, added jwt validation code, and added configuration steps for oauth in APIM interface --- docs/POC-Security-OAuth-Configuration.md | 82 +++++++++++++++++++----- 1 file changed, 67 insertions(+), 15 deletions(-) diff --git a/docs/POC-Security-OAuth-Configuration.md b/docs/POC-Security-OAuth-Configuration.md index 42cddc9..4406357 100644 --- a/docs/POC-Security-OAuth-Configuration.md +++ b/docs/POC-Security-OAuth-Configuration.md @@ -78,9 +78,9 @@ Credential: client secret (or cert) **Rule: use Graph PowerShell for app role assignments when managed identities do not appear in portal options.** ```powershell -Connect-MgGraph -Scopes "Application.Read.All AppRoleAssignment.ReadWrite.All" -Select-MgProfile -Name "v1.0" -Get-MgContext +Install-Module Microsoft.Graph.Applications -Scope CurrentUser -Repository PSGallery -Force +Import-Module Microsoft.Graph.Applications +Connect-MgGraph -TenantId "" -Scopes "Application.ReadWrite.All", "AppRoleAssignment.ReadWrite.All" ``` > [!TIP] @@ -89,14 +89,14 @@ Get-MgContext #### 4a) Assign ACA managed identity -> APIM API role (`API.Caller`) ```powershell -$acaSpId = "" +$acaSpId = "" $apimResourceSpId = "" $apimRoleId = "" New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $acaSpId -PrincipalId $acaSpId -ResourceId $apimResourceSpId -AppRoleId $apimRoleId ``` > [!WARNING] -> Use object IDs, not app IDs, for `ServicePrincipalId`, `PrincipalId`, and `ResourceId`. +> Use object IDs, not app IDs, for `ServicePrincipalId`, `PrincipalId`, and `ResourceId`. The managed identity object ID can be found on the ACA resource itself. For, the APIM Service Principal, you must use the object ID found under Enterprise Apps in Entra, NOT under the corresponding App Registrations. #### 4b) Assign client service principal -> ACA API role (`API.Caller`) @@ -108,7 +108,7 @@ New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $clientSpId -Princip ``` > [!NOTE] -> If you use delegated-only access to ACA (`api.access`), keep this step optional; for app-role based enforcement, keep it required. +> If you use delegated-only access to ACA (`api.access`), keep this step optional; for app-role based enforcement, keep it required. For, the Service Principal object IDs, you must use the object ID found under Enterprise Apps in Entra, NOT under the corresponding App Registrations. ### 5) Configure ACA auth and APIM JWT validation @@ -123,26 +123,78 @@ APIM required claim: roles contains API.Caller > [!WARNING] > Passing ACA audience to APIM `validate-jwt` is a common misconfiguration and causes authorization failures. +#### 5a) APIM inbound `validate-jwt` policy (example) + +**Rule: APIM must validate issuer + audience + role on the token ACA presents to APIM.** + +```xml + + + + + + api:// + + + https://login.microsoftonline.com//v2.0 + + + + API.Caller + + + + +``` + +> [!TIP] +> If your API uses delegated user tokens instead of app roles, validate `scp` instead of `roles`. + +#### 5b) Configure OAuth 2.0 in APIM interface (portal) + +**Rule: configure an APIM OAuth 2.0 authorization server for interactive auth/testing; keep `validate-jwt` as the enforcement control on APIs.** + +1. In Azure portal, open your APIM instance. +2. Go to Security -> OAuth 2.0 + OpenID Connect -> Add OAuth 2.0 server. +3. Set these fields: + - Display name: `EntraOAuth` (or your standard name) + - Grant types: `Authorization code` (and `Client credentials` if needed) + - Client ID: `` + - Client secret: `` + - Authorization endpoint URL: `https://login.microsoftonline.com//oauth2/v2.0/authorize` + - Token endpoint URL: `https://login.microsoftonline.com//oauth2/v2.0/token` + - Default scope: `api:///api.access` (or your API scope) +4. Save the OAuth 2.0 server. +5. Open your API in APIM -> Settings and attach this OAuth 2.0 server under Security if you want Developer Portal Authorize support. +6. Open your API -> Design -> Inbound processing and ensure the `validate-jwt` policy above is present. + +> [!NOTE] +> APIM OAuth server configuration enables the Authorize experience; token acceptance is still controlled by the API policy (`validate-jwt`). + ## Full flow ```mermaid flowchart LR - A[Client App Registration\n(SimpleL7Proxy-Client)] -->|Token aud=api://ACA_APP_ID| B[ACA Ingress + Easy Auth] - B -->|Validates ACA audience| C[SimpleL7Proxy in ACA] - C -->|Managed Identity token\naud=api://APIM_APP_ID/.default| D[APIM] - D -->|validate-jwt: audience + role API.Caller| E[Backend routing/policy] + A["Client App Registration
SimpleL7Proxy-Client"] -->|"Token aud=api://ACA_APP_ID"| B["ACA Ingress + Easy Auth"] + B -->|"Validates ACA audience"| C["SimpleL7Proxy in ACA"] + C -->|"Managed identity token
aud=api://APIM_APP_ID/.default"| D["APIM"] + D -->|"validate-jwt: audience + role API.Caller"| E["Backend routing/policy"] - F[ACA API App Registration\n(SimpleL7Proxy-ACA-API)] -. defines audience/scope .-> B - G[APIM API App Registration\n(SimpleL7Proxy-APIM-API)] -. defines API.Caller role .-> D + F["ACA API App Registration
SimpleL7Proxy-ACA-API"] -. "Defines audience and scope" .-> B + G["APIM API App Registration
SimpleL7Proxy-APIM-API"] -. "Defines API.Caller role" .-> D ``` ## Worked example | Step | Example value | Result | | :--- | :--- | :--- | -| Create APIM API app | `appId = 11111111-1111-1111-1111-111111111111` | APIM audience becomes `api://11111111-1111-1111-1111-111111111111` | -| Create ACA API app | `appId = 22222222-2222-2222-2222-222222222222` | ACA audience becomes `api://22222222-2222-2222-2222-222222222222` | -| Create client app | `appId = 33333333-3333-3333-3333-333333333333` | Client can request token for ACA audience | +| Create APIM API app registration | `appId = 11111111-1111-1111-1111-111111111111` | APIM audience becomes `api://11111111-1111-1111-1111-111111111111` | +| Create ACA API app registration | `appId = 22222222-2222-2222-2222-222222222222` | ACA audience becomes `api://22222222-2222-2222-2222-222222222222` | +| Create client app registration | `appId = 33333333-3333-3333-3333-333333333333` | Client can request token for ACA audience | | Assign ACA MI role on APIM API | `New-MgServicePrincipalAppRoleAssignment ...` | APIM accepts ACA token with `roles: API.Caller` | | Request token in ACA for APIM | `resource = api://111.../.default` | ACA -> APIM call authorized | From bd708e8ade6de6a3666fb4a11646439131f823dd Mon Sep 17 00:00:00 2001 From: MarvelintheCloud Date: Thu, 21 May 2026 10:53:01 -0500 Subject: [PATCH 03/16] Add APIM policy test steps for OAuth configuration POC --- docs/POC-Security-OAuth-Configuration.md | 99 +++++++++++++++++++++++- 1 file changed, 97 insertions(+), 2 deletions(-) diff --git a/docs/POC-Security-OAuth-Configuration.md b/docs/POC-Security-OAuth-Configuration.md index 4406357..b3b8575 100644 --- a/docs/POC-Security-OAuth-Configuration.md +++ b/docs/POC-Security-OAuth-Configuration.md @@ -25,6 +25,7 @@ | ACA scope value | `api.access` | scope value | ACA API app registration | token issuance | | APIM audience | `api://` | URI | APIM `validate-jwt` policy | policy save | | ACA audience | `api://` | URI | ACA auth config | config save | +| Client secret requirement by app | APIM API app: `No`; ACA API app: `Yes` (when used by ACA auth config); Client app: `Yes` (client credentials) | flag | Entra App registrations | immediate | | Client -> ACA token resource | `api://` | URI | token request | per request | | ACA -> APIM token resource | `api:///.default` | URI | managed identity token request | per request | | Graph module permission | `AppRoleAssignment.ReadWrite.All` | Graph scope | `Connect-MgGraph` | login session | @@ -73,6 +74,24 @@ Credential: client secret (or cert) > [!NOTE] > For service-to-service calls, use client credentials and validate `roles` where applicable. +### 3a) Client secret requirements by app registration + +**Rule: only apps that actively request tokens as confidential clients need a client secret.** + +1. APIM protected API app (`SimpleL7Proxy-APIM-API`): no client secret required for this POC. +2. ACA protected API app (`SimpleL7Proxy-ACA-API`): create a client secret if you configure ACA Easy Auth with Entra app credentials (`-c` and `-s` values in `enableContainerAppAuth.sh`). +3. Client app (`SimpleL7Proxy-Client`): create a client secret (or certificate) when using client credentials flow. + +Portal steps to create a secret: + +1. Entra ID -> App registrations -> select the app. +2. Go to Certificates & secrets -> New client secret. +3. Add description + expiry, then create. +4. Copy the secret Value immediately and store it securely. + +> [!WARNING] +> Secret values are shown only once. If lost, create a new secret and update ACA/APIM config that depends on it. + ### 4) Assign app roles with PowerShell (Graph) **Rule: use Graph PowerShell for app role assignments when managed identities do not appear in portal options.** @@ -163,8 +182,8 @@ APIM required claim: roles contains API.Caller 3. Set these fields: - Display name: `EntraOAuth` (or your standard name) - Grant types: `Authorization code` (and `Client credentials` if needed) - - Client ID: `` - - Client secret: `` + - Client ID: `` (typically the client app registration) + - Client secret: `` - Authorization endpoint URL: `https://login.microsoftonline.com//oauth2/v2.0/authorize` - Token endpoint URL: `https://login.microsoftonline.com//oauth2/v2.0/token` - Default scope: `api:///api.access` (or your API scope) @@ -175,6 +194,82 @@ APIM required claim: roles contains API.Caller > [!NOTE] > APIM OAuth server configuration enables the Authorize experience; token acceptance is still controlled by the API policy (`validate-jwt`). +### 6) Test APIM policy after configuration + +**Rule: validate both positive and negative paths to confirm `validate-jwt` is enforcing audience and role correctly.** + +Set your test variables first: + +```bash +APIM_BASE="https://.azure-api.net/" +APIM_SUB_KEY="" +TENANT_ID="" +APIM_APP_ID="" +``` + +#### 6a) Positive test: ACA managed identity (or equivalent caller) succeeds + +```bash +# This token should be requested for APIM audience: api:///.default +TOKEN="" + +curl -i "$APIM_BASE/health" \ + -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ + -H "Authorization: Bearer $TOKEN" +``` + +Expected result: + +- `200` (or your API's expected success code) +- No `Unauthorized. Missing or invalid token.` message + +#### 6b) Negative test: no token should fail + +```bash +curl -i "$APIM_BASE/health" \ + -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" +``` + +Expected result: + +- `401 Unauthorized` +- Error from `validate-jwt` policy + +#### 6c) Negative test: wrong audience should fail + +```bash +# Use a token for ACA audience instead of APIM audience. +BAD_TOKEN="" + +curl -i "$APIM_BASE/health" \ + -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ + -H "Authorization: Bearer $BAD_TOKEN" +``` + +Expected result: + +- `401 Unauthorized` +- Audience validation failure + +#### 6d) Negative test: missing role should fail + +```bash +# Use a token that has APIM audience but lacks roles: API.Caller. +NO_ROLE_TOKEN="" + +curl -i "$APIM_BASE/health" \ + -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ + -H "Authorization: Bearer $NO_ROLE_TOKEN" +``` + +Expected result: + +- `401 Unauthorized` +- Required claim (`roles=API.Caller`) validation failure + +> [!TIP] +> For fast diagnosis, temporarily project token claims in APIM trace and verify `aud`, `iss`, and `roles` match your `validate-jwt` policy. + ## Full flow ```mermaid From d3a380a3eee89a6050444a05f4a95e4cd7c7ec2f Mon Sep 17 00:00:00 2001 From: MarvelintheCloud Date: Thu, 21 May 2026 11:02:19 -0500 Subject: [PATCH 04/16] Added instructions for creating app registrations --- docs/POC-Security-OAuth-Configuration.md | 62 ++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/docs/POC-Security-OAuth-Configuration.md b/docs/POC-Security-OAuth-Configuration.md index b3b8575..e19d8a0 100644 --- a/docs/POC-Security-OAuth-Configuration.md +++ b/docs/POC-Security-OAuth-Configuration.md @@ -35,7 +35,7 @@ ## Setup -### 1) Register APIM protected API app +### 1) Create an App Registration for APIM in Entra **Rule: APIM must validate tokens issued for the APIM API audience, not the ACA audience.** @@ -45,10 +45,27 @@ Application ID URI: api:// App role: API.Caller (Allowed member types: Applications) ``` +Portal steps (repo-aligned): + +1. Go to Entra ID -> App registrations -> New registration. +2. Name it `SimpleL7Proxy-APIM-API` (or your environment naming standard), then create. +3. Open Expose an API -> Set Application ID URI -> `api://`. +4. Open App roles -> Create app role with: + - Display name: `Caller` + - Allowed member types: `Users/Groups` and `Applications` + - Value: `API.Caller` + - Description: `Caller` + - Enable app role: `Yes` +5. Open Enterprise applications -> find this app's service principal -> set assignment required to `Yes` (repo script equivalent: `appRoleAssignmentRequired=true`). +6. Capture and save these IDs for later steps: + - App (client) ID (`APIM_APP_ID`) + - Service principal object ID (`APIM_API_SERVICE_PRINCIPAL_OBJECT_ID`) + - App role ID for `API.Caller` (`APIM_API_CALLER_ROLE_ID`) + > [!WARNING] > If `Allowed member types` excludes `Applications`, app-to-app role assignment fails. -### 2) Register ACA protected API app +### 2) Create an App Registration for ACA in Entra **Rule: ACA must expose its own audience and scope for inbound client tokens.** @@ -58,10 +75,32 @@ Application ID URI: api:// Scope: api.access ``` +Portal steps (repo-aligned): + +1. Go to Entra ID -> App registrations -> New registration. +2. Name it `SimpleL7Proxy-ACA-API`, then create. +3. Open Expose an API -> Set Application ID URI -> `api://`. +4. In Expose an API -> Add a scope with: + - Scope name/value: `api.access` + - Who can consent: `Admins only` (repo script sets scope type `Admin`) + - Admin consent display name: `Admin Access` + - Admin consent description: `Access the API` + - State: `Enabled` +5. Open App roles -> Create app role with: + - Display name: `Caller` + - Allowed member types: `Users/Groups` and `Applications` + - Value: `API.Caller` + - Enable app role: `Yes` +6. Open Enterprise applications -> find this app's service principal -> set assignment required to `Yes`. +7. Capture and save these IDs for later steps: + - App (client) ID (`ACA_APP_ID`) + - Service principal object ID (`ACA_API_SERVICE_PRINCIPAL_OBJECT_ID`) + - App role ID for `API.Caller` (`ACA_API_CALLER_ROLE_ID`) + > [!TIP] > Keep this audience distinct from APIM to avoid token confusion between hops. -### 3) Register client app +### 3) Create an App Registration for client app in Entra **Rule: the client app needs permission to ACA scope and must be allowed by ACA auth policy.** @@ -71,6 +110,23 @@ API permission: ACA API -> Delegated -> api.access Credential: client secret (or cert) ``` +Portal steps (repo-aligned): + +1. Go to Entra ID -> App registrations -> New registration. +2. Name it `SimpleL7Proxy-Client`, then create. +3. Open API permissions -> Add a permission -> My APIs -> select `SimpleL7Proxy-ACA-API`. +4. Add delegated permission `api.access`. +5. If required by tenant policy, select Grant admin consent. +6. Open Certificates & secrets -> create a client secret (or configure a certificate). +7. Ensure a service principal exists for this app in Enterprise applications (repo script equivalent creates one explicitly). +8. Capture and save: + - App (client) ID (`CLIENT_APP_ID`) + - Service principal object ID (`CLIENT_SERVICE_PRINCIPAL_OBJECT_ID`) + - Client secret value (`CLIENT_SECRET`) + +> [!NOTE] +> The repo scripts use this client identity as the caller to ACA and then assign app roles to its service principal as needed. + > [!NOTE] > For service-to-service calls, use client credentials and validate `roles` where applicable. From 4c1b2b1e4aa62d7afe396144054ac468d4b29fa9 Mon Sep 17 00:00:00 2001 From: MarvelintheCloud Date: Thu, 21 May 2026 20:31:41 -0500 Subject: [PATCH 05/16] Split security OAuth POC into ACA and APIM focused docs --- docs/POC-ACA-Proxy-Security-Authorization.md | 201 ++++++++++ docs/POC-APIM-Security-Authorization.md | 207 ++++++++++ docs/POC-Security-OAuth-Configuration.md | 378 +------------------ 3 files changed, 425 insertions(+), 361 deletions(-) create mode 100644 docs/POC-ACA-Proxy-Security-Authorization.md create mode 100644 docs/POC-APIM-Security-Authorization.md diff --git a/docs/POC-ACA-Proxy-Security-Authorization.md b/docs/POC-ACA-Proxy-Security-Authorization.md new file mode 100644 index 0000000..07920a8 --- /dev/null +++ b/docs/POC-ACA-Proxy-Security-Authorization.md @@ -0,0 +1,201 @@ +# POC: ACA Proxy Security and Authorization + +**Purpose:** Validate inbound OAuth 2.0 authentication and authorization for the SimpleL7Proxy Container App (ACA), including Entra app registration setup, ACA auth configuration, and caller validation. + +## TL;DR (< 5 minutes) + +1. **Most important rule: tokens sent to ACA must have `aud = api://`.** +2. Create two Entra apps for this hop: ACA API app and client caller app. +3. Enable ACA authentication and test both success and failure paths. + +## What you will observe + +- A token minted for `api://` is accepted by ACA. +- Requests without a token or with the wrong audience are rejected. +- The client identity can be restricted through allowed applications and app role assignment. + +## Reference + +| Setting | Value in this POC | Unit | Set in | Takes effect | +| :--- | :--- | :--- | :--- | :--- | +| ACA API app registration | `SimpleL7Proxy-ACA-API` | name | Entra App registrations | immediate | +| Client app registration | `SimpleL7Proxy-Client` | name | Entra App registrations | immediate | +| ACA audience | `api://` | URI | ACA auth config | config save | +| ACA scope | `api.access` | scope value | ACA API app | token issuance | +| App role value | `API.Caller` | claim value | ACA API app | token issuance | +| Client secret requirement | ACA API app: `Yes` when used in ACA auth config; Client app: `Yes` for client credentials flow | flag | Entra App registrations | immediate | + +> [!NOTE] +> Units used in this doc: IDs are GUIDs, audience values are URI strings, and roles/scopes are string claims. + +## Setup + +### 1) Create ACA API app registration in Entra + +**Rule: the ACA proxy must expose its own audience and scope for inbound client tokens.** + +```text +Name: SimpleL7Proxy-ACA-API +Application ID URI: api:// +Scope: api.access +App role: API.Caller +``` + +Portal steps (repo-aligned): + +1. Entra ID -> App registrations -> New registration. +2. Name: `SimpleL7Proxy-ACA-API`. +3. Expose an API -> Set Application ID URI -> `api://`. +4. Expose an API -> Add scope: + - Scope value: `api.access` + - Who can consent: `Admins only` + - Admin consent display name: `Admin Access` + - Admin consent description: `Access the API` + - State: `Enabled` +5. App roles -> Create app role: + - Display name: `Caller` + - Allowed member types: `Users/Groups` and `Applications` + - Value: `API.Caller` + - Enable app role: `Yes` +6. Enterprise applications -> corresponding service principal -> set assignment required to `Yes`. +7. Save IDs: + - `ACA_APP_ID` + - `ACA_API_SERVICE_PRINCIPAL_OBJECT_ID` + - `ACA_API_CALLER_ROLE_ID` + +### 2) Create client app registration in Entra + +**Rule: the caller must be able to request a token for ACA scope and present it to ACA.** + +```text +Name: SimpleL7Proxy-Client +Permission: ACA API delegated permission api.access +Credential: client secret (or certificate) +``` + +Portal steps: + +1. Entra ID -> App registrations -> New registration. +2. Name: `SimpleL7Proxy-Client`. +3. API permissions -> Add permission -> My APIs -> `SimpleL7Proxy-ACA-API`. +4. Add delegated permission `api.access`. +5. Grant admin consent if required by tenant policy. +6. Certificates and secrets -> New client secret. +7. Save IDs/secrets: + - `CLIENT_APP_ID` + - `CLIENT_SERVICE_PRINCIPAL_OBJECT_ID` + - `CLIENT_SECRET` + +### 3) Configure ACA authentication + +**Rule: ACA auth must validate the ACA audience and only allow authorized caller applications.** + +Script-aligned configuration using repo flow: + +```bash +./scripts/enableContainerAppAuth.sh \ + -g \ + -n \ + -t \ + -c \ + -s \ + -a +``` + +Equivalent portal checks: + +1. Container Apps -> your app -> Authentication. +2. Identity provider: Microsoft. +3. Tenant: ``. +4. Client ID: ``. +5. Client secret: ACA app secret. +6. Allowed token audiences includes `api://`. +7. Allowed applications includes ``. +8. Unauthenticated requests set to `401`/reject. + +> [!WARNING] +> Use tenant ID for `-t`; do not use a service principal object ID in this field. + +### 4) Optional app-role assignment for client service principal + +**Rule: if you enforce role-based app auth, assign `API.Caller` to the client service principal on ACA API.** + +```powershell +Connect-MgGraph -TenantId "" -Scopes "Application.ReadWrite.All", "AppRoleAssignment.ReadWrite.All" +$clientSpId = "" +$acaResourceSpId = "" +$acaRoleId = "" +New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $clientSpId -PrincipalId $clientSpId -ResourceId $acaResourceSpId -AppRoleId $acaRoleId +``` + +## Full flow + +```mermaid +flowchart LR + A["Client App
SimpleL7Proxy-Client"] -->|"Bearer token aud=api://ACA_APP_ID"| B["ACA Ingress + Easy Auth"] + B -->|"Validate audience + app constraints"| C["SimpleL7Proxy in ACA"] + D["ACA API App
SimpleL7Proxy-ACA-API"] -. "Defines audience, scope, and optional role" .-> B +``` + +## Worked example + +| Step | Example value | Result | +| :--- | :--- | :--- | +| Create ACA API app | `appId = 22222222-2222-2222-2222-222222222222` | ACA audience is `api://22222222-2222-2222-2222-222222222222` | +| Create client app | `appId = 33333333-3333-3333-3333-333333333333` | Client can request ACA token | +| Enable ACA auth | Allowed audience set to ACA URI | ACA validates incoming bearer tokens | +| Send valid token | `aud` matches ACA URI | Request succeeds | +| Send token with wrong audience | `aud` is APIM URI | Request fails with `401` | + +## Test ACA authorization + +**Rule: run one success test and two failure tests to validate auth controls.** + +Set variables: + +```bash +ACA_FQDN="https://" +ACA_RESOURCE="api://" +``` + +Positive test: + +```bash +TOKEN="$(az account get-access-token --resource "$ACA_RESOURCE" --query accessToken -o tsv)" +curl -i "$ACA_FQDN/health" -H "Authorization: Bearer $TOKEN" +``` + +Expected: success response from ACA. + +Negative test (no token): + +```bash +curl -i "$ACA_FQDN/health" +``` + +Expected: `401 Unauthorized`. + +Negative test (wrong audience token): + +```bash +BAD_TOKEN="$(az account get-access-token --resource api:// --query accessToken -o tsv)" +curl -i "$ACA_FQDN/health" -H "Authorization: Bearer $BAD_TOKEN" +``` + +Expected: `401 Unauthorized` due to audience mismatch. + +## Verify + +- [ ] ACA API app exists with `api://` identifier URI. +- [ ] ACA API app exposes scope `api.access`. +- [ ] Client app has delegated permission to `api.access`. +- [ ] ACA auth is enabled and allowed audience includes ACA URI. +- [ ] Allowed applications includes client app ID. +- [ ] Valid ACA token succeeds; missing/wrong-audience token fails. + +## Related docs + +- [scripts/README.md](../scripts/README.md) +- [scripts/console2caSetup.sh](../scripts/console2caSetup.sh) +- [scripts/enableContainerAppAuth.sh](../scripts/enableContainerAppAuth.sh) +- [POC-APIM-Security-Authorization.md](POC-APIM-Security-Authorization.md) diff --git a/docs/POC-APIM-Security-Authorization.md b/docs/POC-APIM-Security-Authorization.md new file mode 100644 index 0000000..2afc8ff --- /dev/null +++ b/docs/POC-APIM-Security-Authorization.md @@ -0,0 +1,207 @@ +# POC: APIM Security and Authorization + +**Purpose:** Validate OAuth 2.0 authentication and authorization at APIM for calls coming from ACA, including Entra app registration, app-role assignment, APIM OAuth interface setup, and `validate-jwt` enforcement. + +## TL;DR (< 5 minutes) + +1. **Most important rule: APIM must only accept tokens with `aud = api://` and `roles = API.Caller`.** +2. Create an APIM API app registration and assign `API.Caller` to ACA managed identity. +3. Configure APIM `validate-jwt` and test both success and failure paths. + +## What you will observe + +- APIM accepts calls with valid bearer tokens minted for APIM audience and role. +- APIM rejects calls with no token, wrong audience, or missing role. +- OAuth server config in APIM enables interactive authorizing, while policy still enforces acceptance. + +## Reference + +| Setting | Value in this POC | Unit | Set in | Takes effect | +| :--- | :--- | :--- | :--- | :--- | +| APIM API app registration | `SimpleL7Proxy-APIM-API` | name | Entra App registrations | immediate | +| App role value | `API.Caller` | claim value | APIM API app | token issuance | +| APIM audience | `api://` | URI | APIM `validate-jwt` policy | policy save | +| ACA caller identity | ACA managed identity service principal | principal | ACA + Entra | immediate | +| OAuth server client secret | secret from client app used in APIM OAuth server UI | secret | Entra + APIM | save | + +> [!NOTE] +> Units used in this doc: IDs are GUIDs and audience values are URI strings. + +## Setup + +### 1) Create APIM API app registration in Entra + +**Rule: APIM policy audience must match this app's identifier URI.** + +```text +Name: SimpleL7Proxy-APIM-API +Application ID URI: api:// +App role: API.Caller +``` + +Portal steps (repo-aligned): + +1. Entra ID -> App registrations -> New registration. +2. Name: `SimpleL7Proxy-APIM-API`. +3. Expose an API -> Set Application ID URI -> `api://`. +4. App roles -> Create app role: + - Display name: `Caller` + - Allowed member types: `Users/Groups` and `Applications` + - Value: `API.Caller` + - Enable app role: `Yes` +5. Enterprise applications -> corresponding service principal -> set assignment required to `Yes`. +6. Save IDs: + - `APIM_APP_ID` + - `APIM_API_SERVICE_PRINCIPAL_OBJECT_ID` + - `APIM_API_CALLER_ROLE_ID` + +### 2) Assign ACA managed identity to APIM role + +**Rule: ACA managed identity must have `API.Caller` role on APIM API enterprise app.** + +```powershell +Connect-MgGraph -TenantId "" -Scopes "Application.ReadWrite.All", "AppRoleAssignment.ReadWrite.All" +$acaSpId = "" +$apimResourceSpId = "" +$apimRoleId = "" +New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $acaSpId -PrincipalId $acaSpId -ResourceId $apimResourceSpId -AppRoleId $apimRoleId +``` + +> [!WARNING] +> Use service principal object IDs from Enterprise applications, not app registration app IDs. + +### 3) Configure APIM inbound `validate-jwt` policy + +**Rule: APIM must validate issuer, audience, and role before backend routing.** + +```xml + + + + + + api:// + + + https://login.microsoftonline.com//v2.0 + + + + API.Caller + + + + +``` + +> [!TIP] +> If your APIM endpoint accepts delegated user tokens, validate `scp` instead of `roles`. + +### 4) Configure OAuth 2.0 in APIM interface + +**Rule: OAuth server settings support authorize/testing UX, while `validate-jwt` policy remains the true gate.** + +1. Azure portal -> APIM instance. +2. Security -> OAuth 2.0 + OpenID Connect -> Add OAuth 2.0 server. +3. Configure: + - Display name: `EntraOAuth` + - Grant types: `Authorization code` and optionally `Client credentials` + - Client ID: `` + - Client secret: `` + - Authorization endpoint: `https://login.microsoftonline.com//oauth2/v2.0/authorize` + - Token endpoint: `https://login.microsoftonline.com//oauth2/v2.0/token` + - Default scope: `api:///.default` (or your API scope) +4. Save. +5. API -> Settings -> attach OAuth server if Developer Portal Authorize is needed. +6. API -> Design -> verify `validate-jwt` is present in inbound policy. + +## Full flow + +```mermaid +flowchart LR + A["ACA Managed Identity"] -->|"Bearer token aud=api://APIM_APP_ID/.default"| B["APIM validate-jwt"] + B -->|"Checks issuer + audience + roles=API.Caller"| C["APIM backend routing"] + D["APIM API App
SimpleL7Proxy-APIM-API"] -. "Defines audience and API.Caller role" .-> B +``` + +## Worked example + +| Step | Example value | Result | +| :--- | :--- | :--- | +| Create APIM API app | `appId = 11111111-1111-1111-1111-111111111111` | APIM audience is `api://11111111-1111-1111-1111-111111111111` | +| Assign ACA managed identity role | `API.Caller` granted | ACA token can satisfy APIM role check | +| Apply validate-jwt policy | audience + role checks active | Unauthorized tokens are blocked | +| Send valid APIM token | `aud` and `roles` match | Request succeeds | +| Send ACA audience token | `aud` mismatch | Request fails with `401` | + +## Test APIM authorization policy + +**Rule: run one positive and three negative tests to confirm policy enforcement.** + +Set variables: + +```bash +APIM_BASE="https://.azure-api.net/" +APIM_SUB_KEY="" +``` + +Positive test: + +```bash +TOKEN="" +curl -i "$APIM_BASE/health" \ + -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ + -H "Authorization: Bearer $TOKEN" +``` + +Expected: success response. + +Negative test (no token): + +```bash +curl -i "$APIM_BASE/health" -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" +``` + +Expected: `401 Unauthorized`. + +Negative test (wrong audience): + +```bash +BAD_TOKEN="" +curl -i "$APIM_BASE/health" \ + -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ + -H "Authorization: Bearer $BAD_TOKEN" +``` + +Expected: `401 Unauthorized` due to audience mismatch. + +Negative test (missing role): + +```bash +NO_ROLE_TOKEN="" +curl -i "$APIM_BASE/health" \ + -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ + -H "Authorization: Bearer $NO_ROLE_TOKEN" +``` + +Expected: `401 Unauthorized` due to missing required claim. + +## Verify + +- [ ] APIM API app registration exists with identifier URI `api://`. +- [ ] App role `API.Caller` exists and allows `Applications`. +- [ ] ACA managed identity is assigned `API.Caller` on APIM API enterprise app. +- [ ] APIM inbound policy validates issuer, audience, and role. +- [ ] Valid APIM token succeeds. +- [ ] Missing token, wrong audience, and missing role all fail with `401`. + +## Related docs + +- [scripts/README.md](../scripts/README.md) +- [scripts/ca2apimSetup.sh](../scripts/ca2apimSetup.sh) +- [APIM-Policy/readme.md](../APIM-Policy/readme.md) +- [POC-ACA-Proxy-Security-Authorization.md](POC-ACA-Proxy-Security-Authorization.md) diff --git a/docs/POC-Security-OAuth-Configuration.md b/docs/POC-Security-OAuth-Configuration.md index e19d8a0..3250103 100644 --- a/docs/POC-Security-OAuth-Configuration.md +++ b/docs/POC-Security-OAuth-Configuration.md @@ -1,369 +1,25 @@ -# POC: Security and OAuth 2.0 Configuration +# POC: Security and OAuth 2.0 Configuration (Split Index) -**Purpose:** Validate the repo-aligned OAuth 2.0 trust chain across client -> Container App -> APIM, including the three Entra app registrations and Graph-based app role assignments. +**Purpose:** This index replaces the previous combined security POC and points to two focused runbooks. -## TL;DR (< 5 minutes) +## TL;DR -1. **Most important rule: each hop must use its own token audience (`aud`) and role check; do not pass the same token end-to-end.** -2. Create three app registrations: APIM protected API, ACA protected API, and client caller app. -3. Assign app roles with Microsoft Graph PowerShell when portal UI cannot target managed identities. +1. **Most important rule: use the doc that matches the security boundary you are configuring.** +2. Use the ACA document for client -> ACA authentication and authorization. +3. Use the APIM document for ACA -> APIM token validation and policy enforcement. -## What you will observe +## Use these documents -- Client obtains a token for ACA (`aud = api://`) and can call ACA when ACA auth is enabled. -- ACA uses managed identity to obtain a separate token for APIM (`aud = api://`) and APIM accepts `roles = API.Caller`. -- App role assignment succeeds for identities not selectable in portal UI by using Graph PowerShell cmdlets. +- [POC-ACA-Proxy-Security-Authorization.md](POC-ACA-Proxy-Security-Authorization.md) + - Focus: securing and authorizing the ACA proxy ingress. + - Includes: ACA app registration, client app registration, ACA auth setup, and ACA validation tests. -## Reference +- [POC-APIM-Security-Authorization.md](POC-APIM-Security-Authorization.md) + - Focus: securing and authorizing APIM. + - Includes: APIM app registration, ACA managed identity role assignment, APIM OAuth interface setup, `validate-jwt` policy, and APIM policy tests. -| Setting | Value in this POC | Unit | Set in | Takes effect | -| :--- | :--- | :--- | :--- | :--- | -| APIM API app registration | `SimpleL7Proxy-APIM-API` | name | Entra App registrations | immediate | -| ACA API app registration | `SimpleL7Proxy-ACA-API` | name | Entra App registrations | immediate | -| Client app registration | `SimpleL7Proxy-Client` | name | Entra App registrations | immediate | -| App role value | `API.Caller` | claim value | APIM API + ACA API app registrations | token issuance | -| ACA scope value | `api.access` | scope value | ACA API app registration | token issuance | -| APIM audience | `api://` | URI | APIM `validate-jwt` policy | policy save | -| ACA audience | `api://` | URI | ACA auth config | config save | -| Client secret requirement by app | APIM API app: `No`; ACA API app: `Yes` (when used by ACA auth config); Client app: `Yes` (client credentials) | flag | Entra App registrations | immediate | -| Client -> ACA token resource | `api://` | URI | token request | per request | -| ACA -> APIM token resource | `api:///.default` | URI | managed identity token request | per request | -| Graph module permission | `AppRoleAssignment.ReadWrite.All` | Graph scope | `Connect-MgGraph` | login session | +## Why the split -> [!NOTE] -> Units used in this doc: all IDs are GUIDs; audiences are URI strings; role/scope values are string claims. - -## Setup - -### 1) Create an App Registration for APIM in Entra - -**Rule: APIM must validate tokens issued for the APIM API audience, not the ACA audience.** - -```text -Name: SimpleL7Proxy-APIM-API -Application ID URI: api:// -App role: API.Caller (Allowed member types: Applications) -``` - -Portal steps (repo-aligned): - -1. Go to Entra ID -> App registrations -> New registration. -2. Name it `SimpleL7Proxy-APIM-API` (or your environment naming standard), then create. -3. Open Expose an API -> Set Application ID URI -> `api://`. -4. Open App roles -> Create app role with: - - Display name: `Caller` - - Allowed member types: `Users/Groups` and `Applications` - - Value: `API.Caller` - - Description: `Caller` - - Enable app role: `Yes` -5. Open Enterprise applications -> find this app's service principal -> set assignment required to `Yes` (repo script equivalent: `appRoleAssignmentRequired=true`). -6. Capture and save these IDs for later steps: - - App (client) ID (`APIM_APP_ID`) - - Service principal object ID (`APIM_API_SERVICE_PRINCIPAL_OBJECT_ID`) - - App role ID for `API.Caller` (`APIM_API_CALLER_ROLE_ID`) - -> [!WARNING] -> If `Allowed member types` excludes `Applications`, app-to-app role assignment fails. - -### 2) Create an App Registration for ACA in Entra - -**Rule: ACA must expose its own audience and scope for inbound client tokens.** - -```text -Name: SimpleL7Proxy-ACA-API -Application ID URI: api:// -Scope: api.access -``` - -Portal steps (repo-aligned): - -1. Go to Entra ID -> App registrations -> New registration. -2. Name it `SimpleL7Proxy-ACA-API`, then create. -3. Open Expose an API -> Set Application ID URI -> `api://`. -4. In Expose an API -> Add a scope with: - - Scope name/value: `api.access` - - Who can consent: `Admins only` (repo script sets scope type `Admin`) - - Admin consent display name: `Admin Access` - - Admin consent description: `Access the API` - - State: `Enabled` -5. Open App roles -> Create app role with: - - Display name: `Caller` - - Allowed member types: `Users/Groups` and `Applications` - - Value: `API.Caller` - - Enable app role: `Yes` -6. Open Enterprise applications -> find this app's service principal -> set assignment required to `Yes`. -7. Capture and save these IDs for later steps: - - App (client) ID (`ACA_APP_ID`) - - Service principal object ID (`ACA_API_SERVICE_PRINCIPAL_OBJECT_ID`) - - App role ID for `API.Caller` (`ACA_API_CALLER_ROLE_ID`) - -> [!TIP] -> Keep this audience distinct from APIM to avoid token confusion between hops. - -### 3) Create an App Registration for client app in Entra - -**Rule: the client app needs permission to ACA scope and must be allowed by ACA auth policy.** - -```text -Name: SimpleL7Proxy-Client -API permission: ACA API -> Delegated -> api.access -Credential: client secret (or cert) -``` - -Portal steps (repo-aligned): - -1. Go to Entra ID -> App registrations -> New registration. -2. Name it `SimpleL7Proxy-Client`, then create. -3. Open API permissions -> Add a permission -> My APIs -> select `SimpleL7Proxy-ACA-API`. -4. Add delegated permission `api.access`. -5. If required by tenant policy, select Grant admin consent. -6. Open Certificates & secrets -> create a client secret (or configure a certificate). -7. Ensure a service principal exists for this app in Enterprise applications (repo script equivalent creates one explicitly). -8. Capture and save: - - App (client) ID (`CLIENT_APP_ID`) - - Service principal object ID (`CLIENT_SERVICE_PRINCIPAL_OBJECT_ID`) - - Client secret value (`CLIENT_SECRET`) - -> [!NOTE] -> The repo scripts use this client identity as the caller to ACA and then assign app roles to its service principal as needed. - -> [!NOTE] -> For service-to-service calls, use client credentials and validate `roles` where applicable. - -### 3a) Client secret requirements by app registration - -**Rule: only apps that actively request tokens as confidential clients need a client secret.** - -1. APIM protected API app (`SimpleL7Proxy-APIM-API`): no client secret required for this POC. -2. ACA protected API app (`SimpleL7Proxy-ACA-API`): create a client secret if you configure ACA Easy Auth with Entra app credentials (`-c` and `-s` values in `enableContainerAppAuth.sh`). -3. Client app (`SimpleL7Proxy-Client`): create a client secret (or certificate) when using client credentials flow. - -Portal steps to create a secret: - -1. Entra ID -> App registrations -> select the app. -2. Go to Certificates & secrets -> New client secret. -3. Add description + expiry, then create. -4. Copy the secret Value immediately and store it securely. - -> [!WARNING] -> Secret values are shown only once. If lost, create a new secret and update ACA/APIM config that depends on it. - -### 4) Assign app roles with PowerShell (Graph) - -**Rule: use Graph PowerShell for app role assignments when managed identities do not appear in portal options.** - -```powershell -Install-Module Microsoft.Graph.Applications -Scope CurrentUser -Repository PSGallery -Force -Import-Module Microsoft.Graph.Applications -Connect-MgGraph -TenantId "" -Scopes "Application.ReadWrite.All", "AppRoleAssignment.ReadWrite.All" -``` - -> [!TIP] -> If `Connect-MgGraph` fails on permissions, sign in with an Entra admin account and consent the requested scopes. - -#### 4a) Assign ACA managed identity -> APIM API role (`API.Caller`) - -```powershell -$acaSpId = "" -$apimResourceSpId = "" -$apimRoleId = "" -New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $acaSpId -PrincipalId $acaSpId -ResourceId $apimResourceSpId -AppRoleId $apimRoleId -``` - -> [!WARNING] -> Use object IDs, not app IDs, for `ServicePrincipalId`, `PrincipalId`, and `ResourceId`. The managed identity object ID can be found on the ACA resource itself. For, the APIM Service Principal, you must use the object ID found under Enterprise Apps in Entra, NOT under the corresponding App Registrations. - -#### 4b) Assign client service principal -> ACA API role (`API.Caller`) - -```powershell -$clientSpId = "" -$acaResourceSpId = "" -$acaRoleId = "" -New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $clientSpId -PrincipalId $clientSpId -ResourceId $acaResourceSpId -AppRoleId $acaRoleId -``` - -> [!NOTE] -> If you use delegated-only access to ACA (`api.access`), keep this step optional; for app-role based enforcement, keep it required. For, the Service Principal object IDs, you must use the object ID found under Enterprise Apps in Entra, NOT under the corresponding App Registrations. - -### 5) Configure ACA auth and APIM JWT validation - -**Rule: ACA validates client token audience; APIM validates ACA managed identity token audience and role.** - -```text -ACA allowed audience: api:// -APIM validate-jwt audience: api:// -APIM required claim: roles contains API.Caller -``` - -> [!WARNING] -> Passing ACA audience to APIM `validate-jwt` is a common misconfiguration and causes authorization failures. - -#### 5a) APIM inbound `validate-jwt` policy (example) - -**Rule: APIM must validate issuer + audience + role on the token ACA presents to APIM.** - -```xml - - - - - - api:// - - - https://login.microsoftonline.com//v2.0 - - - - API.Caller - - - - -``` - -> [!TIP] -> If your API uses delegated user tokens instead of app roles, validate `scp` instead of `roles`. - -#### 5b) Configure OAuth 2.0 in APIM interface (portal) - -**Rule: configure an APIM OAuth 2.0 authorization server for interactive auth/testing; keep `validate-jwt` as the enforcement control on APIs.** - -1. In Azure portal, open your APIM instance. -2. Go to Security -> OAuth 2.0 + OpenID Connect -> Add OAuth 2.0 server. -3. Set these fields: - - Display name: `EntraOAuth` (or your standard name) - - Grant types: `Authorization code` (and `Client credentials` if needed) - - Client ID: `` (typically the client app registration) - - Client secret: `` - - Authorization endpoint URL: `https://login.microsoftonline.com//oauth2/v2.0/authorize` - - Token endpoint URL: `https://login.microsoftonline.com//oauth2/v2.0/token` - - Default scope: `api:///api.access` (or your API scope) -4. Save the OAuth 2.0 server. -5. Open your API in APIM -> Settings and attach this OAuth 2.0 server under Security if you want Developer Portal Authorize support. -6. Open your API -> Design -> Inbound processing and ensure the `validate-jwt` policy above is present. - -> [!NOTE] -> APIM OAuth server configuration enables the Authorize experience; token acceptance is still controlled by the API policy (`validate-jwt`). - -### 6) Test APIM policy after configuration - -**Rule: validate both positive and negative paths to confirm `validate-jwt` is enforcing audience and role correctly.** - -Set your test variables first: - -```bash -APIM_BASE="https://.azure-api.net/" -APIM_SUB_KEY="" -TENANT_ID="" -APIM_APP_ID="" -``` - -#### 6a) Positive test: ACA managed identity (or equivalent caller) succeeds - -```bash -# This token should be requested for APIM audience: api:///.default -TOKEN="" - -curl -i "$APIM_BASE/health" \ - -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ - -H "Authorization: Bearer $TOKEN" -``` - -Expected result: - -- `200` (or your API's expected success code) -- No `Unauthorized. Missing or invalid token.` message - -#### 6b) Negative test: no token should fail - -```bash -curl -i "$APIM_BASE/health" \ - -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" -``` - -Expected result: - -- `401 Unauthorized` -- Error from `validate-jwt` policy - -#### 6c) Negative test: wrong audience should fail - -```bash -# Use a token for ACA audience instead of APIM audience. -BAD_TOKEN="" - -curl -i "$APIM_BASE/health" \ - -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ - -H "Authorization: Bearer $BAD_TOKEN" -``` - -Expected result: - -- `401 Unauthorized` -- Audience validation failure - -#### 6d) Negative test: missing role should fail - -```bash -# Use a token that has APIM audience but lacks roles: API.Caller. -NO_ROLE_TOKEN="" - -curl -i "$APIM_BASE/health" \ - -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ - -H "Authorization: Bearer $NO_ROLE_TOKEN" -``` - -Expected result: - -- `401 Unauthorized` -- Required claim (`roles=API.Caller`) validation failure - -> [!TIP] -> For fast diagnosis, temporarily project token claims in APIM trace and verify `aud`, `iss`, and `roles` match your `validate-jwt` policy. - -## Full flow - -```mermaid -flowchart LR - A["Client App Registration
SimpleL7Proxy-Client"] -->|"Token aud=api://ACA_APP_ID"| B["ACA Ingress + Easy Auth"] - B -->|"Validates ACA audience"| C["SimpleL7Proxy in ACA"] - C -->|"Managed identity token
aud=api://APIM_APP_ID/.default"| D["APIM"] - D -->|"validate-jwt: audience + role API.Caller"| E["Backend routing/policy"] - - F["ACA API App Registration
SimpleL7Proxy-ACA-API"] -. "Defines audience and scope" .-> B - G["APIM API App Registration
SimpleL7Proxy-APIM-API"] -. "Defines API.Caller role" .-> D -``` - -## Worked example - -| Step | Example value | Result | -| :--- | :--- | :--- | -| Create APIM API app registration | `appId = 11111111-1111-1111-1111-111111111111` | APIM audience becomes `api://11111111-1111-1111-1111-111111111111` | -| Create ACA API app registration | `appId = 22222222-2222-2222-2222-222222222222` | ACA audience becomes `api://22222222-2222-2222-2222-222222222222` | -| Create client app registration | `appId = 33333333-3333-3333-3333-333333333333` | Client can request token for ACA audience | -| Assign ACA MI role on APIM API | `New-MgServicePrincipalAppRoleAssignment ...` | APIM accepts ACA token with `roles: API.Caller` | -| Request token in ACA for APIM | `resource = api://111.../.default` | ACA -> APIM call authorized | - -## Verify - -- [ ] APIM API app registration exists with app role `API.Caller`. -- [ ] ACA API app registration exists with scope `api.access` and identifier URI. -- [ ] Client app registration has permission to call ACA API. -- [ ] ACA managed identity is assigned to APIM API app role. -- [ ] ACA auth is enabled and configured with ACA audience. -- [ ] APIM `validate-jwt` checks APIM audience and `roles=API.Caller`. -- [ ] Client can call ACA with token for ACA audience. -- [ ] ACA can call APIM with managed identity token for APIM audience. - -## Related docs - -- [scripts/README.md](../scripts/README.md) -- [scripts/ca2apimSetup.sh](../scripts/ca2apimSetup.sh) -- [scripts/console2caSetup.sh](../scripts/console2caSetup.sh) -- [scripts/enableContainerAppAuth.sh](../scripts/enableContainerAppAuth.sh) -- [APIM-Policy/readme.md](../APIM-Policy/readme.md) +- ACA and APIM have different enforcement points and token audiences. +- Splitting reduces setup confusion and makes validation steps clearer. +- Each document now has a dedicated diagram, worked example, and verification checklist. From b65e82e5d0407022ca922b88c2d5eb3423a483e1 Mon Sep 17 00:00:00 2001 From: MarvelintheCloud Date: Thu, 21 May 2026 20:48:49 -0500 Subject: [PATCH 06/16] Expand APIM security POC with MI setup, issuer guidance, and operational tests --- docs/POC-APIM-Security-Authorization.md | 54 ++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/docs/POC-APIM-Security-Authorization.md b/docs/POC-APIM-Security-Authorization.md index 2afc8ff..f4a8b82 100644 --- a/docs/POC-APIM-Security-Authorization.md +++ b/docs/POC-APIM-Security-Authorization.md @@ -29,6 +29,20 @@ ## Setup +### 0) Enable system-assigned managed identity on ACA + +**Rule: ACA managed identity `principalId` is the identity that must receive `API.Caller` on the APIM API app.** + +```bash +RG="" +CA_NAME="" + +az containerapp identity assign -g "$RG" -n "$CA_NAME" --system-assigned + +ACA_MANAGED_IDENTITY_OBJECT_ID="$(az containerapp show -g "$RG" -n "$CA_NAME" --query "identity.principalId" -o tsv)" +echo "$ACA_MANAGED_IDENTITY_OBJECT_ID" +``` + ### 1) Create APIM API app registration in Entra **Rule: APIM policy audience must match this app's identifier URI.** @@ -85,8 +99,10 @@ New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $acaSpId -PrincipalI api:// + + https://sts.windows.net// https://login.microsoftonline.com//v2.0 @@ -101,6 +117,9 @@ New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $acaSpId -PrincipalI > [!TIP] > If your APIM endpoint accepts delegated user tokens, validate `scp` instead of `roles`. +> [!NOTE] +> Match the issuer to the token you actually receive. Depending on flow and token version, `iss` may be `https://sts.windows.net//` or `https://login.microsoftonline.com//v2.0`. + ### 4) Configure OAuth 2.0 in APIM interface **Rule: OAuth server settings support authorize/testing UX, while `validate-jwt` policy remains the true gate.** @@ -147,12 +166,13 @@ Set variables: ```bash APIM_BASE="https://.azure-api.net/" APIM_SUB_KEY="" +APIM_APP_ID="" ``` Positive test: ```bash -TOKEN="" +TOKEN="$(az account get-access-token --resource "api://$APIM_APP_ID" --query accessToken -o tsv)" curl -i "$APIM_BASE/health" \ -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ -H "Authorization: Bearer $TOKEN" @@ -171,7 +191,7 @@ Expected: `401 Unauthorized`. Negative test (wrong audience): ```bash -BAD_TOKEN="" +BAD_TOKEN="$(az account get-access-token --resource "https://management.azure.com/" --query accessToken -o tsv)" curl -i "$APIM_BASE/health" \ -H "Ocp-Apim-Subscription-Key: $APIM_SUB_KEY" \ -H "Authorization: Bearer $BAD_TOKEN" @@ -190,6 +210,26 @@ curl -i "$APIM_BASE/health" \ Expected: `401 Unauthorized` due to missing required claim. +> [!NOTE] +> Role assignment changes are not retroactive to already-issued tokens. If you revoke a role, old tokens may continue to work until token expiry. + +## Optional: proxy -> APIM with managed identity + +**Rule: when proxy host config uses `usemi=true`, set `audience` so ACA requests the correct APIM token.** + +```bash +Host1="host=https://.azure-api.net;usemi=true;audience=api://;probe=/health" +``` + +Runtime behavior: + +- Proxy acquires a managed identity token for `api://`. +- Proxy forwards requests to APIM with `Authorization: Bearer `. +- APIM `validate-jwt` evaluates that token before backend routing. + +> [!WARNING] +> If `usemi=true` and `audience` is missing or incorrect, APIM receives an invalid or missing token and returns `401`. + ## Verify - [ ] APIM API app registration exists with identifier URI `api://`. @@ -199,6 +239,16 @@ Expected: `401 Unauthorized` due to missing required claim. - [ ] Valid APIM token succeeds. - [ ] Missing token, wrong audience, and missing role all fail with `401`. +## Troubleshooting + +| Symptom | Likely cause | Check | +| :--- | :--- | :--- | +| `401` with token that seems valid | Token audience mismatch | Decode token and verify `aud` equals `api://` or configured accepted value | +| `401` with correct audience | Missing `roles` claim | Verify role assignment to ACA managed identity on APIM API service principal | +| `401` only from ACA path but local test succeeds | Wrong ACA principal or missing role assignment | Confirm `identity.principalId` on ACA matches assigned principal | +| `401` with signature or issuer validation issues | Issuer mismatch in policy | Compare token `iss` to policy `` entries | +| Role removed but calls still succeed | Old token still valid | Wait for token expiry, then retest with a new token | + ## Related docs - [scripts/README.md](../scripts/README.md) From 8ef52e2ed5f8cdda09658521c4e0caab833a49e3 Mon Sep 17 00:00:00 2001 From: Nagendra Mishr Date: Fri, 22 May 2026 13:58:32 -0400 Subject: [PATCH 07/16] merge in olivias changes, reduce cognitive load --- docs/POC-Secure-the-proxy.md | 208 ++++++++++++++++++----------------- 1 file changed, 107 insertions(+), 101 deletions(-) diff --git a/docs/POC-Secure-the-proxy.md b/docs/POC-Secure-the-proxy.md index 47d31b3..86bcaf4 100644 --- a/docs/POC-Secure-the-proxy.md +++ b/docs/POC-Secure-the-proxy.md @@ -1,183 +1,189 @@ -# POC: Securing the Proxy with Container Apps Easy Auth +# POC: Securing the Proxy with Container Apps EasyAuth -**Every request to the proxy is authenticated by the Container Apps platform before it reaches your app — no auth code required.** +**Protect the proxy from unauthorized access.** ## TL;DR (< 5 minutes) -1. Create an Entra app registration with a client secret, redirect URI, and ID token issuance enabled. -2. Wire it to your Container App with `az containerapp auth microsoft update` and set unauthenticated action to `Return401`. -3. Hit the app without a session — get `401`. Hit `/.auth/login/aad` — authenticate — get through. +1. Register an Entra app, create a client secret, and enable ID token issuance. +2. Enable EasyAuth on the Container App and set the unauthenticated action to `Return401`. +3. Acquire a bearer token scoped to `api://` and include it in the `Authorization: Bearer` header. -**Expected outcome:** unauthenticated `curl` → `401`. Browser flow via `/.auth/login/aad` → `200`. +**Expected outcome:** `curl` without a token → `401`. `curl` with a valid token → request reaches the proxy and returns `200`. + +> EasyAuth rejects any request without a valid Entra token before it reaches the proxy. ## What you will observe -- `curl https:///` with no session cookie → `401 Unauthorized` (EasyAuth sidecar rejects before the proxy sees the request). -- Browser navigation to `https:///` without a session → redirect to `https://login.microsoftonline.com/...`. -- After successful Entra login → redirect back to `https:///.auth/login/aad/callback` → session cookie set → proxy receives request with identity headers. -- `X-MS-CLIENT-PRINCIPAL-NAME` header contains the authenticated user's UPN. -- `X-MS-CLIENT-PRINCIPAL` header contains a base64-encoded claims JSON. -- Requests from users not in the tenant → `401` (Entra rejects the login, session is never issued). +- `curl https:///` with no `Authorization` header → `401 Unauthorized`; the proxy never processes the request. +- `curl` with a token for the wrong audience → `401`. +- `curl` with a valid bearer token scoped to `api://` → request reaches the proxy, proxy returns its normal response. +- The proxy receives no unauthenticated traffic at any point. ## Flow ``` -Browser / API client - │ - ▼ -Container Apps EasyAuth sidecar - ├─ No session / invalid token ──► 401 or redirect to login.microsoftonline.com - └─ Valid session / token - │ injects X-MS-CLIENT-PRINCIPAL-NAME, X-MS-CLIENT-PRINCIPAL headers - ▼ - Proxy app code (never sees unauthenticated requests) +API client / service + │ + │ Authorization: Bearer + ▼ +ACA EasyAuth sidecar + ├─ No token ──► 401 + ├─ Wrong audience ──► 401 + └─ Valid token + ▼ + SimpleL7Proxy + (receives only authenticated requests) ``` > [!NOTE] -> EasyAuth runs as a platform sidecar. The proxy receives only authenticated requests and reads identity from headers — no SDK or middleware needed. +> EasyAuth runs as a platform sidecar managed by Azure Container Apps and only forwards validated requests. ## Setup -**What matters:** `--enable-id-token-issuance true`, the correct redirect URI, and unauthenticated action set to reject (not redirect) for API scenarios. - -| Item | Value used in this POC | -| :--- | :--- | -| Redirect URI pattern | `https:///.auth/login/aad/callback` | -| Unauthenticated action | `Return401` (API) or `RedirectToLoginPage` (browser) | -| Identity headers injected | `X-MS-CLIENT-PRINCIPAL-NAME`, `X-MS-CLIENT-PRINCIPAL` | -| Login endpoint | `https:///.auth/login/aad` | -| Token refresh endpoint | `https:///.auth/refresh` | - **Prerequisites:** - Azure subscription with Contributor access -- `az` CLI logged in (`az login`) +- `az` CLI authenticated (`az login`) - Container App deployed (the proxy) -- Tenant ID on hand: `az account show --query tenantId -o tsv` +- Tenant ID: `az account show --query tenantId -o tsv` -### 1. Create the Entra App Registration +### Step 1 — Set Variables -**This registration tells Entra ID which app is allowed to authenticate users — the redirect URI must match exactly.** +Set `APP_NAME`, `CONTAINER_APP_NAME`, `RG` to match your environment. ```bash -APP_NAME="aca-auth-poc" +export ENTRA_APP_NAME="aca-proxy" # display name for the Entra app registration +export CONTAINER_APP_NAME="" # your Container App name +export RG="" # your resource group +``` + +### Step 2 — Create the Entra App Registration and enable EasyAuth + +Save the generated `APP_ID` and `CLIENT_SECRET` variables for troubleshooting. + +```bash + +# Lookup +export TENANT_ID="$(az account show --query tenantId -o tsv)" +export APP_FQDN="https://$(az containerapp show --name "$CONTAINER_APP_NAME" --resource-group "$RG" --query properties.configuration.ingress.fqdn -o tsv)" +export HEALTH_URL="$APP_FQDN/health" -# Create registration -APP_ID=$(az ad app create \ - --display-name "$APP_NAME" \ +export APP_ID=$(az ad app create \ + --display-name "$ENTRA_APP_NAME" \ --sign-in-audience AzureADMyOrg \ --query appId -o tsv) echo "APP_ID=$APP_ID" -# Enable ID token issuance (required for EasyAuth) +# Enable ID token issuance az ad app update --id "$APP_ID" --enable-id-token-issuance true -# Set redirect URI -APP_FQDN="https://...azurecontainerapps.io" -az ad app update --id "$APP_ID" \ - --web-redirect-uris "$APP_FQDN/.auth/login/aad/callback" - -# Create client secret (set end-date explicitly — required in some tenants) -CLIENT_SECRET=$(az ad app credential reset \ +# Create client secret +export CLIENT_SECRET=$(az ad app credential reset \ --id "$APP_ID" \ - --display-name "easyauth-secret" \ + --display-name "proxy-auth-secret" \ --end-date "$(date -d '+30 days' '+%Y-%m-%d')" \ --query password -o tsv) echo "CLIENT_SECRET=$CLIENT_SECRET" # Create service principal az ad sp create --id "$APP_ID" 1>/dev/null -``` - -> [!WARNING] -> `--enable-id-token-issuance true` is mandatory. EasyAuth will fail silently at login without it. - -### 2. Enable EasyAuth on the Container App -**This wires the app registration to the Container App platform — the sidecar handles all token validation from this point.** - -> [!WARNING] -> If this command partially applies (enables auth but fails to register the provider), **the app returns `503` for all traffic** until auth is either fixed or disabled. Verify immediately after running — do not proceed to Step 3 until the provider check passes. - -```bash +# Enable EazyAuth az containerapp auth microsoft update \ - --name "" \ - --resource-group "" \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ --client-id "$APP_ID" \ --client-secret "$CLIENT_SECRET" \ - --tenant-id "" \ + --tenant-id "$TENANT_ID" \ --yes ``` -> [!NOTE] -> This command also enables authentication on the app. +### Step 3 — Verify -Verify the Microsoft identity provider was registered — **if the `identityProviders` block is empty, re-run the command above**: +Run the verify command to ensure that both auth and the identity provider were registered. ```bash az containerapp auth show \ - --name "" \ - --resource-group "" \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ --query "{enabled:platform.enabled, provider:identityProviders.azureActiveDirectory.enabled}" \ -o table ``` -Expected output: both `enabled` columns show `True`. If the portal shows **"No identity provider"**, re-run Step 2 — the `az containerapp auth microsoft update` command may have partially applied. +Both columns must show `True`. If either shows `False` or `identityProviders` is empty, re-run the `auth microsoft update` command above — do not continue to Step 4. -### 3. Set the Unauthenticated Action +### Step 4 — Set the Unauthenticated Action -**For API/proxy scenarios, use `Return401` — `RedirectToLoginPage` is for browser-only apps.** +Rejects unauthenticated requests outright — callers get a `401` with no redirect. ```bash az containerapp auth update \ - --name "" \ - --resource-group "" \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ --enabled true \ --unauthenticated-client-action Return401 ``` -> [!TIP] -> Use `RedirectToLoginPage` if you want the browser to be sent to the Entra login page automatically instead of receiving a `401`. - ## Run ```bash -APP_FQDN="https://...azurecontainerapps.io" - -# 1. Should return 401 +# 1. No token — expect 401 curl -i "$APP_FQDN" -# 2. Browser login flow — open in browser -echo "$APP_FQDN/.auth/login/aad" +# 2. Acquire a token scoped to the proxy's app registration +TOKEN=$(az account get-access-token \ + --resource "api://$APP_ID" \ + --query accessToken -o tsv) -# 3. After login, check injected identity headers -curl -i "$APP_FQDN" --cookie "" +# 3. Call with token — expect 200 +curl -i "$APP_FQDN" \ + -H "Authorization: Bearer $TOKEN" ``` -> [!TIP] -> After logging in via the browser, use browser DevTools → Network → copy the `AppServiceAuthSession` cookie value for use with `curl`. - ## Verify -Run each check in order. All four must pass. +```bash +# No token — expect 401 +curl -i "$HEALTH_URL" + +# Wrong audience (valid Azure token, wrong resource) — expect 401 +# Uses the Azure management API as the resource to produce a real Entra-signed token +# whose 'aud' claim does not match api://$APP_ID — EasyAuth will reject it due to the mismatch. +BAD_TOKEN=$(az account get-access-token --resource "https://management.azure.com/" --query accessToken -o tsv) +curl -i "$HEALTH_URL" -H "Authorization: Bearer $BAD_TOKEN" + +# Valid token — expect 200 +curl -i "$HEALTH_URL" -H "Authorization: Bearer $TOKEN" +``` + +## Remove -- [ ] `curl -i $APP_FQDN` (no session) → `HTTP/1.1 401`, no proxy app output in response -- [ ] Open `$APP_FQDN/.auth/login/aad` in a browser → redirected to `login.microsoftonline.com` -- [ ] Complete Entra login → redirected back to `$APP_FQDN`, session cookie set, app responds normally -- [ ] Inspect response headers or app logs — `X-MS-CLIENT-PRINCIPAL-NAME` is present and contains your UPN +Use this to temporarily disable auth — for example, to isolate whether a problem is in EasyAuth or in the proxy itself. -> [!TIP] -> `curl "$APP_FQDN/.auth/me"` with a valid session returns a JSON array of claims — use this to confirm the session is active without inspecting headers manually. +```bash +az containerapp auth update \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --enabled false +``` + +All traffic is accepted again immediately. The app registration and secret are left in place. To restore protection: + +```bash +az containerapp auth update \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --enabled true +``` ## Troubleshooting -| Symptom | Likely cause | Check | +| Symptom | Cause | Check | | :--- | :--- | :--- | -| `curl` returns `503` instead of `401` | Auth is enabled but no identity provider is configured — platform blocks all traffic | **To restore the app immediately:** `az containerapp auth update -n -g --enabled false`. Then re-run Step 2 and verify the provider is registered before re-enabling. Check with `az containerapp auth show -n -g --query "identityProviders"` | -| Login redirects to Entra but then fails with `AADSTS50011` | Redirect URI mismatch | Compare the URI in the error with what is registered: `az ad app show --id $APP_ID --query "web.redirectUris"` | -| `401` even after successful Entra login | ID token issuance not enabled | Run `az ad app update --id $APP_ID --enable-id-token-issuance true` | -| Browser gets `401` instead of login redirect | Unauthenticated action set to `Return401` | Change to `RedirectToLoginPage`: `az containerapp auth update --unauthenticated-client-action RedirectToLoginPage` | -| `X-MS-CLIENT-PRINCIPAL-NAME` header missing in app | EasyAuth not fully enabled | Run `az containerapp auth show -n -g ` and confirm `"enabled": true` | -| Client secret error at login | Secret expired or wrong value | Reset: `az ad app credential reset --id $APP_ID --display-name "easyauth-secret"` then re-run Step 2 | +| `401` on all requests including valid tokens | Wrong token audience | Decode at [jwt.ms](https://jwt.ms); `aud` claim must be `api://` | +| `503` on all traffic after Step 3 | Auth enabled but no identity provider registered | `az containerapp auth show -n -g --query identityProviders` — if empty, re-run Step 3. To restore immediately: `az containerapp auth update -n -g --enabled false` | +| Authentication fails with `AADSTS50019` or similar | ID token issuance not enabled | `az ad app update --id $APP_ID --enable-id-token-issuance true` | +| `401` despite a valid Entra token | Auth not fully enabled or provider missing | `az containerapp auth show -n -g ` — confirm `"enabled": true` and provider is configured | +| Client secret rejected at authentication | Secret expired or rotated | `az ad app credential reset --id $APP_ID --display-name "proxy-auth-secret"` then re-run Step 2 | From a91e1e554edeed33160489e0b6fb1061c6b4c6a0 Mon Sep 17 00:00:00 2001 From: Nagendra Mishr Date: Fri, 22 May 2026 16:50:14 -0400 Subject: [PATCH 08/16] merge the verification steps --- docs/POC-Secure-the-proxy.md | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/docs/POC-Secure-the-proxy.md b/docs/POC-Secure-the-proxy.md index 86bcaf4..e0e812c 100644 --- a/docs/POC-Secure-the-proxy.md +++ b/docs/POC-Secure-the-proxy.md @@ -98,7 +98,7 @@ az containerapp auth microsoft update \ --yes ``` -### Step 3 — Verify +### Step 3 — Verify Container App Run the verify command to ensure that both auth and the identity provider were registered. @@ -124,11 +124,11 @@ az containerapp auth update \ --unauthenticated-client-action Return401 ``` -## Run +## Verify Access ```bash # 1. No token — expect 401 -curl -i "$APP_FQDN" +curl -i "$HEALTH_URL" # 2. Acquire a token scoped to the proxy's app registration TOKEN=$(az account get-access-token \ @@ -136,15 +136,8 @@ TOKEN=$(az account get-access-token \ --query accessToken -o tsv) # 3. Call with token — expect 200 -curl -i "$APP_FQDN" \ +curl -i "$HEALTH_URL" \ -H "Authorization: Bearer $TOKEN" -``` - -## Verify - -```bash -# No token — expect 401 -curl -i "$HEALTH_URL" # Wrong audience (valid Azure token, wrong resource) — expect 401 # Uses the Azure management API as the resource to produce a real Entra-signed token @@ -152,8 +145,6 @@ curl -i "$HEALTH_URL" BAD_TOKEN=$(az account get-access-token --resource "https://management.azure.com/" --query accessToken -o tsv) curl -i "$HEALTH_URL" -H "Authorization: Bearer $BAD_TOKEN" -# Valid token — expect 200 -curl -i "$HEALTH_URL" -H "Authorization: Bearer $TOKEN" ``` ## Remove From caad7933a8bb567af8f3fdd7e9c255a8c4077b2d Mon Sep 17 00:00:00 2001 From: MarvelintheCloud Date: Fri, 22 May 2026 19:25:56 -0500 Subject: [PATCH 09/16] docs: update secure proxy EasyAuth and token flow --- docs/POC-Secure-the-proxy.md | 114 +++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 13 deletions(-) diff --git a/docs/POC-Secure-the-proxy.md b/docs/POC-Secure-the-proxy.md index e0e812c..fc4aa4f 100644 --- a/docs/POC-Secure-the-proxy.md +++ b/docs/POC-Secure-the-proxy.md @@ -57,6 +57,9 @@ export CONTAINER_APP_NAME="" # your Container App name export RG="" # your resource group ``` +> [!NOTE] +> In Windows/WSL environments, sanitize `-o tsv` outputs with `tr -d '\r\n'` before reusing values in later CLI calls. + ### Step 2 — Create the Entra App Registration and enable EasyAuth Save the generated `APP_ID` and `CLIENT_SECRET` variables for troubleshooting. @@ -64,16 +67,42 @@ Save the generated `APP_ID` and `CLIENT_SECRET` variables for troubleshooting. ```bash # Lookup -export TENANT_ID="$(az account show --query tenantId -o tsv)" -export APP_FQDN="https://$(az containerapp show --name "$CONTAINER_APP_NAME" --resource-group "$RG" --query properties.configuration.ingress.fqdn -o tsv)" +export TENANT_ID="$(az account show --query tenantId -o tsv | tr -d '\r\n')" +export APP_FQDN="https://$(az containerapp show --name "$CONTAINER_APP_NAME" --resource-group "$RG" --query properties.configuration.ingress.fqdn -o tsv | tr -d '\r\n')" export HEALTH_URL="$APP_FQDN/health" export APP_ID=$(az ad app create \ --display-name "$ENTRA_APP_NAME" \ --sign-in-audience AzureADMyOrg \ - --query appId -o tsv) + --query appId -o tsv | tr -d '\r\n') echo "APP_ID=$APP_ID" +# Required so az token requests to api://$APP_ID can resolve the resource principal. +az ad app update --id "$APP_ID" --identifier-uris "api://$APP_ID" + +# Create service principal +az ad sp create --id "$APP_ID" 1>/dev/null + +# Create delegated scope required by Step 2a consent flow: api.access +if [ -z "$APP_ID" ]; then + echo "APP_ID is empty. Re-run Step 2 app creation or app lookup first." + exit 1 +fi + +SCOPE_ID="$(uuidgen | tr '[:upper:]' '[:lower:]')" +API_OBJ="$(az ad app show --id "$APP_ID" --query api -o json)" +UPDATED_API_OBJ="$(echo "$API_OBJ" | jq --arg id "$SCOPE_ID" '.oauth2PermissionScopes = [{ + adminConsentDescription: "Access the API", + adminConsentDisplayName: "Admin Access", + id: $id, + isEnabled: true, + type: "Admin", + userConsentDescription: "Access the API", + userConsentDisplayName: "User Access", + value: "api.access" +}]')" +az ad app update --id "$APP_ID" --set api="$UPDATED_API_OBJ" + # Enable ID token issuance az ad app update --id "$APP_ID" --enable-id-token-issuance true @@ -82,11 +111,9 @@ export CLIENT_SECRET=$(az ad app credential reset \ --id "$APP_ID" \ --display-name "proxy-auth-secret" \ --end-date "$(date -d '+30 days' '+%Y-%m-%d')" \ - --query password -o tsv) + --query password -o tsv | tr -d '\r\n') echo "CLIENT_SECRET=$CLIENT_SECRET" -# Create service principal -az ad sp create --id "$APP_ID" 1>/dev/null # Enable EazyAuth az containerapp auth microsoft update \ @@ -96,21 +123,81 @@ az containerapp auth microsoft update \ --client-secret "$CLIENT_SECRET" \ --tenant-id "$TENANT_ID" \ --yes + +# Verify identifier URI is set correctly. +az ad app show --id "$APP_ID" --query "{appId:appId,identifierUris:identifierUris,scopes:api.oauth2PermissionScopes[].value}" -o table +``` + +### Step 2a — Grant admin consent for token acquisition (fix for `AADSTS65001`) + +If `az account get-access-token --resource "api://$APP_ID"` fails with `consent_required`, grant consent to a dedicated client app instead of broad tenant-wide grants. + +> [!NOTE] +> App IDs are identifiers, not secrets. Keep secrets in secure stores; avoid printing or committing secret values. + +Preferred (least privilege): grant consent to a named client app only. + +```bash +CLIENT_APP_NAME="aca-proxy-client" # your caller app registration +CLIENT_APP_ID="$(az ad app list --display-name "$CLIENT_APP_NAME" --query "[0].appId" -o tsv | tr -d '\r\n')" +SCOPE_ID="$(az ad app show --id "$APP_ID" --query "api.oauth2PermissionScopes[?value=='api.access'].id | [0]" -o tsv | tr -d '\r\n')" + +if [ -z "$CLIENT_APP_ID" ] || [ -z "$SCOPE_ID" ]; then + echo "Missing CLIENT_APP_ID or api.access scope. Verify app registrations first." + exit 1 +fi + +# Add delegated permission and grant admin consent for this specific client app. +az ad app permission add \ + --id "$CLIENT_APP_ID" \ + --api "$APP_ID" \ + --api-permissions "${SCOPE_ID}=Scope" + +az ad app permission admin-consent --id "$CLIENT_APP_ID" +``` + +Optional shortcut for local `az account get-access-token` testing: + +```bash +az logout +az login --tenant "$TENANT_ID" --scope "api://$APP_ID/.default" ``` +> [!WARNING] +> Admin consent requires Entra admin privileges. + ### Step 3 — Verify Container App -Run the verify command to ensure that both auth and the identity provider were registered. +Run these checks to ensure auth is enabled and the Microsoft identity provider is registered. ```bash -az containerapp auth show \ +ENABLED="$(az containerapp auth show \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --query "platform.enabled" -o tsv | tr -d '\r\n')" + +AAD_CLIENT_ID="$(az containerapp auth show \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --query "identityProviders.azureActiveDirectory.registration.clientId" -o tsv | tr -d '\r\n')" + +AUDIENCE="$(az containerapp auth show \ --name "$CONTAINER_APP_NAME" \ --resource-group "$RG" \ - --query "{enabled:platform.enabled, provider:identityProviders.azureActiveDirectory.enabled}" \ - -o table + --query "identityProviders.azureActiveDirectory.validation.allowedAudiences[0]" -o tsv | tr -d '\r\n')" + +echo "enabled=$ENABLED" +echo "aad_client_id=$AAD_CLIENT_ID" +echo "allowed_audience=$AUDIENCE" ``` -Both columns must show `True`. If either shows `False` or `identityProviders` is empty, re-run the `auth microsoft update` command above — do not continue to Step 4. +Expected: + +- enabled=true +- aad_client_id= +- allowed_audience=api:// + +If `aad_client_id` or `allowed_audience` is empty, or `enabled` is not `true`, re-run the auth microsoft update command above and do not continue to Step 4. ### Step 4 — Set the Unauthenticated Action @@ -133,7 +220,7 @@ curl -i "$HEALTH_URL" # 2. Acquire a token scoped to the proxy's app registration TOKEN=$(az account get-access-token \ --resource "api://$APP_ID" \ - --query accessToken -o tsv) + --query accessToken -o tsv | tr -d '\r\n') # 3. Call with token — expect 200 curl -i "$HEALTH_URL" \ @@ -142,7 +229,7 @@ curl -i "$HEALTH_URL" \ # Wrong audience (valid Azure token, wrong resource) — expect 401 # Uses the Azure management API as the resource to produce a real Entra-signed token # whose 'aud' claim does not match api://$APP_ID — EasyAuth will reject it due to the mismatch. -BAD_TOKEN=$(az account get-access-token --resource "https://management.azure.com/" --query accessToken -o tsv) +BAD_TOKEN=$(az account get-access-token --resource "https://management.azure.com/" --query accessToken -o tsv | tr -d '\r\n') curl -i "$HEALTH_URL" -H "Authorization: Bearer $BAD_TOKEN" ``` @@ -176,5 +263,6 @@ az containerapp auth update \ | Authentication fails with `AADSTS50019` or similar | ID token issuance not enabled | `az ad app update --id $APP_ID --enable-id-token-issuance true` | | `401` despite a valid Entra token | Auth not fully enabled or provider missing | `az containerapp auth show -n -g ` — confirm `"enabled": true` and provider is configured | | Client secret rejected at authentication | Secret expired or rotated | `az ad app credential reset --id $APP_ID --display-name "proxy-auth-secret"` then re-run Step 2 | +| `AADSTS65001 consent_required` when requesting token | Delegated consent not granted for client -> API scope | Run Step 2a to grant consent, then retry token request | From 2cb7842a4de608fc1ad5120254b67c595e386785 Mon Sep 17 00:00:00 2001 From: MarvelintheCloud Date: Sat, 23 May 2026 20:52:25 -0500 Subject: [PATCH 10/16] docs: harden secure proxy auth guidance --- docs/POC-Secure-the-proxy.md | 50 ++++++++++-------------------------- 1 file changed, 13 insertions(+), 37 deletions(-) diff --git a/docs/POC-Secure-the-proxy.md b/docs/POC-Secure-the-proxy.md index fc4aa4f..470f550 100644 --- a/docs/POC-Secure-the-proxy.md +++ b/docs/POC-Secure-the-proxy.md @@ -83,7 +83,7 @@ az ad app update --id "$APP_ID" --identifier-uris "api://$APP_ID" # Create service principal az ad sp create --id "$APP_ID" 1>/dev/null -# Create delegated scope required by Step 2a consent flow: api.access +# Create delegated scope if [ -z "$APP_ID" ]; then echo "APP_ID is empty. Re-run Step 2 app creation or app lookup first." exit 1 @@ -112,7 +112,8 @@ export CLIENT_SECRET=$(az ad app credential reset \ --display-name "proxy-auth-secret" \ --end-date "$(date -d '+30 days' '+%Y-%m-%d')" \ --query password -o tsv | tr -d '\r\n') -echo "CLIENT_SECRET=$CLIENT_SECRET" + +# Do not print or commit secret values. Keep them in memory only. # Enable EazyAuth @@ -126,45 +127,17 @@ az containerapp auth microsoft update \ # Verify identifier URI is set correctly. az ad app show --id "$APP_ID" --query "{appId:appId,identifierUris:identifierUris,scopes:api.oauth2PermissionScopes[].value}" -o table -``` - -### Step 2a — Grant admin consent for token acquisition (fix for `AADSTS65001`) - -If `az account get-access-token --resource "api://$APP_ID"` fails with `consent_required`, grant consent to a dedicated client app instead of broad tenant-wide grants. - -> [!NOTE] -> App IDs are identifiers, not secrets. Keep secrets in secure stores; avoid printing or committing secret values. - -Preferred (least privilege): grant consent to a named client app only. -```bash -CLIENT_APP_NAME="aca-proxy-client" # your caller app registration -CLIENT_APP_ID="$(az ad app list --display-name "$CLIENT_APP_NAME" --query "[0].appId" -o tsv | tr -d '\r\n')" -SCOPE_ID="$(az ad app show --id "$APP_ID" --query "api.oauth2PermissionScopes[?value=='api.access'].id | [0]" -o tsv | tr -d '\r\n')" - -if [ -z "$CLIENT_APP_ID" ] || [ -z "$SCOPE_ID" ]; then - echo "Missing CLIENT_APP_ID or api.access scope. Verify app registrations first." - exit 1 -fi - -# Add delegated permission and grant admin consent for this specific client app. -az ad app permission add \ - --id "$CLIENT_APP_ID" \ - --api "$APP_ID" \ - --api-permissions "${SCOPE_ID}=Scope" - -az ad app permission admin-consent --id "$CLIENT_APP_ID" -``` - -Optional shortcut for local `az account get-access-token` testing: - -```bash -az logout -az login --tenant "$TENANT_ID" --scope "api://$APP_ID/.default" +# Optional hygiene: clear secret from shell after auth configuration is complete. +# unset CLIENT_SECRET ``` > [!WARNING] -> Admin consent requires Entra admin privileges. +> You may need to grant admin consent in the Azure portal before token acquisition works. +> If `az account get-access-token --resource "api://$APP_ID"` returns `AADSTS65001` (`consent_required`), ask a tenant admin to grant consent for your client app/API scope in Entra ID: +> **App registrations** -> your client app -> **API permissions** -> **Grant admin consent**. +> +> For better secret hygiene, avoid sharing terminal output that includes auth commands and never paste secret values into tickets, PR comments, or chat logs. ### Step 3 — Verify Container App @@ -238,6 +211,9 @@ curl -i "$HEALTH_URL" -H "Authorization: Bearer $BAD_TOKEN" Use this to temporarily disable auth — for example, to isolate whether a problem is in EasyAuth or in the proxy itself. +> [!WARNING] +> Disabling auth exposes the app endpoint to unauthenticated traffic. Use this only for short-lived troubleshooting in non-production environments, and re-enable auth immediately after validation. + ```bash az containerapp auth update \ --name "$CONTAINER_APP_NAME" \ From f7a1a348cab761772aabc2ca6e78b19ecf990c1e Mon Sep 17 00:00:00 2001 From: MarvelintheCloud Date: Mon, 25 May 2026 15:12:29 -0500 Subject: [PATCH 11/16] docs: improve APIM JWT test guidance - Sanitized ID variables; added app-role overwrite warning and Graph permission note. - Added cross-platform token copy tips. - Clarified CLI token identity vs Container App MI and added MI exec test command. --- docs/POC-security-the-apim.md | 46 +++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/docs/POC-security-the-apim.md b/docs/POC-security-the-apim.md index e8470bb..eeedd3e 100644 --- a/docs/POC-security-the-apim.md +++ b/docs/POC-security-the-apim.md @@ -71,7 +71,7 @@ Capture the managed identity's object ID: ```bash CA_PRINCIPAL_ID=$(az containerapp show \ -g "$RG" -n "$CA_NAME" \ - --query "identity.principalId" -o tsv) + --query "identity.principalId" -o tsv | tr -d '\r\n') echo "CA_PRINCIPAL_ID=$CA_PRINCIPAL_ID" ``` @@ -83,7 +83,7 @@ echo "CA_PRINCIPAL_ID=$CA_PRINCIPAL_ID" APP_NAME="apim-protected-api-poc" # 1. Create the app registration -APP_ID=$(az ad app create --display-name "$APP_NAME" --query "appId" -o tsv) +APP_ID=$(az ad app create --display-name "$APP_NAME" --query "appId" -o tsv | tr -d '\r\n') echo "APP_ID=$APP_ID" # 2. Create the service principal @@ -93,7 +93,7 @@ az ad sp create --id "$APP_ID" 1>/dev/null az ad app update --id "$APP_ID" --identifier-uris "api://$APP_ID" # 4. Add an app role -ROLE_ID=$(python3 -c "import uuid; print(uuid.uuid4())") +ROLE_ID=$(python3 -c "import uuid; print(uuid.uuid4())" | tr -d '\r\n') az ad app update --id "$APP_ID" --app-roles "[ { \"allowedMemberTypes\": [\"User\", \"Application\"], @@ -110,6 +110,9 @@ az ad app update --id "$APP_ID" --app-roles "[ az ad sp update --id "$APP_ID" --set appRoleAssignmentRequired=true ``` +> [!WARNING] +> `az ad app update --app-roles` replaces the app's full role list. If the app already has roles, merge them first instead of applying this single-role JSON as-is. + > [!NOTE] > For delegated user access (OAuth2 scopes), add a scope via **App registrations → [API App] → Expose an API → Add a scope**. App roles alone are sufficient for this POC. @@ -120,7 +123,7 @@ az ad sp update --id "$APP_ID" --set appRoleAssignmentRequired=true Get the service principal object ID of the protected API: ```bash -API_SP_OBJECT_ID=$(az ad sp show --id "$APP_ID" --query "id" -o tsv) +API_SP_OBJECT_ID=$(az ad sp show --id "$APP_ID" --query "id" -o tsv | tr -d '\r\n') echo "API_SP_OBJECT_ID=$API_SP_OBJECT_ID" ``` @@ -140,6 +143,23 @@ New-AzureADServiceAppRoleAssignment ` -Id $AppRoleId ``` +Or assign the role from Bash using Microsoft Graph via `az rest`: + +```bash +AssigneeObjectId=$(printf "%s" "$CA_PRINCIPAL_ID" | tr -d '\r\n') # Managed Identity principalId +ResourceObjectId=$(printf "%s" "$API_SP_OBJECT_ID" | tr -d '\r\n') # Service principal objectId of the API app +AppRoleId=$(printf "%s" "$ROLE_ID" | tr -d '\r\n') # App role GUID from Step 2 + +az rest \ + --method POST \ + --url "https://graph.microsoft.com/v1.0/servicePrincipals/${AssigneeObjectId}/appRoleAssignments" \ + --headers "Content-Type=application/json" \ + --body "{\"principalId\":\"${AssigneeObjectId}\",\"resourceId\":\"${ResourceObjectId}\",\"appRoleId\":\"${AppRoleId}\"}" +``` + +> [!NOTE] +> Graph role assignment via `az rest` requires directory permissions. If this call fails with authorization errors, run it as a tenant admin or use the Azure portal assignment flow. + > [!TIP] > You can also assign the role via the Azure portal: **Enterprise Applications → [API App] → Users and groups → Add assignment**. @@ -193,6 +213,18 @@ curl -i "$APIM_URL" \ -H "Authorization: Bearer $TOKEN" ``` +> [!NOTE] +> `az account get-access-token` uses your current Azure CLI identity (user or service principal), not the Container App managed identity. +> For this call to return `200`, the identity used by `az login` must also be assigned the `API.Caller` app role. +> If you want to validate the exact managed identity path instead, run from inside the Container App: +> +> ```bash +> az containerapp exec -g "$RG" -n "$CA_NAME" --command "sh -lc ' +> TOKEN=\$(curl -s \"\$IDENTITY_ENDPOINT?resource=api://$APP_ID&api-version=2019-08-01\" -H \"X-IDENTITY-HEADER: \$IDENTITY_HEADER\" | jq -r .access_token) +> curl -i \"$APIM_URL\" -H \"Authorization: Bearer \$TOKEN\" +> '" +> ``` + > [!TIP] > Paste the token into [jwt.io](https://jwt.io) and confirm `aud = api://` and `roles` contains `"API.Caller"`. @@ -227,7 +259,10 @@ Run each check in order. All five must pass. - [ ] Decode the `200` token at [jwt.io](https://jwt.io) and confirm: `aud = api://`, `iss = https://sts.windows.net//`, `roles` array contains `"API.Caller"` > [!TIP] -> `az account get-access-token --resource "api://$APP_ID" --query accessToken -o tsv | pbcopy` puts the token straight on the clipboard for jwt.io. +> Copy token to clipboard by platform: +> - macOS: `az account get-access-token --resource "api://$APP_ID" --query accessToken -o tsv | pbcopy` +> - Linux/WSL: `az account get-access-token --resource "api://$APP_ID" --query accessToken -o tsv | xclip -selection clipboard` +> - Windows PowerShell: `az account get-access-token --resource "api://$APP_ID" --query accessToken -o tsv | Set-Clipboard` ## Troubleshooting @@ -238,6 +273,7 @@ Run each check in order. All five must pass. | `401` from Container App but `200` from `az` CLI | MI not assigned the role | Check `CA_PRINCIPAL_ID` matches the MI object ID: `az containerapp show -g $RG -n $CA_NAME --query "identity.principalId"` | | `401` with `"IDX10511: Signature validation failed"` | Issuer URL mismatch | APIM policy `` must match the `iss` claim exactly — copy from jwt.io decode | | Role assignment succeeds but token still lacks `roles` | `appRoleAssignmentRequired` not set | Run `az ad sp update --id "$APP_ID" --set appRoleAssignmentRequired=true` | +| Role assignment call fails with Graph authorization error | Caller lacks Entra directory privileges | Use an admin account for `az rest` assignment or assign via portal Enterprise Applications UI | From e847c817be49191c34d9010c8abb879b942995df Mon Sep 17 00:00:00 2001 From: Nagendra Mishr Date: Tue, 26 May 2026 09:30:53 -0400 Subject: [PATCH 12/16] cleanup docs --- .github/copilot-instructions.md | 68 + docs/Glossary.md | 346 +-- docs/POC-ACA-Proxy-Security-Authorization.md | 74 +- docs/POC-Chargeback.md | 646 ++-- docs/POC-Failover-configuration.md | 944 +++--- docs/POC-OpenAI-Failover.md | 1798 +++++------ docs/POC-Priority-configuration.md | 1344 ++++----- docs/QUICKSTART.md | 164 +- docs/TABLE_OF_CONTENTS.md | 414 +-- taxonomy/concepts.json | 2810 +++++++++--------- 10 files changed, 4367 insertions(+), 4241 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 801d246..7f7ed5a 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -87,3 +87,71 @@ If the file doesn't exist, create it and document any important lessons learned - When a change requires modifications outside the immediate scope of what was requested, ask first. - When the user says "undo", revert ALL changes from the last action, not just some. + +## definition of a gold standard document: + - A POC doc is complete when: A new user can run it in <5 minutes , Behavior is visible and verifiable, No section requires rereading to understand + - The reader can explain:what happened, why it happened, how to reproduce it + +## when writing POCs' +POC docs must prioritize runnable usability over completeness; use direct engineer-to-engineer tone; no marketing language; always include TLDR with <5 min steps and expected outcome; include “what you will observe” as pure behavior bullets; separate sections strictly into setup (minimal prereqs), run (exact commands), verify (checklist mapping signals→meaning), deep dive (step-by-step execution flow), optional variants, and troubleshooting; prefer bullets over prose; avoid repetition and narrative phrasing; every config section must start with “what matters” and highlight only critical knobs (timeout, retry, backend behavior); always define observable signals (headers/logs/state changes) and map them explicitly; include execution cycles (cycle 1 fail, cycle 2 retry, final result); include mental model as simple state machine (select→fail→throttle→retry→recover); include minimal flow diagram (client→proxy→backend A fail→backend B success); verification must be checklist not table; troubleshooting must map symptom→cause→check; all claims must be reproducible and observable; avoid vague phrasing; front-load value in first 30%; reader must be able to run, see, and explain behavior without reading entire document + +## When generating a Reference Document, enforce the following rules: +1. Purpose: +- Produce the authoritative, canonical, single source of truth for the topic. +- Output must be complete, deterministic, and audit‑ready. +2. Language Rules: +- Use mandatory language: MUST, REQUIRED, MANDATORY, SHALL. +- Avoid ambiguity: do not use should, could, might, typically, generally. +- Use exact values, configurations, constraints, and specifications. +- No conversational tone. No filler. No speculation. +3. Required Document Structure (always in this order): +A. Document Metadata (Title, Version, Last Updated, Owner, Review Cycle, Compliance Tags) +B. Summary (what this defines, why it exists, who must follow it) +C. Scope & Applicability (in-scope, out-of-scope, dependencies) +D. Authoritative Specification (architecture, configurations, patterns, constraints, SLAs/SLOs, security requirements) +E. Reference Implementation (canonical diagrams, workflows, configuration blocks, API contracts) +F. Validation & Compliance (required tests, checks, evidence, audit artifacts) +G. Version History (changes, rationale, approvers) +4. Behavioral Rules: +- Never invent facts. Request missing parameters before finalizing. +- Ensure internal consistency across all sections. +- All examples must be canonical, valid, and copy‑paste‑ready. +- All diagrams, tables, and configs must be deterministic and aligned with the specification. +- No placeholders unless explicitly allowed by the user. +- No contradictions across sections. +5. Output Requirements: +- Produce a fully structured, complete document. +- Enforce strict formatting and section order. +- Ensure the document is suitable for governance, compliance, and long‑term reference. + +## When generating an Overview Document, enforce the following rules: +1. Purpose: +- Provide a high‑clarity, high‑signal summary of a system, solution, or domain. +- Communicate essential concepts, architecture, flows, and rationale without deep implementation detail. +- Serve as the onboarding and orientation artifact for new readers. +2. Language Rules: +- Use concise, precise, high‑signal language. +- Avoid ambiguity, filler, marketing language, or conversational tone. +- Use factual, neutral, technically accurate statements. +- Avoid mandatory language unless describing non‑negotiable constraints. +3. Required Document Structure (always in this order): +A. Document Metadata (Title, Version, Last Updated, Owner) +B. Overview Summary (what the system is, what problem it solves, why it exists) +C. Key Objectives (primary goals, outcomes, and value) +D. High‑Level Architecture (major components, interactions, boundaries) +E. Core Concepts (definitions, domain terms, key abstractions) +F. High‑Level Workflows (end‑to‑end flows, sequence summaries) +G. Key Constraints & Assumptions (technical, operational, business) +H. Integration Points (external systems, APIs, dependencies) +I. Non‑Goals (what is intentionally excluded) +J. Future Considerations (roadmap‑level items only) +4. Behavioral Rules: +- Never invent facts. Request missing parameters before finalizing. +- Ensure internal consistency across all sections. +- Keep all diagrams and workflows high‑level; no implementation detail. +- Do not include configuration, SLAs, or compliance details unless explicitly requested. +- No placeholders unless explicitly allowed by the user. +5. Output Requirements: +- Produce a complete, structured overview document. +- Maintain strict section order and formatting. +- Ensure the document is suitable for onboarding, orientation, and executive‑level understanding. diff --git a/docs/Glossary.md b/docs/Glossary.md index 5148ea4..03f37d8 100644 --- a/docs/Glossary.md +++ b/docs/Glossary.md @@ -1,174 +1,174 @@ -# SimpleL7Proxy — Glossary - -| | | -|---|---| -| **Version** | 1.0 | -| **Last Updated** | 2026-05-21 | -| **Owner** | Platform Engineering | -| **Review Cycle** | Updated with each feature release | - -## Summary - -This glossary defines every named concept, setting, and runtime behavior used across SimpleL7Proxy documentation. Each entry is grouped by the domain it belongs to — matching the ten domains in the [Table of Contents](TABLE_OF_CONTENTS.md) and the [machine-readable taxonomy](../taxonomy/concepts.json) — and links to the authoritative reference document where that concept is fully specified. - -**Who this is for:** anyone reading, writing, or reviewing SimpleL7Proxy documentation or configuration files. - -## TL;DR - -- Terms are grouped by domain, matching the [Table of Contents](TABLE_OF_CONTENTS.md) structure. -- Each entry links to the document where that concept is fully specified. -- Configuration setting names appear in `code` style; deprecated terms are marked explicitly. - -## Scope & Applicability - -| | | -|---|---| -| **In scope** | All named concepts, configuration settings, runtime behaviors, and HTTP headers documented in `SimpleL7Proxy/docs/`. | -| **Out of scope** | General Azure service terminology (App Configuration, Service Bus, Blob Storage) except where it directly intersects with proxy behavior. | -| **Dependencies** | [TABLE_OF_CONTENTS.md](TABLE_OF_CONTENTS.md) · [../taxonomy/concepts.json](../taxonomy/concepts.json) | - ---- - -## Request Lifecycle - -Concepts covering how a request moves from client ingress through the priority queue to worker dispatch. - -| Term | Definition | See Also | -|------|-----------|----------| -| `DefaultPriority` | Fallback priority level assigned when the request carries no matching priority header value. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | -| Priority Level | Integer assigned to every request. Lower integer = higher dispatch precedence in the queue. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | -| Priority Queue | Min-heap data structure ordered by priority level. Drives the order in which worker threads pick up requests. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | -| TTL (Time-to-Live) | Total wall-clock budget for a request covering queue wait and all retry attempts. Expiry returns 412 to the client. | [TIMEOUTS.md](TIMEOUTS.md) | -| `Workers` | Count of concurrent proxy worker threads. Cold setting — the default of 10 is for local testing only. | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md) | - ---- - -## Backend Management - -Concepts covering backend host configuration, health probing, and the selection pipeline. - -| Term | Definition | See Also | -|------|-----------|----------| -| Active Pool | The set of backend hosts currently eligible to receive traffic, filtered by rolling success rate threshold. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| Connection String Format | Preferred per-host configuration using a semicolon-delimited `key=value` string (e.g., `host=…;probe=…;path=…`). Supports all modern options. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| Direct Mode | Backend mode where the host is always treated as healthy and no probe is ever sent. Use for serverless or on-demand backends. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| Health Poller | Background loop that probes each configured host at `PollInterval` ms and tracks rolling success rate and average latency. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| `IterationMode` | Controls retry breadth. `SinglePass` tries each host at most once. `MultiPass` cycles up to `MaxAttempts` total. | [LOAD_BALANCING.md](LOAD_BALANCING.md) | -| Load Balance Mode | Determines host ordering within the candidate set: `roundrobin` (even), `latency` (fastest first), or `random`. | [LOAD_BALANCING.md](LOAD_BALANCING.md) | -| Path Filter | Stage 1 of backend selection. Specific-path hosts are checked first; catch-all hosts receive requests that match no specific path. | [LOAD_BALANCING.md](LOAD_BALANCING.md) | -| Shared Iterator | A single load-balance iterator shared across all concurrent requests to the same path, enabling strict round-robin fairness. | [LOAD_BALANCING.md](LOAD_BALANCING.md) | -| Success Rate | Rolling percentage of successful probe responses for a host. Hosts that fall below `SuccessRate` leave the active pool until they recover. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | - ---- - -## Reliability - -Concepts covering circuit breaking, retry, requeue, and the timeout model. - -| Term | Definition | See Also | -|------|-----------|----------| -| `AcceptableStatusCodes` | HTTP status codes from backends that are forwarded directly to the client and not counted as circuit-breaker failures. | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | -| Auto-Recovery | The circuit breaker closes automatically once all failures age out of the sliding window. No manual action is required. | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | -| Circuit Breaker | Per-host failure counter with a sliding time window. Opens when failures exceed `CBErrorThreshold`; skips that host in the selection pipeline. | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | -| Progressive Delay | Artificial per-request delay (100 – 500 ms) added as a host's failure count approaches the open threshold, slowing traffic before the circuit trips. | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | -| Requeue | Returning a request to the priority queue after all backends return 429 with `S7PREQUEUE: true`, using the shortest `Retry-After` seen across all backends. | [LOAD_BALANCING.md](LOAD_BALANCING.md) | -| `Timeout` | Per-host-attempt window in milliseconds. Resets on each retry. Effective limit per attempt = `min(remaining TTL, Timeout)`. | [TIMEOUTS.md](TIMEOUTS.md) | - -> [!NOTE] -> **Circuit breaker vs. active pool:** A host leaves the active pool when its *probe success rate* drops below `SuccessRate`. A circuit breaker opens when *live request failures* exceed `CBErrorThreshold`. Both mechanisms can remove a host independently. - ---- - -## Request Governance - -Concepts covering the validation pipeline, user profiles, and priority mapping. - -| Term | Definition | See Also | -|------|-----------|----------| -| App ID Allowlist | File or URL returning permitted Entra Application IDs. Enforced at step 1 of the validation pipeline, before any other check. | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md) | -| Priority Mapping | Maps an incoming request header value to an internal priority integer and allocates dedicated worker threads to that tier. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | -| Priority Workers | `PriorityLevel:WorkerCount` pairs that reserve dedicated worker threads for each priority level, ensuring high-priority traffic always has capacity. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | -| User Profile | Per-user JSON object loaded periodically from a URL or file. Drives priority assignment, async configuration, custom header injection, and throttling. | [USER_PROFILES.md](USER_PROFILES.md) | - -> [!TIP] -> User profiles reload on a configurable interval (default 1 hour) without a proxy restart. Suspend a user by adding their ID to the suspended-users list — it takes effect on the next reload cycle. - ---- - -## Async Mode - -Concepts covering long-running request handling decoupled from the HTTP connection. - -| Term | Definition | See Also | -|------|-----------|----------| -| `AsyncTimeout` | Maximum backend processing time in milliseconds once a request is in async mode (default 30 minutes). | [TIMEOUTS.md](TIMEOUTS.md) | -| `AsyncTriggerTimeout` | Milliseconds elapsed since enqueue before the proxy releases the client with a 202 response and continues in the background. | [TIMEOUTS.md](TIMEOUTS.md) | -| `AsyncTTLSecs` | Retention period in seconds for the async result blob after processing completes. Also sets the SAS token lifetime. | [AsyncOperation.md](AsyncOperation.md) | -| Blob Lifecycle Policy | Azure Storage lifecycle management rule that automatically deletes result blobs after `BlobRetentionDays`. Must be configured in the storage account — the proxy setting alone does not delete blobs. | [StorageBlobConfig.md](StorageBlobConfig.md) | - -> [!WARNING] -> Async mode requires three simultaneous opt-ins: the proxy-wide `AsyncModeEnabled` flag, an `async-config` block in the user profile, and the `S7PAsyncMode` header on the individual request. All three MUST be present for async upgrade to occur. - ---- - -## Observability - -Concepts covering telemetry events, sinks, token tracking, and health endpoints. - -| Term | Definition | See Also | -|------|-----------|----------| -| CompositeEventClient | Fan-out dispatcher that sends every serialized `ProxyEvent` to all registered telemetry sinks simultaneously. Custom sinks register by implementing `IEventClient + IHostedService`. | [OBSERVABILITY.md](OBSERVABILITY.md) | -| ProxyEvent | Per-request key/value dictionary capturing HTTP status, queue duration, processing duration, backend host, and token counts. | [OBSERVABILITY.md](OBSERVABILITY.md) | -| Sidecar Mode | Deployment pattern where a separate HealthProbe container on port 9000 handles Kubernetes probes. The proxy pushes its health state to the sidecar every second, isolating probe responses from proxy load. | [HEALTH_CHECKING.md](HEALTH_CHECKING.md) | -| Token Telemetry | Prompt and completion token counts extracted from SSE streams in flight by the `processor=OpenAI` stream handler. Logged per request without buffering the full response. | [OBSERVABILITY.md](OBSERVABILITY.md) | - ---- - -## Configuration Management - -Concepts covering how settings reach the proxy, when they take effect, and how they are organized. - -| Term | Definition | See Also | -|------|-----------|----------| -| Cold Setting | Configuration value that takes effect only after a container restart. Examples: `Workers`, `AsyncModeEnabled`. | [CONFIGURATION_SETTINGS.md](CONFIGURATION_SETTINGS.md) | -| Composite Connection String | Semicolon-delimited `key=value` string encoding multiple related settings in a single environment variable (e.g., `Host1`, `AsyncBlobStorageConfig`). | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| Hidden Setting | Runtime-derived value never published to Azure App Configuration. Typically computed from a composite connection string at startup. | [CONFIGURATION_SETTINGS.md](CONFIGURATION_SETTINGS.md) | -| Sentinel | The `Warm:Sentinel` key in Azure App Configuration. Updating its value to anything new triggers hot-reload of all Warm settings across all running proxy instances. | [AZURE_APP_CONFIGURATION.md](AZURE_APP_CONFIGURATION.md) | -| Warm Setting | Configuration value hot-reloaded from Azure App Configuration within ~30 seconds when the Sentinel key changes. No container restart required. | [CONFIGURATION_SETTINGS.md](CONFIGURATION_SETTINGS.md) | - -> [!TIP] -> To apply a batch of Warm setting changes atomically, update all values in Azure App Configuration first, then bump the Sentinel key once. All instances reload together. - ---- - -## Authentication and Security - -Concepts covering how the proxy authenticates to backends and restricts inbound callers. - -| Term | Definition | See Also | -|------|-----------|----------| -| Keyless Auth | Using `usemi=true` in a host connection string to eliminate static API keys. The proxy acquires OAuth2 Bearer tokens from Managed Identity at runtime. | [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) | -| Managed Identity | Azure-managed credential attached to the container. Used for keyless authentication to backends, App Configuration, Event Hubs, Blob Storage, and Service Bus — no secrets stored. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | - ---- - -## Protocol and Headers - -Named HTTP signals that cross the client-proxy and proxy-backend boundaries. - -| Term | Direction | Definition | See Also | -|------|-----------|-----------|----------| -| `S7PAsyncMode` | Client → proxy | Per-request opt-in header that enables async mode for that call. Default header name is configurable. | [AsyncOperation.md](AsyncOperation.md) | -| `S7PDEBUG` | Client → proxy | Enables debug-level response headers on the proxied response for a single request. | [RESPONSE_CODES.md](RESPONSE_CODES.md) | -| `S7PPriorityKey` | Client → proxy | Carries the caller's priority tier value. Mapped via `PriorityKeys` to an internal priority integer. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | -| `S7PREQUEUE` | Backend → proxy | Response header a backend sets on a 429 reply to signal the proxy MUST requeue the request rather than advance to the next host. | [RESPONSE_CODES.md](RESPONSE_CODES.md) | -| `S7PTimeout` | Client → proxy | Per-request override for the host-attempt timeout in milliseconds. | [TIMEOUTS.md](TIMEOUTS.md) | -| `S7PTTL` | Client → proxy | Per-request override for the total TTL budget in seconds. | [TIMEOUTS.md](TIMEOUTS.md) | - ---- - -## Version History - -| Version | Date | Changes | -|---------|------|---------| +# SimpleL7Proxy — Glossary + +| | | +|---|---| +| **Version** | 1.0 | +| **Last Updated** | 2026-05-21 | +| **Owner** | Platform Engineering | +| **Review Cycle** | Updated with each feature release | + +## Summary + +This glossary defines every named concept, setting, and runtime behavior used across SimpleL7Proxy documentation. Each entry is grouped by the domain it belongs to — matching the ten domains in the [Table of Contents](TABLE_OF_CONTENTS.md) and the [machine-readable taxonomy](../taxonomy/concepts.json) — and links to the authoritative reference document where that concept is fully specified. + +**Who this is for:** anyone reading, writing, or reviewing SimpleL7Proxy documentation or configuration files. + +## TL;DR + +- Terms are grouped by domain, matching the [Table of Contents](TABLE_OF_CONTENTS.md) structure. +- Each entry links to the document where that concept is fully specified. +- Configuration setting names appear in `code` style; deprecated terms are marked explicitly. + +## Scope & Applicability + +| | | +|---|---| +| **In scope** | All named concepts, configuration settings, runtime behaviors, and HTTP headers documented in `SimpleL7Proxy/docs/`. | +| **Out of scope** | General Azure service terminology (App Configuration, Service Bus, Blob Storage) except where it directly intersects with proxy behavior. | +| **Dependencies** | [TABLE_OF_CONTENTS.md](TABLE_OF_CONTENTS.md) · [../taxonomy/concepts.json](../taxonomy/concepts.json) | + +--- + +## Request Lifecycle + +Concepts covering how a request moves from client ingress through the priority queue to worker dispatch. + +| Term | Definition | See Also | +|------|-----------|----------| +| `DefaultPriority` | Fallback priority level assigned when the request carries no matching priority header value. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | +| Priority Level | Integer assigned to every request. Lower integer = higher dispatch precedence in the queue. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | +| Priority Queue | Min-heap data structure ordered by priority level. Drives the order in which worker threads pick up requests. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | +| TTL (Time-to-Live) | Total wall-clock budget for a request covering queue wait and all retry attempts. Expiry returns 412 to the client. | [TIMEOUTS.md](TIMEOUTS.md) | +| `Workers` | Count of concurrent proxy worker threads. Cold setting — the default of 10 is for local testing only. | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md) | + +--- + +## Backend Management + +Concepts covering backend host configuration, health probing, and the selection pipeline. + +| Term | Definition | See Also | +|------|-----------|----------| +| Active Pool | The set of backend hosts currently eligible to receive traffic, filtered by rolling success rate threshold. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| Connection String Format | Preferred per-host configuration using a semicolon-delimited `key=value` string (e.g., `host=…;probe=…;path=…`). Supports all modern options. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| Direct Mode | Backend mode where the host is always treated as healthy and no probe is ever sent. Use for serverless or on-demand backends. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| Health Poller | Background loop that probes each configured host at `PollInterval` ms and tracks rolling success rate and average latency. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| `IterationMode` | Controls retry breadth. `SinglePass` tries each host at most once. `MultiPass` cycles up to `MaxAttempts` total. | [LOAD_BALANCING.md](LOAD_BALANCING.md) | +| Load Balance Mode | Determines host ordering within the candidate set: `roundrobin` (even), `latency` (fastest first), or `random`. | [LOAD_BALANCING.md](LOAD_BALANCING.md) | +| Path Filter | Stage 1 of backend selection. Specific-path hosts are checked first; catch-all hosts receive requests that match no specific path. | [LOAD_BALANCING.md](LOAD_BALANCING.md) | +| Shared Iterator | A single load-balance iterator shared across all concurrent requests to the same path, enabling strict round-robin fairness. | [LOAD_BALANCING.md](LOAD_BALANCING.md) | +| Success Rate | Rolling percentage of successful probe responses for a host. Hosts that fall below `SuccessRate` leave the active pool until they recover. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | + +--- + +## Reliability + +Concepts covering circuit breaking, retry, requeue, and the timeout model. + +| Term | Definition | See Also | +|------|-----------|----------| +| `AcceptableStatusCodes` | HTTP status codes from backends that are forwarded directly to the client and not counted as circuit-breaker failures. | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | +| Auto-Recovery | The circuit breaker closes automatically once all failures age out of the sliding window. No manual action is required. | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | +| Circuit Breaker | Per-host failure counter with a sliding time window. Opens when failures exceed `CBErrorThreshold`; skips that host in the selection pipeline. | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | +| Progressive Delay | Artificial per-request delay (100 – 500 ms) added as a host's failure count approaches the open threshold, slowing traffic before the circuit trips. | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | +| Requeue | Returning a request to the priority queue after all backends return 429 with `S7PREQUEUE: true`, using the shortest `Retry-After` seen across all backends. | [LOAD_BALANCING.md](LOAD_BALANCING.md) | +| `Timeout` | Per-host-attempt window in milliseconds. Resets on each retry. Effective limit per attempt = `min(remaining TTL, Timeout)`. | [TIMEOUTS.md](TIMEOUTS.md) | + +> [!NOTE] +> **Circuit breaker vs. active pool:** A host leaves the active pool when its *probe success rate* drops below `SuccessRate`. A circuit breaker opens when *live request failures* exceed `CBErrorThreshold`. Both mechanisms can remove a host independently. + +--- + +## Request Governance + +Concepts covering the validation pipeline, user profiles, and priority mapping. + +| Term | Definition | See Also | +|------|-----------|----------| +| App ID Allowlist | File or URL returning permitted Entra Application IDs. Enforced at step 1 of the validation pipeline, before any other check. | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md) | +| Priority Mapping | Maps an incoming request header value to an internal priority integer and allocates dedicated worker threads to that tier. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | +| Priority Workers | `PriorityLevel:WorkerCount` pairs that reserve dedicated worker threads for each priority level, ensuring high-priority traffic always has capacity. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | +| User Profile | Per-user JSON object loaded periodically from a URL or file. Drives priority assignment, async configuration, custom header injection, and throttling. | [USER_PROFILES.md](USER_PROFILES.md) | + +> [!TIP] +> User profiles reload on a configurable interval (default 1 hour) without a proxy restart. Suspend a user by adding their ID to the suspended-users list — it takes effect on the next reload cycle. + +--- + +## Async Mode + +Concepts covering long-running request handling decoupled from the HTTP connection. + +| Term | Definition | See Also | +|------|-----------|----------| +| `AsyncTimeout` | Maximum backend processing time in milliseconds once a request is in async mode (default 30 minutes). | [TIMEOUTS.md](TIMEOUTS.md) | +| `AsyncTriggerTimeout` | Milliseconds elapsed since enqueue before the proxy releases the client with a 202 response and continues in the background. | [TIMEOUTS.md](TIMEOUTS.md) | +| `AsyncTTLSecs` | Retention period in seconds for the async result blob after processing completes. Also sets the SAS token lifetime. | [AsyncOperation.md](AsyncOperation.md) | +| Blob Lifecycle Policy | Azure Storage lifecycle management rule that automatically deletes result blobs after `BlobRetentionDays`. Must be configured in the storage account — the proxy setting alone does not delete blobs. | [StorageBlobConfig.md](StorageBlobConfig.md) | + +> [!WARNING] +> Async mode requires three simultaneous opt-ins: the proxy-wide `AsyncModeEnabled` flag, an `async-config` block in the user profile, and the `S7PAsyncMode` header on the individual request. All three MUST be present for async upgrade to occur. + +--- + +## Observability + +Concepts covering telemetry events, sinks, token tracking, and health endpoints. + +| Term | Definition | See Also | +|------|-----------|----------| +| CompositeEventClient | Fan-out dispatcher that sends every serialized `ProxyEvent` to all registered telemetry sinks simultaneously. Custom sinks register by implementing `IEventClient + IHostedService`. | [OBSERVABILITY.md](OBSERVABILITY.md) | +| ProxyEvent | Per-request key/value dictionary capturing HTTP status, queue duration, processing duration, backend host, and token counts. | [OBSERVABILITY.md](OBSERVABILITY.md) | +| Sidecar Mode | Deployment pattern where a separate HealthProbe container on port 9000 handles Kubernetes probes. The proxy pushes its health state to the sidecar every second, isolating probe responses from proxy load. | [HEALTH_CHECKING.md](HEALTH_CHECKING.md) | +| Token Telemetry | Prompt and completion token counts extracted from SSE streams in flight by the `processor=OpenAI` stream handler. Logged per request without buffering the full response. | [OBSERVABILITY.md](OBSERVABILITY.md) | + +--- + +## Configuration Management + +Concepts covering how settings reach the proxy, when they take effect, and how they are organized. + +| Term | Definition | See Also | +|------|-----------|----------| +| Cold Setting | Configuration value that takes effect only after a container restart. Examples: `Workers`, `AsyncModeEnabled`. | [CONFIGURATION_SETTINGS.md](CONFIGURATION_SETTINGS.md) | +| Composite Connection String | Semicolon-delimited `key=value` string encoding multiple related settings in a single environment variable (e.g., `Host1`, `AsyncBlobStorageConfig`). | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| Hidden Setting | Runtime-derived value never published to Azure App Configuration. Typically computed from a composite connection string at startup. | [CONFIGURATION_SETTINGS.md](CONFIGURATION_SETTINGS.md) | +| Sentinel | The `Warm:Sentinel` key in Azure App Configuration. Updating its value to anything new triggers hot-reload of all Warm settings across all running proxy instances. | [AZURE_APP_CONFIGURATION.md](AZURE_APP_CONFIGURATION.md) | +| Warm Setting | Configuration value hot-reloaded from Azure App Configuration within ~30 seconds when the Sentinel key changes. No container restart required. | [CONFIGURATION_SETTINGS.md](CONFIGURATION_SETTINGS.md) | + +> [!TIP] +> To apply a batch of Warm setting changes atomically, update all values in Azure App Configuration first, then bump the Sentinel key once. All instances reload together. + +--- + +## Authentication and Security + +Concepts covering how the proxy authenticates to backends and restricts inbound callers. + +| Term | Definition | See Also | +|------|-----------|----------| +| Keyless Auth | Using `usemi=true` in a host connection string to eliminate static API keys. The proxy acquires OAuth2 Bearer tokens from Managed Identity at runtime. | [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) | +| Managed Identity | Azure-managed credential attached to the container. Used for keyless authentication to backends, App Configuration, Event Hubs, Blob Storage, and Service Bus — no secrets stored. | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | + +--- + +## Protocol and Headers + +Named HTTP signals that cross the client-proxy and proxy-backend boundaries. + +| Term | Direction | Definition | See Also | +|------|-----------|-----------|----------| +| `S7PAsyncMode` | Client → proxy | Per-request opt-in header that enables async mode for that call. Default header name is configurable. | [AsyncOperation.md](AsyncOperation.md) | +| `S7PDEBUG` | Client → proxy | Enables debug-level response headers on the proxied response for a single request. | [RESPONSE_CODES.md](RESPONSE_CODES.md) | +| `S7PPriorityKey` | Client → proxy | Carries the caller's priority tier value. Mapped via `PriorityKeys` to an internal priority integer. | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md) | +| `S7PREQUEUE` | Backend → proxy | Response header a backend sets on a 429 reply to signal the proxy MUST requeue the request rather than advance to the next host. | [RESPONSE_CODES.md](RESPONSE_CODES.md) | +| `S7PTimeout` | Client → proxy | Per-request override for the host-attempt timeout in milliseconds. | [TIMEOUTS.md](TIMEOUTS.md) | +| `S7PTTL` | Client → proxy | Per-request override for the total TTL budget in seconds. | [TIMEOUTS.md](TIMEOUTS.md) | + +--- + +## Version History + +| Version | Date | Changes | +|---------|------|---------| | 1.0 | 2026-05-21 | Initial gold-standard release. Reorganized by domain, added See Also links, added callouts, added protocol headers section. | \ No newline at end of file diff --git a/docs/POC-ACA-Proxy-Security-Authorization.md b/docs/POC-ACA-Proxy-Security-Authorization.md index 07920a8..b91e9c9 100644 --- a/docs/POC-ACA-Proxy-Security-Authorization.md +++ b/docs/POC-ACA-Proxy-Security-Authorization.md @@ -8,11 +8,14 @@ 2. Create two Entra apps for this hop: ACA API app and client caller app. 3. Enable ACA authentication and test both success and failure paths. +EasyAuth is the enforcement layer. It validates Entra-issued tokens and blocks requests that do not meet the configured requirements. + ## What you will observe -- A token minted for `api://` is accepted by ACA. -- Requests without a token or with the wrong audience are rejected. -- The client identity can be restricted through allowed applications and app role assignment. +- `curl https:///` with no `Authorization` header → `401 Unauthorized`; the proxy never processes the request. +- `curl` with a token for the wrong audience → `401`. +- `curl` with a valid bearer token scoped to `api://` → request reaches the proxy, proxy returns its normal response. +- The proxy receives no unauthenticated traffic at any point. ## Reference @@ -30,6 +33,13 @@ ## Setup +**What matters:** `--enable-id-token-issuance true`, a valid client secret, and `Return401` as the unauthenticated action. + +| Item | Value used in this POC | +| :--- | :--- | +| Token audience | `api://` | +| Unauthenticated action | `Return401` | + ### 1) Create ACA API app registration in Entra **Rule: the ACA proxy must expose its own audience and scope for inbound client tokens.** @@ -116,6 +126,18 @@ Equivalent portal checks: > [!WARNING] > Use tenant ID for `-t`; do not use a service principal object ID in this field. +Verify the configuration was applied: + +```bash +az containerapp auth show \ + --name "" \ + --resource-group "" \ + --query "{enabled:platform.enabled, provider:identityProviders.azureActiveDirectory.enabled}" \ + -o table +``` + +Both columns must show `True`. If `identityProviders` is empty, re-run Step 3. + ### 4) Optional app-role assignment for client service principal **Rule: if you enforce role-based app auth, assign `API.Caller` to the client service principal on ACA API.** @@ -128,13 +150,38 @@ $acaRoleId = "" New-MgServicePrincipalAppRoleAssignment -ServicePrincipalId $clientSpId -PrincipalId $clientSpId -ResourceId $acaResourceSpId -AppRoleId $acaRoleId ``` +## Run + +```bash +APP_FQDN="https://...azurecontainerapps.io" + +# 1. No token — expect 401 +curl -i "$APP_FQDN" + +# 2. Acquire a token scoped to the proxy's app registration +TOKEN=$(az account get-access-token \ + --resource "api://$APP_ID" \ + --query accessToken -o tsv) + +# 3. Call with token — expect 200 +curl -i "$APP_FQDN" \ + -H "Authorization: Bearer $TOKEN" +``` + ## Full flow -```mermaid -flowchart LR - A["Client App
SimpleL7Proxy-Client"] -->|"Bearer token aud=api://ACA_APP_ID"| B["ACA Ingress + Easy Auth"] - B -->|"Validate audience + app constraints"| C["SimpleL7Proxy in ACA"] - D["ACA API App
SimpleL7Proxy-ACA-API"] -. "Defines audience, scope, and optional role" .-> B +```text +API client / service + │ + │ Authorization: Bearer + ▼ +ACA EasyAuth sidecar + ├─ No token ──► 401 + ├─ Wrong audience ──► 401 + └─ Valid token + ▼ + SimpleL7Proxy + (receives only authenticated requests) ``` ## Worked example @@ -199,3 +246,14 @@ Expected: `401 Unauthorized` due to audience mismatch. - [scripts/console2caSetup.sh](../scripts/console2caSetup.sh) - [scripts/enableContainerAppAuth.sh](../scripts/enableContainerAppAuth.sh) - [POC-APIM-Security-Authorization.md](POC-APIM-Security-Authorization.md) + +## Troubleshooting + +| Symptom | Likely cause | Fix | +| :--- | :--- | :--- | +| `401` on every request | ACA auth not enabled, or wrong audience configured | Run `az containerapp auth show` and confirm `enabled=True` and `allowedAudiences` includes `api://` | +| `401` with a seemingly valid token | Token `aud` does not match `api://` | Re-acquire token: `az account get-access-token --resource api://` | +| `503 Service Unavailable` after enabling auth | `identityProviders` block is empty in auth config | Re-run `enableContainerAppAuth.sh` (Step 3) and re-run `az containerapp auth show` to confirm both columns are `True` | +| Requests pass without a token | Unauthenticated action is not set to `Return401` | In the portal: Container Apps → Authentication → set unauthenticated requests to `HTTP 401`; or re-run the script | +| Valid token still results in `401` | Client app ID not in allowed applications list | Add `` to the allowed applications list in ACA auth config | +| ACA auth config shows enabled but token check fails | Client secret expired or rotated | Generate a new client secret for the ACA API app registration and update the ACA auth config | diff --git a/docs/POC-Chargeback.md b/docs/POC-Chargeback.md index 2bc5052..56fabfa 100644 --- a/docs/POC-Chargeback.md +++ b/docs/POC-Chargeback.md @@ -1,323 +1,323 @@ -# POC: Token-Level Chargeback - -## Overview - -This POC shows how SimpleL7Proxy captures token usage from streaming Azure OpenAI responses and emits it as structured telemetry — without buffering the response or adding meaningful latency. Once the data is in Application Insights, a single KQL query can break down consumption by user, priority tier, or backend, giving you the raw numbers needed for internal chargeback or cost reporting. - -The goal is to verify that: - -1. Token counts (`prompt_tokens`, `completion_tokens`, `total_tokens`) are extracted from the SSE stream and appear in Application Insights custom dimensions. -2. The `userId` header flows through to telemetry, so consumption can be attributed to an individual caller. -3. A KQL query can aggregate total tokens per user over a time window. - -The LLM Simulator covers all three cases. Its sample files return real `usage` blocks in the same format Azure OpenAI uses, so the proxy's stream processor extracts and logs the same fields it would against a real endpoint. - ---- - -## How it works - -The proxy includes stream processors that read the SSE or JSON response stream on-the-fly and extract token usage without buffering the full response. The processor to use depends on the provider (LLM model): - -| Provider | `processor=` value | Usage fields logged | -|----------|--------------------|---------------------| -| Azure OpenAI / OpenAI | `OpenAI` | `Usage.Prompt_Tokens`, `Usage.Completion_Tokens`, `Usage.Total_Tokens` | -| Anthropic | `AllUsage-2` | `Usage.Input_Tokens`, `Usage.Output_Tokens` | -| Google Gemini | `MultiLineAllUsage` | `Usage.PromptTokenCount`, `Usage.CandidatesTokenCount`, `Usage.TotalTokenCount` | - -All processors attach their extracted values to the `ProxyEvent` for that request and write them to every configured telemetry sink — Application Insights, Event Hubs, and the local file logger all receive the same token fields. This POC focuses on Application Insights. For Event Hubs see the tuning section below. - -
-Custom dimensions emitted per request - -| Custom Dimension | Content | -|------------------|---------| -| `Usage.Prompt_Tokens` | Tokens consumed by the input prompt | -| `Usage.Completion_Tokens` | Tokens generated in the response | -| `Usage.Total_Tokens` | Sum of prompt + completion | -| `S7P_RequestId` | Unique request correlation ID | -| `S7P_Priority` | Priority queue the request was assigned to | -| `BackendHost` | Backend URL that served the request | - -
- -The `userId` header value is forwarded to the backend and appears in request telemetry, enabling per-user attribution. - -> **Note:** Token extraction requires the appropriate `processor=` value on the backend host configuration (see table above). Without it, the proxy forwards the stream transparently but does not parse usage. - ---- - -## Prerequisites - -- SimpleL7Proxy running locally or on ACA, pointed at the LLM Simulator. -- `processor=OpenAI` set on the backend host (see [Backend Configuration](#backend-configuration) below). -- Application Insights connected via `APPINSIGHTS_CONNECTIONSTRING`. -- The LLM Simulator deployed as an Azure Function. See [`test/LLMSimulator/Readme.md`](../test/LLMSimulator/Readme.md) — the fastest path is the portal ZIP deploy. Verify it's up: - ```bash - curl https://.azurewebsites.net/api/health - # → 200 OK - ``` - ---- - -## Backend Configuration - -
-Direct backend - -Set one `Host` environment variable per provider. The `path=` prefix tells the proxy which incoming URL paths belong to that host, `processor=` selects the right token extractor, and `mode=direct` disables health probing (appropriate for Azure Functions which scale to zero). Only configure the hosts you actually need — all three are not required. - -```bash -# Azure OpenAI — handles requests to /openai/... -export Host1="host=https://.azurewebsites.net;mode=direct;path=/openai;processor=OpenAI" - -# Anthropic — handles requests to /anthropic/... -export Host2="host=https://.azurewebsites.net;mode=direct;path=/anthropic;processor=AllUsage-2" - -# Google Gemini — handles requests to /v1beta/... -export Host3="host=https://.azurewebsites.net;mode=direct;path=/v1beta;processor=MultiLineAllUsage" -``` - -
- -
-APIM - -With APIM the processor is not set in the host config. Instead, APIM returns it as the `TOKENPROCESSOR` response header. The proxy reads that header from each `200 OK` response and selects the processor dynamically — useful when a single APIM gateway fronts multiple models and the policy knows which backend was actually called. - -The included policy already does this in its `` block: - -```xml - - - MultiLineAllUsage - - ... - -``` - -Change the value to match the model family the policy is routing to (`OpenAI`, `AllUsage-2`, or `MultiLineAllUsage`). If the policy routes to multiple providers, use a policy expression to set it conditionally based on whichever backend was selected. - -The host config does not need a `processor=` value — the header overrides it at runtime. Use `mode=apim` with a probe path so the proxy health-checks the gateway: - -```bash -export Host1="host=https://.azure-api.net;mode=apim;probe=/status-0123456789abcdef" -``` - -
- -```bash -export APPINSIGHTS_CONNECTIONSTRING="InstrumentationKey=..." -export Workers=5 -dotnet run --project src/SimpleL7Proxy -``` - ---- - -## Sending Test Requests - -The simulator returns deterministic responses with realistic `usage` blocks — the same JSON structure the real providers return. Because the token counts are fixed, you can verify telemetry exactly: if the KQL query shows 1058 total tokens for an OpenAI call, the stream processor is working correctly end-to-end. Send at least a few requests across two different `userId` values so the chargeback query has something meaningful to aggregate. - -
-curl commands - -**Azure OpenAI (`processor=OpenAI`) — 58 prompt / 1000 completion / 1058 total:** -```bash -curl -i \ - -H "userId: alice" \ - -H "Content-Type: application/json" \ - -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hello"}],"stream":true}' \ - "http://localhost:8000/openai/deployments/gpt-4o-mini/chat/completions" -``` - -**Anthropic (`processor=AllUsage-2`) — 10 input / 35 output:** -```bash -curl -i \ - -H "userId: alice" \ - -H "Content-Type: application/json" \ - -d '{"model":"claude-sonnet-3-5","messages":[{"role":"user","content":"hello"}]}' \ - "http://localhost:8000/anthropic/v1/messages" -``` - -**Gemini (`processor=MultiLineAllUsage`) — 6 prompt / 19 candidates / 1465 total (includes thinking tokens):** -```bash -curl -i \ - -H "userId: alice" \ - -H "Content-Type: application/json" \ - -d '{"contents":[{"role":"user","parts":[{"text":"hello"}]}]}' \ - "http://localhost:8000/v1beta/models/gemini-2.5-pro:generateContent" -``` - -**Batch — two users, multiple requests (OpenAI):** -```bash -for i in {1..5}; do - curl -s -o /dev/null \ - -H "userId: alice" \ - -H "Content-Type: application/json" \ - -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hello"}],"stream":true}' \ - "http://localhost:8000/openai/deployments/gpt-4o-mini/chat/completions" & -done - -for i in {1..3}; do - curl -s -o /dev/null \ - -H "userId: bob" \ - -H "Content-Type: application/json" \ - -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hello"}],"stream":true}' \ - "http://localhost:8000/openai/deployments/gpt-4o-mini/chat/completions" & -done - -wait -echo "Done" -``` - -
- -After a few seconds, the events will appear in Application Insights. - ---- - -## Verifying the Data - -Now that the proxy is running and sample requests have been sent, it's time to verify that token data is flowing through correctly. The proxy writes a `customEvent` to Application Insights for every completed request, with token counts in `customDimensions`. Because the simulator returns fixed token counts, the numbers are deterministic — if the results match the expected values below, the full pipeline (stream parsing → telemetry emission → ingestion) is confirmed working. - -The field names vary by provider: Azure OpenAI uses `Usage.Prompt_Tokens` / `Usage.Completion_Tokens` / `Usage.Total_Tokens`; Anthropic uses `Usage.Input_Tokens` / `Usage.Output_Tokens`; Gemini uses `Usage.PromptTokenCount` / `Usage.CandidatesTokenCount` / `Usage.TotalTokenCount`. The queries and log checks below use the OpenAI fields — adapt the field names if you're testing a different provider. - -
-Application Insights - -Open the Log Analytics workspace linked to your Application Insights resource and run: - -```kusto -customEvents -| where timestamp > ago(1h) -| where customDimensions contains "Usage.Total_Tokens" -| project - timestamp, - UserId = tostring(customDimensions["userId"]), - Priority = tostring(customDimensions["S7P_Priority"]), - Backend = tostring(customDimensions["BackendHost"]), - PromptTokens = toint(customDimensions["Usage.Prompt_Tokens"]), - CompTokens = toint(customDimensions["Usage.Completion_Tokens"]), - TotalTokens = toint(customDimensions["Usage.Total_Tokens"]) -| summarize - Requests = count(), - TotalTokens = sum(TotalTokens), - PromptTokens = sum(PromptTokens), - CompTokens = sum(CompTokens) - by UserId, Priority -| order by TotalTokens desc -``` - -Expected result for the batch above (simulator returns 1058 tokens per call): - -| UserId | Priority | Requests | TotalTokens | PromptTokens | CompTokens | -|--------|----------|----------|-------------|--------------|------------| -| alice | 1 | 5 | 5290 | 290 | 5000 | -| bob | 1 | 3 | 3174 | 174 | 3000 | - -To break down by backend — useful when multiple deployments serve different tiers: - -```kusto -customEvents -| where timestamp > ago(1h) -| where customDimensions contains "Usage.Total_Tokens" -| summarize - TotalTokens = sum(toint(customDimensions["Usage.Total_Tokens"])), - Requests = count() - by UserId = tostring(customDimensions["userId"]), - Backend = tostring(customDimensions["BackendHost"]) -| order by TotalTokens desc -``` - -
- -
-Event Hubs - -Set `EVENT_LOGGERS=eventhub` and `EVENTHUB_CONNECTIONSTRING` to enable the Event Hubs sink. The proxy emits the same JSON envelope it writes to the file log, so every token field is present in the event body. - -**Capture to a storage account:** Enable the Event Hubs Capture feature on the hub to land events as Avro files in Azure Blob Storage automatically. This gives you a durable, queryable archive without building a consumer. - -**Query with ADX (Azure Data Explorer):** Connect ADX to the hub or to the captured Blob Storage container using an external table or continuous ingestion. Once the data is in ADX you can run the equivalent chargeback query: - -```kusto -ProxyEvents -| where UsageTotalTokens > 0 -| summarize - Requests = count(), - TotalTokens = sum(UsageTotalTokens), - PromptTokens = sum(UsagePromptTokens), - CompTokens = sum(UsageCompletionTokens) - by UserId, Priority -| order by TotalTokens desc -``` - -Other tools that work directly with Event Hubs or Blob-captured data include Fabric Real-Time Intelligence, Stream Analytics, and Azure Synapse. - -
- -
-Local File - -If `EVENT_LOGGERS=file` (the default), token data appears in `eventslog.json` immediately — no ingestion delay. Useful for a quick sanity check before querying Application Insights. - -If the proxy is deployed to Azure Container Apps, the file is written inside the container. Use the ACA console to inspect it: in the Azure portal, open the container app → **Containers** → **Console**, then run the `jq` command below. For a more durable setup, ACA supports mounting an Azure Files share as a volume — configure a storage mount in the container app and set the proxy's working directory to that path so `eventslog.json` persists across container restarts. - -```bash -cat eventslog.json | jq 'select(."Usage.Total_Tokens" != null) | {"user": .userId, "total": ."Usage.Total_Tokens", "backend": .BackendHost}' -``` - -Expected output per OpenAI request: -```json -{ - "user": "alice", - "total": "1058", - "backend": "https://.azurewebsites.net" -} -``` - -
- -
-None - -If `EVENT_LOGGERS` is not set or is set to an empty value, telemetry is turned off and nothing is captured. Requests are still proxied normally, but no token data, request events, or usage metrics are written anywhere. Set at least one logger before running this POC. - -
- ---- - -## Tuning and Further Exploration - -Once the basic data is confirmed, a few variations are worth trying: - -
-Stream Analytics + Power BI dashboard - -With `EVENT_LOGGERS=eventhub`, every request event lands in the hub in real time. Connect an Azure Stream Analytics job to the hub and project the token fields into an output — a Power BI streaming dataset works well here. You can build a live dashboard showing token consumption by user, priority tier, and backend, updating as requests arrive. This is the closest thing to a real-time chargeback view without any custom code. - -For a batch approach, use the Event Hubs Capture output (Avro files in Blob Storage) as a Power BI dataflow source or import it into a Fabric lakehouse for scheduled reporting. - -
- -
-Add a second backend by tier - -Use `acceptablePriorities` to route priority-1 to a "premium" backend and priority-3 to a "standard" one. The `BackendHost` dimension in telemetry then lets you split cost by tier automatically in any of the queries above. - -
- -
-Increase concurrency - -Raise `Workers` and send a larger burst. Watch `eventslog.json` — every line should have a `Usage.Total_Tokens` entry. Missing entries indicate the stream was closed before the final usage chunk arrived (rare with the simulator; common if a real backend is configured without `processor=OpenAI`). - -
- ---- - -## Related Documentation - -- [POC-Priority-configuration.md](POC-Priority-configuration.md) — Routing requests across backends by priority tier -- [POC-Failover-configuration.md](POC-Failover-configuration.md) — Automatic failover and retry behaviour when a backend is slow or unavailable -- [OBSERVABILITY.md](OBSERVABILITY.md) — Token metrics, telemetry channels, and event logger configuration -- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) — `processor=` and other host connection string options +# POC: Token-Level Chargeback + +## Overview + +This POC shows how SimpleL7Proxy captures token usage from streaming Azure OpenAI responses and emits it as structured telemetry — without buffering the response or adding meaningful latency. Once the data is in Application Insights, a single KQL query can break down consumption by user, priority tier, or backend, giving you the raw numbers needed for internal chargeback or cost reporting. + +The goal is to verify that: + +1. Token counts (`prompt_tokens`, `completion_tokens`, `total_tokens`) are extracted from the SSE stream and appear in Application Insights custom dimensions. +2. The `userId` header flows through to telemetry, so consumption can be attributed to an individual caller. +3. A KQL query can aggregate total tokens per user over a time window. + +The LLM Simulator covers all three cases. Its sample files return real `usage` blocks in the same format Azure OpenAI uses, so the proxy's stream processor extracts and logs the same fields it would against a real endpoint. + +--- + +## How it works + +The proxy includes stream processors that read the SSE or JSON response stream on-the-fly and extract token usage without buffering the full response. The processor to use depends on the provider (LLM model): + +| Provider | `processor=` value | Usage fields logged | +|----------|--------------------|---------------------| +| Azure OpenAI / OpenAI | `OpenAI` | `Usage.Prompt_Tokens`, `Usage.Completion_Tokens`, `Usage.Total_Tokens` | +| Anthropic | `AllUsage-2` | `Usage.Input_Tokens`, `Usage.Output_Tokens` | +| Google Gemini | `MultiLineAllUsage` | `Usage.PromptTokenCount`, `Usage.CandidatesTokenCount`, `Usage.TotalTokenCount` | + +All processors attach their extracted values to the `ProxyEvent` for that request and write them to every configured telemetry sink — Application Insights, Event Hubs, and the local file logger all receive the same token fields. This POC focuses on Application Insights. For Event Hubs see the tuning section below. + +
+Custom dimensions emitted per request + +| Custom Dimension | Content | +|------------------|---------| +| `Usage.Prompt_Tokens` | Tokens consumed by the input prompt | +| `Usage.Completion_Tokens` | Tokens generated in the response | +| `Usage.Total_Tokens` | Sum of prompt + completion | +| `S7P_RequestId` | Unique request correlation ID | +| `S7P_Priority` | Priority queue the request was assigned to | +| `BackendHost` | Backend URL that served the request | + +
+ +The `userId` header value is forwarded to the backend and appears in request telemetry, enabling per-user attribution. + +> **Note:** Token extraction requires the appropriate `processor=` value on the backend host configuration (see table above). Without it, the proxy forwards the stream transparently but does not parse usage. + +--- + +## Prerequisites + +- SimpleL7Proxy running locally or on ACA, pointed at the LLM Simulator. +- `processor=OpenAI` set on the backend host (see [Backend Configuration](#backend-configuration) below). +- Application Insights connected via `APPINSIGHTS_CONNECTIONSTRING`. +- The LLM Simulator deployed as an Azure Function. See [`test/LLMSimulator/Readme.md`](../test/LLMSimulator/Readme.md) — the fastest path is the portal ZIP deploy. Verify it's up: + ```bash + curl https://.azurewebsites.net/api/health + # → 200 OK + ``` + +--- + +## Backend Configuration + +
+Direct backend + +Set one `Host` environment variable per provider. The `path=` prefix tells the proxy which incoming URL paths belong to that host, `processor=` selects the right token extractor, and `mode=direct` disables health probing (appropriate for Azure Functions which scale to zero). Only configure the hosts you actually need — all three are not required. + +```bash +# Azure OpenAI — handles requests to /openai/... +export Host1="host=https://.azurewebsites.net;mode=direct;path=/openai;processor=OpenAI" + +# Anthropic — handles requests to /anthropic/... +export Host2="host=https://.azurewebsites.net;mode=direct;path=/anthropic;processor=AllUsage-2" + +# Google Gemini — handles requests to /v1beta/... +export Host3="host=https://.azurewebsites.net;mode=direct;path=/v1beta;processor=MultiLineAllUsage" +``` + +
+ +
+APIM + +With APIM the processor is not set in the host config. Instead, APIM returns it as the `TOKENPROCESSOR` response header. The proxy reads that header from each `200 OK` response and selects the processor dynamically — useful when a single APIM gateway fronts multiple models and the policy knows which backend was actually called. + +The included policy already does this in its `` block: + +```xml + + + MultiLineAllUsage + + ... + +``` + +Change the value to match the model family the policy is routing to (`OpenAI`, `AllUsage-2`, or `MultiLineAllUsage`). If the policy routes to multiple providers, use a policy expression to set it conditionally based on whichever backend was selected. + +The host config does not need a `processor=` value — the header overrides it at runtime. Use `mode=apim` with a probe path so the proxy health-checks the gateway: + +```bash +export Host1="host=https://.azure-api.net;mode=apim;probe=/status-0123456789abcdef" +``` + +
+ +```bash +export APPINSIGHTS_CONNECTIONSTRING="InstrumentationKey=..." +export Workers=5 +dotnet run --project src/SimpleL7Proxy +``` + +--- + +## Sending Test Requests + +The simulator returns deterministic responses with realistic `usage` blocks — the same JSON structure the real providers return. Because the token counts are fixed, you can verify telemetry exactly: if the KQL query shows 1058 total tokens for an OpenAI call, the stream processor is working correctly end-to-end. Send at least a few requests across two different `userId` values so the chargeback query has something meaningful to aggregate. + +
+curl commands + +**Azure OpenAI (`processor=OpenAI`) — 58 prompt / 1000 completion / 1058 total:** +```bash +curl -i \ + -H "userId: alice" \ + -H "Content-Type: application/json" \ + -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hello"}],"stream":true}' \ + "http://localhost:8000/openai/deployments/gpt-4o-mini/chat/completions" +``` + +**Anthropic (`processor=AllUsage-2`) — 10 input / 35 output:** +```bash +curl -i \ + -H "userId: alice" \ + -H "Content-Type: application/json" \ + -d '{"model":"claude-sonnet-3-5","messages":[{"role":"user","content":"hello"}]}' \ + "http://localhost:8000/anthropic/v1/messages" +``` + +**Gemini (`processor=MultiLineAllUsage`) — 6 prompt / 19 candidates / 1465 total (includes thinking tokens):** +```bash +curl -i \ + -H "userId: alice" \ + -H "Content-Type: application/json" \ + -d '{"contents":[{"role":"user","parts":[{"text":"hello"}]}]}' \ + "http://localhost:8000/v1beta/models/gemini-2.5-pro:generateContent" +``` + +**Batch — two users, multiple requests (OpenAI):** +```bash +for i in {1..5}; do + curl -s -o /dev/null \ + -H "userId: alice" \ + -H "Content-Type: application/json" \ + -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hello"}],"stream":true}' \ + "http://localhost:8000/openai/deployments/gpt-4o-mini/chat/completions" & +done + +for i in {1..3}; do + curl -s -o /dev/null \ + -H "userId: bob" \ + -H "Content-Type: application/json" \ + -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hello"}],"stream":true}' \ + "http://localhost:8000/openai/deployments/gpt-4o-mini/chat/completions" & +done + +wait +echo "Done" +``` + +
+ +After a few seconds, the events will appear in Application Insights. + +--- + +## Verifying the Data + +Now that the proxy is running and sample requests have been sent, it's time to verify that token data is flowing through correctly. The proxy writes a `customEvent` to Application Insights for every completed request, with token counts in `customDimensions`. Because the simulator returns fixed token counts, the numbers are deterministic — if the results match the expected values below, the full pipeline (stream parsing → telemetry emission → ingestion) is confirmed working. + +The field names vary by provider: Azure OpenAI uses `Usage.Prompt_Tokens` / `Usage.Completion_Tokens` / `Usage.Total_Tokens`; Anthropic uses `Usage.Input_Tokens` / `Usage.Output_Tokens`; Gemini uses `Usage.PromptTokenCount` / `Usage.CandidatesTokenCount` / `Usage.TotalTokenCount`. The queries and log checks below use the OpenAI fields — adapt the field names if you're testing a different provider. + +
+Application Insights + +Open the Log Analytics workspace linked to your Application Insights resource and run: + +```kusto +customEvents +| where timestamp > ago(1h) +| where customDimensions contains "Usage.Total_Tokens" +| project + timestamp, + UserId = tostring(customDimensions["userId"]), + Priority = tostring(customDimensions["S7P_Priority"]), + Backend = tostring(customDimensions["BackendHost"]), + PromptTokens = toint(customDimensions["Usage.Prompt_Tokens"]), + CompTokens = toint(customDimensions["Usage.Completion_Tokens"]), + TotalTokens = toint(customDimensions["Usage.Total_Tokens"]) +| summarize + Requests = count(), + TotalTokens = sum(TotalTokens), + PromptTokens = sum(PromptTokens), + CompTokens = sum(CompTokens) + by UserId, Priority +| order by TotalTokens desc +``` + +Expected result for the batch above (simulator returns 1058 tokens per call): + +| UserId | Priority | Requests | TotalTokens | PromptTokens | CompTokens | +|--------|----------|----------|-------------|--------------|------------| +| alice | 1 | 5 | 5290 | 290 | 5000 | +| bob | 1 | 3 | 3174 | 174 | 3000 | + +To break down by backend — useful when multiple deployments serve different tiers: + +```kusto +customEvents +| where timestamp > ago(1h) +| where customDimensions contains "Usage.Total_Tokens" +| summarize + TotalTokens = sum(toint(customDimensions["Usage.Total_Tokens"])), + Requests = count() + by UserId = tostring(customDimensions["userId"]), + Backend = tostring(customDimensions["BackendHost"]) +| order by TotalTokens desc +``` + +
+ +
+Event Hubs + +Set `EVENT_LOGGERS=eventhub` and `EVENTHUB_CONNECTIONSTRING` to enable the Event Hubs sink. The proxy emits the same JSON envelope it writes to the file log, so every token field is present in the event body. + +**Capture to a storage account:** Enable the Event Hubs Capture feature on the hub to land events as Avro files in Azure Blob Storage automatically. This gives you a durable, queryable archive without building a consumer. + +**Query with ADX (Azure Data Explorer):** Connect ADX to the hub or to the captured Blob Storage container using an external table or continuous ingestion. Once the data is in ADX you can run the equivalent chargeback query: + +```kusto +ProxyEvents +| where UsageTotalTokens > 0 +| summarize + Requests = count(), + TotalTokens = sum(UsageTotalTokens), + PromptTokens = sum(UsagePromptTokens), + CompTokens = sum(UsageCompletionTokens) + by UserId, Priority +| order by TotalTokens desc +``` + +Other tools that work directly with Event Hubs or Blob-captured data include Fabric Real-Time Intelligence, Stream Analytics, and Azure Synapse. + +
+ +
+Local File + +If `EVENT_LOGGERS=file` (the default), token data appears in `eventslog.json` immediately — no ingestion delay. Useful for a quick sanity check before querying Application Insights. + +If the proxy is deployed to Azure Container Apps, the file is written inside the container. Use the ACA console to inspect it: in the Azure portal, open the container app → **Containers** → **Console**, then run the `jq` command below. For a more durable setup, ACA supports mounting an Azure Files share as a volume — configure a storage mount in the container app and set the proxy's working directory to that path so `eventslog.json` persists across container restarts. + +```bash +cat eventslog.json | jq 'select(."Usage.Total_Tokens" != null) | {"user": .userId, "total": ."Usage.Total_Tokens", "backend": .BackendHost}' +``` + +Expected output per OpenAI request: +```json +{ + "user": "alice", + "total": "1058", + "backend": "https://.azurewebsites.net" +} +``` + +
+ +
+None + +If `EVENT_LOGGERS` is not set or is set to an empty value, telemetry is turned off and nothing is captured. Requests are still proxied normally, but no token data, request events, or usage metrics are written anywhere. Set at least one logger before running this POC. + +
+ +--- + +## Tuning and Further Exploration + +Once the basic data is confirmed, a few variations are worth trying: + +
+Stream Analytics + Power BI dashboard + +With `EVENT_LOGGERS=eventhub`, every request event lands in the hub in real time. Connect an Azure Stream Analytics job to the hub and project the token fields into an output — a Power BI streaming dataset works well here. You can build a live dashboard showing token consumption by user, priority tier, and backend, updating as requests arrive. This is the closest thing to a real-time chargeback view without any custom code. + +For a batch approach, use the Event Hubs Capture output (Avro files in Blob Storage) as a Power BI dataflow source or import it into a Fabric lakehouse for scheduled reporting. + +
+ +
+Add a second backend by tier + +Use `acceptablePriorities` to route priority-1 to a "premium" backend and priority-3 to a "standard" one. The `BackendHost` dimension in telemetry then lets you split cost by tier automatically in any of the queries above. + +
+ +
+Increase concurrency + +Raise `Workers` and send a larger burst. Watch `eventslog.json` — every line should have a `Usage.Total_Tokens` entry. Missing entries indicate the stream was closed before the final usage chunk arrived (rare with the simulator; common if a real backend is configured without `processor=OpenAI`). + +
+ +--- + +## Related Documentation + +- [POC-Priority-configuration.md](POC-Priority-configuration.md) — Routing requests across backends by priority tier +- [POC-Failover-configuration.md](POC-Failover-configuration.md) — Automatic failover and retry behaviour when a backend is slow or unavailable +- [OBSERVABILITY.md](OBSERVABILITY.md) — Token metrics, telemetry channels, and event logger configuration +- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) — `processor=` and other host connection string options diff --git a/docs/POC-Failover-configuration.md b/docs/POC-Failover-configuration.md index f033a26..bb919ac 100644 --- a/docs/POC-Failover-configuration.md +++ b/docs/POC-Failover-configuration.md @@ -1,472 +1,472 @@ -# POC: Failover Configuration - -**Purpose:** Show that when the primary backend returns a simulated `429`, the `Priority-with-retry-enhancedLog.xml` APIM policy marks it throttled and retries the same request against a healthy backend that returns a real OpenAI-style response. - -> [!NOTE] -> **Policy version:** This POC uses [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml). The older [`APIM-Policy/v2.0.1/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.0.1/Priority-with-retry-enhancedLog.xml) does not combine `url + path` the same way and will not produce the `backendLog` entries shown below. - -> [!IMPORTANT] -> **The rule: when Backend A returns `429`, APIM marks it throttled for `Retry-After + 2s`, retries the request against the next healthy backend, and the client still sees `200 OK`.** - -## TL;DR (< 5 minutes) - -1. Apply [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) to your APIM API and use the exact two `listBackends` entries below. -2. Keep `retryCount: 2` so the policy has one failed attempt and one recovery attempt. -3. Send one OpenAI Responses request through APIM, for example `POST https://.azure-api.net//v1/responses`. - -**Expected outcome:** `200 OK`, `x-Backend-Attempts: 2`, and `backendLog` shows `Fail-429-1` throttled before `PAYGO` succeeds. - -## What you will observe - -- Request #1 is fast, shows `x-Backend-Attempts: 2`, and returns a successful response from `PAYGO`. -- Request #2 sent within the cool-down window is also successful, but shows `x-Backend-Attempts: 1` because `Fail-429-1` is skipped. -- Request #3 sent after the cool-down expires shows `x-Backend-Attempts: 2` again because `Fail-429-1` is retried and throttled again. -- The client does not see `429`; the failover stays inside APIM. - -## Reference - -
-Settings, values, units, and when each takes effect - -| Setting | Value in this POC | Unit | Set in | Takes effect | -| :--- | :--- | :--- | :--- | :--- | -| Backend A `url` | `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/error/429` | URL | `listBackends` | after policy save | -| Backend A `path` | `/api` | path segment | `listBackends` | after policy save | -| Backend A effective URL | `.../api/error/429/api` | URL | policy normalization (`url + path`) | after policy save | -| Backend B `url` | `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api` | URL | `listBackends` | after policy save | -| Backend B `path` | `openai` | path segment | `listBackends` | after policy save | -| Backend B effective URL | `.../api/openai` | URL | policy normalization (`url + path`) | after policy save | -| Backend B `timeout` | `20` | seconds | `listBackends` | after policy save | -| Backend A `timeout` | default `10` | seconds | policy default when omitted | after policy save | -| `429` cool-down | `Retry-After + 2` | seconds | parsed from backend response | per request | -| Simulator default `Retry-After` | `10` | seconds | `/api/error/429` default | per request | -| Effective cool-down in this POC | `12` | seconds | policy logic | per request | -| `retryCount` | `2` | attempts | `priorityCfg` | after policy save | -| Default request priority | `3` | level | policy default when header absent | per request | -| `limitConcurrency` | default `off` | mode | policy default when omitted | after policy save | -| `bufferResponse` | default `true` | boolean | policy default when omitted | after policy save | - -> [!NOTE] -> **Units used in this doc:** timeouts and cool-downs are in seconds. The policy combines `url + path` once during backend normalization and logs the combined URL in `backendLog`. - -
- -## Setup - -### Minimal prerequisites - -**What matters:** this POC uses one APIM API and one deployed LLM Simulator function; no extra infrastructure is required. - -- An APIM instance with the retry policy applied to the target API. -- The LLM Simulator function deployed at `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net`. -- An APIM frontend route that forwards an OpenAI-style request path. The example below uses `/v1/responses`. -- Managed identity enabled for APIM if you want the config to stay identical to real Azure OpenAI backends. - -Endpoints used in this POC: - -- `GET|POST /api/error/429` returns `429` immediately and sets `Retry-After` to `10` seconds by default. -- `POST /api/openai/v1/responses` returns a real OpenAI-style response from the simulator. - -> [!NOTE] -> The simulator accepts anonymous requests. Keeping `auth: "MI"` is still useful here because it matches the production Azure OpenAI configuration shape. - -### Apply the policy - -**What matters:** apply the policy at the API level on **All operations**, not at product or global scope. Use the Azure portal for this POC; the CLI form is provided only as an automation alternative. - -#### Azure portal (recommended) - -1. Open your APIM instance in the [Azure portal](https://portal.azure.com). -2. Select **APIs** and open the target API. -3. Select **All operations**. -4. Open the **Inbound processing** policy editor (`` icon). -5. Replace the editor contents with [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml). -6. Select **Save**. - -
-Azure CLI alternative - -```bash -az apim api policy create \ - --resource-group \ - --service-name \ - --api-id \ - --value "$(cat APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml)" \ - --format xml -``` - -
- -### Configure `listBackends` - -**What matters:** Backend A must return `429` immediately and Backend B must return a normal OpenAI-style response. - -Use these exact backend entries: - -```xml - -``` - -After normalization, the policy logs these effective backend URLs: - -- `Fail-429-1` -> `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/error/429/api` -- `PAYGO` -> `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai` - -> [!WARNING] -> **Do not add `/openai` a second time to the backend `url`.** In v2.1+, the policy combines `url + path` once and uses that combined value as the backend base URL. - -### Configure `priorityCfg` - -**What matters:** `retryCount` must be `2` or higher so the policy can spend one cycle on the `429` and one cycle on recovery. - -```xml - -``` - -> [!TIP] -> You do not need to send `llm_proxy_priority` for this POC. Both backends accept priorities `1`, `2`, and `3`, so the default priority still exercises failover. - -## Run - -**What matters:** send one OpenAI-style request through APIM and let the policy route the same request across both backends. - -This example uses the real SimpleL7Proxy frontend route `/resp/v1/responses` on the deployed Container App. The backend entry contributes `/openai` via `path: "openai"`, so the effective backend path becomes `/api/openai/v1/responses`. - -```bash -curl -i \ - -H "Content-Type: application/json" \ - -d '{"input":"hi","model":"gpt-5-nano","max_output_tokens":500}' \ - "https://simplel7dev.wittybeach-67bb528b.eastus.azurecontainerapps.io/resp/v1/responses" -``` - -### How the URL is rewritten end-to-end - -**What matters:** the `/v1/responses` path the client sends and the `/api/openai/v1/responses` path the simulator receives are the same suffix; APIM strips its API prefix and the backend entry prepends its base path. - -| Hop | Receives | Transformation | Forwards | -| :--- | :--- | :--- | :--- | -| 1. `curl` | n/a | n/a | `POST https://simplel7dev.wittybeach-67bb528b.eastus.azurecontainerapps.io/resp/v1/responses` | -| 2. SimpleL7Proxy (Container App) | `POST /resp/v1/responses` | Pass-through to APIM | `POST https://.azure-api.net/resp/v1/responses` | -| 3. APIM API (suffix `/resp`) | `POST /resp/v1/responses` | Strips the API suffix `/resp`, leaving operation path `/v1/responses` | `POST /v1/responses` into the policy | -| 4. Retry policy + `PAYGO` backend | operation path `/v1/responses` | Prepends the normalized backend base `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai` (from `url + path`) | `POST https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai/v1/responses` | -| 5. LLM Simulator function | `POST /api/openai/v1/responses` | Returns the OpenAI-style JSON | `200 OK` back up the chain | - -Visually: - -```text -client: POST .../resp/v1/responses - | - v -proxy: POST https://.azure-api.net/resp/v1/responses - | (APIM API suffix = /resp -> stripped) - v -APIM: operation path = /v1/responses - | (PAYGO base = .../api/openai , from url + path) - v -backend: POST https://simplel7fn-...azurewebsites.net/api/openai/v1/responses -``` - -> [!NOTE] -> If your APIM API already exposes `/openai` in the public route, keep that existing frontend route. The key check is the effective backend URL in `backendLog`: `PAYGO` should still resolve to `.../api/openai/...`, not `.../api/openai/openai/...`. - -## Verify - -**What matters:** use the headers and `backendLog` to confirm each stage of the failover state machine. - -- [ ] `200 OK` means APIM hid the backend `429` from the client. -- [ ] `x-Backend-Attempts: 2` means cycle 1 hit `Fail-429-1` and cycle 2 hit `PAYGO`. -- [ ] `x-PolicyCycleCounter: 2` means the retry loop executed exactly two cycles. -- [ ] `backendLog` contains `Using Fail-429-1 URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/error/429/api`. -- [ ] `backendLog` contains `THROTTLED: Fail-429-1 Retry-After: 00:12` (that is, `Retry-After (10s) + 2s` safety buffer). -- [ ] `backendLog` contains `Using PAYGO URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai`. -- [ ] `backendLog` ends with `CALL SUCCESSFUL`, which means the second attempt completed normally. -- [ ] `x-backend-affinity` maps to the `PAYGO` backend hash, which confirms the recovery backend served the response. - -### Verify the cool-down window - -**What matters:** the next two requests confirm the `Retry-After + 2s` throttle window is actually enforced. - -1. Send request #1. Expect `x-Backend-Attempts: 2` and a `PAYGO` response. -2. Send request #2 within 12 seconds. Expect `x-Backend-Attempts: 1` and no new `Fail-429-1` throttle event. -3. Wait more than 12 seconds and send request #3. Expect `x-Backend-Attempts: 2` again because `Fail-429-1` is retried. - -## Deep dive - -**What matters:** this POC is a two-cycle loop: pick the `429` backend, classify the `429`, mark it throttled, then retry against the healthy backend. - -### Full request flow - -```mermaid -flowchart LR - C[Client request] --> P[APIM retry policy] - P --> A[Backend A\nFail-429-1\n/api/error/429/api] - A -->|429 + Retry-After: 10| F[Classify temporary error] - F --> T[Mark Backend A throttled\n12s window] - T --> R[Retry loop reselects backend] - R --> B[Backend B\nPAYGO\n/api/openai] - B --> S[200 OK] - S --> O[Client receives success] -``` - -### Worked example - -**What matters:** the numbers below show why the first request succeeds even though the first backend fails immediately. - -| Step | Time | What happens | What it shows | -| :--- | :--- | :--- | :--- | -| 1 | `t=0.0s` | Policy selects `Fail-429-1` because it has `priorityGroup=1` and is not throttled. | Primary path wins first. | -| 2 | `t=0.1s` | Backend A returns `429` with `Retry-After: 10`. | The failure is temporary, not permanent. | -| 3 | `t=0.1s` | Policy calculates `10 + 2 = 12s` and marks Backend A throttled. | APIM enters the cool-down state. | -| 4 | `t=0.1s` | Retry loop runs again and selects `PAYGO`. | Recovery happens inside the same client request. | -| 5 | `t=0.3s` | Backend B returns `200 OK`. | The healthy backend is serving correctly. | -| 6 | `t=0.3s` | Response headers show `x-Backend-Attempts: 2` and `PAYGO` affinity. | The client can confirm failover from headers alone. | -| 7 | `t=5s` | A second request inside the 12-second window shows `x-Backend-Attempts: 1`. | Backend A is skipped while throttled. | -| 8 | `t=13s` | A third request shows `x-Backend-Attempts: 2` again. | The cool-down expired and Backend A re-entered selection. | - -### Reading `backendLog` from a real call - -**What matters:** every `|`-separated entry in `backendLog` is ` `. Read it top-to-bottom as a state machine: the policy lists currently throttled backends, picks an index, calls it, classifies the result, optionally throttles it, then either succeeds or starts another cycle. - -#### Call 1 - cold start, real failover (`x-Backend-Attempts: 2`) - -The `curl` command from the [Run](#run) section was sent to `https://simplel7dev.wittybeach-67bb528b.eastus.azurecontainerapps.io/resp/v1/responses`. APIM returned `200 OK` and included the `backendlog` response header recorded below. Each `|`-separated entry is ` `; the full HTTP response is in the [Full HTTP response from Call 1](#full-http-response-from-call-1) section further down. - -The received `backendlog` header (re-formatted one entry per line for readability): - -```text -0.001s Begin -0.001s THROTTLED: (none) -0.001s RETRIES LEFT: 4 CYCLE: 1 INDEX: 0 -0.001s Using Fail-429-1 URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/error/429/api LIMIT: off -0.322s StatusCode: 429 - Temp Error -0.323s THROTTLED: Fail-429-1 Retry-After: 00:12 -0.323s CALL INCOMPLETE, Unthrottled Backends: 2 -1.323s RETRIES LEFT: 3 CYCLE: 2 INDEX: 2 -1.323s Using PAYGO URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai LIMIT: off -1.629s StatusCode: 200 - Success -1.629s CALL SUCCESSFUL -``` - -
-Line-by-line explanation (Call 1) - -| Time | Log entry | Meaning | -| :--- | :--- | :--- | -| `0.001s` | `Begin` | Policy entered the retry loop. | -| `0.001s` | `THROTTLED: (none)` | Throttled-backend list is empty; every backend is eligible. | -| `0.001s` | `RETRIES LEFT: 4 CYCLE: 1 INDEX: 0` | Cycle 1 picked `INDEX: 0` (`Fail-429-1`, the first entry in `listBackends`). `RETRIES LEFT: 4` is the remaining budget after the cycle-1 decrement. | -| `0.001s` | `Using Fail-429-1 URL: .../api/error/429/api LIMIT: off` | Effective URL is the normalized `url + path`. `LIMIT: off` means `limitConcurrency` is not enforced. | -| `0.322s` | `StatusCode: 429 - Temp Error` | Simulator answered in 321 ms; policy classified `429` as a temporary error (`isTempError = true`). | -| `0.323s` | `THROTTLED: Fail-429-1 Retry-After: 00:12` | `Fail-429-1` is throttled for `mm:ss = 00:12` = `Retry-After (10s)` + `2s` safety buffer. | -| `0.323s` | `CALL INCOMPLETE, Unthrottled Backends: 2` | Call has not succeeded yet; two backends are still eligible (`FAIL-429-2` at index 1, `PAYGO` at index 2). | -| `1.323s` | `RETRIES LEFT: 3 CYCLE: 2 INDEX: 2` | 1 s later the policy starts cycle 2 and selects `INDEX: 2` (`PAYGO`). Index 1 is skipped because it is also a 429 backend in this deployment. | -| `1.323s` | `Using PAYGO URL: .../api/openai LIMIT: off` | PAYGO's effective URL after normalization. | -| `1.629s` | `StatusCode: 200 - Success` | PAYGO returned in 306 ms. | -| `1.629s` | `CALL SUCCESSFUL` | End of the retry loop; client sees `200 OK` after ~1.63 s. | - -The `x-backend-attempts: 2` and `x-policycyclecounter: 2` headers in the response match `CYCLE: 2` and the two `Using ...` lines. - -
- -
-Full HTTP response from Call 1 - -```http -HTTP/2 200 -request-context: appId=cid-v1:d5a7cc01-2aaa-4e64-8e84-92457137e12c -x-sample-file: gpt5-nano-response.txt -tokenprocessor: MultiLineAllUsage -x-backend-affinity: 7d2da94ea51f53af9140 -x-backend-attempts: 2 -x-policycyclecounter: 2 -backendlog: 0.001s Begin | 0.001s THROTTLED: (none) | 0.001s RETRIES LEFT: 4 CYCLE: 1 INDEX: 0 | 0.001s Using Fail-429-1 URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/error/429/api LIMIT: off | 0.322s StatusCode: 429 - Temp Error | 0.323s THROTTLED: Fail-429-1 Retry-After: 00:12 | 0.323s CALL INCOMPLETE, Unthrottled Backends: 2 | 1.323s RETRIES LEFT: 3 CYCLE: 2 INDEX: 2 | 1.323s Using PAYGO URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai LIMIT: off | 1.629s StatusCode: 200 - Success | 1.629s CALL SUCCESSFUL -x-policy-lasterror: [{"code":"init","message":"noError"}] -content-type: text/plain; charset=utf-8 -server: Microsoft-NetCore/2.0 -date: Tue, 19 May 2026 17:50:56 GMT -``` - -```json -{ - "id": "resp_68a8ac23029c81a0ac7cdc747825224603cd1485deb283f8", - "object": "response", - "created_at": 1755884579, - "status": "incomplete", - "background": false, - "content_filters": null, - "error": null, - "incomplete_details": { "reason": "max_output_tokens" }, - "instructions": null, - "max_output_tokens": 500, - "max_tool_calls": null, - "model": "gpt-5-nano", - "output": [ - { - "id": "rs_68a8ac23391481a0a7034bb6b2c6a6c403cd1485deb283f8", - "type": "reasoning", - "summary": [] - } - ], - "parallel_tool_calls": true, - "previous_response_id": null, - "prompt_cache_key": null, - "reasoning": { "effort": "high", "summary": null }, - "safety_identifier": null, - "service_tier": "default", - "store": true, - "temperature": 1.0, - "text": { "format": { "type": "text" } }, - "tool_choice": "auto", - "tools": [], - "top_p": 1.0, - "truncation": "disabled", - "usage": { - "input_tokens": 30, - "input_tokens_details": { "cached_tokens": 0 }, - "output_tokens": 448, - "output_tokens_details": { "reasoning_tokens": 448 }, - "total_tokens": 478 - }, - "user": null, - "metadata": {} -} -``` - -
- -#### Call 2 - immediate retry inside the cool-down window (`x-Backend-Attempts: 1`) - -The same `curl` command was repeated ~7 seconds later, while `Fail-429-1` was still in its cool-down window. APIM again returned `200 OK`, but the `backendlog` header now shows a single-cycle path that skips the throttled backend entirely. - -The received `backendlog` header (re-formatted one entry per line): - -```text -0.001s Begin -0.001s THROTTLED: (Fail-429-1 - 00:05) -0.001s RETRIES LEFT: 4 CYCLE: 1 INDEX: 2 -0.001s Using PAYGO URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai LIMIT: off -0.085s StatusCode: 200 - Success -0.085s CALL SUCCESSFUL -``` - -
-Line-by-line explanation (Call 2) - -| Time | Log entry | Meaning | -| :--- | :--- | :--- | -| `0.001s` | `Begin` | Policy entered the retry loop. | -| `0.001s` | `THROTTLED: (Fail-429-1 - 00:05)` | Throttled-backend list contains `Fail-429-1` with `00:05` (5 s) remaining out of the original 12 s window. Format is `(
- -> [!TIP] -> If you re-run after the `00:12` window elapses, `Fail-429-1` returns to the eligible pool, gets picked at `INDEX: 0` again, fails with `429`, and the log pattern matches Call 1. - -## Optional variants - -### Change the throttle window - -**What matters:** the cool-down is `Retry-After + 2s`, and `Retry-After` itself can be changed two different ways without touching the APIM policy. - -#### Per-request override (query string) - -Change Backend A's URL to one of these: - -- `.../api/error/429?retryAfter=1` for a 3-second total cool-down. -- `.../api/error/429?retryAfter=30` for a 32-second total cool-down. - -#### Change the simulator default (app setting) - -The simulator reads the `ERROR429_RETRY_AFTER_DEFAULT` environment variable / Azure Functions app setting and falls back to `10` when it is unset or invalid. Updating it changes the default `Retry-After` for every `/api/error/429` request that does not pass `?retryAfter=`. - -Azure portal: - -1. Open the Function App (for example `simplel7fn-e8bscgd8h4adcjcs`). -2. Select **Settings** > **Environment variables** > **App settings**. -3. Add or edit `ERROR429_RETRY_AFTER_DEFAULT` and set the value in seconds (for example `5` or `30`). -4. Select **Apply** and confirm the restart. - -Azure CLI: - -```bash -az functionapp config appsettings set \ - --name \ - --resource-group \ - --settings ERROR429_RETRY_AFTER_DEFAULT=5 -``` - -After the function restarts, the new cool-down is `ERROR429_RETRY_AFTER_DEFAULT + 2s` and the `backendLog` entry becomes `THROTTLED: Fail-429-1 Retry-After: 00:07` for the `5` example. - - -### Swap the healthy backend to a real Azure OpenAI resource - -**What matters:** keep the `429` simulator primary and replace only the `PAYGO` backend if you want to test the same failover flow against a live Azure OpenAI response. - -Replace Backend B with your Azure OpenAI endpoint and keep `priorityGroup: 2`, `timeout: 20`, and `auth: "MI"`. - -### Timeout variant - -**What matters:** if you want to document the timeout branch instead of the `429` branch, point Backend A at a slow or unreachable endpoint and keep Backend B unchanged. - -Expected differences: - -- `backendLog` shows `likely timeout` instead of `isTempError=true`. -- The cool-down is the hard-coded `10s` timeout window instead of `Retry-After + 2s`. - -## Troubleshooting - -**What matters:** each symptom below maps to one concrete cause and one concrete check. - -| Symptom | Likely cause | Check | -| :--- | :--- | :--- | -| Client still receives `429` | `retryCount` is too low or Backend B is unhealthy | Confirm `priorityCfg[*].retryCount >= 2` and test Backend B directly | -| First request shows `x-Backend-Attempts: 1` and succeeds | Backend A is still throttled from a previous run | Wait more than 12 seconds and run again | -| `backendLog` shows `.../api/openai/openai/...` | `/openai` appears in both the frontend route and the backend base URL | Remove one duplicate path segment and retest | -| `backendLog` does not show `retry-after=10` | Backend A is not the simulator `429` route you expect | Confirm Backend A URL is exactly `/api/error/429` or explicitly set `?retryAfter=` | -| `x-backend-affinity` does not map to `PAYGO` | Backend URLs changed after affinity hashes were computed | Re-save the policy so the hashes are recalculated | -| Managed identity call fails before the backend request | APIM does not have a valid token for the configured resource | Verify APIM managed identity configuration or temporarily use an empty `auth` value for the simulator-only test | - -## Related documentation - -- [POC-Priority-configuration.md](POC-Priority-configuration.md) - Routing requests across backends by priority tier -- [POC-OpenAI-Failover.md](POC-OpenAI-Failover.md) - Real Azure OpenAI PTU-to-PAYGO failover -- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) - Host connection string options including `timeout` and `retryCount` -- [OBSERVABILITY.md](OBSERVABILITY.md) - Token metrics, telemetry channels, and event logger configuration +# POC: Failover Configuration + +**Purpose:** Show that when the primary backend returns a simulated `429`, the `Priority-with-retry-enhancedLog.xml` APIM policy marks it throttled and retries the same request against a healthy backend that returns a real OpenAI-style response. + +> [!NOTE] +> **Policy version:** This POC uses [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml). The older [`APIM-Policy/v2.0.1/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.0.1/Priority-with-retry-enhancedLog.xml) does not combine `url + path` the same way and will not produce the `backendLog` entries shown below. + +> [!IMPORTANT] +> **The rule: when Backend A returns `429`, APIM marks it throttled for `Retry-After + 2s`, retries the request against the next healthy backend, and the client still sees `200 OK`.** + +## TL;DR (< 5 minutes) + +1. Apply [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) to your APIM API and use the exact two `listBackends` entries below. +2. Keep `retryCount: 2` so the policy has one failed attempt and one recovery attempt. +3. Send one OpenAI Responses request through APIM, for example `POST https://.azure-api.net//v1/responses`. + +**Expected outcome:** `200 OK`, `x-Backend-Attempts: 2`, and `backendLog` shows `Fail-429-1` throttled before `PAYGO` succeeds. + +## What you will observe + +- Request #1 is fast, shows `x-Backend-Attempts: 2`, and returns a successful response from `PAYGO`. +- Request #2 sent within the cool-down window is also successful, but shows `x-Backend-Attempts: 1` because `Fail-429-1` is skipped. +- Request #3 sent after the cool-down expires shows `x-Backend-Attempts: 2` again because `Fail-429-1` is retried and throttled again. +- The client does not see `429`; the failover stays inside APIM. + +## Reference + +
+Settings, values, units, and when each takes effect + +| Setting | Value in this POC | Unit | Set in | Takes effect | +| :--- | :--- | :--- | :--- | :--- | +| Backend A `url` | `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/error/429` | URL | `listBackends` | after policy save | +| Backend A `path` | `/api` | path segment | `listBackends` | after policy save | +| Backend A effective URL | `.../api/error/429/api` | URL | policy normalization (`url + path`) | after policy save | +| Backend B `url` | `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api` | URL | `listBackends` | after policy save | +| Backend B `path` | `openai` | path segment | `listBackends` | after policy save | +| Backend B effective URL | `.../api/openai` | URL | policy normalization (`url + path`) | after policy save | +| Backend B `timeout` | `20` | seconds | `listBackends` | after policy save | +| Backend A `timeout` | default `10` | seconds | policy default when omitted | after policy save | +| `429` cool-down | `Retry-After + 2` | seconds | parsed from backend response | per request | +| Simulator default `Retry-After` | `10` | seconds | `/api/error/429` default | per request | +| Effective cool-down in this POC | `12` | seconds | policy logic | per request | +| `retryCount` | `2` | attempts | `priorityCfg` | after policy save | +| Default request priority | `3` | level | policy default when header absent | per request | +| `limitConcurrency` | default `off` | mode | policy default when omitted | after policy save | +| `bufferResponse` | default `true` | boolean | policy default when omitted | after policy save | + +> [!NOTE] +> **Units used in this doc:** timeouts and cool-downs are in seconds. The policy combines `url + path` once during backend normalization and logs the combined URL in `backendLog`. + +
+ +## Setup + +### Minimal prerequisites + +**What matters:** this POC uses one APIM API and one deployed LLM Simulator function; no extra infrastructure is required. + +- An APIM instance with the retry policy applied to the target API. +- The LLM Simulator function deployed at `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net`. +- An APIM frontend route that forwards an OpenAI-style request path. The example below uses `/v1/responses`. +- Managed identity enabled for APIM if you want the config to stay identical to real Azure OpenAI backends. + +Endpoints used in this POC: + +- `GET|POST /api/error/429` returns `429` immediately and sets `Retry-After` to `10` seconds by default. +- `POST /api/openai/v1/responses` returns a real OpenAI-style response from the simulator. + +> [!NOTE] +> The simulator accepts anonymous requests. Keeping `auth: "MI"` is still useful here because it matches the production Azure OpenAI configuration shape. + +### Apply the policy + +**What matters:** apply the policy at the API level on **All operations**, not at product or global scope. Use the Azure portal for this POC; the CLI form is provided only as an automation alternative. + +#### Azure portal (recommended) + +1. Open your APIM instance in the [Azure portal](https://portal.azure.com). +2. Select **APIs** and open the target API. +3. Select **All operations**. +4. Open the **Inbound processing** policy editor (`` icon). +5. Replace the editor contents with [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml). +6. Select **Save**. + +
+Azure CLI alternative + +```bash +az apim api policy create \ + --resource-group \ + --service-name \ + --api-id \ + --value "$(cat APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml)" \ + --format xml +``` + +
+ +### Configure `listBackends` + +**What matters:** Backend A must return `429` immediately and Backend B must return a normal OpenAI-style response. + +Use these exact backend entries: + +```xml + +``` + +After normalization, the policy logs these effective backend URLs: + +- `Fail-429-1` -> `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/error/429/api` +- `PAYGO` -> `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai` + +> [!WARNING] +> **Do not add `/openai` a second time to the backend `url`.** In v2.1+, the policy combines `url + path` once and uses that combined value as the backend base URL. + +### Configure `priorityCfg` + +**What matters:** `retryCount` must be `2` or higher so the policy can spend one cycle on the `429` and one cycle on recovery. + +```xml + +``` + +> [!TIP] +> You do not need to send `llm_proxy_priority` for this POC. Both backends accept priorities `1`, `2`, and `3`, so the default priority still exercises failover. + +## Run + +**What matters:** send one OpenAI-style request through APIM and let the policy route the same request across both backends. + +This example uses the real SimpleL7Proxy frontend route `/resp/v1/responses` on the deployed Container App. The backend entry contributes `/openai` via `path: "openai"`, so the effective backend path becomes `/api/openai/v1/responses`. + +```bash +curl -i \ + -H "Content-Type: application/json" \ + -d '{"input":"hi","model":"gpt-5-nano","max_output_tokens":500}' \ + "https://simplel7dev.wittybeach-67bb528b.eastus.azurecontainerapps.io/resp/v1/responses" +``` + +### How the URL is rewritten end-to-end + +**What matters:** the `/v1/responses` path the client sends and the `/api/openai/v1/responses` path the simulator receives are the same suffix; APIM strips its API prefix and the backend entry prepends its base path. + +| Hop | Receives | Transformation | Forwards | +| :--- | :--- | :--- | :--- | +| 1. `curl` | n/a | n/a | `POST https://simplel7dev.wittybeach-67bb528b.eastus.azurecontainerapps.io/resp/v1/responses` | +| 2. SimpleL7Proxy (Container App) | `POST /resp/v1/responses` | Pass-through to APIM | `POST https://.azure-api.net/resp/v1/responses` | +| 3. APIM API (suffix `/resp`) | `POST /resp/v1/responses` | Strips the API suffix `/resp`, leaving operation path `/v1/responses` | `POST /v1/responses` into the policy | +| 4. Retry policy + `PAYGO` backend | operation path `/v1/responses` | Prepends the normalized backend base `https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai` (from `url + path`) | `POST https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai/v1/responses` | +| 5. LLM Simulator function | `POST /api/openai/v1/responses` | Returns the OpenAI-style JSON | `200 OK` back up the chain | + +Visually: + +```text +client: POST .../resp/v1/responses + | + v +proxy: POST https://.azure-api.net/resp/v1/responses + | (APIM API suffix = /resp -> stripped) + v +APIM: operation path = /v1/responses + | (PAYGO base = .../api/openai , from url + path) + v +backend: POST https://simplel7fn-...azurewebsites.net/api/openai/v1/responses +``` + +> [!NOTE] +> If your APIM API already exposes `/openai` in the public route, keep that existing frontend route. The key check is the effective backend URL in `backendLog`: `PAYGO` should still resolve to `.../api/openai/...`, not `.../api/openai/openai/...`. + +## Verify + +**What matters:** use the headers and `backendLog` to confirm each stage of the failover state machine. + +- [ ] `200 OK` means APIM hid the backend `429` from the client. +- [ ] `x-Backend-Attempts: 2` means cycle 1 hit `Fail-429-1` and cycle 2 hit `PAYGO`. +- [ ] `x-PolicyCycleCounter: 2` means the retry loop executed exactly two cycles. +- [ ] `backendLog` contains `Using Fail-429-1 URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/error/429/api`. +- [ ] `backendLog` contains `THROTTLED: Fail-429-1 Retry-After: 00:12` (that is, `Retry-After (10s) + 2s` safety buffer). +- [ ] `backendLog` contains `Using PAYGO URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai`. +- [ ] `backendLog` ends with `CALL SUCCESSFUL`, which means the second attempt completed normally. +- [ ] `x-backend-affinity` maps to the `PAYGO` backend hash, which confirms the recovery backend served the response. + +### Verify the cool-down window + +**What matters:** the next two requests confirm the `Retry-After + 2s` throttle window is actually enforced. + +1. Send request #1. Expect `x-Backend-Attempts: 2` and a `PAYGO` response. +2. Send request #2 within 12 seconds. Expect `x-Backend-Attempts: 1` and no new `Fail-429-1` throttle event. +3. Wait more than 12 seconds and send request #3. Expect `x-Backend-Attempts: 2` again because `Fail-429-1` is retried. + +## Deep dive + +**What matters:** this POC is a two-cycle loop: pick the `429` backend, classify the `429`, mark it throttled, then retry against the healthy backend. + +### Full request flow + +```mermaid +flowchart LR + C[Client request] --> P[APIM retry policy] + P --> A[Backend A\nFail-429-1\n/api/error/429/api] + A -->|429 + Retry-After: 10| F[Classify temporary error] + F --> T[Mark Backend A throttled\n12s window] + T --> R[Retry loop reselects backend] + R --> B[Backend B\nPAYGO\n/api/openai] + B --> S[200 OK] + S --> O[Client receives success] +``` + +### Worked example + +**What matters:** the numbers below show why the first request succeeds even though the first backend fails immediately. + +| Step | Time | What happens | What it shows | +| :--- | :--- | :--- | :--- | +| 1 | `t=0.0s` | Policy selects `Fail-429-1` because it has `priorityGroup=1` and is not throttled. | Primary path wins first. | +| 2 | `t=0.1s` | Backend A returns `429` with `Retry-After: 10`. | The failure is temporary, not permanent. | +| 3 | `t=0.1s` | Policy calculates `10 + 2 = 12s` and marks Backend A throttled. | APIM enters the cool-down state. | +| 4 | `t=0.1s` | Retry loop runs again and selects `PAYGO`. | Recovery happens inside the same client request. | +| 5 | `t=0.3s` | Backend B returns `200 OK`. | The healthy backend is serving correctly. | +| 6 | `t=0.3s` | Response headers show `x-Backend-Attempts: 2` and `PAYGO` affinity. | The client can confirm failover from headers alone. | +| 7 | `t=5s` | A second request inside the 12-second window shows `x-Backend-Attempts: 1`. | Backend A is skipped while throttled. | +| 8 | `t=13s` | A third request shows `x-Backend-Attempts: 2` again. | The cool-down expired and Backend A re-entered selection. | + +### Reading `backendLog` from a real call + +**What matters:** every `|`-separated entry in `backendLog` is ` `. Read it top-to-bottom as a state machine: the policy lists currently throttled backends, picks an index, calls it, classifies the result, optionally throttles it, then either succeeds or starts another cycle. + +#### Call 1 - cold start, real failover (`x-Backend-Attempts: 2`) + +The `curl` command from the [Run](#run) section was sent to `https://simplel7dev.wittybeach-67bb528b.eastus.azurecontainerapps.io/resp/v1/responses`. APIM returned `200 OK` and included the `backendlog` response header recorded below. Each `|`-separated entry is ` `; the full HTTP response is in the [Full HTTP response from Call 1](#full-http-response-from-call-1) section further down. + +The received `backendlog` header (re-formatted one entry per line for readability): + +```text +0.001s Begin +0.001s THROTTLED: (none) +0.001s RETRIES LEFT: 4 CYCLE: 1 INDEX: 0 +0.001s Using Fail-429-1 URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/error/429/api LIMIT: off +0.322s StatusCode: 429 - Temp Error +0.323s THROTTLED: Fail-429-1 Retry-After: 00:12 +0.323s CALL INCOMPLETE, Unthrottled Backends: 2 +1.323s RETRIES LEFT: 3 CYCLE: 2 INDEX: 2 +1.323s Using PAYGO URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai LIMIT: off +1.629s StatusCode: 200 - Success +1.629s CALL SUCCESSFUL +``` + +
+Line-by-line explanation (Call 1) + +| Time | Log entry | Meaning | +| :--- | :--- | :--- | +| `0.001s` | `Begin` | Policy entered the retry loop. | +| `0.001s` | `THROTTLED: (none)` | Throttled-backend list is empty; every backend is eligible. | +| `0.001s` | `RETRIES LEFT: 4 CYCLE: 1 INDEX: 0` | Cycle 1 picked `INDEX: 0` (`Fail-429-1`, the first entry in `listBackends`). `RETRIES LEFT: 4` is the remaining budget after the cycle-1 decrement. | +| `0.001s` | `Using Fail-429-1 URL: .../api/error/429/api LIMIT: off` | Effective URL is the normalized `url + path`. `LIMIT: off` means `limitConcurrency` is not enforced. | +| `0.322s` | `StatusCode: 429 - Temp Error` | Simulator answered in 321 ms; policy classified `429` as a temporary error (`isTempError = true`). | +| `0.323s` | `THROTTLED: Fail-429-1 Retry-After: 00:12` | `Fail-429-1` is throttled for `mm:ss = 00:12` = `Retry-After (10s)` + `2s` safety buffer. | +| `0.323s` | `CALL INCOMPLETE, Unthrottled Backends: 2` | Call has not succeeded yet; two backends are still eligible (`FAIL-429-2` at index 1, `PAYGO` at index 2). | +| `1.323s` | `RETRIES LEFT: 3 CYCLE: 2 INDEX: 2` | 1 s later the policy starts cycle 2 and selects `INDEX: 2` (`PAYGO`). Index 1 is skipped because it is also a 429 backend in this deployment. | +| `1.323s` | `Using PAYGO URL: .../api/openai LIMIT: off` | PAYGO's effective URL after normalization. | +| `1.629s` | `StatusCode: 200 - Success` | PAYGO returned in 306 ms. | +| `1.629s` | `CALL SUCCESSFUL` | End of the retry loop; client sees `200 OK` after ~1.63 s. | + +The `x-backend-attempts: 2` and `x-policycyclecounter: 2` headers in the response match `CYCLE: 2` and the two `Using ...` lines. + +
+ +
+Full HTTP response from Call 1 + +```http +HTTP/2 200 +request-context: appId=cid-v1:d5a7cc01-2aaa-4e64-8e84-92457137e12c +x-sample-file: gpt5-nano-response.txt +tokenprocessor: MultiLineAllUsage +x-backend-affinity: 7d2da94ea51f53af9140 +x-backend-attempts: 2 +x-policycyclecounter: 2 +backendlog: 0.001s Begin | 0.001s THROTTLED: (none) | 0.001s RETRIES LEFT: 4 CYCLE: 1 INDEX: 0 | 0.001s Using Fail-429-1 URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/error/429/api LIMIT: off | 0.322s StatusCode: 429 - Temp Error | 0.323s THROTTLED: Fail-429-1 Retry-After: 00:12 | 0.323s CALL INCOMPLETE, Unthrottled Backends: 2 | 1.323s RETRIES LEFT: 3 CYCLE: 2 INDEX: 2 | 1.323s Using PAYGO URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai LIMIT: off | 1.629s StatusCode: 200 - Success | 1.629s CALL SUCCESSFUL +x-policy-lasterror: [{"code":"init","message":"noError"}] +content-type: text/plain; charset=utf-8 +server: Microsoft-NetCore/2.0 +date: Tue, 19 May 2026 17:50:56 GMT +``` + +```json +{ + "id": "resp_68a8ac23029c81a0ac7cdc747825224603cd1485deb283f8", + "object": "response", + "created_at": 1755884579, + "status": "incomplete", + "background": false, + "content_filters": null, + "error": null, + "incomplete_details": { "reason": "max_output_tokens" }, + "instructions": null, + "max_output_tokens": 500, + "max_tool_calls": null, + "model": "gpt-5-nano", + "output": [ + { + "id": "rs_68a8ac23391481a0a7034bb6b2c6a6c403cd1485deb283f8", + "type": "reasoning", + "summary": [] + } + ], + "parallel_tool_calls": true, + "previous_response_id": null, + "prompt_cache_key": null, + "reasoning": { "effort": "high", "summary": null }, + "safety_identifier": null, + "service_tier": "default", + "store": true, + "temperature": 1.0, + "text": { "format": { "type": "text" } }, + "tool_choice": "auto", + "tools": [], + "top_p": 1.0, + "truncation": "disabled", + "usage": { + "input_tokens": 30, + "input_tokens_details": { "cached_tokens": 0 }, + "output_tokens": 448, + "output_tokens_details": { "reasoning_tokens": 448 }, + "total_tokens": 478 + }, + "user": null, + "metadata": {} +} +``` + +
+ +#### Call 2 - immediate retry inside the cool-down window (`x-Backend-Attempts: 1`) + +The same `curl` command was repeated ~7 seconds later, while `Fail-429-1` was still in its cool-down window. APIM again returned `200 OK`, but the `backendlog` header now shows a single-cycle path that skips the throttled backend entirely. + +The received `backendlog` header (re-formatted one entry per line): + +```text +0.001s Begin +0.001s THROTTLED: (Fail-429-1 - 00:05) +0.001s RETRIES LEFT: 4 CYCLE: 1 INDEX: 2 +0.001s Using PAYGO URL: https://simplel7fn-e8bscgd8h4adcjcs.westus-01.azurewebsites.net/api/openai LIMIT: off +0.085s StatusCode: 200 - Success +0.085s CALL SUCCESSFUL +``` + +
+Line-by-line explanation (Call 2) + +| Time | Log entry | Meaning | +| :--- | :--- | :--- | +| `0.001s` | `Begin` | Policy entered the retry loop. | +| `0.001s` | `THROTTLED: (Fail-429-1 - 00:05)` | Throttled-backend list contains `Fail-429-1` with `00:05` (5 s) remaining out of the original 12 s window. Format is `(
+ +> [!TIP] +> If you re-run after the `00:12` window elapses, `Fail-429-1` returns to the eligible pool, gets picked at `INDEX: 0` again, fails with `429`, and the log pattern matches Call 1. + +## Optional variants + +### Change the throttle window + +**What matters:** the cool-down is `Retry-After + 2s`, and `Retry-After` itself can be changed two different ways without touching the APIM policy. + +#### Per-request override (query string) + +Change Backend A's URL to one of these: + +- `.../api/error/429?retryAfter=1` for a 3-second total cool-down. +- `.../api/error/429?retryAfter=30` for a 32-second total cool-down. + +#### Change the simulator default (app setting) + +The simulator reads the `ERROR429_RETRY_AFTER_DEFAULT` environment variable / Azure Functions app setting and falls back to `10` when it is unset or invalid. Updating it changes the default `Retry-After` for every `/api/error/429` request that does not pass `?retryAfter=`. + +Azure portal: + +1. Open the Function App (for example `simplel7fn-e8bscgd8h4adcjcs`). +2. Select **Settings** > **Environment variables** > **App settings**. +3. Add or edit `ERROR429_RETRY_AFTER_DEFAULT` and set the value in seconds (for example `5` or `30`). +4. Select **Apply** and confirm the restart. + +Azure CLI: + +```bash +az functionapp config appsettings set \ + --name \ + --resource-group \ + --settings ERROR429_RETRY_AFTER_DEFAULT=5 +``` + +After the function restarts, the new cool-down is `ERROR429_RETRY_AFTER_DEFAULT + 2s` and the `backendLog` entry becomes `THROTTLED: Fail-429-1 Retry-After: 00:07` for the `5` example. + + +### Swap the healthy backend to a real Azure OpenAI resource + +**What matters:** keep the `429` simulator primary and replace only the `PAYGO` backend if you want to test the same failover flow against a live Azure OpenAI response. + +Replace Backend B with your Azure OpenAI endpoint and keep `priorityGroup: 2`, `timeout: 20`, and `auth: "MI"`. + +### Timeout variant + +**What matters:** if you want to document the timeout branch instead of the `429` branch, point Backend A at a slow or unreachable endpoint and keep Backend B unchanged. + +Expected differences: + +- `backendLog` shows `likely timeout` instead of `isTempError=true`. +- The cool-down is the hard-coded `10s` timeout window instead of `Retry-After + 2s`. + +## Troubleshooting + +**What matters:** each symptom below maps to one concrete cause and one concrete check. + +| Symptom | Likely cause | Check | +| :--- | :--- | :--- | +| Client still receives `429` | `retryCount` is too low or Backend B is unhealthy | Confirm `priorityCfg[*].retryCount >= 2` and test Backend B directly | +| First request shows `x-Backend-Attempts: 1` and succeeds | Backend A is still throttled from a previous run | Wait more than 12 seconds and run again | +| `backendLog` shows `.../api/openai/openai/...` | `/openai` appears in both the frontend route and the backend base URL | Remove one duplicate path segment and retest | +| `backendLog` does not show `retry-after=10` | Backend A is not the simulator `429` route you expect | Confirm Backend A URL is exactly `/api/error/429` or explicitly set `?retryAfter=` | +| `x-backend-affinity` does not map to `PAYGO` | Backend URLs changed after affinity hashes were computed | Re-save the policy so the hashes are recalculated | +| Managed identity call fails before the backend request | APIM does not have a valid token for the configured resource | Verify APIM managed identity configuration or temporarily use an empty `auth` value for the simulator-only test | + +## Related documentation + +- [POC-Priority-configuration.md](POC-Priority-configuration.md) - Routing requests across backends by priority tier +- [POC-OpenAI-Failover.md](POC-OpenAI-Failover.md) - Real Azure OpenAI PTU-to-PAYGO failover +- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) - Host connection string options including `timeout` and `retryCount` +- [OBSERVABILITY.md](OBSERVABILITY.md) - Token metrics, telemetry channels, and event logger configuration diff --git a/docs/POC-OpenAI-Failover.md b/docs/POC-OpenAI-Failover.md index fd11adf..339598f 100644 --- a/docs/POC-OpenAI-Failover.md +++ b/docs/POC-OpenAI-Failover.md @@ -1,899 +1,899 @@ -# POC: Azure OpenAI Failover via APIM - -**Purpose:** Show that when a backend returns `429`, the APIM policy marks it as throttled, retries the same request against the next available backend, and the client still sees `200 OK`. Any combination of `PTU` and `PAYGO` backends can be used but for the purposes of this POC, we will a single PTU and a single PAYGO backed. - -> [!IMPORTANT] -> **The rule: PTU at `priorityGroup: 1` wins when healthy; when it returns `429`, APIM throttles it for `Retry-After + 2s` and the same request retries against PAYGO at `priorityGroup: 2`. The client never sees the `429`.** - -## TL;DR (< 5 minutes) - -1. Apply [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) to your APIM API with two backends: PTU at `priorityGroup: 1` and PAYGO at `priorityGroup: 2`, both using the same deployment name. -2. Send one healthy request, then a burst to exhaust PTU quota, then one more request while PTU is throttled. -3. Read `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog` to confirm PTU → PAYGO failover. - -**Expected outcomes:** healthy = `x-Backend-Attempts: 1`, PTU affinity; failover = `x-Backend-Attempts: 2`, PAYGO affinity; cool-down = `x-Backend-Attempts: 1`, PAYGO affinity (PTU skipped). - -## What you will observe - -- A healthy PTU request returns `200 OK` with `x-Backend-Attempts: 1` and the PTU affinity hash. -- When PTU is throttled, the same request still returns `200 OK` but with `x-Backend-Attempts: 2` and the PAYGO affinity hash. -- Requests sent during the `Retry-After + 2s` cool-down return `200 OK` with `x-Backend-Attempts: 1` and PAYGO affinity — PTU is skipped without being tried. -- After the cool-down expires, the next request returns `x-Backend-Attempts: 1` with PTU affinity again. -- If PTU is unreachable instead of throttled, `backendLog` shows `likely timeout` and a `10s` cool-down instead of a `Retry-After`-based window. - -## Reference - -
-Settings, values, units, and when each takes effect - -| Setting | Value in this POC | Unit | Set in | Takes effect | -| :--- | :--- | :--- | :--- | :--- | -| Policy file | [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) | — | APIM API | after policy save | -| `priorityGroup` (PTU) | `1` | group | `listBackends` | after policy save | -| `priorityGroup` (PAYGO) | `2` | group | `listBackends` | after policy save | -| `timeout` | `30` | seconds | `listBackends` | after policy save | -| `retryCount` | `2` | attempts | `priorityCfg` | after policy save | -| `defaultRetryAfter` | `10` | seconds | `listBackends` per-backend | after policy save | -| `429` cool-down | `Retry-After + 2` | seconds | parsed from Azure OpenAI response | per request | -| Timeout cool-down | `10` (hard-coded) | seconds | policy logic | per request | -| `auth` | `"MI"` | — | `listBackends` | after policy save | -| `bufferResponse` | `true` | boolean | `listBackends` | after policy save | -| `limitConcurrency` | `off` | mode | `listBackends` | after policy save | -| Deployment name | same on both resources | — | Azure OpenAI | at resource creation | -| Reload behavior | policy save | — | APIM | after policy save | - -> [!NOTE] -> **Units used in this doc:** `timeout` and `defaultRetryAfter` are in seconds. `Retry-After` is in seconds as returned by Azure OpenAI. The policy adds `2s` to `Retry-After` before applying the cool-down window. - -
- -## Setup - -### Minimal prerequisites - -**What matters:** you need two Azure OpenAI resources with the same deployment name. The PTU resource needs low enough quota to trigger real `429` responses under load. - -- An APIM instance with [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) applied at the API level. -- Two Azure OpenAI resources with the same deployment name, for example `gpt-4o-mini`. -- A PTU primary with capacity low enough to trigger `429` responses under the burst in step 2. -- A PAYGO secondary in a separate Azure OpenAI resource. -- One auth path (choose one): - - **Managed Identity:** APIM system-assigned identity has `Cognitive Services OpenAI User` on both AOAI resources. - - **API key:** keys from each Azure OpenAI resource. -- An APIM subscription key for the target API. - -> [!NOTE] -> Older policy versions used `api-key` instead of `auth`. The v2.1.0 policy reads `auth`; `api-key` is silently ignored. - -### Apply the policy - -**What matters:** apply the policy at the API level on **All operations**, not at product or global scope. - -#### Azure portal (recommended) - -1. Open your APIM instance in the [Azure portal](https://portal.azure.com). -2. Select **APIs** and open the target API. -3. Select **All operations**. -4. Open the **Inbound processing** policy editor (`` icon). -5. Replace the editor contents with [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml). -6. Select **Save**. - -If you are using Managed Identity: - -1. In APIM, open **Identity** and enable the system-assigned identity. -2. Open each Azure OpenAI resource. -3. Go to **Access control (IAM)**. -4. Add the `Cognitive Services OpenAI User` role assignment for the APIM managed identity. - -> [!NOTE] -> Role propagation can take a few minutes. If the first request returns `401 PermissionDenied`, wait and retry. - -
-Azure CLI alternative - -```bash -az apim api policy create \ - --resource-group \ - --service-name \ - --api-id \ - --value "$(cat APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml)" \ - --format xml -``` - -Enable Managed Identity and assign AOAI access: - -```bash -PRINCIPAL_ID=$(az apim update --resource-group --name \ - --set identity.type=SystemAssigned --query identity.principalId -o tsv) - -for AOAI in ; do - SCOPE=$(az cognitiveservices account show -g -n "$AOAI" --query id -o tsv) - az role assignment create --assignee "$PRINCIPAL_ID" \ - --role "Cognitive Services OpenAI User" --scope "$SCOPE" -done -``` - -
- -### Configure `listBackends` - -**What matters:** PTU must have a lower `priorityGroup` than PAYGO. Both backends must use the same deployment name so APIM can forward the same operation path to either one. - -```xml -.openai.azure.com/" }, - { "path", "" }, - { "priorityGroup", 1 }, - { "label", "PTU" }, - { "acceptablePriorities", new JArray(1,2,3) }, - { "limitConcurrency", "off" }, - { "bufferResponse", true }, - { "timeout", 30 }, - { "auth", "MI" } - }); - - backends.Add(new JObject() - { - { "url", "https://.openai.azure.com/" }, - { "path", "" }, - { "priorityGroup", 2 }, - { "label", "PAYGO" }, - { "acceptablePriorities", new JArray(1,2,3) }, - { "limitConcurrency", "off" }, - { "bufferResponse", true }, - { "timeout", 30 }, - { "auth", "MI" } - }); - -... -}" /> -``` - -> [!TIP] -> For API key auth, replace `"MI"` with a key value or a Named Value reference such as `{{aoai-ptu-key}}`. - -### Configure `priorityCfg` - -**What matters:** `retryCount: 2` gives the policy one cycle on PTU and one cycle on PAYGO for the same request. - -```xml - -``` - -> [!WARNING] -> If `retryCount` is too low, the client receives the PTU `429` instead of a PAYGO recovery. - -## Run - -**What matters:** run the four requests in order. Replace the placeholders once and reuse them for every step. - -```bash -BASE="https://.azure-api.net/" -KEY="" -DEP="" -VER="" -``` - -> [!TIP] -> Use `2024-02-01` as the `api-version` if you are unsure which one your resources support. - -### 1. Healthy PTU request - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: $KEY" \ - -H "Content-Type: application/json" \ - -d '{"messages":[{"role":"user","content":"Say hi in five words."}],"max_tokens":40}' \ - "$BASE/openai/deployments/$DEP/chat/completions?api-version=$VER" -``` - -Expected: `200 OK`, `x-Backend-Attempts: 1`, `x-backend-affinity` = PTU hash, `backendLog` ends with `CALL SUCCESSFUL` for PTU URL. - -### 2. Force PTU throttle and fail over to PAYGO - -Send a burst to exhaust PTU quota: - -```bash -for i in {1..20}; do - curl -s -o /dev/null -w "%{http_code} %{header_x-backend-affinity}\n" \ - -H "Ocp-Apim-Subscription-Key: $KEY" \ - -H "Content-Type: application/json" \ - -d "{\"messages\":[{\"role\":\"user\",\"content\":\"$(yes word | head -n 500 | tr '\n' ' ')\"}],\"max_tokens\":200}" \ - "$BASE/openai/deployments/$DEP/chat/completions?api-version=$VER" & -done -wait -``` - -Then send one request with headers visible: - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: $KEY" \ - -H "Content-Type: application/json" \ - -d '{"messages":[{"role":"user","content":"Hi"}],"max_tokens":20}' \ - "$BASE/openai/deployments/$DEP/chat/completions?api-version=$VER" -``` - -Expected: `200 OK`, `x-Backend-Attempts: 2`, `x-PolicyCycleCounter: 2`, `x-backend-affinity` = PAYGO hash, `backendLog` shows `THROTTLED: PTU Retry-After: ` then `CALL SUCCESSFUL` for PAYGO. - -### 3. Confirm cool-down behavior - -Immediately send one more request while PTU is inside the cool-down window: - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: $KEY" \ - -H "Content-Type: application/json" \ - -d '{"messages":[{"role":"user","content":"Hi again"}],"max_tokens":20}' \ - "$BASE/openai/deployments/$DEP/chat/completions?api-version=$VER" -``` - -Expected: `200 OK`, `x-Backend-Attempts: 1`, `x-backend-affinity` = PAYGO hash, `backendLog` shows `THROTTLED: (PTU - )` and a single-cycle PAYGO call. - -### 4. Confirm recovery to PTU - -Wait for the `Retry-After` value plus `2s`, then repeat step 1. - -Expected: `200 OK`, `x-Backend-Attempts: 1`, `x-backend-affinity` switches back to PTU hash, `backendLog` shows PTU URL and `CALL SUCCESSFUL`. - -### 5. Optional — simulate PTU unreachable - -Point the PTU `url` at a non-routable host, then repeat step 1: - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: $KEY" \ - -H "Content-Type: application/json" \ - -d '{"messages":[{"role":"user","content":"Hi"}],"max_tokens":20}' \ - "$BASE/openai/deployments/$DEP/chat/completions?api-version=$VER" -``` - -Expected: `200 OK`, `x-Backend-Attempts: 2`, `backendLog` includes `likely timeout` and a `10s` cool-down, PAYGO serves the response. - -## Verify - -**What matters:** `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog` together tell you exactly which backend was selected and why. - -- [ ] Step 1: `200 OK`, `x-Backend-Attempts: 1`, affinity = PTU hash. -- [ ] Step 1: `backendLog` contains `Using PTU URL:` and ends with `CALL SUCCESSFUL`. No PAYGO entry. -- [ ] Step 2: `200 OK`, `x-Backend-Attempts: 2`, `x-PolicyCycleCounter: 2`, affinity = PAYGO hash. -- [ ] Step 2: `backendLog` contains `THROTTLED: PTU Retry-After:` followed by `Using PAYGO URL:` and `CALL SUCCESSFUL`. -- [ ] Step 3: `200 OK`, `x-Backend-Attempts: 1`, affinity = PAYGO hash. `backendLog` shows `THROTTLED: (PTU - )` and a single PAYGO cycle. -- [ ] Step 4: `200 OK`, `x-Backend-Attempts: 1`, affinity returns to PTU hash after cool-down expires. -- [ ] Step 5 (optional): `backendLog` shows `likely timeout` and a `10s` window instead of `Retry-After`. - -> [!TIP] -> If you cannot explain a result using all three signals together, the test is not complete yet. - -## Deep dive - -**What matters:** the policy is a two-cycle loop — pick PTU, classify the failure, mark PTU throttled, retry against PAYGO — and the cool-down state persists across requests until the window expires. - -### Full request flow - -```mermaid -flowchart LR - C[Client request] --> S[APIM selects lowest healthy priority] - S --> P[Cycle 1: PTU backend] - P -->|200 OK| R1[Return PTU response] - P -->|429 + Retry-After| T[Mark PTU throttled\nRetry-After + 2s] - P -->|timeout| T2[Mark PTU throttled\n10s window] - T --> N[Retry loop: cycle 2] - T2 --> N - N --> G[Cycle 2: PAYGO backend] - G -->|200 OK| R2[Return PAYGO response] - T -. during cool-down .-> K[Next requests skip PTU] - K -. after expiry .-> S -``` - -### Worked example - -| Step | Time | Observable signal | What it shows | -| :--- | :--- | :--- | :--- | -| 1 | `t=0s` | `x-Backend-Attempts: 1`, PTU affinity | PTU is healthy; wins on cycle 1. | -| 2 | `t=4s` | PTU returns `429` with `Retry-After: 5` | PTU is overloaded. | -| 3 | `t=4s` | `backendLog`: `THROTTLED: PTU Retry-After: 00:07` | Policy applied `5 + 2 = 7s` cool-down. | -| 4 | `t=4s` | `x-Backend-Attempts: 2`, PAYGO affinity | Same client request succeeded on cycle 2. | -| 5 | `t=8s` | `x-Backend-Attempts: 1`, PAYGO affinity | PTU still in cool-down; skipped entirely. | -| 6 | `t=12s` | `x-Backend-Attempts: 1`, PTU affinity | Cool-down expired; PTU re-entered selection. | - -### How to read `backendLog` - -**What matters:** each `|`-separated entry is ` `. The effective backend URL is the normalized `url + path` after policy processing. - -Healthy PTU path: - -```text -0.001s Begin -0.001s THROTTLED: (none) -0.001s RETRIES LEFT: 2 CYCLE: 1 INDEX: 0 -0.001s Using PTU URL: https://.openai.azure.com/openai/deployments//chat/completions LIMIT: off -0.320s StatusCode: 200 - Success -0.320s CALL SUCCESSFUL -``` - -Failover after a real Azure OpenAI `429`: - -```text -0.001s Begin -0.001s THROTTLED: (none) -0.001s RETRIES LEFT: 2 CYCLE: 1 INDEX: 0 -0.001s Using PTU URL: https://.openai.azure.com/... LIMIT: off -0.312s StatusCode: 429 - Temp Error -0.313s THROTTLED: PTU Retry-After: 00:07 -0.313s CALL INCOMPLETE, Unthrottled Backends: 1 -1.313s RETRIES LEFT: 1 CYCLE: 2 INDEX: 1 -1.313s Using PAYGO URL: https://.openai.azure.com/... LIMIT: off -1.630s StatusCode: 200 - Success -1.630s CALL SUCCESSFUL -``` - -Cool-down request (PTU skipped from the start): - -```text -0.001s Begin -0.001s THROTTLED: (PTU - 00:04) -0.001s RETRIES LEFT: 2 CYCLE: 1 INDEX: 1 -0.001s Using PAYGO URL: https://.openai.azure.com/... LIMIT: off -0.085s StatusCode: 200 - Success -0.085s CALL SUCCESSFUL -``` - -### Auth modes - -| Mode | Header sent | When to use | -| :--- | :--- | :--- | -| Managed Identity (`"MI"`) | `Authorization: Bearer ` | Preferred for all deployments. | -| API key | `api-key: ` | Use only when MI is not available. | - -> [!WARNING] -> Do not hard-code production keys in the policy. Use APIM Named Values or Key Vault references. - -## Optional variants - -### Three-region active/active/active - -**What matters:** add a third backend at `priorityGroup: 3` and raise `retryCount` to `3` so the retry loop can walk all three regions. - -```xml -backends.Add(new JObject() { - { "url", "https://.openai.azure.com/" }, { "path", "" }, - { "priorityGroup", 3 }, { "label", "EUS2" }, - { "acceptablePriorities", new JArray(1,2,3) }, - { "limitConcurrency", "off" }, { "bufferResponse", true }, - { "timeout", 30 }, { "auth", "MI" } -}); -``` - -### Streaming chat completions - -**What matters:** set `bufferResponse: false` on each backend. Failover behavior is identical because the `429` or timeout is classified before the stream starts. - -```xml -{ "bufferResponse", false }, -``` - -Confirm the APIM gateway does not override the backend setting with global response buffering. - -### Tuning knobs - -- Lower PTU `timeout` to abandon slow PTU responses faster and reach PAYGO sooner. -- Set `limitConcurrency` on PTU to pre-empt hard throttling by capping in-flight requests. -- Restrict PTU to premium callers with `acceptablePriorities: [1]` and PAYGO to all callers with `acceptablePriorities: [1,2,3]`. See [POC-Priority-configuration.md](POC-Priority-configuration.md). -- Use `requeue: true` only when you want exhausted retries to return `429 + S7PREQUEUE: true` to SimpleL7Proxy instead of surfacing an error. - -## Troubleshooting - -**What matters:** each symptom maps to one concrete cause and one concrete check. - -| Symptom | Likely cause | Check | -| :--- | :--- | :--- | -| First request returns `401` | Managed Identity or API key is invalid | Confirm APIM identity has `Cognitive Services OpenAI User` on both AOAI resources; or confirm the `auth` value is correct | -| First request returns `404` | Deployment name differs between PTU and PAYGO | Verify both resources expose the same `/openai/deployments//...` path | -| Failover request returns `429` to client | `retryCount` is too low, or PAYGO is also unhealthy | Confirm `priorityCfg[*].retryCount >= 2` and test PAYGO directly | -| Every request goes to PAYGO even after waiting | PTU still throttled, or PTU keeps returning `429` | Compare the last `Retry-After` in `backendLog` with the elapsed wait; send a low-token test request | -| `backendLog` shows only PTU, no retry | PTU call succeeded or PAYGO is filtered out | Confirm the burst is large enough to trigger throttling and that PAYGO has `acceptablePriorities` covering the request | -| Streaming response does not stream | `bufferResponse` is still `true` or gateway-level buffering is active | Set `bufferResponse: false` and check APIM gateway buffering settings | -| `api-key` field silently ignored | Using old v2.0.1 field name | Rename `api-key` to `auth` in `listBackends` | - -> [!WARNING] -> Do not change multiple knobs at once while debugging. Check auth first, then deployment name, then retry settings, then load pattern. - -## Related documentation - -- [POC-Failover-configuration.md](POC-Failover-configuration.md) — Same failover policy using the LLM Simulator instead of real Azure OpenAI endpoints -- [POC-Priority-configuration.md](POC-Priority-configuration.md) — Priority-based backend selection by caller tier -- [POC-Chargeback.md](POC-Chargeback.md) — Token usage tracking and per-user cost attribution -- [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) — Wiring Azure OpenAI and AI Foundry endpoints into SimpleL7Proxy -- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) — Host configuration options including `timeout`, `retryCount`, and `acceptablePriorities` -- [SECURITY.md](SECURITY.md) — Managed Identity, Key Vault, and secret rotation guidance - - -## TL;DR - -- Put the PTU backend at `priorityGroup: 1`, the PAYGO backend at `priorityGroup: 2`, and keep the same deployment name on both Azure OpenAI resources. -- Run one normal request, then a short burst that exhausts PTU quota; the client should keep getting `200 OK` while APIM retries from PTU to PAYGO. -- Verify with `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog`: first PTU wins, then PAYGO takes over during cool-down, then traffic returns to PTU after cool-down expires. - -## What You Will Observe - -- A healthy request returns `200 OK` with `x-Backend-Attempts: 1` and the PTU affinity hash. -- A throttled PTU request still returns `200 OK`, but now with `x-Backend-Attempts: 2` and the PAYGO affinity hash. -- Requests sent during the `Retry-After + 2s` cool-down skip PTU entirely. -- After cool-down expires, the next request returns to PTU. - -[!NOTE] -Units used in this doc: `timeout` and `defaultRetryAfter` are seconds, Azure OpenAI `Retry-After` is seconds. - -## Reference - -
-Reference: defaults, units, and policy knobs used in this POC - -Use this reference when you are configuring the policy, checking whether the test is set up correctly, or explaining why APIM selected, skipped, or retried a backend. - -| Item | Value or default | What matters | -| :--- | :--- | :--- | -| Policy file | `APIM-Policy/Priority-with-retry-enhancedLog.xml` | Apply it at the API level on the API you are testing. | -| `priorityGroup` | PTU `1`, PAYGO `2` | Lower number wins when both backends are healthy. | -| `timeout` | `30` seconds in this POC | Long enough for real chat completions, short enough to surface a bad backend. | -| `retryCount` | `2` for priorities `1`, `2`, and `3` | Gives the policy one more backend to try after the first failure. | -| `defaultRetryAfter` | `10` seconds | Used for timeout-style failures when no backend `Retry-After` exists. | -| `Retry-After` handling | backend value plus `2` seconds | The policy adds a safety buffer before reusing a throttled backend. | -| `auth` | `"MI"` or a literal key | Use `"MI"` when possible; use `auth`, not `api-key`. | -| `bufferResponse` | `true` | Keep `true` for non-streaming chat completions; set `false` only for SSE streaming. | -| Deployment name | Same on both AOAI resources | APIM forwards the same `/openai/deployments//...` path to whichever backend wins. | -| Reload behavior | Policy save on APIM | Policy changes take effect after save; backend throttle state is runtime state inside the gateway. | - -
- -## Setup - -**Use two Azure OpenAI resources with the same deployment name, then put PTU first and PAYGO second in `listBackends`.** - -```xml -{ "priorityGroup", 1 }, -{ "label", "PTU" }, -{ "auth", "MI" } -``` - -[!NOTE] -Older versions of the policy use `api-key` rather than `auth`. - -### Minimal prerequisites - -- An APIM instance with [../APIM-Policy/Priority-with-retry-enhancedLog.xml](../APIM-Policy/Priority-with-retry-enhancedLog.xml) applied at the API level. -- Two Azure OpenAI resources with the same deployment name, for example `gpt-4o-mini`. -- A PTU primary with low enough capacity to trigger real `429` responses under load. -- A PAYGO secondary in a separate Azure OpenAI resource. -- One auth path: - - Managed Identity: APIM system-assigned identity has `Cognitive Services OpenAI User` on both resources. - - API key: keys from each Azure OpenAI resource. -- An APIM subscription key for the target API. - -### Apply the policy - -**Apply the policy to the target API, not globally, so the test surface is small and observable.** - -Portal steps: - -1. Open APIM in the Azure portal. -2. Go to `APIs` and select the target API. -3. Select `All operations` so the policy applies across the API. -4. In `Inbound processing`, open the policy editor. -5. Paste the contents of `APIM-Policy/Priority-with-retry-enhancedLog.xml`. -6. Save the policy. - -If you are using Managed Identity: - -1. In APIM, open `Identity` and enable the system-assigned identity. -2. Open each Azure OpenAI resource. -3. Go to `Access control (IAM)`. -4. Add the `Cognitive Services OpenAI User` role assignment for the APIM managed identity. - -
-If you prefer to use the CLI - -Use the Azure CLI to apply the policy and, if needed, enable Managed Identity and grant Azure OpenAI access. - -```bash -az apim api policy create \ - --resource-group \ - --service-name \ - --api-id \ - --value "$(cat APIM-Policy/Priority-with-retry-enhancedLog.xml)" \ - --format xml -``` - -```bash -PRINCIPAL_ID=$(az apim update --resource-group --name \ - --set identity.type=SystemAssigned --query identity.principalId -o tsv) - -for AOAI in ; do - SCOPE=$(az cognitiveservices account show -g -n "$AOAI" --query id -o tsv) - az role assignment create --assignee "$PRINCIPAL_ID" \ - --role "Cognitive Services OpenAI User" --scope "$SCOPE" -done -``` - -
- -[!NOTE] -Role propagation can take a few minutes. If the first request returns `401 PermissionDenied`, wait and retry. - -### Backend configuration - -What matters: We want to try the PTU first, so make sure that it has a lower `priorityGroup` than the PAYGO endpoint. OpenAI requres authentication so both entries must expose a valid `auth` mode ( either MI or key ). - -```xml -.openai.azure.com/" }, - { "path", "" }, - { "priorityGroup", 1 }, - { "label", "PTU" }, - { "acceptablePriorities", new JArray(1,2,3) }, - { "limitConcurrency", "off" }, - { "bufferResponse", true }, - { "timeout", 30 }, - { "auth", "MI" } - }); - - backends.Add(new JObject() - { - { "url", "https://.openai.azure.com/" }, - { "path", "" }, - { "priorityGroup", 2 }, - { "label", "PAYGO" }, - { "acceptablePriorities", new JArray(1,2,3) }, - { "limitConcurrency", "off" }, - { "bufferResponse", true }, - { "timeout", 30 }, - { "auth", "MI" } - }); - - foreach (JObject backend in backends) { - string saltedUrl = salt + backend["url"].ToString(); - backend["affinity"] = string.Concat( - System.Security.Cryptography.SHA256.Create() - .ComputeHash(System.Text.Encoding.UTF8.GetBytes(saltedUrl)) - .Take(10) - .Select(b => b.ToString("x2")) - ); - backend["isThrottling"] = false; - backend["retryAfter"] = DateTime.MinValue; - backend["defaultRetryAfter"] = 10; - } - - return backends; -}" /> -``` - -[!TIP] -For API key auth, replace `"MI"` with a key value or a Named Value reference such as `{{aoai-ptu-key}}`. - -### Retry configuration - -What matters: set `retryCount` high enough for the policy to try the next backend in the list. - -```xml - -``` - -[!WARNING] -If `retryCount` is too low, the client will see the PTU failure instead of a PAYGO recovery. - -## Run - -**Run the four requests in order: healthy PTU, forced PTU throttle, immediate follow-up during cool-down, and post-cool-down recovery.** - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: " \ - -H "Content-Type: application/json" \ - -d '{"messages":[{"role":"user","content":"Say hi in five words."}],"max_tokens":40}' \ - "https://.azure-api.net//openai/deployments//chat/completions?api-version=" -``` - -[!TIP] -Replace ``, ``, ``, ``, and `` once, then reuse the same values for every step. - -### 1. Healthy PTU request - -Send one normal chat request: - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: " \ - -H "Content-Type: application/json" \ - -d '{"messages":[{"role":"user","content":"Say hi in five words."}],"max_tokens":40}' \ - "https://.azure-api.net//openai/deployments//chat/completions?api-version=" -``` - -Expected outcome: - -- `200 OK` -- `x-Backend-Attempts: 1` -- `x-backend-affinity` equals the PTU backend hash -- `backendLog` contains `Using PTU backend` and `CALL SUCCESSFUL` - -### 2. Force PTU throttling and fail over to PAYGO - -Send a short burst that exceeds PTU TPM or RPM quota: - -```bash -for i in {1..20}; do - curl -s -o /dev/null -w "%{http_code} %{header_x-backend-affinity}\n" \ - -H "Ocp-Apim-Subscription-Key: " \ - -H "Content-Type: application/json" \ - -d "{\"messages\":[{\"role\":\"user\",\"content\":\"$(yes word | head -n 500 | tr '\n' ' ')\"}],\"max_tokens\":200}" \ - "https://.azure-api.net//openai/deployments//chat/completions?api-version=" & -done -wait -``` - -Then send one request with headers visible: - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: " \ - -H "Content-Type: application/json" \ - -d '{"messages":[{"role":"user","content":"Hi"}],"max_tokens":20}' \ - "https://.azure-api.net//openai/deployments//chat/completions?api-version=" -``` - -Expected outcome: - -- `200 OK` -- `x-Backend-Attempts: 2` -- `x-PolicyCycleCounter: 2` -- `x-backend-affinity` now equals the PAYGO backend hash -- `backendLog` shows `Throttling [0] by s, isTempError=true, retry-after=` followed by `Using PAYGO backend` and `CALL SUCCESSFUL` - -### 3. Confirm cool-down behavior - -Immediately send one more request while PTU is still inside the `Retry-After + 2s` window: - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: " \ - -H "Content-Type: application/json" \ - -d '{"messages":[{"role":"user","content":"Hi again"}],"max_tokens":20}' \ - "https://.azure-api.net//openai/deployments//chat/completions?api-version=" -``` - -Expected outcome: - -- `200 OK` -- `x-Backend-Attempts: 1` -- `x-backend-affinity` still equals the PAYGO backend hash -- `backendLog` shows PAYGO only; there is no new PTU throttle event for this request - -### 4. Confirm recovery to PTU - -Wait for the backend `Retry-After` value plus the policy's `2` second buffer, then repeat the healthy request from step 1. - -Expected outcome: - -- `200 OK` -- `x-Backend-Attempts: 1` -- `x-backend-affinity` switches back to the PTU backend hash -- `backendLog` shows `Using PTU backend` and `CALL SUCCESSFUL` - -### 5. Optional: simulate PTU unreachable instead of throttled - -Point the PTU backend URL at a non-routable host such as `https://nonexistent-host-xyz.openai.azure.com/`, then run the same single request again. - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: " \ - -H "Content-Type: application/json" \ - -d '{"messages":[{"role":"user","content":"Hi"}],"max_tokens":20}' \ - "https://.azure-api.net//openai/deployments//chat/completions?api-version=" -``` - -Expected outcome: - -- `200 OK` -- `x-Backend-Attempts: 2` -- `backendLog` includes `likely timeout` and a `10s` throttle window -- PAYGO serves the final response - -## Verify - -**Verification is a checklist: every signal below maps directly to one step in the failover state machine.** - -```text -select -> fail -> throttle -> retry -> recover -``` - -[!TIP] -If you cannot explain a result with `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog`, the test is not complete yet. - -Checklist: - -- [ ] Healthy request: `200 OK` with `x-Backend-Attempts: 1` means PTU answered on cycle 1. -- [ ] Failover request: `200 OK` with `x-Backend-Attempts: 2` means PTU failed on cycle 1 and PAYGO answered on cycle 2. -- [ ] Failover request: `backendLog` contains `Throttling [0] by s` and then `Using PAYGO backend`. -- [ ] Cool-down request: `x-Backend-Attempts: 1` with PAYGO affinity means PTU was skipped because it was still throttled. -- [ ] Recovery request: PTU affinity returns after `Retry-After + 2s`, which proves the policy exited cool-down. -- [ ] Timeout variant: `backendLog` shows `likely timeout` and a `10s` cooldown instead of a backend-supplied `Retry-After`. - -## Deep Dive - -**The policy behavior is a two-cycle loop: pick PTU, classify the PTU failure, mark PTU throttled, then retry against PAYGO.** - -```xml -{ "priorityGroup", 1 }, -{ "priorityGroup", 2 }, -{ "retryCount", 2 } -``` - -[!NOTE] -This POC uses real Azure OpenAI `429` responses, so the same control path is exercised as production overload handling. - -### End-to-end flow - -```mermaid -flowchart LR - C[Client request] --> S[APIM selects lowest healthy priority] - S --> P[Cycle 1: PTU backend] - P -->|200 OK| R1[Return PTU response] - P -->|429 + Retry-After or timeout| T[Mark PTU throttled] - T --> N[Retry after classification] - N --> G[Cycle 2: PAYGO backend] - G -->|200 OK| R2[Return PAYGO response] - T -. during cool-down .-> K[Next requests skip PTU] - K -. after expiry .-> S -``` - -### Worked example with concrete numbers - -| Step | Time | Observable signal | Meaning | -| :--- | :--- | :--- | :--- | -| 1 | `t=0s` | `x-Backend-Attempts: 1`, PTU affinity | PTU is healthy and wins selection. | -| 2 | `t=4s` | PTU returns `429` with `Retry-After: 5` | PTU is overloaded. | -| 3 | `t=4s` | `backendLog` shows `Throttling [0] by 7s` | APIM adds the `2s` safety buffer. | -| 4 | `t=4s` | `x-Backend-Attempts: 2`, PAYGO affinity | The same client request succeeded on cycle 2. | -| 5 | `t=8s` | Next request has `x-Backend-Attempts: 1`, PAYGO affinity | PTU is still in cool-down and is skipped. | -| 6 | `t=12s` | Next request returns PTU affinity again | Cool-down expired and PTU re-entered selection. | - -### How to read `backendLog` - -Successful PTU path: - -```text -Using PTU backend: https://.openai.azure.com/openai/deployments//chat/completions ... CALL SUCCESSFUL -``` - -Failover path after a real Azure OpenAI `429`: - -```text -Throttling [0] by 7s, isTempError=true, retry-after=5 -Using PAYGO backend: https://.openai.azure.com/openai/deployments//chat/completions ... CALL SUCCESSFUL -``` - -What the fields mean: - -- `[0]` is the zero-based index of the PTU backend inside `listBackends`. -- `retry-after=5` is the Azure OpenAI response. -- `by 7s` is the policy's throttle window after the `2` second safety buffer is added. - -### Auth modes - -| Mode | Header sent | Recommendation | -| :--- | :--- | :--- | -| Managed Identity | `Authorization: Bearer ` | Preferred for real deployments. | -| API key | `api-key: ` | Acceptable for POCs or when MI is unavailable. | - -[!WARNING] -Do not hard-code production keys in the policy. Use APIM Named Values or Key Vault references. - -## Optional Variants - -**Once the two-backend POC works, only add one variable at a time: more regions, streaming, or tuning.** - -```xml -{ "bufferResponse", false }, -{ "limitConcurrency", "" }, -{ "acceptablePriorities", new JArray(1,2,3) } -``` - -[!TIP] -Keep the first pass to two backends only. It makes the retry path obvious in `backendLog`. - -### Three-region active/active/active - -- Add a third backend with `priorityGroup: 3`. -- Keep `acceptablePriorities: [1,2,3]` on all three. -- Raise `priorityCfg[*].retryCount` to `3` so the loop can walk all regions. - -```xml -backends.Add(new JObject() { - { "url", "https://.openai.azure.com/" }, { "path", "" }, - { "priorityGroup", 1 }, { "label", "EUS2" }, - { "acceptablePriorities", new JArray(1,2,3) }, - { "limitConcurrency", "off" }, { "bufferResponse", true }, - { "timeout", 30 }, { "auth", "MI" } -}); -``` - -### Streaming chat completions - -- Set `bufferResponse: false` on each streaming backend. -- Failover behavior is unchanged because the `429` or timeout is classified before streaming begins. -- Confirm the gateway does not override streaming with global response buffering. - -```xml -{ "bufferResponse", false }, -``` - -### Tuning knobs that matter - -- Lower PTU `timeout` only if you want to abandon slow PTU responses faster. -- Set `limitConcurrency` on PTU if you want excess traffic to skip PTU before it hard-throttles. -- Reserve PTU for premium traffic by setting PTU `acceptablePriorities: [1]` and PAYGO `acceptablePriorities: [2,3]`. -- Use `requeue: true` only if you intentionally want to return `429 + S7PREQUEUE: true` to SimpleL7Proxy. - -## Troubleshooting - -**Every failure here should be explainable as symptom -> cause -> check.** - -```text -symptom -> cause -> check -``` - -[!WARNING] -Do not change multiple knobs at once while debugging. Check auth first, then deployment name, then retry settings, then load pattern. - -- Symptom: First request returns `401`. - Cause: Managed Identity or API key is invalid. - Check: Confirm APIM identity has `Cognitive Services OpenAI User` on both AOAI resources, or confirm the `auth` value is correct. - -- Symptom: First request returns `404`. - Cause: Deployment name differs between PTU and PAYGO or the forwarded path is wrong. - Check: Verify both AOAI resources expose the same `/openai/deployments//...` path. - -- Symptom: Failover request returns `429` to the client instead of `200`. - Cause: `retryCount` is too low, only one backend is eligible, or PAYGO is also unhealthy. - Check: Verify `priorityCfg[*].retryCount >= 2`, `acceptablePriorities` includes the current request, and PAYGO works directly. - -- Symptom: Every request keeps going to PAYGO even after waiting. - Cause: PTU is still throttled, the gateway clock has not passed the throttle window, or PTU keeps returning `429`. - Check: Compare the last `Retry-After` value in `backendLog` with the wait time and send a low-token test request. - -- Symptom: `backendLog` shows only PTU and no retry. - Cause: The PTU call succeeded, or the second backend was filtered out. - Check: Confirm the burst is large enough to trigger PTU throttling and that PAYGO shares the same `acceptablePriorities`. - -- Symptom: Streaming response does not stream. - Cause: `bufferResponse` is still `true` or APIM response buffering overrides the backend setting. - Check: Set `bufferResponse: false` and verify gateway-level buffering behavior. - -## Related Documentation - -- [POC-Failover-configuration.md](POC-Failover-configuration.md) - Same failover policy against the LLM Simulator instead of real Azure OpenAI backends. -- [POC-Priority-configuration.md](POC-Priority-configuration.md) - Priority-based backend selection by caller tier. -- [POC-Chargeback.md](POC-Chargeback.md) - Token usage tracking and per-user cost attribution. -- [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) - Wiring Azure OpenAI and AI Foundry endpoints into SimpleL7Proxy. -- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) - Host configuration options such as `Timeout`, `retryCount`, and `acceptablePriorities`. -- [SECURITY.md](SECURITY.md) - Managed Identity, Key Vault, and secret rotation guidance. +# POC: Azure OpenAI Failover via APIM + +**Purpose:** Show that when a backend returns `429`, the APIM policy marks it as throttled, retries the same request against the next available backend, and the client still sees `200 OK`. Any combination of `PTU` and `PAYGO` backends can be used but for the purposes of this POC, we will a single PTU and a single PAYGO backed. + +> [!IMPORTANT] +> **The rule: PTU at `priorityGroup: 1` wins when healthy; when it returns `429`, APIM throttles it for `Retry-After + 2s` and the same request retries against PAYGO at `priorityGroup: 2`. The client never sees the `429`.** + +## TL;DR (< 5 minutes) + +1. Apply [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) to your APIM API with two backends: PTU at `priorityGroup: 1` and PAYGO at `priorityGroup: 2`, both using the same deployment name. +2. Send one healthy request, then a burst to exhaust PTU quota, then one more request while PTU is throttled. +3. Read `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog` to confirm PTU → PAYGO failover. + +**Expected outcomes:** healthy = `x-Backend-Attempts: 1`, PTU affinity; failover = `x-Backend-Attempts: 2`, PAYGO affinity; cool-down = `x-Backend-Attempts: 1`, PAYGO affinity (PTU skipped). + +## What you will observe + +- A healthy PTU request returns `200 OK` with `x-Backend-Attempts: 1` and the PTU affinity hash. +- When PTU is throttled, the same request still returns `200 OK` but with `x-Backend-Attempts: 2` and the PAYGO affinity hash. +- Requests sent during the `Retry-After + 2s` cool-down return `200 OK` with `x-Backend-Attempts: 1` and PAYGO affinity — PTU is skipped without being tried. +- After the cool-down expires, the next request returns `x-Backend-Attempts: 1` with PTU affinity again. +- If PTU is unreachable instead of throttled, `backendLog` shows `likely timeout` and a `10s` cool-down instead of a `Retry-After`-based window. + +## Reference + +
+Settings, values, units, and when each takes effect + +| Setting | Value in this POC | Unit | Set in | Takes effect | +| :--- | :--- | :--- | :--- | :--- | +| Policy file | [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) | — | APIM API | after policy save | +| `priorityGroup` (PTU) | `1` | group | `listBackends` | after policy save | +| `priorityGroup` (PAYGO) | `2` | group | `listBackends` | after policy save | +| `timeout` | `30` | seconds | `listBackends` | after policy save | +| `retryCount` | `2` | attempts | `priorityCfg` | after policy save | +| `defaultRetryAfter` | `10` | seconds | `listBackends` per-backend | after policy save | +| `429` cool-down | `Retry-After + 2` | seconds | parsed from Azure OpenAI response | per request | +| Timeout cool-down | `10` (hard-coded) | seconds | policy logic | per request | +| `auth` | `"MI"` | — | `listBackends` | after policy save | +| `bufferResponse` | `true` | boolean | `listBackends` | after policy save | +| `limitConcurrency` | `off` | mode | `listBackends` | after policy save | +| Deployment name | same on both resources | — | Azure OpenAI | at resource creation | +| Reload behavior | policy save | — | APIM | after policy save | + +> [!NOTE] +> **Units used in this doc:** `timeout` and `defaultRetryAfter` are in seconds. `Retry-After` is in seconds as returned by Azure OpenAI. The policy adds `2s` to `Retry-After` before applying the cool-down window. + +
+ +## Setup + +### Minimal prerequisites + +**What matters:** you need two Azure OpenAI resources with the same deployment name. The PTU resource needs low enough quota to trigger real `429` responses under load. + +- An APIM instance with [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) applied at the API level. +- Two Azure OpenAI resources with the same deployment name, for example `gpt-4o-mini`. +- A PTU primary with capacity low enough to trigger `429` responses under the burst in step 2. +- A PAYGO secondary in a separate Azure OpenAI resource. +- One auth path (choose one): + - **Managed Identity:** APIM system-assigned identity has `Cognitive Services OpenAI User` on both AOAI resources. + - **API key:** keys from each Azure OpenAI resource. +- An APIM subscription key for the target API. + +> [!NOTE] +> Older policy versions used `api-key` instead of `auth`. The v2.1.0 policy reads `auth`; `api-key` is silently ignored. + +### Apply the policy + +**What matters:** apply the policy at the API level on **All operations**, not at product or global scope. + +#### Azure portal (recommended) + +1. Open your APIM instance in the [Azure portal](https://portal.azure.com). +2. Select **APIs** and open the target API. +3. Select **All operations**. +4. Open the **Inbound processing** policy editor (`` icon). +5. Replace the editor contents with [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml). +6. Select **Save**. + +If you are using Managed Identity: + +1. In APIM, open **Identity** and enable the system-assigned identity. +2. Open each Azure OpenAI resource. +3. Go to **Access control (IAM)**. +4. Add the `Cognitive Services OpenAI User` role assignment for the APIM managed identity. + +> [!NOTE] +> Role propagation can take a few minutes. If the first request returns `401 PermissionDenied`, wait and retry. + +
+Azure CLI alternative + +```bash +az apim api policy create \ + --resource-group \ + --service-name \ + --api-id \ + --value "$(cat APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml)" \ + --format xml +``` + +Enable Managed Identity and assign AOAI access: + +```bash +PRINCIPAL_ID=$(az apim update --resource-group --name \ + --set identity.type=SystemAssigned --query identity.principalId -o tsv) + +for AOAI in ; do + SCOPE=$(az cognitiveservices account show -g -n "$AOAI" --query id -o tsv) + az role assignment create --assignee "$PRINCIPAL_ID" \ + --role "Cognitive Services OpenAI User" --scope "$SCOPE" +done +``` + +
+ +### Configure `listBackends` + +**What matters:** PTU must have a lower `priorityGroup` than PAYGO. Both backends must use the same deployment name so APIM can forward the same operation path to either one. + +```xml +.openai.azure.com/" }, + { "path", "" }, + { "priorityGroup", 1 }, + { "label", "PTU" }, + { "acceptablePriorities", new JArray(1,2,3) }, + { "limitConcurrency", "off" }, + { "bufferResponse", true }, + { "timeout", 30 }, + { "auth", "MI" } + }); + + backends.Add(new JObject() + { + { "url", "https://.openai.azure.com/" }, + { "path", "" }, + { "priorityGroup", 2 }, + { "label", "PAYGO" }, + { "acceptablePriorities", new JArray(1,2,3) }, + { "limitConcurrency", "off" }, + { "bufferResponse", true }, + { "timeout", 30 }, + { "auth", "MI" } + }); + +... +}" /> +``` + +> [!TIP] +> For API key auth, replace `"MI"` with a key value or a Named Value reference such as `{{aoai-ptu-key}}`. + +### Configure `priorityCfg` + +**What matters:** `retryCount: 2` gives the policy one cycle on PTU and one cycle on PAYGO for the same request. + +```xml + +``` + +> [!WARNING] +> If `retryCount` is too low, the client receives the PTU `429` instead of a PAYGO recovery. + +## Run + +**What matters:** run the four requests in order. Replace the placeholders once and reuse them for every step. + +```bash +BASE="https://.azure-api.net/" +KEY="" +DEP="" +VER="" +``` + +> [!TIP] +> Use `2024-02-01` as the `api-version` if you are unsure which one your resources support. + +### 1. Healthy PTU request + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: $KEY" \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Say hi in five words."}],"max_tokens":40}' \ + "$BASE/openai/deployments/$DEP/chat/completions?api-version=$VER" +``` + +Expected: `200 OK`, `x-Backend-Attempts: 1`, `x-backend-affinity` = PTU hash, `backendLog` ends with `CALL SUCCESSFUL` for PTU URL. + +### 2. Force PTU throttle and fail over to PAYGO + +Send a burst to exhaust PTU quota: + +```bash +for i in {1..20}; do + curl -s -o /dev/null -w "%{http_code} %{header_x-backend-affinity}\n" \ + -H "Ocp-Apim-Subscription-Key: $KEY" \ + -H "Content-Type: application/json" \ + -d "{\"messages\":[{\"role\":\"user\",\"content\":\"$(yes word | head -n 500 | tr '\n' ' ')\"}],\"max_tokens\":200}" \ + "$BASE/openai/deployments/$DEP/chat/completions?api-version=$VER" & +done +wait +``` + +Then send one request with headers visible: + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: $KEY" \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Hi"}],"max_tokens":20}' \ + "$BASE/openai/deployments/$DEP/chat/completions?api-version=$VER" +``` + +Expected: `200 OK`, `x-Backend-Attempts: 2`, `x-PolicyCycleCounter: 2`, `x-backend-affinity` = PAYGO hash, `backendLog` shows `THROTTLED: PTU Retry-After: ` then `CALL SUCCESSFUL` for PAYGO. + +### 3. Confirm cool-down behavior + +Immediately send one more request while PTU is inside the cool-down window: + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: $KEY" \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Hi again"}],"max_tokens":20}' \ + "$BASE/openai/deployments/$DEP/chat/completions?api-version=$VER" +``` + +Expected: `200 OK`, `x-Backend-Attempts: 1`, `x-backend-affinity` = PAYGO hash, `backendLog` shows `THROTTLED: (PTU - )` and a single-cycle PAYGO call. + +### 4. Confirm recovery to PTU + +Wait for the `Retry-After` value plus `2s`, then repeat step 1. + +Expected: `200 OK`, `x-Backend-Attempts: 1`, `x-backend-affinity` switches back to PTU hash, `backendLog` shows PTU URL and `CALL SUCCESSFUL`. + +### 5. Optional — simulate PTU unreachable + +Point the PTU `url` at a non-routable host, then repeat step 1: + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: $KEY" \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Hi"}],"max_tokens":20}' \ + "$BASE/openai/deployments/$DEP/chat/completions?api-version=$VER" +``` + +Expected: `200 OK`, `x-Backend-Attempts: 2`, `backendLog` includes `likely timeout` and a `10s` cool-down, PAYGO serves the response. + +## Verify + +**What matters:** `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog` together tell you exactly which backend was selected and why. + +- [ ] Step 1: `200 OK`, `x-Backend-Attempts: 1`, affinity = PTU hash. +- [ ] Step 1: `backendLog` contains `Using PTU URL:` and ends with `CALL SUCCESSFUL`. No PAYGO entry. +- [ ] Step 2: `200 OK`, `x-Backend-Attempts: 2`, `x-PolicyCycleCounter: 2`, affinity = PAYGO hash. +- [ ] Step 2: `backendLog` contains `THROTTLED: PTU Retry-After:` followed by `Using PAYGO URL:` and `CALL SUCCESSFUL`. +- [ ] Step 3: `200 OK`, `x-Backend-Attempts: 1`, affinity = PAYGO hash. `backendLog` shows `THROTTLED: (PTU - )` and a single PAYGO cycle. +- [ ] Step 4: `200 OK`, `x-Backend-Attempts: 1`, affinity returns to PTU hash after cool-down expires. +- [ ] Step 5 (optional): `backendLog` shows `likely timeout` and a `10s` window instead of `Retry-After`. + +> [!TIP] +> If you cannot explain a result using all three signals together, the test is not complete yet. + +## Deep dive + +**What matters:** the policy is a two-cycle loop — pick PTU, classify the failure, mark PTU throttled, retry against PAYGO — and the cool-down state persists across requests until the window expires. + +### Full request flow + +```mermaid +flowchart LR + C[Client request] --> S[APIM selects lowest healthy priority] + S --> P[Cycle 1: PTU backend] + P -->|200 OK| R1[Return PTU response] + P -->|429 + Retry-After| T[Mark PTU throttled\nRetry-After + 2s] + P -->|timeout| T2[Mark PTU throttled\n10s window] + T --> N[Retry loop: cycle 2] + T2 --> N + N --> G[Cycle 2: PAYGO backend] + G -->|200 OK| R2[Return PAYGO response] + T -. during cool-down .-> K[Next requests skip PTU] + K -. after expiry .-> S +``` + +### Worked example + +| Step | Time | Observable signal | What it shows | +| :--- | :--- | :--- | :--- | +| 1 | `t=0s` | `x-Backend-Attempts: 1`, PTU affinity | PTU is healthy; wins on cycle 1. | +| 2 | `t=4s` | PTU returns `429` with `Retry-After: 5` | PTU is overloaded. | +| 3 | `t=4s` | `backendLog`: `THROTTLED: PTU Retry-After: 00:07` | Policy applied `5 + 2 = 7s` cool-down. | +| 4 | `t=4s` | `x-Backend-Attempts: 2`, PAYGO affinity | Same client request succeeded on cycle 2. | +| 5 | `t=8s` | `x-Backend-Attempts: 1`, PAYGO affinity | PTU still in cool-down; skipped entirely. | +| 6 | `t=12s` | `x-Backend-Attempts: 1`, PTU affinity | Cool-down expired; PTU re-entered selection. | + +### How to read `backendLog` + +**What matters:** each `|`-separated entry is ` `. The effective backend URL is the normalized `url + path` after policy processing. + +Healthy PTU path: + +```text +0.001s Begin +0.001s THROTTLED: (none) +0.001s RETRIES LEFT: 2 CYCLE: 1 INDEX: 0 +0.001s Using PTU URL: https://.openai.azure.com/openai/deployments//chat/completions LIMIT: off +0.320s StatusCode: 200 - Success +0.320s CALL SUCCESSFUL +``` + +Failover after a real Azure OpenAI `429`: + +```text +0.001s Begin +0.001s THROTTLED: (none) +0.001s RETRIES LEFT: 2 CYCLE: 1 INDEX: 0 +0.001s Using PTU URL: https://.openai.azure.com/... LIMIT: off +0.312s StatusCode: 429 - Temp Error +0.313s THROTTLED: PTU Retry-After: 00:07 +0.313s CALL INCOMPLETE, Unthrottled Backends: 1 +1.313s RETRIES LEFT: 1 CYCLE: 2 INDEX: 1 +1.313s Using PAYGO URL: https://.openai.azure.com/... LIMIT: off +1.630s StatusCode: 200 - Success +1.630s CALL SUCCESSFUL +``` + +Cool-down request (PTU skipped from the start): + +```text +0.001s Begin +0.001s THROTTLED: (PTU - 00:04) +0.001s RETRIES LEFT: 2 CYCLE: 1 INDEX: 1 +0.001s Using PAYGO URL: https://.openai.azure.com/... LIMIT: off +0.085s StatusCode: 200 - Success +0.085s CALL SUCCESSFUL +``` + +### Auth modes + +| Mode | Header sent | When to use | +| :--- | :--- | :--- | +| Managed Identity (`"MI"`) | `Authorization: Bearer ` | Preferred for all deployments. | +| API key | `api-key: ` | Use only when MI is not available. | + +> [!WARNING] +> Do not hard-code production keys in the policy. Use APIM Named Values or Key Vault references. + +## Optional variants + +### Three-region active/active/active + +**What matters:** add a third backend at `priorityGroup: 3` and raise `retryCount` to `3` so the retry loop can walk all three regions. + +```xml +backends.Add(new JObject() { + { "url", "https://.openai.azure.com/" }, { "path", "" }, + { "priorityGroup", 3 }, { "label", "EUS2" }, + { "acceptablePriorities", new JArray(1,2,3) }, + { "limitConcurrency", "off" }, { "bufferResponse", true }, + { "timeout", 30 }, { "auth", "MI" } +}); +``` + +### Streaming chat completions + +**What matters:** set `bufferResponse: false` on each backend. Failover behavior is identical because the `429` or timeout is classified before the stream starts. + +```xml +{ "bufferResponse", false }, +``` + +Confirm the APIM gateway does not override the backend setting with global response buffering. + +### Tuning knobs + +- Lower PTU `timeout` to abandon slow PTU responses faster and reach PAYGO sooner. +- Set `limitConcurrency` on PTU to pre-empt hard throttling by capping in-flight requests. +- Restrict PTU to premium callers with `acceptablePriorities: [1]` and PAYGO to all callers with `acceptablePriorities: [1,2,3]`. See [POC-Priority-configuration.md](POC-Priority-configuration.md). +- Use `requeue: true` only when you want exhausted retries to return `429 + S7PREQUEUE: true` to SimpleL7Proxy instead of surfacing an error. + +## Troubleshooting + +**What matters:** each symptom maps to one concrete cause and one concrete check. + +| Symptom | Likely cause | Check | +| :--- | :--- | :--- | +| First request returns `401` | Managed Identity or API key is invalid | Confirm APIM identity has `Cognitive Services OpenAI User` on both AOAI resources; or confirm the `auth` value is correct | +| First request returns `404` | Deployment name differs between PTU and PAYGO | Verify both resources expose the same `/openai/deployments//...` path | +| Failover request returns `429` to client | `retryCount` is too low, or PAYGO is also unhealthy | Confirm `priorityCfg[*].retryCount >= 2` and test PAYGO directly | +| Every request goes to PAYGO even after waiting | PTU still throttled, or PTU keeps returning `429` | Compare the last `Retry-After` in `backendLog` with the elapsed wait; send a low-token test request | +| `backendLog` shows only PTU, no retry | PTU call succeeded or PAYGO is filtered out | Confirm the burst is large enough to trigger throttling and that PAYGO has `acceptablePriorities` covering the request | +| Streaming response does not stream | `bufferResponse` is still `true` or gateway-level buffering is active | Set `bufferResponse: false` and check APIM gateway buffering settings | +| `api-key` field silently ignored | Using old v2.0.1 field name | Rename `api-key` to `auth` in `listBackends` | + +> [!WARNING] +> Do not change multiple knobs at once while debugging. Check auth first, then deployment name, then retry settings, then load pattern. + +## Related documentation + +- [POC-Failover-configuration.md](POC-Failover-configuration.md) — Same failover policy using the LLM Simulator instead of real Azure OpenAI endpoints +- [POC-Priority-configuration.md](POC-Priority-configuration.md) — Priority-based backend selection by caller tier +- [POC-Chargeback.md](POC-Chargeback.md) — Token usage tracking and per-user cost attribution +- [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) — Wiring Azure OpenAI and AI Foundry endpoints into SimpleL7Proxy +- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) — Host configuration options including `timeout`, `retryCount`, and `acceptablePriorities` +- [SECURITY.md](SECURITY.md) — Managed Identity, Key Vault, and secret rotation guidance + + +## TL;DR + +- Put the PTU backend at `priorityGroup: 1`, the PAYGO backend at `priorityGroup: 2`, and keep the same deployment name on both Azure OpenAI resources. +- Run one normal request, then a short burst that exhausts PTU quota; the client should keep getting `200 OK` while APIM retries from PTU to PAYGO. +- Verify with `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog`: first PTU wins, then PAYGO takes over during cool-down, then traffic returns to PTU after cool-down expires. + +## What You Will Observe + +- A healthy request returns `200 OK` with `x-Backend-Attempts: 1` and the PTU affinity hash. +- A throttled PTU request still returns `200 OK`, but now with `x-Backend-Attempts: 2` and the PAYGO affinity hash. +- Requests sent during the `Retry-After + 2s` cool-down skip PTU entirely. +- After cool-down expires, the next request returns to PTU. + +[!NOTE] +Units used in this doc: `timeout` and `defaultRetryAfter` are seconds, Azure OpenAI `Retry-After` is seconds. + +## Reference + +
+Reference: defaults, units, and policy knobs used in this POC + +Use this reference when you are configuring the policy, checking whether the test is set up correctly, or explaining why APIM selected, skipped, or retried a backend. + +| Item | Value or default | What matters | +| :--- | :--- | :--- | +| Policy file | `APIM-Policy/Priority-with-retry-enhancedLog.xml` | Apply it at the API level on the API you are testing. | +| `priorityGroup` | PTU `1`, PAYGO `2` | Lower number wins when both backends are healthy. | +| `timeout` | `30` seconds in this POC | Long enough for real chat completions, short enough to surface a bad backend. | +| `retryCount` | `2` for priorities `1`, `2`, and `3` | Gives the policy one more backend to try after the first failure. | +| `defaultRetryAfter` | `10` seconds | Used for timeout-style failures when no backend `Retry-After` exists. | +| `Retry-After` handling | backend value plus `2` seconds | The policy adds a safety buffer before reusing a throttled backend. | +| `auth` | `"MI"` or a literal key | Use `"MI"` when possible; use `auth`, not `api-key`. | +| `bufferResponse` | `true` | Keep `true` for non-streaming chat completions; set `false` only for SSE streaming. | +| Deployment name | Same on both AOAI resources | APIM forwards the same `/openai/deployments//...` path to whichever backend wins. | +| Reload behavior | Policy save on APIM | Policy changes take effect after save; backend throttle state is runtime state inside the gateway. | + +
+ +## Setup + +**Use two Azure OpenAI resources with the same deployment name, then put PTU first and PAYGO second in `listBackends`.** + +```xml +{ "priorityGroup", 1 }, +{ "label", "PTU" }, +{ "auth", "MI" } +``` + +[!NOTE] +Older versions of the policy use `api-key` rather than `auth`. + +### Minimal prerequisites + +- An APIM instance with [../APIM-Policy/Priority-with-retry-enhancedLog.xml](../APIM-Policy/Priority-with-retry-enhancedLog.xml) applied at the API level. +- Two Azure OpenAI resources with the same deployment name, for example `gpt-4o-mini`. +- A PTU primary with low enough capacity to trigger real `429` responses under load. +- A PAYGO secondary in a separate Azure OpenAI resource. +- One auth path: + - Managed Identity: APIM system-assigned identity has `Cognitive Services OpenAI User` on both resources. + - API key: keys from each Azure OpenAI resource. +- An APIM subscription key for the target API. + +### Apply the policy + +**Apply the policy to the target API, not globally, so the test surface is small and observable.** + +Portal steps: + +1. Open APIM in the Azure portal. +2. Go to `APIs` and select the target API. +3. Select `All operations` so the policy applies across the API. +4. In `Inbound processing`, open the policy editor. +5. Paste the contents of `APIM-Policy/Priority-with-retry-enhancedLog.xml`. +6. Save the policy. + +If you are using Managed Identity: + +1. In APIM, open `Identity` and enable the system-assigned identity. +2. Open each Azure OpenAI resource. +3. Go to `Access control (IAM)`. +4. Add the `Cognitive Services OpenAI User` role assignment for the APIM managed identity. + +
+If you prefer to use the CLI + +Use the Azure CLI to apply the policy and, if needed, enable Managed Identity and grant Azure OpenAI access. + +```bash +az apim api policy create \ + --resource-group \ + --service-name \ + --api-id \ + --value "$(cat APIM-Policy/Priority-with-retry-enhancedLog.xml)" \ + --format xml +``` + +```bash +PRINCIPAL_ID=$(az apim update --resource-group --name \ + --set identity.type=SystemAssigned --query identity.principalId -o tsv) + +for AOAI in ; do + SCOPE=$(az cognitiveservices account show -g -n "$AOAI" --query id -o tsv) + az role assignment create --assignee "$PRINCIPAL_ID" \ + --role "Cognitive Services OpenAI User" --scope "$SCOPE" +done +``` + +
+ +[!NOTE] +Role propagation can take a few minutes. If the first request returns `401 PermissionDenied`, wait and retry. + +### Backend configuration + +What matters: We want to try the PTU first, so make sure that it has a lower `priorityGroup` than the PAYGO endpoint. OpenAI requres authentication so both entries must expose a valid `auth` mode ( either MI or key ). + +```xml +.openai.azure.com/" }, + { "path", "" }, + { "priorityGroup", 1 }, + { "label", "PTU" }, + { "acceptablePriorities", new JArray(1,2,3) }, + { "limitConcurrency", "off" }, + { "bufferResponse", true }, + { "timeout", 30 }, + { "auth", "MI" } + }); + + backends.Add(new JObject() + { + { "url", "https://.openai.azure.com/" }, + { "path", "" }, + { "priorityGroup", 2 }, + { "label", "PAYGO" }, + { "acceptablePriorities", new JArray(1,2,3) }, + { "limitConcurrency", "off" }, + { "bufferResponse", true }, + { "timeout", 30 }, + { "auth", "MI" } + }); + + foreach (JObject backend in backends) { + string saltedUrl = salt + backend["url"].ToString(); + backend["affinity"] = string.Concat( + System.Security.Cryptography.SHA256.Create() + .ComputeHash(System.Text.Encoding.UTF8.GetBytes(saltedUrl)) + .Take(10) + .Select(b => b.ToString("x2")) + ); + backend["isThrottling"] = false; + backend["retryAfter"] = DateTime.MinValue; + backend["defaultRetryAfter"] = 10; + } + + return backends; +}" /> +``` + +[!TIP] +For API key auth, replace `"MI"` with a key value or a Named Value reference such as `{{aoai-ptu-key}}`. + +### Retry configuration + +What matters: set `retryCount` high enough for the policy to try the next backend in the list. + +```xml + +``` + +[!WARNING] +If `retryCount` is too low, the client will see the PTU failure instead of a PAYGO recovery. + +## Run + +**Run the four requests in order: healthy PTU, forced PTU throttle, immediate follow-up during cool-down, and post-cool-down recovery.** + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: " \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Say hi in five words."}],"max_tokens":40}' \ + "https://.azure-api.net//openai/deployments//chat/completions?api-version=" +``` + +[!TIP] +Replace ``, ``, ``, ``, and `` once, then reuse the same values for every step. + +### 1. Healthy PTU request + +Send one normal chat request: + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: " \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Say hi in five words."}],"max_tokens":40}' \ + "https://.azure-api.net//openai/deployments//chat/completions?api-version=" +``` + +Expected outcome: + +- `200 OK` +- `x-Backend-Attempts: 1` +- `x-backend-affinity` equals the PTU backend hash +- `backendLog` contains `Using PTU backend` and `CALL SUCCESSFUL` + +### 2. Force PTU throttling and fail over to PAYGO + +Send a short burst that exceeds PTU TPM or RPM quota: + +```bash +for i in {1..20}; do + curl -s -o /dev/null -w "%{http_code} %{header_x-backend-affinity}\n" \ + -H "Ocp-Apim-Subscription-Key: " \ + -H "Content-Type: application/json" \ + -d "{\"messages\":[{\"role\":\"user\",\"content\":\"$(yes word | head -n 500 | tr '\n' ' ')\"}],\"max_tokens\":200}" \ + "https://.azure-api.net//openai/deployments//chat/completions?api-version=" & +done +wait +``` + +Then send one request with headers visible: + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: " \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Hi"}],"max_tokens":20}' \ + "https://.azure-api.net//openai/deployments//chat/completions?api-version=" +``` + +Expected outcome: + +- `200 OK` +- `x-Backend-Attempts: 2` +- `x-PolicyCycleCounter: 2` +- `x-backend-affinity` now equals the PAYGO backend hash +- `backendLog` shows `Throttling [0] by s, isTempError=true, retry-after=` followed by `Using PAYGO backend` and `CALL SUCCESSFUL` + +### 3. Confirm cool-down behavior + +Immediately send one more request while PTU is still inside the `Retry-After + 2s` window: + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: " \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Hi again"}],"max_tokens":20}' \ + "https://.azure-api.net//openai/deployments//chat/completions?api-version=" +``` + +Expected outcome: + +- `200 OK` +- `x-Backend-Attempts: 1` +- `x-backend-affinity` still equals the PAYGO backend hash +- `backendLog` shows PAYGO only; there is no new PTU throttle event for this request + +### 4. Confirm recovery to PTU + +Wait for the backend `Retry-After` value plus the policy's `2` second buffer, then repeat the healthy request from step 1. + +Expected outcome: + +- `200 OK` +- `x-Backend-Attempts: 1` +- `x-backend-affinity` switches back to the PTU backend hash +- `backendLog` shows `Using PTU backend` and `CALL SUCCESSFUL` + +### 5. Optional: simulate PTU unreachable instead of throttled + +Point the PTU backend URL at a non-routable host such as `https://nonexistent-host-xyz.openai.azure.com/`, then run the same single request again. + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: " \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"Hi"}],"max_tokens":20}' \ + "https://.azure-api.net//openai/deployments//chat/completions?api-version=" +``` + +Expected outcome: + +- `200 OK` +- `x-Backend-Attempts: 2` +- `backendLog` includes `likely timeout` and a `10s` throttle window +- PAYGO serves the final response + +## Verify + +**Verification is a checklist: every signal below maps directly to one step in the failover state machine.** + +```text +select -> fail -> throttle -> retry -> recover +``` + +[!TIP] +If you cannot explain a result with `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog`, the test is not complete yet. + +Checklist: + +- [ ] Healthy request: `200 OK` with `x-Backend-Attempts: 1` means PTU answered on cycle 1. +- [ ] Failover request: `200 OK` with `x-Backend-Attempts: 2` means PTU failed on cycle 1 and PAYGO answered on cycle 2. +- [ ] Failover request: `backendLog` contains `Throttling [0] by s` and then `Using PAYGO backend`. +- [ ] Cool-down request: `x-Backend-Attempts: 1` with PAYGO affinity means PTU was skipped because it was still throttled. +- [ ] Recovery request: PTU affinity returns after `Retry-After + 2s`, which proves the policy exited cool-down. +- [ ] Timeout variant: `backendLog` shows `likely timeout` and a `10s` cooldown instead of a backend-supplied `Retry-After`. + +## Deep Dive + +**The policy behavior is a two-cycle loop: pick PTU, classify the PTU failure, mark PTU throttled, then retry against PAYGO.** + +```xml +{ "priorityGroup", 1 }, +{ "priorityGroup", 2 }, +{ "retryCount", 2 } +``` + +[!NOTE] +This POC uses real Azure OpenAI `429` responses, so the same control path is exercised as production overload handling. + +### End-to-end flow + +```mermaid +flowchart LR + C[Client request] --> S[APIM selects lowest healthy priority] + S --> P[Cycle 1: PTU backend] + P -->|200 OK| R1[Return PTU response] + P -->|429 + Retry-After or timeout| T[Mark PTU throttled] + T --> N[Retry after classification] + N --> G[Cycle 2: PAYGO backend] + G -->|200 OK| R2[Return PAYGO response] + T -. during cool-down .-> K[Next requests skip PTU] + K -. after expiry .-> S +``` + +### Worked example with concrete numbers + +| Step | Time | Observable signal | Meaning | +| :--- | :--- | :--- | :--- | +| 1 | `t=0s` | `x-Backend-Attempts: 1`, PTU affinity | PTU is healthy and wins selection. | +| 2 | `t=4s` | PTU returns `429` with `Retry-After: 5` | PTU is overloaded. | +| 3 | `t=4s` | `backendLog` shows `Throttling [0] by 7s` | APIM adds the `2s` safety buffer. | +| 4 | `t=4s` | `x-Backend-Attempts: 2`, PAYGO affinity | The same client request succeeded on cycle 2. | +| 5 | `t=8s` | Next request has `x-Backend-Attempts: 1`, PAYGO affinity | PTU is still in cool-down and is skipped. | +| 6 | `t=12s` | Next request returns PTU affinity again | Cool-down expired and PTU re-entered selection. | + +### How to read `backendLog` + +Successful PTU path: + +```text +Using PTU backend: https://.openai.azure.com/openai/deployments//chat/completions ... CALL SUCCESSFUL +``` + +Failover path after a real Azure OpenAI `429`: + +```text +Throttling [0] by 7s, isTempError=true, retry-after=5 +Using PAYGO backend: https://.openai.azure.com/openai/deployments//chat/completions ... CALL SUCCESSFUL +``` + +What the fields mean: + +- `[0]` is the zero-based index of the PTU backend inside `listBackends`. +- `retry-after=5` is the Azure OpenAI response. +- `by 7s` is the policy's throttle window after the `2` second safety buffer is added. + +### Auth modes + +| Mode | Header sent | Recommendation | +| :--- | :--- | :--- | +| Managed Identity | `Authorization: Bearer ` | Preferred for real deployments. | +| API key | `api-key: ` | Acceptable for POCs or when MI is unavailable. | + +[!WARNING] +Do not hard-code production keys in the policy. Use APIM Named Values or Key Vault references. + +## Optional Variants + +**Once the two-backend POC works, only add one variable at a time: more regions, streaming, or tuning.** + +```xml +{ "bufferResponse", false }, +{ "limitConcurrency", "" }, +{ "acceptablePriorities", new JArray(1,2,3) } +``` + +[!TIP] +Keep the first pass to two backends only. It makes the retry path obvious in `backendLog`. + +### Three-region active/active/active + +- Add a third backend with `priorityGroup: 3`. +- Keep `acceptablePriorities: [1,2,3]` on all three. +- Raise `priorityCfg[*].retryCount` to `3` so the loop can walk all regions. + +```xml +backends.Add(new JObject() { + { "url", "https://.openai.azure.com/" }, { "path", "" }, + { "priorityGroup", 1 }, { "label", "EUS2" }, + { "acceptablePriorities", new JArray(1,2,3) }, + { "limitConcurrency", "off" }, { "bufferResponse", true }, + { "timeout", 30 }, { "auth", "MI" } +}); +``` + +### Streaming chat completions + +- Set `bufferResponse: false` on each streaming backend. +- Failover behavior is unchanged because the `429` or timeout is classified before streaming begins. +- Confirm the gateway does not override streaming with global response buffering. + +```xml +{ "bufferResponse", false }, +``` + +### Tuning knobs that matter + +- Lower PTU `timeout` only if you want to abandon slow PTU responses faster. +- Set `limitConcurrency` on PTU if you want excess traffic to skip PTU before it hard-throttles. +- Reserve PTU for premium traffic by setting PTU `acceptablePriorities: [1]` and PAYGO `acceptablePriorities: [2,3]`. +- Use `requeue: true` only if you intentionally want to return `429 + S7PREQUEUE: true` to SimpleL7Proxy. + +## Troubleshooting + +**Every failure here should be explainable as symptom -> cause -> check.** + +```text +symptom -> cause -> check +``` + +[!WARNING] +Do not change multiple knobs at once while debugging. Check auth first, then deployment name, then retry settings, then load pattern. + +- Symptom: First request returns `401`. + Cause: Managed Identity or API key is invalid. + Check: Confirm APIM identity has `Cognitive Services OpenAI User` on both AOAI resources, or confirm the `auth` value is correct. + +- Symptom: First request returns `404`. + Cause: Deployment name differs between PTU and PAYGO or the forwarded path is wrong. + Check: Verify both AOAI resources expose the same `/openai/deployments//...` path. + +- Symptom: Failover request returns `429` to the client instead of `200`. + Cause: `retryCount` is too low, only one backend is eligible, or PAYGO is also unhealthy. + Check: Verify `priorityCfg[*].retryCount >= 2`, `acceptablePriorities` includes the current request, and PAYGO works directly. + +- Symptom: Every request keeps going to PAYGO even after waiting. + Cause: PTU is still throttled, the gateway clock has not passed the throttle window, or PTU keeps returning `429`. + Check: Compare the last `Retry-After` value in `backendLog` with the wait time and send a low-token test request. + +- Symptom: `backendLog` shows only PTU and no retry. + Cause: The PTU call succeeded, or the second backend was filtered out. + Check: Confirm the burst is large enough to trigger PTU throttling and that PAYGO shares the same `acceptablePriorities`. + +- Symptom: Streaming response does not stream. + Cause: `bufferResponse` is still `true` or APIM response buffering overrides the backend setting. + Check: Set `bufferResponse: false` and verify gateway-level buffering behavior. + +## Related Documentation + +- [POC-Failover-configuration.md](POC-Failover-configuration.md) - Same failover policy against the LLM Simulator instead of real Azure OpenAI backends. +- [POC-Priority-configuration.md](POC-Priority-configuration.md) - Priority-based backend selection by caller tier. +- [POC-Chargeback.md](POC-Chargeback.md) - Token usage tracking and per-user cost attribution. +- [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) - Wiring Azure OpenAI and AI Foundry endpoints into SimpleL7Proxy. +- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) - Host configuration options such as `Timeout`, `retryCount`, and `acceptablePriorities`. +- [SECURITY.md](SECURITY.md) - Managed Identity, Key Vault, and secret rotation guidance. diff --git a/docs/POC-Priority-configuration.md b/docs/POC-Priority-configuration.md index e73dd76..d2a0225 100644 --- a/docs/POC-Priority-configuration.md +++ b/docs/POC-Priority-configuration.md @@ -1,672 +1,672 @@ -# POC: Priority Levels - -**Purpose:** Show that `acceptablePriorities` on each backend restricts the eligible candidate set per request, so a priority-1 request goes only to backends that accept it, a shared backend handles lower priorities, and a request with no eligible backend gets `503`. - -> [!IMPORTANT] -> **The rule: the policy builds the candidate set from backends whose `acceptablePriorities` includes the request priority before the retry loop starts. Backends outside the set are invisible for the lifetime of that request.** - -## TL;DR (< 5 minutes) - -1. Deploy the LLM Simulator and configure three backends: `Reserved` (priority-1 only), `Shared` (priority-2 and 3), `AlwaysFail` (priority-3, returns 500). -2. Send four requests — with `llm_proxy_priority: 1`, `2`, no header, and a modified config with no eligible backend. -3. Read `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog` to confirm the routing decision for each case. - -**Expected outcomes:** priority-1 → `Reserved` only; priority-2 → `Shared` only; no header (defaults to 3) → `Shared` wins over `AlwaysFail`; no eligible backend → `503`. - -## What you will observe - -- A `llm_proxy_priority: 1` request returns `200 OK` with `x-Backend-Attempts: 1` and the `Reserved` affinity hash. -- A `llm_proxy_priority: 2` request returns `200 OK` with `x-Backend-Attempts: 1` and the `Shared` affinity hash. `Reserved` is never tried. -- A request with no priority header defaults to `3`; `Shared` wins over `AlwaysFail` by `priorityGroup` order. -- A priority-2 request when no backend accepts priority-2 returns `503 Service Unavailable`. -- A priority-2 request sent while `Reserved` is throttled still returns `200 OK` via `Shared` — throttle state on an out-of-set backend has no effect. - -## Reference - -
-Settings, values, units, and when each takes effect - -| Setting | Value in this POC | Unit | Set in | Takes effect | -| :--- | :--- | :--- | :--- | :--- | -| Policy file | [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) | — | APIM API | after policy save | -| `acceptablePriorities` (Reserved) | `[1]` | priority levels | `listBackends` | after policy save | -| `acceptablePriorities` (Shared) | `[2, 3]` | priority levels | `listBackends` | after policy save | -| `acceptablePriorities` (AlwaysFail) | `[3]` | priority levels | `listBackends` | after policy save | -| `priorityGroup` (Reserved) | `1` | group | `listBackends` | after policy save | -| `priorityGroup` (Shared) | `2` | group | `listBackends` | after policy save | -| `priorityGroup` (AlwaysFail) | `3` | group | `listBackends` | after policy save | -| `retryCount` | `1` | attempts | `priorityCfg` | after policy save | -| Default request priority | `3` | level | policy default when header absent | per request | -| `llm_proxy_priority` header | `1`, `2`, or `3` | level | request header | per request | -| `limitConcurrency` | `off` | mode | policy default | after policy save | -| `bufferResponse` | `true` | boolean | policy default | after policy save | -| `timeout` | `30` | seconds | `listBackends` | after policy save | - -> [!NOTE] -> **Units used in this doc:** `timeout` is in seconds. `priorityGroup` is an integer; lower wins when multiple backends are eligible. `acceptablePriorities` is a JSON array of integer priority levels. - -
- -## Setup - -### Minimal prerequisites - -**What matters:** this POC needs one APIM API, one deployed LLM Simulator function, and the v2.1.0 policy. No real Azure OpenAI endpoints are required. - -- An APIM instance with [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) applied at the API level. -- The LLM Simulator deployed as an Azure Function. See [`test/LLMSimulator/Readme.md`](../test/LLMSimulator/Readme.md). Verify it is running: - ```bash - curl https://.azurewebsites.net/api/health - # → 200 OK - ``` -- Note the function app hostname; you will use it in the backend list below. - -> [!WARNING] -> `priority`, `ModelType`, and `api-key` fields from older `listBackends` blocks are silently ignored by v2.1.0. Rename them to `priorityGroup`, `label`, and `auth` before running this POC. - -### Apply the policy - -**What matters:** apply the policy at the API level on **All operations**, not at product or global scope. - -#### Azure portal (recommended) - -1. Open your APIM instance in the [Azure portal](https://portal.azure.com). -2. Select **APIs** and open the target API. -3. Select **All operations**. -4. Open the **Inbound processing** policy editor (`` icon). -5. Replace the editor contents with [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml). -6. Select **Save**. - -
-Azure CLI alternative - -```bash -az apim api policy create \ - --resource-group \ - --service-name \ - --api-id \ - --value "$(cat APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml)" \ - --format xml -``` - -
- -### Configure `listBackends` - -**What matters:** each backend's `acceptablePriorities` defines which requests it will handle. A backend whose list does not contain the request priority is excluded from the candidate set before the retry loop runs. - -```xml -.azurewebsites.net/api/delay?delay=100" }, - { "priorityGroup", 1 }, - { "label", "Reserved" }, - { "acceptablePriorities", new JArray(1) }, - { "limitConcurrency", "off" }, - { "bufferResponse", true }, - { "timeout", 30 }, - { "auth", "" } - }); - - // Shared: priority-2 and priority-3 requests - backends.Add(new JObject() - { - { "url", "https://.azurewebsites.net/api/delay?delay=100" }, - { "priorityGroup", 2 }, - { "label", "Shared" }, - { "acceptablePriorities", new JArray(2, 3) }, - { "limitConcurrency", "off" }, - { "bufferResponse", true }, - { "timeout", 30 }, - { "auth", "" } - }); - - // AlwaysFail: priority-3 fallback that returns 500 — used to confirm the 503 path - backends.Add(new JObject() - { - { "url", "https://.azurewebsites.net/api/error/500" }, - { "priorityGroup", 3 }, - { "label", "AlwaysFail" }, - { "acceptablePriorities", new JArray(3) }, - { "limitConcurrency", "off" }, - { "bufferResponse", true }, - { "timeout", 30 }, - { "auth", "" } - }); - - foreach (JObject backend in backends) { - string saltedUrl = salt + backend["url"].ToString(); - backend["affinity"] = string.Concat( - System.Security.Cryptography.SHA256.Create() - .ComputeHash(System.Text.Encoding.UTF8.GetBytes(saltedUrl)) - .Take(10) - .Select(b => b.ToString("x2")) - ); - backend["isThrottling"] = false; - backend["retryAfter"] = DateTime.MinValue; - backend["defaultRetryAfter"] = 10; - } - return backends; -}" /> -``` - -### Configure `priorityCfg` - -**What matters:** `retryCount: 1` is sufficient here because priority isolation, not failover, is the focus. Each priority level gets one attempt. - -```xml - -``` - -## Run - -**What matters:** replace ``, ``, and `` once, then run the tests in order. - -```bash -BASE="https://.azure-api.net/" -KEY="" -``` - -### Test 1 — Priority-1 routes to `Reserved` only - -```bash -curl -i \ - -H "llm_proxy_priority: 1" \ - -H "Ocp-Apim-Subscription-Key: $KEY" \ - "$BASE/api/delay" -``` - -### Test 2 — Priority-2 routes to `Shared`, skips `Reserved` - -```bash -curl -i \ - -H "llm_proxy_priority: 2" \ - -H "Ocp-Apim-Subscription-Key: $KEY" \ - "$BASE/api/delay" -``` - -### Test 3 — No header defaults to priority-3; `Shared` wins over `AlwaysFail` - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: $KEY" \ - "$BASE/api/delay" -``` - -### Test 4 — No eligible backend returns 503 - -Temporarily change `Shared`'s `acceptablePriorities` to `[3]` only, then send a priority-2 request: - -```xml -{ "acceptablePriorities", new JArray(3) }, // temporary change for Test 4 -``` - -```bash -curl -i \ - -H "llm_proxy_priority: 2" \ - -H "Ocp-Apim-Subscription-Key: $KEY" \ - "$BASE/api/delay" -``` - -Restore `Shared` to `new JArray(2, 3)` after this test. - -### Test 5 — `Reserved` throttled; priority-2 request unaffected - -Change `Reserved` to a 429-returning URL to trigger throttling: - -```xml -{ "url", "https://.azurewebsites.net/api/error/429?retryAfter=30" }, -``` - -Send a priority-1 request to trigger throttling (expect `503` because no other backend accepts priority-1): - -```bash -curl -i -H "llm_proxy_priority: 1" -H "Ocp-Apim-Subscription-Key: $KEY" "$BASE/api/delay" -``` - -Immediately send a priority-2 request: - -```bash -curl -i -H "llm_proxy_priority: 2" -H "Ocp-Apim-Subscription-Key: $KEY" "$BASE/api/delay" -``` - -Restore `Reserved`'s URL to `/api/delay?delay=100` after this test. - -## Verify - -**What matters:** `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog` together tell you exactly which backend was selected and why. - -- [ ] Test 1: `200 OK`, `x-Backend-Attempts: 1`, affinity = `Reserved` hash, `backendLog` ends with `CALL SUCCESSFUL` for the `/api/delay` URL. -- [ ] Test 1: `backendLog` contains no reference to `Shared` or `AlwaysFail`. -- [ ] Test 2: `200 OK`, `x-Backend-Attempts: 1`, affinity = `Shared` hash. -- [ ] Test 2: `backendLog` contains no reference to `Reserved`. -- [ ] Test 3: `200 OK`, `x-Backend-Attempts: 1`, affinity = `Shared` hash (not `AlwaysFail`). -- [ ] Test 4: `503 Service Unavailable` — `PriBackendIndxs` was empty for priority-2. -- [ ] Test 5: priority-2 returns `200 OK` via `Shared` even while `Reserved` is throttled. - -## Deep dive - -**What matters:** priority isolation happens before the retry loop. The policy builds `PriBackendIndxs` by filtering backends to those whose `acceptablePriorities` contains `requestPriority`; the retry loop never sees the others. - -### How priority filtering works - -```mermaid -flowchart LR - R[Request arrives] --> H{llm_proxy_priority\nheader present?} - H -->|yes| P[requestPriority = header value] - H -->|no| D[requestPriority = 3 default] - P --> F[Build PriBackendIndxs:\nfilter listBackends by acceptablePriorities] - D --> F - F -->|empty| E[Return 503] - F -->|not empty| L[Retry loop picks lowest priorityGroup\nfrom eligible set] - L --> B[Call backend] - B -->|200 OK| S[Return to client] - B -->|failure| T[Throttle / retry within eligible set] -``` - -### Worked example - -| Step | Request | `llm_proxy_priority` | `PriBackendIndxs` | Backend selected | Result | -| :--- | :--- | :--- | :--- | :--- | :--- | -| 1 | Test 1 | `1` | `[0]` (Reserved only) | Reserved (priorityGroup 1) | `200 OK` | -| 2 | Test 2 | `2` | `[1]` (Shared only) | Shared (priorityGroup 2) | `200 OK` | -| 3 | Test 3 | `3` (default) | `[1, 2]` (Shared + AlwaysFail) | Shared wins, priorityGroup 2 < 3 | `200 OK` | -| 4 | Test 4 | `2` | `[]` (empty — Shared temporarily restricted) | none | `503` | -| 5 | Test 5 (p-1) | `1` | `[0]` (Reserved, but throttled) | Reserved returns 429, no other candidate | `503` | -| 6 | Test 5 (p-2) | `2` | `[1]` (Shared only) | Shared — Reserved's throttle state is irrelevant | `200 OK` | - -### How to read `backendLog` - -**What matters:** only backends inside `PriBackendIndxs` appear in `backendLog`. A backend that was never in the candidate set produces no log entry. - -Single-backend success (Tests 1 and 2): - -```text -0.001s Begin -0.001s THROTTLED: (none) -0.001s RETRIES LEFT: 1 CYCLE: 1 INDEX: 0 -0.001s Using Reserved URL: https://.azurewebsites.net/api/delay?delay=100 LIMIT: off -0.105s StatusCode: 200 - Success -0.105s CALL SUCCESSFUL -``` - -No eligible backend (Test 4): - -```text -0.001s Begin -0.001s THROTTLED: (none) -0.001s PriBackendIndxs is empty for priority 2 — returning 503 -``` - -Priority-2 request while `Reserved` is throttled (Test 5, second request): - -```text -0.001s Begin -0.001s THROTTLED: (Reserved - 00:28) -0.001s RETRIES LEFT: 1 CYCLE: 1 INDEX: 1 -0.001s Using Shared URL: https://.azurewebsites.net/api/delay?delay=100 LIMIT: off -0.097s StatusCode: 200 - Success -0.097s CALL SUCCESSFUL -``` - -`Reserved` appears in the `THROTTLED` list but never in a `Using ...` line because it is not in `PriBackendIndxs` for `requestPriority = 2`. - -## Optional variants - -### Hard tier boundaries (no shared fallback) - -**What matters:** set each backend's `acceptablePriorities` to a single value. A request that misses every tier returns `503` rather than falling through to a lower tier. - -Change `Shared` to accept only priority-2: - -```xml -{ "acceptablePriorities", new JArray(2) }, -``` - -A priority-3 request now returns `503` instead of routing to `Shared`. - -### PTU-first with PAYGO overflow - -**What matters:** set the PTU backend's `acceptablePriorities` to `[1]` (premium only) and the PAYGO backend's to `[1, 2, 3]` (all). Premium requests go to PTU first; when PTU is throttled the retry loop stays within the same eligible set and falls over to PAYGO. - -```xml -{ "label", "PTU" }, { "acceptablePriorities", new JArray(1) }, { "priorityGroup", 1 } -{ "label", "PAYGO" }, { "acceptablePriorities", new JArray(1,2,3) }, { "priorityGroup", 2 } -``` - -See [POC-OpenAI-Failover.md](POC-OpenAI-Failover.md) for the full failover walkthrough. - -### Requeue on exhaustion - -**What matters:** set `requeue: true` for a priority level so that when the retry budget is exhausted the policy returns `429 + S7PREQUEUE: true`, signalling SimpleL7Proxy to re-enqueue the request rather than surface an error. - -```xml -cfg["3"] = new JObject { { "retryCount", 1 }, { "requeue", true } }; -``` - -## Troubleshooting - -**What matters:** each symptom maps to one concrete cause and one concrete check. - -| Symptom | Likely cause | Check | -| :--- | :--- | :--- | -| `503` on a request you expected to succeed | No backend's `acceptablePriorities` contains the request priority | Log or print `PriBackendIndxs`; confirm the header value matches an entry in at least one `acceptablePriorities` array | -| Wrong backend selected (affinity mismatch) | `priorityGroup` values are not in the expected order | Re-check `priorityGroup` on each backend; lower number wins when multiple are eligible | -| `503` even though a backend with the right priority exists | Backend is throttled and no other backend in the eligible set is healthy | Check `backendLog` for `THROTTLED:` entries; wait for cool-down or restore the backend URL | -| `priority`, `ModelType`, or `api-key` fields silently ignored | Old v2.0.1 field names used in `listBackends` | Rename to `priorityGroup`, `label`, and `auth`; see the [Reference](#reference) table | -| All requests route to the same backend regardless of header | `llm_proxy_priority` header is not reaching APIM | Confirm the header is not stripped by SimpleL7Proxy or a network layer before APIM; check APIM traces | -| Default priority-3 request hits `AlwaysFail` instead of `Shared` | `Shared`'s `acceptablePriorities` does not include `3` | Add `3` to `Shared`'s list or adjust `priorityGroup` so `Shared` sorts before `AlwaysFail` | - -## Related documentation - -- [POC-Failover-configuration.md](POC-Failover-configuration.md) — Automatic failover and retry when a backend returns `429` or times out -- [POC-OpenAI-Failover.md](POC-OpenAI-Failover.md) — Real Azure OpenAI PTU-to-PAYGO failover -- [POC-Chargeback.md](POC-Chargeback.md) — Token-level usage tracking and per-user cost attribution -- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) — Host configuration options including `acceptablePriorities` and `priorityGroup` -- [OBSERVABILITY.md](OBSERVABILITY.md) — Token metrics, telemetry channels, and event logger configuration - - ---- - -## What the policy does - -Each backend in `listBackends` carries an `acceptablePriorities` array. Before the retry loop picks a backend, the `PriBackendIndxs` variable is built by filtering the backend list to only those whose `acceptablePriorities` contains the request's priority value: - -```csharp -if (backend["acceptablePriorities"]?.Values().Contains(requestPriority) == true) { - list.Add(i); -} -``` - -The retry loop only iterates over backends in `PriBackendIndxs`. A backend that isn't in that list is invisible for the lifetime of the request, regardless of whether it's healthy. - -The request priority is read from the `llm_proxy_priority` header. If the header is absent, the policy defaults to `3`. - ---- - -## Prerequisites - -- An APIM instance with `Priority-with-retry-enhancedLog.xml` applied to the target API. See [Applying the policy](#applying-the-policy). -- The LLM Simulator deployed as an Azure Function. See [`test/LLMSimulator/Readme.md`](../test/LLMSimulator/Readme.md) — the fastest path is the portal ZIP deploy. Verify it's running: - ```bash - curl https://.azurewebsites.net/api/health - # → 200 OK - ``` -- Note the function app hostname — you'll use it in the backend list below. - ---- - -## Applying the policy - -The policy file is [`APIM-Policy/Priority-with-retry-enhancedLog.xml`](../../APIM-Policy/Priority-with-retry-enhancedLog.xml). - -**Azure portal:** -1. Open your APIM instance → **APIs** → select the target API. -2. Select **All operations** in the left panel. -3. Click the `` icon in the **Inbound processing** tile. -4. Replace the editor contents with the XML file contents. -5. Click **Save**. - -**Azure CLI:** -```bash -az apim api policy create \ - --resource-group \ - --service-name \ - --api-id \ - --value "$(cat APIM-Policy/Priority-with-retry-enhancedLog.xml)" \ - --format xml -``` - ---- - -## Backend configuration - -This POC uses three backends, all pointing at the same deployed LLM Simulator but with different `acceptablePriorities` lists: - -| Name | Endpoint | `priorityGroup` | `acceptablePriorities` | Purpose | -|------|----------|-----------------|------------------------|---------| -| Backend A | `/api/delay?delay=100` | `1` | `[1]` | Reserved for priority-1 requests only | -| Backend B | `/api/delay?delay=100` | `2` | `[2, 3]` | Shared: handles priority-2 and priority-3 | -| Backend C | `/api/error/500` | `3` | `[3]` | Priority-3 fallback that always fails — used to verify the 503 path | - -> **Field-name note.** Earlier drafts of these POCs used `priority`, `ModelType`, `Timeout`, `LimitConcurrency`, `BufferResponse`, and `api-key`. The policy now reads `priorityGroup`, `label`, `timeout`, `limitConcurrency`, `bufferResponse`, and `auth`. Uppercase first-letter variants (`Timeout`, `LimitConcurrency`, `BufferResponse`) are still normalized to lowercase at policy load, so they continue to work — but `priority`, `ModelType`, and `api-key` are **silently ignored**. If you're carrying over an older `listBackends` block, rename those three. - -Replace `listBackends` in the policy's `` block: - -```xml -.azurewebsites.net/api/delay?delay=100" }, - { "priorityGroup", 1 }, - { "label", "PTU" }, - { "acceptablePriorities", new JArray(1) }, - { "limitConcurrency", "off" }, - { "bufferResponse", true }, - { "timeout", 30 }, - { "auth", "" } - }); - - // Backend B: shared — handles priority-2 and priority-3 - backends.Add(new JObject() - { - { "url", "https://.azurewebsites.net/api/delay?delay=100" }, - { "priorityGroup", 2 }, - { "label", "PAYGO" }, - { "acceptablePriorities", new JArray(2, 3) }, - { "limitConcurrency", "off" }, - { "bufferResponse", true }, - { "timeout", 30 }, - { "auth", "" } - }); - - // Backend C: always returns 500 — used to verify 503 when no backend is eligible - backends.Add(new JObject() - { - { "url", "https://.azurewebsites.net/api/error/500" }, - { "priorityGroup", 3 }, - { "label", "PAYGO" }, - { "acceptablePriorities", new JArray(3) }, - { "limitConcurrency", "off" }, - { "bufferResponse", true }, - { "timeout", 30 }, - { "auth", "" } - }); - - foreach (JObject backend in backends) { - string saltedUrl = salt + backend["url"].ToString(); - backend["affinity"] = string.Concat( - System.Security.Cryptography.SHA256.Create() - .ComputeHash(System.Text.Encoding.UTF8.GetBytes(saltedUrl)) - .Take(10) - .Select(b => b.ToString("x2")) - ); - backend["isThrottling"] = false; - backend["retryAfter"] = DateTime.MinValue; - backend["defaultRetryAfter"] = 10; - } - return backends; -}" /> -``` - -Also set `priorityCfg` to give each priority level one retry: - -```xml - -``` - ---- - -## Test cases - -Replace ``, ``, and `` in each command. - -### Test 1 — Priority-1 request routes to Backend A only - -```bash -curl -i \ - -H "llm_proxy_priority: 1" \ - -H "Ocp-Apim-Subscription-Key: " \ - "https://.azure-api.net//api/delay" -``` - -**Expected:** - -| Header | Value | Meaning | -|--------|-------|---------| -| HTTP status | `200 OK` | Backend A responded. | -| `x-Backend-Attempts` | `1` | Only one backend was tried. | -| `x-backend-affinity` | hash of Backend A's URL | Confirms Backend A was used. | -| `x-PolicyCycleCounter` | `1` | One cycle; no retry needed. | -| `backendLog` | `CALL SUCCESSFUL` for Backend A URL | No fallback occurred. | - -Backend B and Backend C were never in the candidate set — `PriBackendIndxs` contained only index 0 for `requestPriority = 1`. - ---- - -### Test 2 — Priority-2 request routes to Backend B, skips Backend A - -```bash -curl -i \ - -H "llm_proxy_priority: 2" \ - -H "Ocp-Apim-Subscription-Key: " \ - "https://.azure-api.net//api/delay" -``` - -**Expected:** - -| Header | Value | Meaning | -|--------|-------|---------| -| HTTP status | `200 OK` | Backend B responded. | -| `x-Backend-Attempts` | `1` | One backend tried. | -| `x-backend-affinity` | hash of Backend B's URL | Confirms Backend B was used. | -| `backendLog` | `CALL SUCCESSFUL` for Backend B URL | Correct. | - -Backend A (`acceptablePriorities: [1]`) was not in the candidate set. Backend C was also excluded. Only Backend B matched `requestPriority = 2`. - ---- - -### Test 3 — Default priority (no header) behaves the same as priority-3 - -```bash -curl -i \ - -H "Ocp-Apim-Subscription-Key: " \ - "https://.azure-api.net//api/delay" -``` - -The policy defaults to `requestPriority = 3` when the header is absent. Backend B (`acceptablePriorities: [2, 3]`) and Backend C (`acceptablePriorities: [3]`) are both eligible. - -Backend B (`priorityGroup: 2`) sorts before Backend C (`priorityGroup: 3`), so Backend B is tried first and responds successfully. - -**Expected:** `200 OK`, `x-Backend-Attempts: 1`, affinity pointing at Backend B. - ---- - -### Test 4 — No eligible backend returns 503 - -This test verifies what happens when a request carries a priority that no backend accepts. Remove Backend B's `2` from its `acceptablePriorities` list (leave it as `[3]` only), then send a priority-2 request: - -```xml -{ "acceptablePriorities", new JArray(3) }, // temporarily changed for this test -``` - -```bash -curl -i \ - -H "llm_proxy_priority: 2" \ - -H "Ocp-Apim-Subscription-Key: " \ - "https://.azure-api.net//api/delay" -``` - -**Expected:** `503 Service Unavailable`. `PriBackendIndxs` is empty for `requestPriority = 2`, so the retry loop has nothing to try. - -Restore Backend B's `acceptablePriorities` to `[2, 3]` after verifying this. - ---- - -### Test 5 — Backend A being throttled does not affect a priority-2 request - -This confirms that priority isolation works even when a backend in a different tier is actively throttled. First, trigger a throttle on Backend A by changing it to point at `/api/error/429`: - -```xml -{ "url", "https://.azurewebsites.net/api/error/429?retryAfter=30" }, -``` - -Send a priority-1 request (this will fail over and mark Backend A throttled): -```bash -curl -i -H "llm_proxy_priority: 1" -H "Ocp-Apim-Subscription-Key: " \ - "https://.azure-api.net//api/delay" -# Backend A returns 429; no other backend accepts priority-1 → 503 -``` - -Immediately send a priority-2 request: -```bash -curl -i -H "llm_proxy_priority: 2" -H "Ocp-Apim-Subscription-Key: " \ - "https://.azure-api.net//api/delay" -``` - -**Expected:** `200 OK` via Backend B. Backend A's throttle state is irrelevant because it was never in the candidate set for `requestPriority = 2`. - -Restore Backend A's URL to `/api/delay?delay=100` after verifying. - ---- - -## How to read the `backendLog` header - -Each backend attempt appends a line to `backendLog`. For a successful single-attempt request it looks like: - -``` -Using PAYGO backend: https://.azurewebsites.net/api/delay?delay=100 ... CALL SUCCESSFUL -``` - -For a request where the first attempt was skipped (because no eligible backend was found at that index), only the successful attempt appears. For a retry where one backend was throttled and another succeeded, you'll see two lines: - -``` -Throttling [0] by 12s, isTempError=true, retry-after=10 -Using PAYGO backend: https://.azurewebsites.net/api/delay?delay=100 ... CALL SUCCESSFUL -``` - -The `[0]` is the zero-based index of the throttled backend in `listBackends`. - ---- - -
-Tuning - -Once the basic tests pass, a few variations are worth exploring: - -- **Restrict Backend B to priority-2 only** (`acceptablePriorities: [2]`) and send a priority-3 request. With no backend accepting `3`, you'll see a `503` — useful if you want to enforce hard tier boundaries with no shared fallback. -- **Add a PTU backend at priority 1** and a PAYGO backend at priorities 1–3. Priority-1 requests will prefer the PTU backend; only when it's throttled will the policy fall back to PAYGO — the standard cost-optimized pattern for Azure OpenAI deployments. -- **Set `requeue: true` for priority-3** and `retryCount: 0`. When the shared backend is throttled and retries are exhausted, the policy returns `429 + S7PREQUEUE: true`, which tells SimpleL7Proxy to re-enqueue the request rather than return an error to the caller. -- **Use `LimitConcurrency`** to cap how many simultaneous requests each backend handles. Combine with different concurrency caps per tier to simulate a PTU deployment with a fixed token budget. - -
- ---- - -## Related Documentation - -- [POC-Failover-configuration.md](POC-Failover-configuration.md) — Automatic failover and retry behaviour when a backend is slow or unavailable -- [POC-Chargeback.md](POC-Chargeback.md) — Token-level usage tracking and per-user cost attribution -- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) — Host connection string options including `acceptablePriorities` and `processor=` -- [OBSERVABILITY.md](OBSERVABILITY.md) — Token metrics, telemetry channels, and event logger configuration +# POC: Priority Levels + +**Purpose:** Show that `acceptablePriorities` on each backend restricts the eligible candidate set per request, so a priority-1 request goes only to backends that accept it, a shared backend handles lower priorities, and a request with no eligible backend gets `503`. + +> [!IMPORTANT] +> **The rule: the policy builds the candidate set from backends whose `acceptablePriorities` includes the request priority before the retry loop starts. Backends outside the set are invisible for the lifetime of that request.** + +## TL;DR (< 5 minutes) + +1. Deploy the LLM Simulator and configure three backends: `Reserved` (priority-1 only), `Shared` (priority-2 and 3), `AlwaysFail` (priority-3, returns 500). +2. Send four requests — with `llm_proxy_priority: 1`, `2`, no header, and a modified config with no eligible backend. +3. Read `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog` to confirm the routing decision for each case. + +**Expected outcomes:** priority-1 → `Reserved` only; priority-2 → `Shared` only; no header (defaults to 3) → `Shared` wins over `AlwaysFail`; no eligible backend → `503`. + +## What you will observe + +- A `llm_proxy_priority: 1` request returns `200 OK` with `x-Backend-Attempts: 1` and the `Reserved` affinity hash. +- A `llm_proxy_priority: 2` request returns `200 OK` with `x-Backend-Attempts: 1` and the `Shared` affinity hash. `Reserved` is never tried. +- A request with no priority header defaults to `3`; `Shared` wins over `AlwaysFail` by `priorityGroup` order. +- A priority-2 request when no backend accepts priority-2 returns `503 Service Unavailable`. +- A priority-2 request sent while `Reserved` is throttled still returns `200 OK` via `Shared` — throttle state on an out-of-set backend has no effect. + +## Reference + +
+Settings, values, units, and when each takes effect + +| Setting | Value in this POC | Unit | Set in | Takes effect | +| :--- | :--- | :--- | :--- | :--- | +| Policy file | [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) | — | APIM API | after policy save | +| `acceptablePriorities` (Reserved) | `[1]` | priority levels | `listBackends` | after policy save | +| `acceptablePriorities` (Shared) | `[2, 3]` | priority levels | `listBackends` | after policy save | +| `acceptablePriorities` (AlwaysFail) | `[3]` | priority levels | `listBackends` | after policy save | +| `priorityGroup` (Reserved) | `1` | group | `listBackends` | after policy save | +| `priorityGroup` (Shared) | `2` | group | `listBackends` | after policy save | +| `priorityGroup` (AlwaysFail) | `3` | group | `listBackends` | after policy save | +| `retryCount` | `1` | attempts | `priorityCfg` | after policy save | +| Default request priority | `3` | level | policy default when header absent | per request | +| `llm_proxy_priority` header | `1`, `2`, or `3` | level | request header | per request | +| `limitConcurrency` | `off` | mode | policy default | after policy save | +| `bufferResponse` | `true` | boolean | policy default | after policy save | +| `timeout` | `30` | seconds | `listBackends` | after policy save | + +> [!NOTE] +> **Units used in this doc:** `timeout` is in seconds. `priorityGroup` is an integer; lower wins when multiple backends are eligible. `acceptablePriorities` is a JSON array of integer priority levels. + +
+ +## Setup + +### Minimal prerequisites + +**What matters:** this POC needs one APIM API, one deployed LLM Simulator function, and the v2.1.0 policy. No real Azure OpenAI endpoints are required. + +- An APIM instance with [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml) applied at the API level. +- The LLM Simulator deployed as an Azure Function. See [`test/LLMSimulator/Readme.md`](../test/LLMSimulator/Readme.md). Verify it is running: + ```bash + curl https://.azurewebsites.net/api/health + # → 200 OK + ``` +- Note the function app hostname; you will use it in the backend list below. + +> [!WARNING] +> `priority`, `ModelType`, and `api-key` fields from older `listBackends` blocks are silently ignored by v2.1.0. Rename them to `priorityGroup`, `label`, and `auth` before running this POC. + +### Apply the policy + +**What matters:** apply the policy at the API level on **All operations**, not at product or global scope. + +#### Azure portal (recommended) + +1. Open your APIM instance in the [Azure portal](https://portal.azure.com). +2. Select **APIs** and open the target API. +3. Select **All operations**. +4. Open the **Inbound processing** policy editor (`` icon). +5. Replace the editor contents with [`APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml`](../APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml). +6. Select **Save**. + +
+Azure CLI alternative + +```bash +az apim api policy create \ + --resource-group \ + --service-name \ + --api-id \ + --value "$(cat APIM-Policy/v2.1.0/Priority-with-retry-enhancedLog.xml)" \ + --format xml +``` + +
+ +### Configure `listBackends` + +**What matters:** each backend's `acceptablePriorities` defines which requests it will handle. A backend whose list does not contain the request priority is excluded from the candidate set before the retry loop runs. + +```xml +.azurewebsites.net/api/delay?delay=100" }, + { "priorityGroup", 1 }, + { "label", "Reserved" }, + { "acceptablePriorities", new JArray(1) }, + { "limitConcurrency", "off" }, + { "bufferResponse", true }, + { "timeout", 30 }, + { "auth", "" } + }); + + // Shared: priority-2 and priority-3 requests + backends.Add(new JObject() + { + { "url", "https://.azurewebsites.net/api/delay?delay=100" }, + { "priorityGroup", 2 }, + { "label", "Shared" }, + { "acceptablePriorities", new JArray(2, 3) }, + { "limitConcurrency", "off" }, + { "bufferResponse", true }, + { "timeout", 30 }, + { "auth", "" } + }); + + // AlwaysFail: priority-3 fallback that returns 500 — used to confirm the 503 path + backends.Add(new JObject() + { + { "url", "https://.azurewebsites.net/api/error/500" }, + { "priorityGroup", 3 }, + { "label", "AlwaysFail" }, + { "acceptablePriorities", new JArray(3) }, + { "limitConcurrency", "off" }, + { "bufferResponse", true }, + { "timeout", 30 }, + { "auth", "" } + }); + + foreach (JObject backend in backends) { + string saltedUrl = salt + backend["url"].ToString(); + backend["affinity"] = string.Concat( + System.Security.Cryptography.SHA256.Create() + .ComputeHash(System.Text.Encoding.UTF8.GetBytes(saltedUrl)) + .Take(10) + .Select(b => b.ToString("x2")) + ); + backend["isThrottling"] = false; + backend["retryAfter"] = DateTime.MinValue; + backend["defaultRetryAfter"] = 10; + } + return backends; +}" /> +``` + +### Configure `priorityCfg` + +**What matters:** `retryCount: 1` is sufficient here because priority isolation, not failover, is the focus. Each priority level gets one attempt. + +```xml + +``` + +## Run + +**What matters:** replace ``, ``, and `` once, then run the tests in order. + +```bash +BASE="https://.azure-api.net/" +KEY="" +``` + +### Test 1 — Priority-1 routes to `Reserved` only + +```bash +curl -i \ + -H "llm_proxy_priority: 1" \ + -H "Ocp-Apim-Subscription-Key: $KEY" \ + "$BASE/api/delay" +``` + +### Test 2 — Priority-2 routes to `Shared`, skips `Reserved` + +```bash +curl -i \ + -H "llm_proxy_priority: 2" \ + -H "Ocp-Apim-Subscription-Key: $KEY" \ + "$BASE/api/delay" +``` + +### Test 3 — No header defaults to priority-3; `Shared` wins over `AlwaysFail` + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: $KEY" \ + "$BASE/api/delay" +``` + +### Test 4 — No eligible backend returns 503 + +Temporarily change `Shared`'s `acceptablePriorities` to `[3]` only, then send a priority-2 request: + +```xml +{ "acceptablePriorities", new JArray(3) }, // temporary change for Test 4 +``` + +```bash +curl -i \ + -H "llm_proxy_priority: 2" \ + -H "Ocp-Apim-Subscription-Key: $KEY" \ + "$BASE/api/delay" +``` + +Restore `Shared` to `new JArray(2, 3)` after this test. + +### Test 5 — `Reserved` throttled; priority-2 request unaffected + +Change `Reserved` to a 429-returning URL to trigger throttling: + +```xml +{ "url", "https://.azurewebsites.net/api/error/429?retryAfter=30" }, +``` + +Send a priority-1 request to trigger throttling (expect `503` because no other backend accepts priority-1): + +```bash +curl -i -H "llm_proxy_priority: 1" -H "Ocp-Apim-Subscription-Key: $KEY" "$BASE/api/delay" +``` + +Immediately send a priority-2 request: + +```bash +curl -i -H "llm_proxy_priority: 2" -H "Ocp-Apim-Subscription-Key: $KEY" "$BASE/api/delay" +``` + +Restore `Reserved`'s URL to `/api/delay?delay=100` after this test. + +## Verify + +**What matters:** `x-Backend-Attempts`, `x-backend-affinity`, and `backendLog` together tell you exactly which backend was selected and why. + +- [ ] Test 1: `200 OK`, `x-Backend-Attempts: 1`, affinity = `Reserved` hash, `backendLog` ends with `CALL SUCCESSFUL` for the `/api/delay` URL. +- [ ] Test 1: `backendLog` contains no reference to `Shared` or `AlwaysFail`. +- [ ] Test 2: `200 OK`, `x-Backend-Attempts: 1`, affinity = `Shared` hash. +- [ ] Test 2: `backendLog` contains no reference to `Reserved`. +- [ ] Test 3: `200 OK`, `x-Backend-Attempts: 1`, affinity = `Shared` hash (not `AlwaysFail`). +- [ ] Test 4: `503 Service Unavailable` — `PriBackendIndxs` was empty for priority-2. +- [ ] Test 5: priority-2 returns `200 OK` via `Shared` even while `Reserved` is throttled. + +## Deep dive + +**What matters:** priority isolation happens before the retry loop. The policy builds `PriBackendIndxs` by filtering backends to those whose `acceptablePriorities` contains `requestPriority`; the retry loop never sees the others. + +### How priority filtering works + +```mermaid +flowchart LR + R[Request arrives] --> H{llm_proxy_priority\nheader present?} + H -->|yes| P[requestPriority = header value] + H -->|no| D[requestPriority = 3 default] + P --> F[Build PriBackendIndxs:\nfilter listBackends by acceptablePriorities] + D --> F + F -->|empty| E[Return 503] + F -->|not empty| L[Retry loop picks lowest priorityGroup\nfrom eligible set] + L --> B[Call backend] + B -->|200 OK| S[Return to client] + B -->|failure| T[Throttle / retry within eligible set] +``` + +### Worked example + +| Step | Request | `llm_proxy_priority` | `PriBackendIndxs` | Backend selected | Result | +| :--- | :--- | :--- | :--- | :--- | :--- | +| 1 | Test 1 | `1` | `[0]` (Reserved only) | Reserved (priorityGroup 1) | `200 OK` | +| 2 | Test 2 | `2` | `[1]` (Shared only) | Shared (priorityGroup 2) | `200 OK` | +| 3 | Test 3 | `3` (default) | `[1, 2]` (Shared + AlwaysFail) | Shared wins, priorityGroup 2 < 3 | `200 OK` | +| 4 | Test 4 | `2` | `[]` (empty — Shared temporarily restricted) | none | `503` | +| 5 | Test 5 (p-1) | `1` | `[0]` (Reserved, but throttled) | Reserved returns 429, no other candidate | `503` | +| 6 | Test 5 (p-2) | `2` | `[1]` (Shared only) | Shared — Reserved's throttle state is irrelevant | `200 OK` | + +### How to read `backendLog` + +**What matters:** only backends inside `PriBackendIndxs` appear in `backendLog`. A backend that was never in the candidate set produces no log entry. + +Single-backend success (Tests 1 and 2): + +```text +0.001s Begin +0.001s THROTTLED: (none) +0.001s RETRIES LEFT: 1 CYCLE: 1 INDEX: 0 +0.001s Using Reserved URL: https://.azurewebsites.net/api/delay?delay=100 LIMIT: off +0.105s StatusCode: 200 - Success +0.105s CALL SUCCESSFUL +``` + +No eligible backend (Test 4): + +```text +0.001s Begin +0.001s THROTTLED: (none) +0.001s PriBackendIndxs is empty for priority 2 — returning 503 +``` + +Priority-2 request while `Reserved` is throttled (Test 5, second request): + +```text +0.001s Begin +0.001s THROTTLED: (Reserved - 00:28) +0.001s RETRIES LEFT: 1 CYCLE: 1 INDEX: 1 +0.001s Using Shared URL: https://.azurewebsites.net/api/delay?delay=100 LIMIT: off +0.097s StatusCode: 200 - Success +0.097s CALL SUCCESSFUL +``` + +`Reserved` appears in the `THROTTLED` list but never in a `Using ...` line because it is not in `PriBackendIndxs` for `requestPriority = 2`. + +## Optional variants + +### Hard tier boundaries (no shared fallback) + +**What matters:** set each backend's `acceptablePriorities` to a single value. A request that misses every tier returns `503` rather than falling through to a lower tier. + +Change `Shared` to accept only priority-2: + +```xml +{ "acceptablePriorities", new JArray(2) }, +``` + +A priority-3 request now returns `503` instead of routing to `Shared`. + +### PTU-first with PAYGO overflow + +**What matters:** set the PTU backend's `acceptablePriorities` to `[1]` (premium only) and the PAYGO backend's to `[1, 2, 3]` (all). Premium requests go to PTU first; when PTU is throttled the retry loop stays within the same eligible set and falls over to PAYGO. + +```xml +{ "label", "PTU" }, { "acceptablePriorities", new JArray(1) }, { "priorityGroup", 1 } +{ "label", "PAYGO" }, { "acceptablePriorities", new JArray(1,2,3) }, { "priorityGroup", 2 } +``` + +See [POC-OpenAI-Failover.md](POC-OpenAI-Failover.md) for the full failover walkthrough. + +### Requeue on exhaustion + +**What matters:** set `requeue: true` for a priority level so that when the retry budget is exhausted the policy returns `429 + S7PREQUEUE: true`, signalling SimpleL7Proxy to re-enqueue the request rather than surface an error. + +```xml +cfg["3"] = new JObject { { "retryCount", 1 }, { "requeue", true } }; +``` + +## Troubleshooting + +**What matters:** each symptom maps to one concrete cause and one concrete check. + +| Symptom | Likely cause | Check | +| :--- | :--- | :--- | +| `503` on a request you expected to succeed | No backend's `acceptablePriorities` contains the request priority | Log or print `PriBackendIndxs`; confirm the header value matches an entry in at least one `acceptablePriorities` array | +| Wrong backend selected (affinity mismatch) | `priorityGroup` values are not in the expected order | Re-check `priorityGroup` on each backend; lower number wins when multiple are eligible | +| `503` even though a backend with the right priority exists | Backend is throttled and no other backend in the eligible set is healthy | Check `backendLog` for `THROTTLED:` entries; wait for cool-down or restore the backend URL | +| `priority`, `ModelType`, or `api-key` fields silently ignored | Old v2.0.1 field names used in `listBackends` | Rename to `priorityGroup`, `label`, and `auth`; see the [Reference](#reference) table | +| All requests route to the same backend regardless of header | `llm_proxy_priority` header is not reaching APIM | Confirm the header is not stripped by SimpleL7Proxy or a network layer before APIM; check APIM traces | +| Default priority-3 request hits `AlwaysFail` instead of `Shared` | `Shared`'s `acceptablePriorities` does not include `3` | Add `3` to `Shared`'s list or adjust `priorityGroup` so `Shared` sorts before `AlwaysFail` | + +## Related documentation + +- [POC-Failover-configuration.md](POC-Failover-configuration.md) — Automatic failover and retry when a backend returns `429` or times out +- [POC-OpenAI-Failover.md](POC-OpenAI-Failover.md) — Real Azure OpenAI PTU-to-PAYGO failover +- [POC-Chargeback.md](POC-Chargeback.md) — Token-level usage tracking and per-user cost attribution +- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) — Host configuration options including `acceptablePriorities` and `priorityGroup` +- [OBSERVABILITY.md](OBSERVABILITY.md) — Token metrics, telemetry channels, and event logger configuration + + +--- + +## What the policy does + +Each backend in `listBackends` carries an `acceptablePriorities` array. Before the retry loop picks a backend, the `PriBackendIndxs` variable is built by filtering the backend list to only those whose `acceptablePriorities` contains the request's priority value: + +```csharp +if (backend["acceptablePriorities"]?.Values().Contains(requestPriority) == true) { + list.Add(i); +} +``` + +The retry loop only iterates over backends in `PriBackendIndxs`. A backend that isn't in that list is invisible for the lifetime of the request, regardless of whether it's healthy. + +The request priority is read from the `llm_proxy_priority` header. If the header is absent, the policy defaults to `3`. + +--- + +## Prerequisites + +- An APIM instance with `Priority-with-retry-enhancedLog.xml` applied to the target API. See [Applying the policy](#applying-the-policy). +- The LLM Simulator deployed as an Azure Function. See [`test/LLMSimulator/Readme.md`](../test/LLMSimulator/Readme.md) — the fastest path is the portal ZIP deploy. Verify it's running: + ```bash + curl https://.azurewebsites.net/api/health + # → 200 OK + ``` +- Note the function app hostname — you'll use it in the backend list below. + +--- + +## Applying the policy + +The policy file is [`APIM-Policy/Priority-with-retry-enhancedLog.xml`](../../APIM-Policy/Priority-with-retry-enhancedLog.xml). + +**Azure portal:** +1. Open your APIM instance → **APIs** → select the target API. +2. Select **All operations** in the left panel. +3. Click the `` icon in the **Inbound processing** tile. +4. Replace the editor contents with the XML file contents. +5. Click **Save**. + +**Azure CLI:** +```bash +az apim api policy create \ + --resource-group \ + --service-name \ + --api-id \ + --value "$(cat APIM-Policy/Priority-with-retry-enhancedLog.xml)" \ + --format xml +``` + +--- + +## Backend configuration + +This POC uses three backends, all pointing at the same deployed LLM Simulator but with different `acceptablePriorities` lists: + +| Name | Endpoint | `priorityGroup` | `acceptablePriorities` | Purpose | +|------|----------|-----------------|------------------------|---------| +| Backend A | `/api/delay?delay=100` | `1` | `[1]` | Reserved for priority-1 requests only | +| Backend B | `/api/delay?delay=100` | `2` | `[2, 3]` | Shared: handles priority-2 and priority-3 | +| Backend C | `/api/error/500` | `3` | `[3]` | Priority-3 fallback that always fails — used to verify the 503 path | + +> **Field-name note.** Earlier drafts of these POCs used `priority`, `ModelType`, `Timeout`, `LimitConcurrency`, `BufferResponse`, and `api-key`. The policy now reads `priorityGroup`, `label`, `timeout`, `limitConcurrency`, `bufferResponse`, and `auth`. Uppercase first-letter variants (`Timeout`, `LimitConcurrency`, `BufferResponse`) are still normalized to lowercase at policy load, so they continue to work — but `priority`, `ModelType`, and `api-key` are **silently ignored**. If you're carrying over an older `listBackends` block, rename those three. + +Replace `listBackends` in the policy's `` block: + +```xml +.azurewebsites.net/api/delay?delay=100" }, + { "priorityGroup", 1 }, + { "label", "PTU" }, + { "acceptablePriorities", new JArray(1) }, + { "limitConcurrency", "off" }, + { "bufferResponse", true }, + { "timeout", 30 }, + { "auth", "" } + }); + + // Backend B: shared — handles priority-2 and priority-3 + backends.Add(new JObject() + { + { "url", "https://.azurewebsites.net/api/delay?delay=100" }, + { "priorityGroup", 2 }, + { "label", "PAYGO" }, + { "acceptablePriorities", new JArray(2, 3) }, + { "limitConcurrency", "off" }, + { "bufferResponse", true }, + { "timeout", 30 }, + { "auth", "" } + }); + + // Backend C: always returns 500 — used to verify 503 when no backend is eligible + backends.Add(new JObject() + { + { "url", "https://.azurewebsites.net/api/error/500" }, + { "priorityGroup", 3 }, + { "label", "PAYGO" }, + { "acceptablePriorities", new JArray(3) }, + { "limitConcurrency", "off" }, + { "bufferResponse", true }, + { "timeout", 30 }, + { "auth", "" } + }); + + foreach (JObject backend in backends) { + string saltedUrl = salt + backend["url"].ToString(); + backend["affinity"] = string.Concat( + System.Security.Cryptography.SHA256.Create() + .ComputeHash(System.Text.Encoding.UTF8.GetBytes(saltedUrl)) + .Take(10) + .Select(b => b.ToString("x2")) + ); + backend["isThrottling"] = false; + backend["retryAfter"] = DateTime.MinValue; + backend["defaultRetryAfter"] = 10; + } + return backends; +}" /> +``` + +Also set `priorityCfg` to give each priority level one retry: + +```xml + +``` + +--- + +## Test cases + +Replace ``, ``, and `` in each command. + +### Test 1 — Priority-1 request routes to Backend A only + +```bash +curl -i \ + -H "llm_proxy_priority: 1" \ + -H "Ocp-Apim-Subscription-Key: " \ + "https://.azure-api.net//api/delay" +``` + +**Expected:** + +| Header | Value | Meaning | +|--------|-------|---------| +| HTTP status | `200 OK` | Backend A responded. | +| `x-Backend-Attempts` | `1` | Only one backend was tried. | +| `x-backend-affinity` | hash of Backend A's URL | Confirms Backend A was used. | +| `x-PolicyCycleCounter` | `1` | One cycle; no retry needed. | +| `backendLog` | `CALL SUCCESSFUL` for Backend A URL | No fallback occurred. | + +Backend B and Backend C were never in the candidate set — `PriBackendIndxs` contained only index 0 for `requestPriority = 1`. + +--- + +### Test 2 — Priority-2 request routes to Backend B, skips Backend A + +```bash +curl -i \ + -H "llm_proxy_priority: 2" \ + -H "Ocp-Apim-Subscription-Key: " \ + "https://.azure-api.net//api/delay" +``` + +**Expected:** + +| Header | Value | Meaning | +|--------|-------|---------| +| HTTP status | `200 OK` | Backend B responded. | +| `x-Backend-Attempts` | `1` | One backend tried. | +| `x-backend-affinity` | hash of Backend B's URL | Confirms Backend B was used. | +| `backendLog` | `CALL SUCCESSFUL` for Backend B URL | Correct. | + +Backend A (`acceptablePriorities: [1]`) was not in the candidate set. Backend C was also excluded. Only Backend B matched `requestPriority = 2`. + +--- + +### Test 3 — Default priority (no header) behaves the same as priority-3 + +```bash +curl -i \ + -H "Ocp-Apim-Subscription-Key: " \ + "https://.azure-api.net//api/delay" +``` + +The policy defaults to `requestPriority = 3` when the header is absent. Backend B (`acceptablePriorities: [2, 3]`) and Backend C (`acceptablePriorities: [3]`) are both eligible. + +Backend B (`priorityGroup: 2`) sorts before Backend C (`priorityGroup: 3`), so Backend B is tried first and responds successfully. + +**Expected:** `200 OK`, `x-Backend-Attempts: 1`, affinity pointing at Backend B. + +--- + +### Test 4 — No eligible backend returns 503 + +This test verifies what happens when a request carries a priority that no backend accepts. Remove Backend B's `2` from its `acceptablePriorities` list (leave it as `[3]` only), then send a priority-2 request: + +```xml +{ "acceptablePriorities", new JArray(3) }, // temporarily changed for this test +``` + +```bash +curl -i \ + -H "llm_proxy_priority: 2" \ + -H "Ocp-Apim-Subscription-Key: " \ + "https://.azure-api.net//api/delay" +``` + +**Expected:** `503 Service Unavailable`. `PriBackendIndxs` is empty for `requestPriority = 2`, so the retry loop has nothing to try. + +Restore Backend B's `acceptablePriorities` to `[2, 3]` after verifying this. + +--- + +### Test 5 — Backend A being throttled does not affect a priority-2 request + +This confirms that priority isolation works even when a backend in a different tier is actively throttled. First, trigger a throttle on Backend A by changing it to point at `/api/error/429`: + +```xml +{ "url", "https://.azurewebsites.net/api/error/429?retryAfter=30" }, +``` + +Send a priority-1 request (this will fail over and mark Backend A throttled): +```bash +curl -i -H "llm_proxy_priority: 1" -H "Ocp-Apim-Subscription-Key: " \ + "https://.azure-api.net//api/delay" +# Backend A returns 429; no other backend accepts priority-1 → 503 +``` + +Immediately send a priority-2 request: +```bash +curl -i -H "llm_proxy_priority: 2" -H "Ocp-Apim-Subscription-Key: " \ + "https://.azure-api.net//api/delay" +``` + +**Expected:** `200 OK` via Backend B. Backend A's throttle state is irrelevant because it was never in the candidate set for `requestPriority = 2`. + +Restore Backend A's URL to `/api/delay?delay=100` after verifying. + +--- + +## How to read the `backendLog` header + +Each backend attempt appends a line to `backendLog`. For a successful single-attempt request it looks like: + +``` +Using PAYGO backend: https://.azurewebsites.net/api/delay?delay=100 ... CALL SUCCESSFUL +``` + +For a request where the first attempt was skipped (because no eligible backend was found at that index), only the successful attempt appears. For a retry where one backend was throttled and another succeeded, you'll see two lines: + +``` +Throttling [0] by 12s, isTempError=true, retry-after=10 +Using PAYGO backend: https://.azurewebsites.net/api/delay?delay=100 ... CALL SUCCESSFUL +``` + +The `[0]` is the zero-based index of the throttled backend in `listBackends`. + +--- + +
+Tuning + +Once the basic tests pass, a few variations are worth exploring: + +- **Restrict Backend B to priority-2 only** (`acceptablePriorities: [2]`) and send a priority-3 request. With no backend accepting `3`, you'll see a `503` — useful if you want to enforce hard tier boundaries with no shared fallback. +- **Add a PTU backend at priority 1** and a PAYGO backend at priorities 1–3. Priority-1 requests will prefer the PTU backend; only when it's throttled will the policy fall back to PAYGO — the standard cost-optimized pattern for Azure OpenAI deployments. +- **Set `requeue: true` for priority-3** and `retryCount: 0`. When the shared backend is throttled and retries are exhausted, the policy returns `429 + S7PREQUEUE: true`, which tells SimpleL7Proxy to re-enqueue the request rather than return an error to the caller. +- **Use `LimitConcurrency`** to cap how many simultaneous requests each backend handles. Combine with different concurrency caps per tier to simulate a PTU deployment with a fixed token budget. + +
+ +--- + +## Related Documentation + +- [POC-Failover-configuration.md](POC-Failover-configuration.md) — Automatic failover and retry behaviour when a backend is slow or unavailable +- [POC-Chargeback.md](POC-Chargeback.md) — Token-level usage tracking and per-user cost attribution +- [BACKEND_HOSTS.md](BACKEND_HOSTS.md) — Host connection string options including `acceptablePriorities` and `processor=` +- [OBSERVABILITY.md](OBSERVABILITY.md) — Token metrics, telemetry channels, and event logger configuration diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md index 65676c1..8622b05 100644 --- a/docs/QUICKSTART.md +++ b/docs/QUICKSTART.md @@ -1,82 +1,82 @@ -# Quick Start - -> [!IMPORTANT] -> Current deployment scripts are Docker-based. `deploy.sh` and `deploy.ps1` build and push images using local Docker. -> If Docker is unavailable, use the remote ACR build workflow in [CONTAINER_DEPLOYMENT.md](CONTAINER_DEPLOYMENT.md), then deploy using the resulting image tags. - -## Deploy to Azure Container Apps - -### Prerequisites - -- [.NET 10 SDK](https://dotnet.microsoft.com/download) -- [Docker](https://docs.docker.com/get-docker/) (optional; only needed for local container builds) -- [Azure Developer CLI (azd)](https://learn.microsoft.com/en-us/azure/developer/azure-developer-cli/install-azd) -- [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) -- Azure subscription with Container Apps enabled - -### Windows - -```powershell -.\.azure\setup.ps1 -azd provision -.\.azure\deploy.ps1 -``` - -### Linux / macOS - -```bash -chmod +x .azure/setup.sh .azure/deploy.sh -./.azure/setup.sh && azd provision && ./.azure/deploy.sh -``` - -The setup script will prompt for a deployment scenario: - -| Scenario | Description | -|----------|-------------| -| `local-proxy-public-apim` | Proxy runs locally; backends on public APIM | -| `aca-proxy-public-apim` | Proxy deployed as ACA; backends on public APIM | -| `vnet-proxy-deployment` | Proxy inside a VNet | - -→ For detailed ACA deployment steps and options, see [CONTAINER_DEPLOYMENT.md](CONTAINER_DEPLOYMENT.md). - ---- - -## Run Locally (2 commands) - -```bash -git clone https://github.com/your-org/SimpleL7Proxy.git -dotnet run --project src/SimpleL7Proxy -``` - -The proxy starts on port 8000. Set at least one backend via `Host1` before sending traffic: - -```bash -export Host1=host=https://api.example.com;probe=/health -dotnet run --project src/SimpleL7Proxy -``` - -→ Need a mock backend to test against? See [DUMMY_BACKEND.md](DUMMY_BACKEND.md). - ---- - -## Local Development Paths - -### Fastest: Port + Backend Only - -```bash -export Port=8080 -export Host1=http://localhost:3000 -dotnet run --project src/SimpleL7Proxy -``` - -### Using Azure App Configuration - -```bash -export AZURE_APPCONFIG_ENDPOINT=https://your-appconfig.azconfig.io -export AZURE_APPCONFIG_LABEL=dev -dotnet run --project src/SimpleL7Proxy -``` - -→ For a full walkthrough of local setup options, see [BEGINNER_DEVELOPMENT.md](BEGINNER_DEVELOPMENT.md). -→ Need help diagnosing issues? See [TroubleshootTOC.md](TroubleshootTOC.md). - +# Quick Start + +> [!IMPORTANT] +> Current deployment scripts are Docker-based. `deploy.sh` and `deploy.ps1` build and push images using local Docker. +> If Docker is unavailable, use the remote ACR build workflow in [CONTAINER_DEPLOYMENT.md](CONTAINER_DEPLOYMENT.md), then deploy using the resulting image tags. + +## Deploy to Azure Container Apps + +### Prerequisites + +- [.NET 10 SDK](https://dotnet.microsoft.com/download) +- [Docker](https://docs.docker.com/get-docker/) (optional; only needed for local container builds) +- [Azure Developer CLI (azd)](https://learn.microsoft.com/en-us/azure/developer/azure-developer-cli/install-azd) +- [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) +- Azure subscription with Container Apps enabled + +### Windows + +```powershell +.\.azure\setup.ps1 +azd provision +.\.azure\deploy.ps1 +``` + +### Linux / macOS + +```bash +chmod +x .azure/setup.sh .azure/deploy.sh +./.azure/setup.sh && azd provision && ./.azure/deploy.sh +``` + +The setup script will prompt for a deployment scenario: + +| Scenario | Description | +|----------|-------------| +| `local-proxy-public-apim` | Proxy runs locally; backends on public APIM | +| `aca-proxy-public-apim` | Proxy deployed as ACA; backends on public APIM | +| `vnet-proxy-deployment` | Proxy inside a VNet | + +→ For detailed ACA deployment steps and options, see [CONTAINER_DEPLOYMENT.md](CONTAINER_DEPLOYMENT.md). + +--- + +## Run Locally (2 commands) + +```bash +git clone https://github.com/your-org/SimpleL7Proxy.git +dotnet run --project src/SimpleL7Proxy +``` + +The proxy starts on port 8000. Set at least one backend via `Host1` before sending traffic: + +```bash +export Host1=host=https://api.example.com;probe=/health +dotnet run --project src/SimpleL7Proxy +``` + +→ Need a mock backend to test against? See [DUMMY_BACKEND.md](DUMMY_BACKEND.md). + +--- + +## Local Development Paths + +### Fastest: Port + Backend Only + +```bash +export Port=8080 +export Host1=http://localhost:3000 +dotnet run --project src/SimpleL7Proxy +``` + +### Using Azure App Configuration + +```bash +export AZURE_APPCONFIG_ENDPOINT=https://your-appconfig.azconfig.io +export AZURE_APPCONFIG_LABEL=dev +dotnet run --project src/SimpleL7Proxy +``` + +→ For a full walkthrough of local setup options, see [BEGINNER_DEVELOPMENT.md](BEGINNER_DEVELOPMENT.md). +→ Need help diagnosing issues? See [TroubleshootTOC.md](TroubleshootTOC.md). + diff --git a/docs/TABLE_OF_CONTENTS.md b/docs/TABLE_OF_CONTENTS.md index ace9e1b..6226145 100644 --- a/docs/TABLE_OF_CONTENTS.md +++ b/docs/TABLE_OF_CONTENTS.md @@ -1,207 +1,207 @@ -# SimpleL7Proxy — Documentation Table of Contents - -All documentation is organized by conceptual domain. Each entry links to the authoritative document for that concept. - -> **Machine-readable taxonomy:** `SimpleL7Proxy/taxonomy/concepts.json` contains the full concept graph with IDs, relationships, settings cross-references, and response code mappings used to generate and validate documentation. - ---- - -## Request Lifecycle - -The end-to-end path a request travels from client to backend and back. - -| Concept | Document | -|---------|----------| -| Ingress, listener port, worker count | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md) | -| Priority queue, priority levels, default priority | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md#priority-management) | -| TTL (Time-to-Live) and per-request overrides | [TIMEOUTS.md](TIMEOUTS.md) | -| Worker dispatch, concurrent thread count | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md) | -| Response headers injected by the proxy | [RESPONSE_CODES.md](RESPONSE_CODES.md) | -| All proxy-originated and pass-through response codes | [RESPONSE_CODES.md](RESPONSE_CODES.md) | -| Request and response header reference (S7P* headers) | [RESPONSE_CODES.md](RESPONSE_CODES.md) | - ---- - -## Backend Management - -How the proxy discovers, probes, and selects backend hosts. - -| Concept | Document | -|---------|----------| -| Backend host configuration (Host1–Host9) | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| Connection string format (all keys) | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| Legacy per-variable format (deprecated) | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| Path-based routing, catch-all hosts, strip prefix | [BACKEND_HOSTS.md](BACKEND_HOSTS.md#path-based-routing) | -| Direct mode (serverless / no probing) | [BACKEND_HOSTS.md](BACKEND_HOSTS.md#direct-mode) | -| Stream processor (`processor=OpenAI`) | [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) | -| IP override, DNS bypass | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| Health polling, success rate, active pool | [BACKEND_HOSTS.md](BACKEND_HOSTS.md#health-polling) | -| Load balance modes (roundrobin / latency / random) | [LOAD_BALANCING.md](LOAD_BALANCING.md) | -| Backend selection pipeline (path → order → circuit gate) | [LOAD_BALANCING.md](LOAD_BALANCING.md) | -| Shared iterators | [LOAD_BALANCING.md](LOAD_BALANCING.md) | -| Iteration mode, max attempts | [LOAD_BALANCING.md](LOAD_BALANCING.md) | - ---- - -## Reliability - -Mechanisms that prevent failures from propagating to clients. - -| Concept | Document | -|---------|----------| -| Circuit breaker: CLOSED / OPEN states | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | -| Sliding window, auto-recovery | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | -| Progressive delay (50%–90% threshold) | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | -| Global blocked check (all circuits OPEN → 503) | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | -| Acceptable status codes | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | -| Retry across backends | [LOAD_BALANCING.md](LOAD_BALANCING.md#retrying-across-backends) | -| Requeue on 429 + S7PREQUEUE | [LOAD_BALANCING.md](LOAD_BALANCING.md#handling-responses) | -| TTL (total request budget) | [TIMEOUTS.md](TIMEOUTS.md) | -| Per-host Timeout | [TIMEOUTS.md](TIMEOUTS.md#synchronous-requests) | -| Per-request override headers (S7PTTL, S7PTimeout) | [TIMEOUTS.md](TIMEOUTS.md#per-request-overrides) | -| AsyncTriggerTimeout / AsyncTimeout / AsyncTTLSecs | [TIMEOUTS.md](TIMEOUTS.md#async-requests) | - ---- - -## Request Governance - -Validation and priority rules applied before a request enters the queue. - -| Concept | Document | -|---------|----------| -| Validation pipeline execution order | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md) | -| App ID validation (Entra allowlist, step 1) | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md#scenario-4-validate-caller-app-ids) | -| Header stripping (DisallowedHeaders, step 2) | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md#scenario-2-strip-internal-headers-before-forwarding) | -| Required headers (step 4, returns 417) | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md#scenario-1-require-specific-headers-on-every-request) | -| Header value validation rules (step 5) | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md#scenario-3-validate-header-values-against-a-per-user-allowlist) | -| User profiles: structure, fields, refresh | [USER_PROFILES.md](USER_PROFILES.md) | -| User ID field, profile lookup | [USER_PROFILES.md](USER_PROFILES.md) | -| Suspended users | [USER_PROFILES.md](USER_PROFILES.md#user-suspension) | -| async-config profile field | [USER_PROFILES.md](USER_PROFILES.md) | -| Priority mapping (PriorityKeys / PriorityValues / PriorityWorkers) | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md#priority-management) | -| Per-user throttling (UserPriorityThreshold) | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md#user-governance) | - ---- - -## Async Mode - -Long-running request handling that decouples client wait from backend processing. - -| Concept | Document | -|---------|----------| -| Async mode overview, three-level enablement | [AsyncOperation.md](AsyncOperation.md) | -| AsyncTriggerTimeout and async upgrade (202 response) | [TIMEOUTS.md](TIMEOUTS.md#async-requests) | -| Azure Service Bus configuration and status events | [AsyncOperation.md](AsyncOperation.md#azure-service-bus-configuration) | -| Result blob storage, SAS token lifetime | [AsyncOperation.md](AsyncOperation.md) | -| Blob retention and lifecycle management | [StorageBlobConfig.md](StorageBlobConfig.md) | -| Async processing variables (full reference) | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md#async-processing-variables) | - ---- - -## Observability - -How the proxy exposes telemetry about its own operation. - -| Concept | Document | -|---------|----------| -| ProxyEvent model, event headers, fan-out architecture | [OBSERVABILITY.md](OBSERVABILITY.md) | -| Application Insights (primary production sink) | [OBSERVABILITY.md](OBSERVABILITY.md#telemetry-channels) | -| Event Hubs (high-volume streaming) | [OBSERVABILITY.md](OBSERVABILITY.md#telemetry-channels) | -| Local log file (development) | [OBSERVABILITY.md](OBSERVABILITY.md#telemetry-channels) | -| Token telemetry from SSE streams (`processor=OpenAI`) | [OBSERVABILITY.md](OBSERVABILITY.md) | -| Custom event logger (IEventClient + IHostedService) | [OBSERVABILITY.md](OBSERVABILITY.md#custom-event-loggers) | -| Health endpoints: /liveness, /readiness, /startup | [HEALTH_CHECKING.md](HEALTH_CHECKING.md) | -| Sidecar mode health isolation (port 9000) | [HEALTH_CHECKING.md](HEALTH_CHECKING.md#2-sidecar-mode-high-performance) | -| Observability logging variables | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md#logging--monitoring-variables) | - ---- - -## Configuration Management - -How settings reach the proxy and when they take effect. - -| Concept | Document | -|---------|----------| -| Warm / Cold / Hidden setting classification | [CONFIGURATION_SETTINGS.md](CONFIGURATION_SETTINGS.md) | -| Settings organized by frequency of use (Essential / Common / Advanced) | [CONFIGURATION_CATEGORIES.md](CONFIGURATION_CATEGORIES.md) | -| Azure App Configuration: setup, RBAC, Sentinel pattern | [AZURE_APP_CONFIGURATION.md](AZURE_APP_CONFIGURATION.md) | -| Composite connection string format | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| All environment variables (exhaustive reference) | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md) | -| Minimum required configuration | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md#minimum-required-configuration) | -| Copy-paste configurations for common scenarios | [SCENARIOS.md](SCENARIOS.md) | - ---- - -## Authentication and Security - -How the proxy authenticates to backends and restricts inbound callers. - -| Concept | Document | -|---------|----------| -| Managed Identity for backends (`usemi=true`) | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| Keyless auth for Azure OpenAI / AI Foundry | [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) | -| OAuth2 Bearer token attachment | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | -| App ID allowlist (Entra, >13 app IDs) | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md) | -| Header stripping to prevent internal header leakage | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md) | -| VNet deployment and sovereign cloud | [OVERVIEW.md](OVERVIEW.md) | -| Security overview and responsible disclosure | [SECURITY.md](SECURITY.md) | -| Securing the proxy with APIM | [POC-Secure-the-proxy.md](POC-Secure-the-proxy.md) | - ---- - -## Deployment Architecture - -How the proxy is packaged and run on Azure. - -| Concept | Document | -|---------|----------| -| Azure Container Apps deployment | [CONTAINER_DEPLOYMENT.md](CONTAINER_DEPLOYMENT.md) | -| Sidecar deployment (proxy + health probe containers) | [SIDECAR_DEPLOYMENT.md](SIDECAR_DEPLOYMENT.md) | -| Build and deploy scripts, parameters file | [SIDECAR_DEPLOYMENT.md](SIDECAR_DEPLOYMENT.md) | -| Azure AI Foundry / OpenAI integration patterns | [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) | -| APIM integration and reference policy | [../APIM-Policy/readme.md](../APIM-Policy/readme.md) | -| Day-2 operations | [../deployment/DAY2_OPERATIONS.md](../deployment/DAY2_OPERATIONS.md) | - ---- - -## Protocol and Headers - -Named HTTP signals that cross the client-proxy and proxy-backend boundaries. - -| Header | Direction | Document | -|--------|-----------|----------| -| `S7PPriorityKey` | Client → proxy | [RESPONSE_CODES.md](RESPONSE_CODES.md#request-headers-proxy-reads-these) | -| `S7PTTL` | Client → proxy | [TIMEOUTS.md](TIMEOUTS.md#per-request-overrides) | -| `S7PTimeout` | Client → proxy | [TIMEOUTS.md](TIMEOUTS.md#per-request-overrides) | -| `S7PAsyncMode` | Client → proxy | [AsyncOperation.md](AsyncOperation.md) | -| `S7PDEBUG` | Client → proxy | [RESPONSE_CODES.md](RESPONSE_CODES.md#request-headers-proxy-reads-these) | -| `S7PREQUEUE` | Backend → proxy | [RESPONSE_CODES.md](RESPONSE_CODES.md#backend-429-and-requeue) | -| `x-Request-Queue-Duration` | Proxy → client | [RESPONSE_CODES.md](RESPONSE_CODES.md#response-headers-proxy-adds-these) | -| `x-Request-Process-Duration` | Proxy → client | [RESPONSE_CODES.md](RESPONSE_CODES.md#response-headers-proxy-adds-these) | -| `x-Request-Worker` | Proxy → client | [RESPONSE_CODES.md](RESPONSE_CODES.md#response-headers-proxy-adds-these) | -| `BackendHost` | Proxy → client | [RESPONSE_CODES.md](RESPONSE_CODES.md#response-headers-proxy-adds-these) | -| `Total-Latency` | Proxy → client | [RESPONSE_CODES.md](RESPONSE_CODES.md#response-headers-proxy-adds-these) | - ---- - -## Getting Started - -| Goal | Start here | -|------|-----------| -| First deployment | [QUICKSTART.md](QUICKSTART.md) | -| Local development | [BEGINNER_DEVELOPMENT.md](BEGINNER_DEVELOPMENT.md) | -| Advanced development & performance tuning | [ADVANCED_DEVELOPMENT.md](ADVANCED_DEVELOPMENT.md) | -| Understanding the architecture | [OVERVIEW.md](OVERVIEW.md) | -| Diagnosing a problem | [TroubleshootTOC.md](TroubleshootTOC.md) | -| Code structure and internals | [design.md](design.md) | - -## Proof-of-Concept Guides - -| Scenario | Document | -|----------|----------| -| OpenAI failover across regions | [POC-OpenAI-Failover.md](POC-OpenAI-Failover.md) | -| Failover configuration | [POC-Failover-configuration.md](POC-Failover-configuration.md) | -| Priority-based routing | [POC-Priority-configuration.md](POC-Priority-configuration.md) | -| Securing the proxy | [POC-Secure-the-proxy.md](POC-Secure-the-proxy.md) | -| Securing APIM | [POC-security-the-apim.md](POC-security-the-apim.md) | -| Chargeback and token tracking | [POC-Chargeback.md](POC-Chargeback.md) | +# SimpleL7Proxy — Documentation Table of Contents + +All documentation is organized by conceptual domain. Each entry links to the authoritative document for that concept. + +> **Machine-readable taxonomy:** `SimpleL7Proxy/taxonomy/concepts.json` contains the full concept graph with IDs, relationships, settings cross-references, and response code mappings used to generate and validate documentation. + +--- + +## Request Lifecycle + +The end-to-end path a request travels from client to backend and back. + +| Concept | Document | +|---------|----------| +| Ingress, listener port, worker count | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md) | +| Priority queue, priority levels, default priority | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md#priority-management) | +| TTL (Time-to-Live) and per-request overrides | [TIMEOUTS.md](TIMEOUTS.md) | +| Worker dispatch, concurrent thread count | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md) | +| Response headers injected by the proxy | [RESPONSE_CODES.md](RESPONSE_CODES.md) | +| All proxy-originated and pass-through response codes | [RESPONSE_CODES.md](RESPONSE_CODES.md) | +| Request and response header reference (S7P* headers) | [RESPONSE_CODES.md](RESPONSE_CODES.md) | + +--- + +## Backend Management + +How the proxy discovers, probes, and selects backend hosts. + +| Concept | Document | +|---------|----------| +| Backend host configuration (Host1–Host9) | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| Connection string format (all keys) | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| Legacy per-variable format (deprecated) | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| Path-based routing, catch-all hosts, strip prefix | [BACKEND_HOSTS.md](BACKEND_HOSTS.md#path-based-routing) | +| Direct mode (serverless / no probing) | [BACKEND_HOSTS.md](BACKEND_HOSTS.md#direct-mode) | +| Stream processor (`processor=OpenAI`) | [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) | +| IP override, DNS bypass | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| Health polling, success rate, active pool | [BACKEND_HOSTS.md](BACKEND_HOSTS.md#health-polling) | +| Load balance modes (roundrobin / latency / random) | [LOAD_BALANCING.md](LOAD_BALANCING.md) | +| Backend selection pipeline (path → order → circuit gate) | [LOAD_BALANCING.md](LOAD_BALANCING.md) | +| Shared iterators | [LOAD_BALANCING.md](LOAD_BALANCING.md) | +| Iteration mode, max attempts | [LOAD_BALANCING.md](LOAD_BALANCING.md) | + +--- + +## Reliability + +Mechanisms that prevent failures from propagating to clients. + +| Concept | Document | +|---------|----------| +| Circuit breaker: CLOSED / OPEN states | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | +| Sliding window, auto-recovery | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | +| Progressive delay (50%–90% threshold) | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | +| Global blocked check (all circuits OPEN → 503) | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | +| Acceptable status codes | [CIRCUIT_BREAKER.md](CIRCUIT_BREAKER.md) | +| Retry across backends | [LOAD_BALANCING.md](LOAD_BALANCING.md#retrying-across-backends) | +| Requeue on 429 + S7PREQUEUE | [LOAD_BALANCING.md](LOAD_BALANCING.md#handling-responses) | +| TTL (total request budget) | [TIMEOUTS.md](TIMEOUTS.md) | +| Per-host Timeout | [TIMEOUTS.md](TIMEOUTS.md#synchronous-requests) | +| Per-request override headers (S7PTTL, S7PTimeout) | [TIMEOUTS.md](TIMEOUTS.md#per-request-overrides) | +| AsyncTriggerTimeout / AsyncTimeout / AsyncTTLSecs | [TIMEOUTS.md](TIMEOUTS.md#async-requests) | + +--- + +## Request Governance + +Validation and priority rules applied before a request enters the queue. + +| Concept | Document | +|---------|----------| +| Validation pipeline execution order | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md) | +| App ID validation (Entra allowlist, step 1) | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md#scenario-4-validate-caller-app-ids) | +| Header stripping (DisallowedHeaders, step 2) | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md#scenario-2-strip-internal-headers-before-forwarding) | +| Required headers (step 4, returns 417) | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md#scenario-1-require-specific-headers-on-every-request) | +| Header value validation rules (step 5) | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md#scenario-3-validate-header-values-against-a-per-user-allowlist) | +| User profiles: structure, fields, refresh | [USER_PROFILES.md](USER_PROFILES.md) | +| User ID field, profile lookup | [USER_PROFILES.md](USER_PROFILES.md) | +| Suspended users | [USER_PROFILES.md](USER_PROFILES.md#user-suspension) | +| async-config profile field | [USER_PROFILES.md](USER_PROFILES.md) | +| Priority mapping (PriorityKeys / PriorityValues / PriorityWorkers) | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md#priority-management) | +| Per-user throttling (UserPriorityThreshold) | [ADVANCED_CONFIGURATION.md](ADVANCED_CONFIGURATION.md#user-governance) | + +--- + +## Async Mode + +Long-running request handling that decouples client wait from backend processing. + +| Concept | Document | +|---------|----------| +| Async mode overview, three-level enablement | [AsyncOperation.md](AsyncOperation.md) | +| AsyncTriggerTimeout and async upgrade (202 response) | [TIMEOUTS.md](TIMEOUTS.md#async-requests) | +| Azure Service Bus configuration and status events | [AsyncOperation.md](AsyncOperation.md#azure-service-bus-configuration) | +| Result blob storage, SAS token lifetime | [AsyncOperation.md](AsyncOperation.md) | +| Blob retention and lifecycle management | [StorageBlobConfig.md](StorageBlobConfig.md) | +| Async processing variables (full reference) | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md#async-processing-variables) | + +--- + +## Observability + +How the proxy exposes telemetry about its own operation. + +| Concept | Document | +|---------|----------| +| ProxyEvent model, event headers, fan-out architecture | [OBSERVABILITY.md](OBSERVABILITY.md) | +| Application Insights (primary production sink) | [OBSERVABILITY.md](OBSERVABILITY.md#telemetry-channels) | +| Event Hubs (high-volume streaming) | [OBSERVABILITY.md](OBSERVABILITY.md#telemetry-channels) | +| Local log file (development) | [OBSERVABILITY.md](OBSERVABILITY.md#telemetry-channels) | +| Token telemetry from SSE streams (`processor=OpenAI`) | [OBSERVABILITY.md](OBSERVABILITY.md) | +| Custom event logger (IEventClient + IHostedService) | [OBSERVABILITY.md](OBSERVABILITY.md#custom-event-loggers) | +| Health endpoints: /liveness, /readiness, /startup | [HEALTH_CHECKING.md](HEALTH_CHECKING.md) | +| Sidecar mode health isolation (port 9000) | [HEALTH_CHECKING.md](HEALTH_CHECKING.md#2-sidecar-mode-high-performance) | +| Observability logging variables | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md#logging--monitoring-variables) | + +--- + +## Configuration Management + +How settings reach the proxy and when they take effect. + +| Concept | Document | +|---------|----------| +| Warm / Cold / Hidden setting classification | [CONFIGURATION_SETTINGS.md](CONFIGURATION_SETTINGS.md) | +| Settings organized by frequency of use (Essential / Common / Advanced) | [CONFIGURATION_CATEGORIES.md](CONFIGURATION_CATEGORIES.md) | +| Azure App Configuration: setup, RBAC, Sentinel pattern | [AZURE_APP_CONFIGURATION.md](AZURE_APP_CONFIGURATION.md) | +| Composite connection string format | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| All environment variables (exhaustive reference) | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md) | +| Minimum required configuration | [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md#minimum-required-configuration) | +| Copy-paste configurations for common scenarios | [SCENARIOS.md](SCENARIOS.md) | + +--- + +## Authentication and Security + +How the proxy authenticates to backends and restricts inbound callers. + +| Concept | Document | +|---------|----------| +| Managed Identity for backends (`usemi=true`) | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| Keyless auth for Azure OpenAI / AI Foundry | [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) | +| OAuth2 Bearer token attachment | [BACKEND_HOSTS.md](BACKEND_HOSTS.md) | +| App ID allowlist (Entra, >13 app IDs) | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md) | +| Header stripping to prevent internal header leakage | [REQUEST_VALIDATION.md](REQUEST_VALIDATION.md) | +| VNet deployment and sovereign cloud | [OVERVIEW.md](OVERVIEW.md) | +| Security overview and responsible disclosure | [SECURITY.md](SECURITY.md) | +| Securing the proxy with APIM | [POC-Secure-the-proxy.md](POC-Secure-the-proxy.md) | + +--- + +## Deployment Architecture + +How the proxy is packaged and run on Azure. + +| Concept | Document | +|---------|----------| +| Azure Container Apps deployment | [CONTAINER_DEPLOYMENT.md](CONTAINER_DEPLOYMENT.md) | +| Sidecar deployment (proxy + health probe containers) | [SIDECAR_DEPLOYMENT.md](SIDECAR_DEPLOYMENT.md) | +| Build and deploy scripts, parameters file | [SIDECAR_DEPLOYMENT.md](SIDECAR_DEPLOYMENT.md) | +| Azure AI Foundry / OpenAI integration patterns | [AI_FOUNDRY_INTEGRATION.md](AI_FOUNDRY_INTEGRATION.md) | +| APIM integration and reference policy | [../APIM-Policy/readme.md](../APIM-Policy/readme.md) | +| Day-2 operations | [../deployment/DAY2_OPERATIONS.md](../deployment/DAY2_OPERATIONS.md) | + +--- + +## Protocol and Headers + +Named HTTP signals that cross the client-proxy and proxy-backend boundaries. + +| Header | Direction | Document | +|--------|-----------|----------| +| `S7PPriorityKey` | Client → proxy | [RESPONSE_CODES.md](RESPONSE_CODES.md#request-headers-proxy-reads-these) | +| `S7PTTL` | Client → proxy | [TIMEOUTS.md](TIMEOUTS.md#per-request-overrides) | +| `S7PTimeout` | Client → proxy | [TIMEOUTS.md](TIMEOUTS.md#per-request-overrides) | +| `S7PAsyncMode` | Client → proxy | [AsyncOperation.md](AsyncOperation.md) | +| `S7PDEBUG` | Client → proxy | [RESPONSE_CODES.md](RESPONSE_CODES.md#request-headers-proxy-reads-these) | +| `S7PREQUEUE` | Backend → proxy | [RESPONSE_CODES.md](RESPONSE_CODES.md#backend-429-and-requeue) | +| `x-Request-Queue-Duration` | Proxy → client | [RESPONSE_CODES.md](RESPONSE_CODES.md#response-headers-proxy-adds-these) | +| `x-Request-Process-Duration` | Proxy → client | [RESPONSE_CODES.md](RESPONSE_CODES.md#response-headers-proxy-adds-these) | +| `x-Request-Worker` | Proxy → client | [RESPONSE_CODES.md](RESPONSE_CODES.md#response-headers-proxy-adds-these) | +| `BackendHost` | Proxy → client | [RESPONSE_CODES.md](RESPONSE_CODES.md#response-headers-proxy-adds-these) | +| `Total-Latency` | Proxy → client | [RESPONSE_CODES.md](RESPONSE_CODES.md#response-headers-proxy-adds-these) | + +--- + +## Getting Started + +| Goal | Start here | +|------|-----------| +| First deployment | [QUICKSTART.md](QUICKSTART.md) | +| Local development | [BEGINNER_DEVELOPMENT.md](BEGINNER_DEVELOPMENT.md) | +| Advanced development & performance tuning | [ADVANCED_DEVELOPMENT.md](ADVANCED_DEVELOPMENT.md) | +| Understanding the architecture | [OVERVIEW.md](OVERVIEW.md) | +| Diagnosing a problem | [TroubleshootTOC.md](TroubleshootTOC.md) | +| Code structure and internals | [design.md](design.md) | + +## Proof-of-Concept Guides + +| Scenario | Document | +|----------|----------| +| OpenAI failover across regions | [POC-OpenAI-Failover.md](POC-OpenAI-Failover.md) | +| Failover configuration | [POC-Failover-configuration.md](POC-Failover-configuration.md) | +| Priority-based routing | [POC-Priority-configuration.md](POC-Priority-configuration.md) | +| Securing the proxy | [POC-Secure-the-proxy.md](POC-Secure-the-proxy.md) | +| Securing APIM | [POC-security-the-apim.md](POC-security-the-apim.md) | +| Chargeback and token tracking | [POC-Chargeback.md](POC-Chargeback.md) | diff --git a/taxonomy/concepts.json b/taxonomy/concepts.json index a51a6cb..4b05837 100644 --- a/taxonomy/concepts.json +++ b/taxonomy/concepts.json @@ -1,1405 +1,1405 @@ -{ - "meta": { - "schema_version": "1.0", - "generated_date": "2026-05-21", - "application": "SimpleL7Proxy", - "description": "Machine-readable concept taxonomy derived from SimpleL7Proxy documentation. Not intended for human consumption.", - "source_docs_path": "SimpleL7Proxy/docs/" - }, - - "domains": [ - { "id": "d01", "name": "Request Lifecycle", "description": "End-to-end path a request travels from client to backend and back." }, - { "id": "d02", "name": "Backend Management", "description": "How the proxy discovers, probes, and selects backend hosts." }, - { "id": "d03", "name": "Reliability", "description": "Mechanisms that prevent failures from propagating to clients." }, - { "id": "d04", "name": "Request Governance", "description": "Validation and priority rules applied before a request enters the queue." }, - { "id": "d05", "name": "Async Mode", "description": "Long-running request handling that decouples client wait from backend processing." }, - { "id": "d06", "name": "Observability", "description": "How the proxy exposes telemetry about its own operation." }, - { "id": "d07", "name": "Configuration Management", "description": "How settings reach the proxy and when they take effect." }, - { "id": "d08", "name": "Authentication and Security","description": "How the proxy authenticates to backends and restricts inbound callers." }, - { "id": "d09", "name": "Deployment Architecture", "description": "How the proxy is packaged and run on Azure." }, - { "id": "d10", "name": "Protocol and Headers", "description": "Named HTTP signals that cross the client-proxy and proxy-backend boundaries." } - ], - - "subdomains": [ - { "id": "sd01-01", "domain_id": "d01", "name": "Ingress" }, - { "id": "sd01-02", "domain_id": "d01", "name": "Priority Queue" }, - { "id": "sd01-03", "domain_id": "d01", "name": "Worker Dispatch" }, - { "id": "sd01-04", "domain_id": "d01", "name": "Response" }, - { "id": "sd02-01", "domain_id": "d02", "name": "Host Configuration" }, - { "id": "sd02-02", "domain_id": "d02", "name": "Health Polling" }, - { "id": "sd02-03", "domain_id": "d02", "name": "Backend Selection Pipeline" }, - { "id": "sd03-01", "domain_id": "d03", "name": "Circuit Breaker" }, - { "id": "sd03-02", "domain_id": "d03", "name": "Retry and Requeue" }, - { "id": "sd03-03", "domain_id": "d03", "name": "Timeout Model" }, - { "id": "sd04-01", "domain_id": "d04", "name": "Validation Pipeline" }, - { "id": "sd04-02", "domain_id": "d04", "name": "Validation Settings" }, - { "id": "sd04-03", "domain_id": "d04", "name": "User Profiles" }, - { "id": "sd04-04", "domain_id": "d04", "name": "Priority Mapping" }, - { "id": "sd06-01", "domain_id": "d06", "name": "Event Model" }, - { "id": "sd06-02", "domain_id": "d06", "name": "Telemetry Sinks" }, - { "id": "sd06-03", "domain_id": "d06", "name": "Health Endpoints" } - ], - - "concepts": [ - - { - "id": "c-listener", - "name": "Listener", - "code_file": "Server.cs", - "domain_id": "d01", - "subdomain_id": "sd01-01", - "definition": "Accepts inbound HTTP connections and submits each request to the priority queue.", - "key_settings": ["Port", "Workers"], - "related_concept_ids": ["c-priority-queue", "c-enqueue", "c-validation-pipeline"], - "docs": ["design.md", "ENVIRONMENT_VARIABLES.md"] - }, - { - "id": "c-request-data", - "name": "RequestData", - "code_file": "RequestData.cs", - "domain_id": "d01", - "subdomain_id": "sd01-01", - "definition": "Runtime object holding all data for one inbound request: headers, body, path, assigned priority, and TTL expiry.", - "key_settings": [], - "related_concept_ids": ["c-proxy-data", "c-enqueue"], - "docs": ["design.md"] - }, - { - "id": "c-enqueue", - "name": "Enqueue", - "domain_id": "d01", - "subdomain_id": "sd01-01", - "definition": "Submitting a RequestData into the priority queue. The TTL clock starts at this moment.", - "key_settings": ["MaxQueueLength"], - "related_concept_ids": ["c-ttl", "c-priority-queue", "c-max-queue-length", "c-validation-pipeline"], - "docs": ["RESPONSE_CODES.md", "TIMEOUTS.md"] - }, - { - "id": "c-max-queue-length", - "name": "Max Queue Length", - "domain_id": "d01", - "subdomain_id": "sd01-01", - "definition": "Hard cap on queued requests. Requests arriving when the queue is full receive 429 Too Many Requests.", - "key_settings": ["MaxQueueLength"], - "related_concept_ids": ["c-enqueue"], - "docs": ["ENVIRONMENT_VARIABLES.md", "RESPONSE_CODES.md"] - }, - { - "id": "c-priority-queue", - "name": "Priority Queue", - "code_file": "PriorityQueue.cs", - "domain_id": "d01", - "subdomain_id": "sd01-02", - "definition": "Min-heap data structure ordered by priority level integer. Lower integer exits first.", - "key_settings": ["DefaultPriority"], - "related_concept_ids": ["c-priority-level", "c-proxy-worker"], - "docs": ["design.md", "ADVANCED_CONFIGURATION.md"] - }, - { - "id": "c-priority-level", - "name": "Priority Level", - "domain_id": "d01", - "subdomain_id": "sd01-02", - "definition": "Integer assigned to every request. Lower number means higher dispatch precedence in the queue.", - "key_settings": ["PriorityValues", "DefaultPriority"], - "related_concept_ids": ["c-priority-queue", "c-priority-mapping", "c-default-priority"], - "docs": ["ADVANCED_CONFIGURATION.md", "ENVIRONMENT_VARIABLES.md"] - }, - { - "id": "c-default-priority", - "name": "Default Priority", - "domain_id": "d01", - "subdomain_id": "sd01-02", - "definition": "Priority level assigned when the request carries no matching PriorityKeyHeader value.", - "key_settings": ["DefaultPriority"], - "default_value": 2, - "related_concept_ids": ["c-priority-level", "c-priority-mapping"], - "docs": ["ADVANCED_CONFIGURATION.md", "CONFIGURATION_SETTINGS.md"] - }, - { - "id": "c-ttl", - "name": "TTL (Time-to-Live)", - "domain_id": "d01", - "subdomain_id": "sd01-02", - "definition": "Wall-clock budget for the entire request lifecycle covering queue wait plus all retry attempts. Expiry returns 412.", - "key_settings": ["DefaultTTLSecs"], - "per_request_override_header": "S7PTTL", - "unit": "seconds", - "related_concept_ids": ["c-timeout", "c-enqueue"], - "docs": ["TIMEOUTS.md", "RESPONSE_CODES.md"] - }, - { - "id": "c-proxy-worker", - "name": "ProxyWorker", - "code_file": "ProxyWorker.cs", - "domain_id": "d01", - "subdomain_id": "sd01-03", - "definition": "Thread that dequeues a request and drives the backend selection, send, and response write cycle.", - "key_settings": ["Workers"], - "related_concept_ids": ["c-priority-queue", "c-backend-selection-pipeline"], - "docs": ["design.md"] - }, - { - "id": "c-workers", - "name": "Workers", - "domain_id": "d01", - "subdomain_id": "sd01-03", - "definition": "Count of concurrent proxy worker threads. Cold setting; default 10 is for local testing only.", - "key_settings": ["Workers"], - "reload_type": "Cold", - "default_value": 10, - "related_concept_ids": ["c-proxy-worker", "c-priority-workers"], - "docs": ["ENVIRONMENT_VARIABLES.md", "CONFIGURATION_SETTINGS.md"] - }, - { - "id": "c-proxy-data", - "name": "ProxyData", - "code_file": "ProxyData.cs", - "domain_id": "d01", - "subdomain_id": "sd01-04", - "definition": "Runtime object holding a backend response: HTTP status code, response headers, and body.", - "key_settings": [], - "related_concept_ids": ["c-request-data", "c-pass-through"], - "docs": ["design.md"] - }, - { - "id": "c-pass-through", - "name": "Pass-through", - "domain_id": "d01", - "subdomain_id": "sd01-04", - "definition": "Any backend status code in AcceptableStatusCodes is forwarded directly to the client without triggering retry or circuit recording.", - "key_settings": ["AcceptableStatusCodes"], - "related_concept_ids": ["c-acceptable-status-codes", "c-circuit-breaker"], - "docs": ["RESPONSE_CODES.md", "CIRCUIT_BREAKER.md"] - }, - { - "id": "c-response-headers-injected", - "name": "Injected Response Headers", - "domain_id": "d01", - "subdomain_id": "sd01-04", - "definition": "HTTP headers the proxy appends to every successfully proxied response before returning to the client.", - "header_names": [ - "x-Request-Queue-Duration", - "x-Request-Process-Duration", - "x-Request-Worker", - "BackendHost", - "Total-Latency" - ], - "related_concept_ids": ["c-proxy-event"], - "docs": ["RESPONSE_CODES.md"] - }, - - { - "id": "c-backend-host", - "name": "Backend Host", - "domain_id": "d02", - "subdomain_id": "sd02-01", - "definition": "A single upstream endpoint the proxy can forward requests to. Configured as Host1 through Host9.", - "key_settings": ["Host1","Host2","Host3","Host4","Host5","Host6","Host7","Host8","Host9"], - "connection_string_keys": ["host","probe","path","mode","processor","usemi","useoauth","audience","ipaddress","stripprefix","retryafter"], - "related_concept_ids": ["c-connection-string-format", "c-health-poller", "c-circuit-breaker"], - "docs": ["BACKEND_HOSTS.md", "ENVIRONMENT_VARIABLES.md"] - }, - { - "id": "c-connection-string-format", - "name": "Connection String Format", - "domain_id": "d02", - "subdomain_id": "sd02-01", - "definition": "Preferred per-host configuration using a semicolon-delimited key=value string. Only this format supports path, mode, usemi, processor, and other modern options.", - "example": "host=https://api.example.com;probe=/health;path=/api;usemi=true", - "related_concept_ids": ["c-legacy-format", "c-composite-connection-string"], - "docs": ["BACKEND_HOSTS.md"] - }, - { - "id": "c-legacy-format", - "name": "Legacy Format", - "domain_id": "d02", - "subdomain_id": "sd02-01", - "definition": "Deprecated per-variable host configuration. Cannot express path, mode, usemi, or other modern per-host options. Must not be used in new deployments.", - "deprecated": true, - "deprecated_vars": ["Probe_path1","Probe_path2","Probe_path3","IP1","IP2"], - "related_concept_ids": ["c-connection-string-format"], - "docs": ["BACKEND_HOSTS.md", "ENVIRONMENT_VARIABLES.md"] - }, - { - "id": "c-path-prefix", - "name": "Path Prefix", - "domain_id": "d02", - "subdomain_id": "sd02-01", - "definition": "URL prefix configured on a host via the path connection string key. Requests whose URL starts with this prefix are routed to this host.", - "connection_string_key": "path", - "related_concept_ids": ["c-path-filter", "c-catch-all-host", "c-strip-prefix"], - "docs": ["BACKEND_HOSTS.md", "LOAD_BALANCING.md"] - }, - { - "id": "c-catch-all-host", - "name": "Catch-all Host", - "domain_id": "d02", - "subdomain_id": "sd02-01", - "definition": "A host with path=/ or no path configured. Receives requests that match no specific-path host.", - "connection_string_key": "path", - "related_concept_ids": ["c-path-prefix", "c-path-filter"], - "docs": ["BACKEND_HOSTS.md", "LOAD_BALANCING.md"] - }, - { - "id": "c-ip-override", - "name": "IP Override", - "domain_id": "d02", - "subdomain_id": "sd02-01", - "definition": "Forces all requests to a specific IP address, bypassing DNS resolution for that host.", - "connection_string_key": "ipaddress", - "related_concept_ids": ["c-backend-host"], - "docs": ["BACKEND_HOSTS.md"] - }, - { - "id": "c-strip-prefix", - "name": "Strip Prefix", - "domain_id": "d02", - "subdomain_id": "sd02-01", - "definition": "When true (default), the matched path prefix is removed from the URL before forwarding. Set stripprefix=false to preserve the full original path.", - "connection_string_key": "stripprefix", - "default_value": true, - "related_concept_ids": ["c-path-prefix"], - "docs": ["BACKEND_HOSTS.md"] - }, - { - "id": "c-processor", - "name": "Processor", - "domain_id": "d02", - "subdomain_id": "sd02-01", - "definition": "Named stream processor applied to the backend response after receipt. The built-in OpenAI processor extracts token usage counts from SSE streams.", - "connection_string_key": "processor", - "known_values": ["OpenAI"], - "related_concept_ids": ["c-token-telemetry", "c-direct-mode"], - "docs": ["AI_FOUNDRY_INTEGRATION.md", "OBSERVABILITY.md"] - }, - { - "id": "c-direct-mode", - "name": "Direct Mode", - "domain_id": "d02", - "subdomain_id": "sd02-01", - "definition": "Backend mode where the host is always treated as healthy. No probe is ever sent. In latency mode the host sorts first (0 ms average latency). Use for serverless or on-demand backends that must not be woken by probes.", - "connection_string_key": "mode", - "connection_string_value": "mode=direct", - "related_concept_ids": ["c-health-poller", "c-processor", "c-backend-host"], - "docs": ["BACKEND_HOSTS.md", "AI_FOUNDRY_INTEGRATION.md"] - }, - - { - "id": "c-health-poller", - "name": "Health Poller", - "code_file": "Backends.cs", - "domain_id": "d02", - "subdomain_id": "sd02-02", - "definition": "Background loop that probes each configured host at PollInterval ms intervals and tracks rolling success rate and average latency.", - "key_settings": ["PollInterval", "PollTimeout"], - "related_concept_ids": ["c-probe-path", "c-success-rate", "c-active-pool", "c-average-latency"], - "docs": ["BACKEND_HOSTS.md", "HEALTH_CHECKING.md"] - }, - { - "id": "c-probe-path", - "name": "Probe Path", - "domain_id": "d02", - "subdomain_id": "sd02-02", - "definition": "URL path sent as a GET request to test backend health. Ignored when mode=direct.", - "connection_string_key": "probe", - "default_value": "echo/resource?param1=sample", - "related_concept_ids": ["c-health-poller", "c-direct-mode"], - "docs": ["BACKEND_HOSTS.md"] - }, - { - "id": "c-success-rate", - "name": "Success Rate", - "domain_id": "d02", - "subdomain_id": "sd02-02", - "definition": "Rolling percentage of successful probe responses for a host. Hosts below the SuccessRate threshold are removed from the active pool until they recover.", - "key_settings": ["SuccessRate"], - "default_value": 80, - "unit": "percent", - "related_concept_ids": ["c-active-pool", "c-health-poller"], - "docs": ["BACKEND_HOSTS.md"] - }, - { - "id": "c-active-pool", - "name": "Active Pool", - "domain_id": "d02", - "subdomain_id": "sd02-02", - "definition": "Set of backend hosts currently eligible to receive traffic, filtered by rolling success rate threshold.", - "key_settings": ["SuccessRate"], - "related_concept_ids": ["c-success-rate", "c-health-poller", "c-circuit-breaker"], - "docs": ["BACKEND_HOSTS.md", "LOAD_BALANCING.md"] - }, - { - "id": "c-average-latency", - "name": "Average Latency", - "domain_id": "d02", - "subdomain_id": "sd02-02", - "definition": "Rolling average probe response time per host. Used to order hosts in latency load-balance mode. Direct-mode hosts report 0 ms and sort first.", - "key_settings": [], - "related_concept_ids": ["c-health-poller", "c-latency-iterator"], - "docs": ["BACKEND_HOSTS.md", "LOAD_BALANCING.md"] - }, - { - "id": "c-base-host-health", - "name": "BaseHostHealth", - "code_file": "BaseHostHealth.cs", - "domain_id": "d02", - "subdomain_id": "sd02-02", - "definition": "Runtime object holding a single host's health metrics: rolling success rate, average latency, and circuit breaker state.", - "key_settings": [], - "related_concept_ids": ["c-circuit-breaker", "c-success-rate", "c-average-latency"], - "docs": ["design.md"] - }, - - { - "id": "c-backend-selection-pipeline", - "name": "Backend Selection Pipeline", - "domain_id": "d02", - "subdomain_id": "sd02-03", - "definition": "Three-stage process run on every request: (1) path filter, (2) load-balance ordering, (3) per-host circuit-breaker gate.", - "stages": ["path-filter", "load-balance-mode", "circuit-breaker-gate"], - "related_concept_ids": ["c-path-filter", "c-load-balance-mode", "c-circuit-breaker"], - "docs": ["LOAD_BALANCING.md", "design.md"] - }, - { - "id": "c-iterator-factory", - "name": "IteratorFactory", - "code_file": "IteratorFactory.cs", - "domain_id": "d02", - "subdomain_id": "sd02-03", - "definition": "Creates a load-balance iterator for a given request path based on LoadBalanceMode configuration.", - "key_settings": ["LoadBalanceMode"], - "related_concept_ids": ["c-load-balance-mode", "c-shared-iterator"], - "docs": ["design.md", "LOAD_BALANCING.md"] - }, - { - "id": "c-path-filter", - "name": "Path Filter", - "domain_id": "d02", - "subdomain_id": "sd02-03", - "definition": "Stage 1 of backend selection. Specific-path hosts are checked first; if none match, catch-all hosts are used. Specific paths always win.", - "key_settings": ["path"], - "related_concept_ids": ["c-path-prefix", "c-catch-all-host", "c-backend-selection-pipeline"], - "docs": ["LOAD_BALANCING.md", "BACKEND_HOSTS.md"] - }, - { - "id": "c-load-balance-mode", - "name": "Load Balance Mode", - "domain_id": "d02", - "subdomain_id": "sd02-03", - "definition": "Stage 2 of backend selection. Determines the ordering of hosts within the matched set for each request.", - "key_settings": ["LoadBalanceMode"], - "allowed_values": ["roundrobin", "latency", "random"], - "default_value": "latency", - "reload_type": "Warm", - "related_concept_ids": ["c-round-robin-iterator", "c-latency-iterator", "c-random-iterator"], - "docs": ["LOAD_BALANCING.md"] - }, - { - "id": "c-round-robin-iterator", - "name": "Round-Robin Iterator", - "code_file": "RoundRobinHostIterator.cs", - "domain_id": "d02", - "subdomain_id": "sd02-03", - "definition": "Distributes requests evenly across active hosts using a global atomic counter.", - "config_value": "LoadBalanceMode=roundrobin", - "related_concept_ids": ["c-load-balance-mode", "c-shared-iterator"], - "docs": ["LOAD_BALANCING.md"] - }, - { - "id": "c-latency-iterator", - "name": "Latency Iterator", - "code_file": "LatencyHostIterator.cs", - "domain_id": "d02", - "subdomain_id": "sd02-03", - "definition": "Orders hosts by ascending average probe latency per request. Direct-mode hosts always sort first at 0 ms.", - "config_value": "LoadBalanceMode=latency", - "related_concept_ids": ["c-load-balance-mode", "c-average-latency", "c-direct-mode"], - "docs": ["LOAD_BALANCING.md"] - }, - { - "id": "c-random-iterator", - "name": "Random Iterator", - "code_file": "RandomHostIterator.cs", - "domain_id": "d02", - "subdomain_id": "sd02-03", - "definition": "Shuffles active hosts randomly on each request, producing no predictable access pattern.", - "config_value": "LoadBalanceMode=random", - "related_concept_ids": ["c-load-balance-mode"], - "docs": ["LOAD_BALANCING.md"] - }, - { - "id": "c-shared-iterator", - "name": "Shared Iterator", - "code_file": "SharedIteratorRegistry.cs", - "domain_id": "d02", - "subdomain_id": "sd02-03", - "definition": "Single iterator shared across all concurrent requests to the same path. Enables strict round-robin fairness across parallel workers.", - "key_settings": ["UseSharedIterators", "SharedIteratorTTLSeconds", "SharedIteratorCleanupIntervalSeconds"], - "related_concept_ids": ["c-round-robin-iterator", "c-iterator-factory"], - "docs": ["LOAD_BALANCING.md"] - }, - { - "id": "c-iteration-mode", - "name": "Iteration Mode", - "domain_id": "d02", - "subdomain_id": "sd02-03", - "definition": "Controls retry breadth. SinglePass tries each host at most once. MultiPass cycles through all hosts up to MaxAttempts total.", - "key_settings": ["IterationMode"], - "allowed_values": ["SinglePass", "MultiPass"], - "default_value": "SinglePass", - "reload_type": "Warm", - "related_concept_ids": ["c-max-attempts", "c-retry"], - "docs": ["LOAD_BALANCING.md"] - }, - { - "id": "c-max-attempts", - "name": "Max Attempts", - "domain_id": "d02", - "subdomain_id": "sd02-03", - "definition": "Maximum total host attempts in MultiPass mode. Hosts skipped because their circuit is OPEN do not consume an attempt.", - "key_settings": ["MaxAttempts"], - "default_value": 10, - "reload_type": "Warm", - "related_concept_ids": ["c-iteration-mode", "c-circuit-breaker"], - "docs": ["LOAD_BALANCING.md", "CONFIGURATION_SETTINGS.md"] - }, - - { - "id": "c-circuit-breaker", - "name": "Circuit Breaker", - "code_file": "CircuitBreaker.cs", - "domain_id": "d03", - "subdomain_id": "sd03-01", - "definition": "Per-host failure counter with a sliding time window. Opens when the failure count in the window exceeds CBErrorThreshold.", - "key_settings": ["CBErrorThreshold", "CBTimeslice", "AcceptableStatusCodes"], - "states": ["CLOSED", "OPEN"], - "reload_type": "Warm", - "related_concept_ids": ["c-circuit-closed", "c-circuit-open", "c-sliding-window", "c-auto-recovery", "c-progressive-delay", "c-global-blocked-check"], - "docs": ["CIRCUIT_BREAKER.md"] - }, - { - "id": "c-circuit-closed", - "name": "CLOSED State", - "domain_id": "d03", - "subdomain_id": "sd03-01", - "definition": "Normal circuit breaker state. Requests are forwarded to the host.", - "related_concept_ids": ["c-circuit-breaker", "c-circuit-open"], - "docs": ["CIRCUIT_BREAKER.md"] - }, - { - "id": "c-circuit-open", - "name": "OPEN State", - "domain_id": "d03", - "subdomain_id": "sd03-01", - "definition": "Circuit tripped. Host is skipped by the load-balance iterator. Skipped hosts do not consume the MaxAttempts budget.", - "related_concept_ids": ["c-circuit-breaker", "c-circuit-closed", "c-global-blocked-check"], - "docs": ["CIRCUIT_BREAKER.md"] - }, - { - "id": "c-sliding-window", - "name": "Sliding Window", - "domain_id": "d03", - "subdomain_id": "sd03-01", - "definition": "Only failures timestamped within the last CBTimeslice seconds count toward the threshold. Older entries are pruned automatically.", - "key_settings": ["CBTimeslice"], - "related_concept_ids": ["c-circuit-breaker", "c-auto-recovery"], - "docs": ["CIRCUIT_BREAKER.md"] - }, - { - "id": "c-auto-recovery", - "name": "Auto-Recovery", - "domain_id": "d03", - "subdomain_id": "sd03-01", - "definition": "Circuit closes automatically when all failures age out of the sliding window. No manual intervention required.", - "related_concept_ids": ["c-circuit-breaker", "c-sliding-window"], - "docs": ["CIRCUIT_BREAKER.md"] - }, - { - "id": "c-progressive-delay", - "name": "Progressive Delay", - "domain_id": "d03", - "subdomain_id": "sd03-01", - "definition": "Artificial per-request delay (100-500 ms) added as a host's failure count approaches CBErrorThreshold. Not configurable.", - "delay_table": [ - { "failure_pct_of_threshold": 50, "delay_ms": 100 }, - { "failure_pct_of_threshold": 60, "delay_ms": 200 }, - { "failure_pct_of_threshold": 70, "delay_ms": 300 }, - { "failure_pct_of_threshold": 80, "delay_ms": 400 }, - { "failure_pct_of_threshold": 90, "delay_ms": 500 } - ], - "related_concept_ids": ["c-circuit-breaker"], - "docs": ["CIRCUIT_BREAKER.md"] - }, - { - "id": "c-global-blocked-check", - "name": "Global Blocked Check", - "code_symbol": "AreAllCircuitBreakersBlocked()", - "domain_id": "d03", - "subdomain_id": "sd03-01", - "definition": "When all registered circuit breakers are OPEN simultaneously, the proxy returns 503 immediately without attempting any host.", - "related_concept_ids": ["c-circuit-open"], - "docs": ["CIRCUIT_BREAKER.md"] - }, - { - "id": "c-acceptable-status-codes", - "name": "Acceptable Status Codes", - "domain_id": "d03", - "subdomain_id": "sd03-01", - "definition": "HTTP status codes from backends that are not counted as circuit-breaker failures and are forwarded to the client.", - "key_settings": ["AcceptableStatusCodes"], - "default_value": [200, 202, 400, 401, 403, 404, 408, 410, 412, 417], - "reload_type": "Warm", - "related_concept_ids": ["c-circuit-breaker", "c-pass-through"], - "docs": ["CIRCUIT_BREAKER.md", "RESPONSE_CODES.md"] - }, - - { - "id": "c-retry", - "name": "Retry", - "domain_id": "d03", - "subdomain_id": "sd03-02", - "definition": "Advancing to the next host in the iterator after a non-acceptable backend response. Does not apply when the circuit is OPEN.", - "key_settings": ["IterationMode", "MaxAttempts"], - "related_concept_ids": ["c-iteration-mode", "c-max-attempts", "c-requeue"], - "docs": ["LOAD_BALANCING.md"] - }, - { - "id": "c-requeue", - "name": "Requeue", - "domain_id": "d03", - "subdomain_id": "sd03-02", - "definition": "Returning a request to the priority queue after all backends returned 429 with S7PREQUEUE:true. Uses the shortest retry-after delay seen across all backends.", - "related_concept_ids": ["c-retry", "c-s7prequeue-header", "c-retry-after"], - "docs": ["LOAD_BALANCING.md", "RESPONSE_CODES.md"] - }, - { - "id": "c-s7prequeue-header", - "name": "S7PREQUEUE", - "header_direction": "backend-response", - "domain_id": "d03", - "subdomain_id": "sd03-02", - "definition": "Response header a backend sets on a 429 reply to signal the proxy should requeue the request rather than try the next host.", - "related_concept_ids": ["c-requeue", "c-retry-after"], - "docs": ["RESPONSE_CODES.md", "LOAD_BALANCING.md"] - }, - { - "id": "c-retry-after", - "name": "Retry-After", - "domain_id": "d03", - "subdomain_id": "sd03-02", - "definition": "Delay value from the backend's Retry-After response header. When all backends return S7PREQUEUE, the proxy uses the shortest retry-after seen.", - "related_concept_ids": ["c-requeue", "c-s7prequeue-header"], - "docs": ["LOAD_BALANCING.md"] - }, - - { - "id": "c-timeout", - "name": "Timeout", - "domain_id": "d03", - "subdomain_id": "sd03-03", - "definition": "Per-host-attempt window in milliseconds. Resets on each retry. Effective deadline per attempt = min(remaining TTL, Timeout).", - "key_settings": ["Timeout"], - "per_request_override_header": "S7PTimeout", - "unit": "milliseconds", - "default_value": 1200000, - "reload_type": "Cold", - "related_concept_ids": ["c-ttl", "c-async-trigger-timeout"], - "docs": ["TIMEOUTS.md"] - }, - { - "id": "c-async-trigger-timeout", - "name": "AsyncTriggerTimeout", - "domain_id": "d03", - "subdomain_id": "sd03-03", - "definition": "Milliseconds after enqueue before the proxy releases the client with a 202 response and continues processing in the background.", - "key_settings": ["AsyncTriggerTimeout"], - "unit": "milliseconds", - "default_value": 10000, - "reload_type": "Warm", - "related_concept_ids": ["c-timeout", "c-async-mode", "c-async-timeout"], - "docs": ["TIMEOUTS.md", "AsyncOperation.md"] - }, - { - "id": "c-async-timeout", - "name": "AsyncTimeout", - "domain_id": "d03", - "subdomain_id": "sd03-03", - "definition": "Maximum backend processing time in milliseconds once a request is in async mode.", - "key_settings": ["AsyncTimeout"], - "unit": "milliseconds", - "default_value": 1800000, - "reload_type": "Warm", - "related_concept_ids": ["c-async-trigger-timeout", "c-async-ttl-secs"], - "docs": ["TIMEOUTS.md", "AsyncOperation.md"] - }, - { - "id": "c-async-ttl-secs", - "name": "AsyncTTLSecs", - "domain_id": "d03", - "subdomain_id": "sd03-03", - "definition": "Retention period in seconds for the async result blob in Azure Blob Storage after processing completes.", - "key_settings": ["AsyncTTLSecs"], - "unit": "seconds", - "default_value": 86400, - "reload_type": "Warm", - "related_concept_ids": ["c-result-blob", "c-async-timeout"], - "docs": ["TIMEOUTS.md", "AsyncOperation.md", "StorageBlobConfig.md"] - }, - - { - "id": "c-validation-pipeline", - "name": "Validation Pipeline", - "domain_id": "d04", - "subdomain_id": "sd04-01", - "definition": "Ordered sequence of validation checks applied to every non-probe inbound request before it is enqueued. The execution order is fixed.", - "steps": [ - { "order": 1, "concept_id": "c-app-id-validation", "on_failure_code": 403, "on_failure_label": "DisallowedAppID" }, - { "order": 2, "concept_id": "c-header-stripping", "on_failure_code": null, "on_failure_label": "silent" }, - { "order": 3, "concept_id": "c-user-profile", "on_failure_code": 403, "on_failure_label": "UnknownProfile" }, - { "order": 4, "concept_id": "c-required-headers", "on_failure_code": 417, "on_failure_label": "IncompleteHeaders" }, - { "order": 5, "concept_id": "c-header-validation-rule","on_failure_code": 417, "on_failure_label": "InvalidHeader" } - ], - "related_concept_ids": ["c-app-id-validation","c-header-stripping","c-user-profile","c-required-headers","c-header-validation-rule"], - "docs": ["REQUEST_VALIDATION.md"] - }, - { - "id": "c-app-id-validation", - "name": "App ID Validation", - "domain_id": "d04", - "subdomain_id": "sd04-01", - "definition": "Step 1 of the validation pipeline. Checks the caller's Entra Application ID against a remote allowlist before any other check. Designed for deployments with more than 13 App IDs (Entra's built-in limit).", - "key_settings": ["ValidateAuthAppID", "ValidateAuthAppIDUrl", "ValidateAuthAppIDHeader", "ValidateAuthAppFieldName"], - "on_failure_code": 403, - "reload_type": "Warm", - "related_concept_ids": ["c-app-id-allowlist", "c-validation-pipeline"], - "docs": ["REQUEST_VALIDATION.md", "ADVANCED_CONFIGURATION.md"] - }, - { - "id": "c-header-stripping", - "name": "Header Stripping", - "domain_id": "d04", - "subdomain_id": "sd04-01", - "definition": "Step 2 of the validation pipeline. Removes headers listed in DisallowedHeaders silently. No error is returned to the caller.", - "key_settings": ["DisallowedHeaders"], - "related_concept_ids": ["c-disallowed-headers", "c-validation-pipeline"], - "docs": ["REQUEST_VALIDATION.md"] - }, - { - "id": "c-required-headers", - "name": "Required Headers", - "domain_id": "d04", - "subdomain_id": "sd04-02", - "definition": "List of header names that must be non-empty on every request. The first missing header returns 417.", - "key_settings": ["RequiredHeaders"], - "on_failure_code": 417, - "on_failure_label": "IncompleteHeaders", - "reload_type": "Warm", - "related_concept_ids": ["c-validation-pipeline", "c-header-validation-rule"], - "docs": ["REQUEST_VALIDATION.md"] - }, - { - "id": "c-disallowed-headers", - "name": "Disallowed Headers", - "domain_id": "d04", - "subdomain_id": "sd04-02", - "definition": "Headers stripped silently from the request before forwarding. Automatically includes the allowlist header when ValidateHeaders is configured.", - "key_settings": ["DisallowedHeaders"], - "auto_populated_by": ["c-header-validation-rule"], - "related_concept_ids": ["c-header-stripping", "c-header-validation-rule"], - "docs": ["REQUEST_VALIDATION.md"] - }, - { - "id": "c-header-validation-rule", - "name": "Header Validation Rule", - "domain_id": "d04", - "subdomain_id": "sd04-02", - "definition": "ValidateHeaders mapping SourceHeader=AllowlistHeader. The source header value must appear in the comma-separated allowlist header injected from the user profile. Supports * suffix for prefix matching.", - "key_settings": ["ValidateHeaders"], - "on_failure_code": 417, - "on_failure_label": "InvalidHeader", - "reload_type": "Warm", - "side_effects": [ - "auto-adds SourceHeader and AllowlistHeader to RequiredHeaders", - "auto-adds AllowlistHeader to DisallowedHeaders" - ], - "related_concept_ids": ["c-user-profile", "c-required-headers", "c-disallowed-headers"], - "docs": ["REQUEST_VALIDATION.md", "ADVANCED_CONFIGURATION.md"] - }, - { - "id": "c-app-id-allowlist", - "name": "App ID Allowlist", - "domain_id": "d04", - "subdomain_id": "sd04-02", - "definition": "File or URL returning the list of permitted Entra Application IDs. Enforced at step 1 of the validation pipeline.", - "key_settings": ["ValidateAuthAppIDUrl", "ValidateAuthAppFieldName"], - "related_concept_ids": ["c-app-id-validation"], - "docs": ["REQUEST_VALIDATION.md", "ENVIRONMENT_VARIABLES.md"] - }, - - { - "id": "c-user-profile", - "name": "User Profile", - "domain_id": "d04", - "subdomain_id": "sd04-03", - "definition": "Per-user JSON object loaded periodically from a URL or file. Drives priority assignment, async configuration, and custom header injection.", - "key_settings": ["UseProfiles", "UserConfigUrl"], - "profile_fields": [ - { "field": "userId", "required": true, "description": "Unique identifier matched against the incoming request header." }, - { "field": "S7PPriorityKey", "required": false, "description": "Priority tier value mapped via PriorityKeys." }, - { "field": "async-config", "required": false, "description": "Enables async mode and sets per-user blob container and Service Bus topic." }, - { "field": "[CustomHeader]", "required": false, "description": "Any other key is injected as an HTTP request header." } - ], - "related_concept_ids": ["c-user-id", "c-s7p-priority-key-field", "c-async-config-field", "c-profile-refresh", "c-suspended-user"], - "docs": ["USER_PROFILES.md"] - }, - { - "id": "c-user-id", - "name": "User ID", - "domain_id": "d04", - "subdomain_id": "sd04-03", - "definition": "Unique identifier extracted from a configurable request header and used to look up the user profile record.", - "key_settings": ["UserIDFieldName"], - "default_value": "userId", - "related_concept_ids": ["c-user-profile"], - "docs": ["USER_PROFILES.md"] - }, - { - "id": "c-s7p-priority-key-field", - "name": "S7PPriorityKey Profile Field", - "domain_id": "d04", - "subdomain_id": "sd04-03", - "definition": "User profile field whose value is matched against PriorityKeys to assign a priority tier to the user's requests.", - "related_concept_ids": ["c-user-profile", "c-priority-mapping"], - "docs": ["USER_PROFILES.md", "ADVANCED_CONFIGURATION.md"] - }, - { - "id": "c-async-config-field", - "name": "async-config Profile Field", - "domain_id": "d04", - "subdomain_id": "sd04-03", - "definition": "User profile field enabling async mode for a specific user. Specifies the user's blob container name and Service Bus topic.", - "key_settings": ["AsyncClientConfigFieldName"], - "required_subfields": ["enabled", "containername", "topic"], - "optional_subfields": ["timeout"], - "related_concept_ids": ["c-user-profile", "c-async-mode"], - "docs": ["USER_PROFILES.md", "AsyncOperation.md"] - }, - { - "id": "c-profile-refresh", - "name": "Profile Refresh", - "domain_id": "d04", - "subdomain_id": "sd04-03", - "definition": "Proxy reloads the user profile file at the configured interval (default 1 hour). Enables live user management without restart.", - "key_settings": ["UserConfigRefreshIntervalSecs"], - "default_value": 3600, - "unit": "seconds", - "related_concept_ids": ["c-user-profile"], - "docs": ["USER_PROFILES.md", "ENVIRONMENT_VARIABLES.md"] - }, - { - "id": "c-suspended-user", - "name": "Suspended User", - "domain_id": "d04", - "subdomain_id": "sd04-03", - "definition": "User whose ID appears in the suspended users list. Returns 403 immediately, regardless of profile content.", - "key_settings": ["SuspendedUserConfigUrl"], - "on_failure_code": 403, - "related_concept_ids": ["c-user-profile"], - "docs": ["USER_PROFILES.md"] - }, - - { - "id": "c-priority-mapping", - "name": "Priority Mapping", - "domain_id": "d04", - "subdomain_id": "sd04-04", - "definition": "Maps a request header value to an internal priority integer and allocates dedicated worker threads to that priority level.", - "key_settings": ["PriorityKeyHeader", "PriorityKeys", "PriorityValues", "PriorityWorkers"], - "constraint": "Count of PriorityKeys MUST equal count of PriorityValues. PriorityWorkers MUST reference only levels in PriorityValues.", - "related_concept_ids": ["c-priority-key-header", "c-priority-keys", "c-priority-values", "c-priority-workers"], - "docs": ["ADVANCED_CONFIGURATION.md"] - }, - { - "id": "c-priority-key-header", - "name": "Priority Key Header", - "domain_id": "d04", - "subdomain_id": "sd04-04", - "definition": "Name of the request header the proxy inspects to determine priority tier. Default: S7PPriorityKey.", - "key_settings": ["PriorityKeyHeader"], - "default_value": "S7PPriorityKey", - "reload_type": "Warm", - "related_concept_ids": ["c-priority-mapping"], - "docs": ["ADVANCED_CONFIGURATION.md", "RESPONSE_CODES.md"] - }, - { - "id": "c-priority-keys", - "name": "Priority Keys", - "domain_id": "d04", - "subdomain_id": "sd04-04", - "definition": "Comma-separated list of expected header values. Each entry maps 1:1 to the corresponding entry in PriorityValues.", - "key_settings": ["PriorityKeys"], - "reload_type": "Warm", - "related_concept_ids": ["c-priority-mapping", "c-priority-values"], - "docs": ["ADVANCED_CONFIGURATION.md"] - }, - { - "id": "c-priority-values", - "name": "Priority Values", - "domain_id": "d04", - "subdomain_id": "sd04-04", - "definition": "Comma-separated list of internal priority integers, in 1:1 correspondence with PriorityKeys.", - "key_settings": ["PriorityValues"], - "reload_type": "Warm", - "related_concept_ids": ["c-priority-mapping", "c-priority-keys"], - "docs": ["ADVANCED_CONFIGURATION.md"] - }, - { - "id": "c-priority-workers", - "name": "Priority Workers", - "domain_id": "d04", - "subdomain_id": "sd04-04", - "definition": "PriorityLevel:WorkerCount pairs that reserve dedicated worker threads for each priority level.", - "key_settings": ["PriorityWorkers"], - "format": "level:count,level:count", - "reload_type": "Warm", - "related_concept_ids": ["c-priority-mapping", "c-workers"], - "docs": ["ADVANCED_CONFIGURATION.md"] - }, - { - "id": "c-per-user-throttling", - "name": "Per-user Throttling", - "domain_id": "d04", - "subdomain_id": "sd04-04", - "definition": "When a user's active requests exceed UserPriorityThreshold as a fraction of the total queue, that user's new requests are deprioritized.", - "key_settings": ["UserPriorityThreshold"], - "value_type": "float", - "value_range": "0.0 to 1.0", - "default_value": 0.1, - "reload_type": "Warm", - "related_concept_ids": ["c-priority-mapping", "c-user-profile"], - "docs": ["ADVANCED_CONFIGURATION.md", "USER_PROFILES.md"] - }, - - { - "id": "c-async-mode", - "name": "Async Mode", - "domain_id": "d05", - "definition": "System-wide feature enabling asynchronous request processing. Must be enabled at three levels: proxy (AsyncModeEnabled), user profile (async-config field), and per-request (opt-in header).", - "key_settings": ["AsyncModeEnabled"], - "reload_type": "Cold", - "enablement_levels": [ - "AsyncModeEnabled=true (proxy-wide switch, Cold)", - "async-config field in user profile", - "AsyncClientRequestHeader on the request" - ], - "related_concept_ids": ["c-opt-in-header", "c-async-upgrade", "c-result-blob", "c-service-bus-status"], - "docs": ["AsyncOperation.md"] - }, - { - "id": "c-opt-in-header", - "name": "Opt-in Header", - "domain_id": "d05", - "definition": "Per-request header clients send to request async processing for that call. Default header name: S7PAsyncMode.", - "key_settings": ["AsyncClientRequestHeader"], - "default_value": "S7PAsyncMode", - "reload_type": "Warm", - "related_concept_ids": ["c-async-mode"], - "docs": ["AsyncOperation.md"] - }, - { - "id": "c-async-upgrade", - "name": "Async Upgrade", - "domain_id": "d05", - "definition": "After AsyncTriggerTimeout ms, the proxy returns 202 to the client containing blob URIs and continues backend processing in the background.", - "key_settings": ["AsyncTriggerTimeout"], - "related_concept_ids": ["c-async-trigger-timeout", "c-202-response", "c-result-blob"], - "docs": ["AsyncOperation.md", "TIMEOUTS.md"] - }, - { - "id": "c-202-response", - "name": "202 Accepted Response", - "domain_id": "d05", - "definition": "HTTP 202 returned to the client immediately after async upgrade. Body contains result blob URIs for polling.", - "http_status": 202, - "related_concept_ids": ["c-async-upgrade", "c-result-blob"], - "docs": ["AsyncOperation.md"] - }, - { - "id": "c-result-blob", - "name": "Result Blob", - "domain_id": "d05", - "definition": "Azure Blob Storage object written when the backend completes an async request. URI is included in the 202 response. Expires after AsyncTTLSecs.", - "key_settings": ["AsyncBlobStorageConfig", "StorageDbContainerName", "AsyncTTLSecs"], - "related_concept_ids": ["c-blob-sas-token", "c-blob-lifecycle-policy", "c-async-blob-worker"], - "docs": ["AsyncOperation.md", "StorageBlobConfig.md"] - }, - { - "id": "c-blob-sas-token", - "name": "Blob SAS Token", - "domain_id": "d05", - "definition": "Time-limited access token generated for the result blob. Valid for AsyncTTLSecs seconds.", - "key_settings": ["AsyncTTLSecs"], - "related_concept_ids": ["c-result-blob"], - "docs": ["AsyncOperation.md"] - }, - { - "id": "c-blob-lifecycle-policy", - "name": "Blob Lifecycle Policy", - "domain_id": "d05", - "definition": "Azure Storage lifecycle management rule that automatically deletes blobs after BlobRetentionDays. Must be configured in the storage account independently; the BlobRetentionDays proxy setting alone does not delete blobs.", - "key_settings": ["BlobRetentionDays", "StorageDbContainerName"], - "related_concept_ids": ["c-result-blob"], - "docs": ["StorageBlobConfig.md"] - }, - { - "id": "c-service-bus-status", - "name": "Service Bus Status", - "domain_id": "d05", - "definition": "Real-time async lifecycle events sent to a per-user Azure Service Bus topic as requests move through processing states.", - "key_settings": ["AsyncSBConfig"], - "related_concept_ids": ["c-status-events", "c-async-mode"], - "docs": ["AsyncOperation.md"] - }, - { - "id": "c-status-events", - "name": "Status Events", - "domain_id": "d05", - "definition": "Lifecycle notification values sent over Service Bus.", - "values": ["InQueue", "RetryAfterDelay", "ReQueued", "Processing", "Processed", "Failed", "Expired"], - "related_concept_ids": ["c-service-bus-status"], - "docs": ["AsyncOperation.md"] - }, - { - "id": "c-async-blob-worker", - "name": "Async Blob Worker", - "domain_id": "d05", - "definition": "Background thread that writes completed async request results to Azure Blob Storage.", - "key_settings": ["AsyncBlobWorkerCount"], - "default_value": 2, - "reload_type": "Cold", - "related_concept_ids": ["c-result-blob"], - "docs": ["AsyncOperation.md"] - }, - - { - "id": "c-proxy-event", - "name": "ProxyEvent", - "domain_id": "d06", - "subdomain_id": "sd06-01", - "definition": "Key/value dictionary capturing all dimensions of a single proxied request: HTTP status, timing, token counts, headers, and other metadata.", - "related_concept_ids": ["c-event-headers", "c-composite-event-client"], - "docs": ["OBSERVABILITY.md"] - }, - { - "id": "c-event-headers", - "name": "Event Headers", - "code_interface": "ICommonEventData", - "domain_id": "d06", - "subdomain_id": "sd06-01", - "definition": "Common fields injected into every ProxyEvent: application version, container name, and revision. Supplied by the ICommonEventData implementation.", - "key_settings": ["EVENT_HEADERS"], - "related_concept_ids": ["c-proxy-event"], - "docs": ["OBSERVABILITY.md"] - }, - { - "id": "c-composite-event-client", - "name": "CompositeEventClient", - "code_file": "CompositeEventClient.cs", - "domain_id": "d06", - "subdomain_id": "sd06-01", - "definition": "Fan-out dispatcher that sends every serialized ProxyEvent to all registered event logger backends. Uses FrozenDictionary for zero-lock dispatch on the hot path.", - "related_concept_ids": ["c-event-logger-backend", "c-proxy-event"], - "docs": ["OBSERVABILITY.md"] - }, - { - "id": "c-event-logger-backend", - "name": "Event Logger Backend", - "domain_id": "d06", - "subdomain_id": "sd06-01", - "definition": "Named sink that receives serialized JSON ProxyEvents. Built-in values: file, eventhub. Custom backends can be registered by implementing IEventClient + IHostedService.", - "key_settings": ["EVENT_LOGGERS"], - "built_in_values": ["file", "eventhub"], - "custom_value_format": "", - "related_concept_ids": ["c-composite-event-client", "c-log-file", "c-event-hubs", "c-custom-event-logger"], - "docs": ["OBSERVABILITY.md"] - }, - { - "id": "c-custom-event-logger", - "name": "Custom Event Logger", - "domain_id": "d06", - "subdomain_id": "sd06-01", - "definition": "User-supplied class implementing IEventClient and IHostedService. Calls CompositeEventClient.Add(this) at startup to self-register with the fan-out.", - "related_concept_ids": ["c-event-logger-backend", "c-composite-event-client"], - "docs": ["OBSERVABILITY.md"] - }, - { - "id": "c-application-insights", - "name": "Application Insights", - "domain_id": "d06", - "subdomain_id": "sd06-02", - "definition": "Primary production telemetry sink. Receives structured telemetry (requests, dependencies, exceptions) via TelemetryClient.", - "key_settings": ["APPINSIGHTS_CONNECTIONSTRING"], - "related_concept_ids": ["c-event-logger-backend"], - "docs": ["OBSERVABILITY.md"] - }, - { - "id": "c-event-hubs", - "name": "Event Hubs", - "domain_id": "d06", - "subdomain_id": "sd06-02", - "definition": "High-volume streaming telemetry sink for JSON event lines. Supports connection-string auth or Managed Identity.", - "key_settings": ["EVENTHUB_CONNECTIONSTRING", "EVENTHUB_NAMESPACE", "EVENTHUB_NAME"], - "related_concept_ids": ["c-event-logger-backend"], - "docs": ["OBSERVABILITY.md"] - }, - { - "id": "c-log-file", - "name": "Log File", - "domain_id": "d06", - "subdomain_id": "sd06-02", - "definition": "Local JSON-lines file sink for development and testing. Buffers in a ConcurrentQueue and flushes in batches every 500 ms.", - "key_settings": ["LOGFILE_NAME"], - "related_concept_ids": ["c-event-logger-backend"], - "docs": ["OBSERVABILITY.md"] - }, - { - "id": "c-token-telemetry", - "name": "Token Telemetry", - "domain_id": "d06", - "subdomain_id": "sd06-02", - "definition": "Prompt and completion token counts extracted from SSE (Server-Sent Events) streams and logged per request. Requires processor=OpenAI on the backend host.", - "key_settings": ["processor"], - "connection_string_value": "processor=OpenAI", - "related_concept_ids": ["c-processor", "c-proxy-event"], - "docs": ["OBSERVABILITY.md", "AI_FOUNDRY_INTEGRATION.md"] - }, - { - "id": "c-liveness-endpoint", - "name": "/liveness Endpoint", - "domain_id": "d06", - "subdomain_id": "sd06-03", - "definition": "Returns 200 OK if the process is running. Never returns 503.", - "path": "/liveness", - "responses": { "200": "process running" }, - "related_concept_ids": ["c-readiness-endpoint", "c-sidecar-mode"], - "docs": ["HEALTH_CHECKING.md"] - }, - { - "id": "c-readiness-endpoint", - "name": "/readiness Endpoint", - "domain_id": "d06", - "subdomain_id": "sd06-03", - "definition": "Returns 200 if at least one backend host is in the active pool. Returns 503 if no hosts are active.", - "path": "/readiness", - "responses": { "200": "at least one active host", "503": "no active hosts" }, - "related_concept_ids": ["c-liveness-endpoint", "c-active-pool"], - "docs": ["HEALTH_CHECKING.md"] - }, - { - "id": "c-startup-endpoint", - "name": "/startup Endpoint", - "domain_id": "d06", - "subdomain_id": "sd06-03", - "definition": "Returns 200 if the health poller has completed its first probe pass. Returns 503 during initialization.", - "path": "/startup", - "responses": { "200": "poller has run at least once", "503": "still initializing" }, - "related_concept_ids": ["c-health-poller"], - "docs": ["HEALTH_CHECKING.md"] - }, - { - "id": "c-sidecar-mode", - "name": "Sidecar Mode", - "domain_id": "d06", - "subdomain_id": "sd06-03", - "definition": "Isolation pattern where a separate HealthProbe container on port 9000 handles Kubernetes probes. The proxy pushes its health state to the sidecar every second. If no update arrives for 10 seconds the sidecar fails probes.", - "key_settings": ["HealthProbeSidecar"], - "ports": { "proxy": 8000, "sidecar": 9000 }, - "related_concept_ids": ["c-liveness-endpoint", "c-readiness-endpoint", "c-startup-endpoint"], - "docs": ["HEALTH_CHECKING.md", "SIDECAR_DEPLOYMENT.md"] - }, - - { - "id": "c-warm-setting", - "name": "Warm Setting", - "domain_id": "d07", - "definition": "Configuration value hot-reloaded from Azure App Configuration within ~30 seconds when the Sentinel key changes. No container restart required.", - "label_in_app_config": "none or AZURE_APPCONFIG_LABEL", - "key_prefix": "Warm:", - "related_concept_ids": ["c-cold-setting", "c-hidden-setting", "c-sentinel"], - "docs": ["CONFIGURATION_SETTINGS.md", "AZURE_APP_CONFIGURATION.md"] - }, - { - "id": "c-cold-setting", - "name": "Cold Setting", - "domain_id": "d07", - "definition": "Configuration value published to Azure App Configuration but taking effect only after a container restart.", - "label_in_app_config": "Cold", - "key_prefix": "Warm:", - "related_concept_ids": ["c-warm-setting", "c-hidden-setting"], - "docs": ["CONFIGURATION_SETTINGS.md", "AZURE_APP_CONFIGURATION.md"] - }, - { - "id": "c-hidden-setting", - "name": "Hidden Setting", - "domain_id": "d07", - "definition": "Configuration value never published to Azure App Configuration. Runtime-derived from composite connection strings or infrastructure state.", - "related_concept_ids": ["c-warm-setting", "c-cold-setting"], - "docs": ["CONFIGURATION_SETTINGS.md"] - }, - { - "id": "c-sentinel", - "name": "Sentinel", - "domain_id": "d07", - "definition": "Special key (Warm:Sentinel) in Azure App Configuration. Bumping its value to any new value triggers hot-reload of all Warm settings across all running proxy instances.", - "app_config_key": "Warm:Sentinel", - "related_concept_ids": ["c-warm-setting"], - "docs": ["AZURE_APP_CONFIGURATION.md"] - }, - { - "id": "c-composite-connection-string", - "name": "Composite Connection String", - "domain_id": "d07", - "definition": "Semicolon or comma-delimited key=value string encoding multiple related settings in one environment variable.", - "examples": ["Host1", "AsyncBlobStorageConfig", "AsyncSBConfig", "HealthProbeSidecar"], - "related_concept_ids": ["c-connection-string-format"], - "docs": ["BACKEND_HOSTS.md", "AsyncOperation.md"] - }, - { - "id": "c-app-config-label", - "name": "App Configuration Label", - "domain_id": "d07", - "definition": "Label filter applied when reading from Azure App Configuration. Allows isolated setting namespaces per environment.", - "key_settings": ["AZURE_APPCONFIG_LABEL"], - "related_concept_ids": ["c-warm-setting"], - "docs": ["AZURE_APP_CONFIGURATION.md"] - }, - { - "id": "c-refresh-interval", - "name": "Refresh Interval", - "domain_id": "d07", - "definition": "How often in seconds the proxy polls the Sentinel key in Azure App Configuration.", - "key_settings": ["AZURE_APPCONFIG_REFRESH_INTERVAL_SECONDS"], - "default_value": 30, - "unit": "seconds", - "related_concept_ids": ["c-sentinel", "c-warm-setting"], - "docs": ["AZURE_APP_CONFIGURATION.md", "CONFIGURATION_SETTINGS.md"] - }, - - { - "id": "c-managed-identity", - "name": "Managed Identity", - "domain_id": "d08", - "definition": "Azure-managed credential attached to the container. Used for keyless authentication to backends, App Configuration, Event Hubs, Blob Storage, and Service Bus.", - "connection_string_key": "usemi", - "connection_string_value": "usemi=true", - "related_concept_ids": ["c-keyless-auth", "c-oauth2-bearer"], - "docs": ["AI_FOUNDRY_INTEGRATION.md", "BACKEND_HOSTS.md"] - }, - { - "id": "c-oauth2-bearer", - "name": "OAuth2 Bearer Token", - "domain_id": "d08", - "definition": "Proxy acquires an Entra ID token and attaches it as Authorization: Bearer on outbound backend requests.", - "connection_string_key": "useoauth", - "connection_string_value": "useoauth=true", - "related_concept_ids": ["c-managed-identity"], - "docs": ["BACKEND_HOSTS.md", "AI_FOUNDRY_INTEGRATION.md"] - }, - { - "id": "c-keyless-auth", - "name": "Keyless Auth", - "domain_id": "d08", - "definition": "Configuration pattern using usemi=true and audience=https://cognitiveservices.azure.com to eliminate API key management for Azure OpenAI and AI Foundry backends.", - "connection_string_example": "host=…;usemi=true;audience=https://cognitiveservices.azure.com", - "related_concept_ids": ["c-managed-identity", "c-oauth2-bearer"], - "docs": ["AI_FOUNDRY_INTEGRATION.md"] - }, - { - "id": "c-vnet-deployment", - "name": "VNet Deployment", - "domain_id": "d08", - "definition": "Deployment pattern where the proxy runs entirely inside a Virtual Network with no external data plane dependencies. Supports sovereign and government cloud regions.", - "related_concept_ids": ["c-managed-identity"], - "docs": ["OVERVIEW.md", "CONTAINER_DEPLOYMENT.md"] - }, - - { - "id": "c-azure-container-apps", - "name": "Azure Container Apps", - "domain_id": "d09", - "definition": "Primary hosting platform for SimpleL7Proxy. Provides ingress, auto-scaling, managed identity, and revision management.", - "related_concept_ids": ["c-sidecar-deployment", "c-container-image"], - "docs": ["CONTAINER_DEPLOYMENT.md", "SIDECAR_DEPLOYMENT.md"] - }, - { - "id": "c-container-image", - "name": "Container Image", - "domain_id": "d09", - "definition": "Docker image pushed to Azure Container Registry (ACR) and deployed to Azure Container Apps.", - "key_settings": ["ACR", "REGISTRY_SERVER"], - "related_concept_ids": ["c-azure-container-apps"], - "docs": ["CONTAINER_DEPLOYMENT.md", "SIDECAR_DEPLOYMENT.md"] - }, - { - "id": "c-sidecar-deployment", - "name": "Sidecar Deployment", - "domain_id": "d09", - "definition": "Two-container ACA revision: proxy container on port 8000 and health probe container on port 9000. Both containers share localhost networking within the revision.", - "key_settings": ["HealthProbeSidecar"], - "ports": { "proxy": 8000, "health_probe": 9000 }, - "related_concept_ids": ["c-sidecar-mode", "c-azure-container-apps"], - "docs": ["SIDECAR_DEPLOYMENT.md"] - }, - { - "id": "c-parameters-file", - "name": "Parameters File", - "code_file": "deploy.parameters.sh", - "domain_id": "d09", - "definition": "Single source of truth for all build and deploy scripts in the sidecar deployment. Set values once; all scripts read from this file.", - "path": "deployment/proxy-with-sidecar/deploy.parameters.sh", - "related_concept_ids": ["c-sidecar-deployment"], - "docs": ["SIDECAR_DEPLOYMENT.md"] - }, - { - "id": "c-apim-integration", - "name": "APIM Integration", - "domain_id": "d09", - "definition": "Azure API Management as a front-end to SimpleL7Proxy. Adds semantic caching, PII scrubbing, TPM-based rate limiting, quota enforcement, and mTLS.", - "related_concept_ids": ["c-ai-foundry-integration"], - "docs": ["AI_FOUNDRY_INTEGRATION.md", "OVERVIEW.md"] - }, - { - "id": "c-ai-foundry-integration", - "name": "AI Foundry Integration", - "domain_id": "d09", - "definition": "Direct or APIM-mediated connection to Azure OpenAI and AI Foundry model endpoints. Requires processor=OpenAI for token count extraction.", - "key_settings": ["processor", "usemi"], - "related_concept_ids": ["c-processor", "c-apim-integration", "c-keyless-auth"], - "docs": ["AI_FOUNDRY_INTEGRATION.md"] - } - ], - - "settings": [ - { "name": "AcceptableStatusCodes", "type": "int[]", "reload": "Warm", "default": [200,202,400,401,403,404,408,410,412,417], "concept_ids": ["c-acceptable-status-codes","c-circuit-breaker","c-pass-through"] }, - { "name": "AsyncBlobStorageConfig","type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-result-blob","c-async-mode"] }, - { "name": "AsyncBlobWorkerCount", "type": "int", "reload": "Cold", "default": 2, "concept_ids": ["c-async-blob-worker"] }, - { "name": "AsyncClientConfigFieldName","type":"string","reload":"Warm", "default": "async-config", "concept_ids": ["c-async-config-field"] }, - { "name": "AsyncClientRequestHeader","type":"string","reload":"Warm", "default": "S7PAsyncMode", "concept_ids": ["c-opt-in-header"] }, - { "name": "AsyncModeEnabled", "type": "bool", "reload": "Cold", "default": false, "concept_ids": ["c-async-mode"] }, - { "name": "AsyncSBConfig", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-service-bus-status"] }, - { "name": "AsyncTimeout", "type": "int", "reload": "Warm", "default": 1800000, "unit": "ms", "concept_ids": ["c-async-timeout"] }, - { "name": "AsyncTriggerTimeout", "type": "int", "reload": "Warm", "default": 10000, "unit": "ms", "concept_ids": ["c-async-trigger-timeout"] }, - { "name": "AsyncTTLSecs", "type": "int", "reload": "Warm", "default": 86400, "unit": "s", "concept_ids": ["c-async-ttl-secs"] }, - { "name": "APPINSIGHTS_CONNECTIONSTRING","type":"string","reload":"Cold","default": null, "concept_ids": ["c-application-insights"] }, - { "name": "AZURE_APPCONFIG_CONNECTION_STRING","type":"string","reload":"Cold","default":null,"concept_ids": ["c-warm-setting","c-cold-setting"] }, - { "name": "AZURE_APPCONFIG_ENDPOINT","type":"string","reload":"Cold", "default": null, "concept_ids": ["c-warm-setting","c-cold-setting"] }, - { "name": "AZURE_APPCONFIG_LABEL", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-app-config-label"] }, - { "name": "AZURE_APPCONFIG_REFRESH_INTERVAL_SECONDS","type":"int","reload":"Cold","default":30,"unit":"s","concept_ids":["c-refresh-interval"] }, - { "name": "BlobRetentionDays", "type": "int", "reload": "Warm", "default": 7, "unit": "days","concept_ids": ["c-blob-lifecycle-policy"] }, - { "name": "CBErrorThreshold", "type": "int", "reload": "Warm", "default": 50, "concept_ids": ["c-circuit-breaker","c-sliding-window"] }, - { "name": "CBTimeslice", "type": "int", "reload": "Warm", "default": 60, "unit": "s", "concept_ids": ["c-circuit-breaker","c-sliding-window"] }, - { "name": "DefaultPriority", "type": "int", "reload": "Warm", "default": 2, "concept_ids": ["c-default-priority","c-priority-level"] }, - { "name": "DefaultTTLSecs", "type": "int", "reload": "Warm", "default": 300, "unit": "s", "concept_ids": ["c-ttl"] }, - { "name": "DisallowedHeaders", "type": "string", "reload": "Warm", "default": null, "concept_ids": ["c-disallowed-headers","c-header-stripping"] }, - { "name": "DnsRefreshTimeout", "type": "int", "reload": "Cold", "default": 120000, "unit": "ms", "concept_ids": ["c-backend-host"] }, - { "name": "EVENT_HEADERS", "type": "string", "reload": "Cold", "default": "SimpleL7Proxy.Events.CommonEventHeaders", "concept_ids": ["c-event-headers"] }, - { "name": "EVENT_LOGGERS", "type": "string", "reload": "Cold", "default": "file", "concept_ids": ["c-event-logger-backend"] }, - { "name": "EVENTHUB_CONNECTIONSTRING","type":"string","reload":"Cold", "default": null, "concept_ids": ["c-event-hubs"] }, - { "name": "EVENTHUB_NAME", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-event-hubs"] }, - { "name": "EVENTHUB_NAMESPACE", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-event-hubs"] }, - { "name": "HealthProbeSidecar", "type": "string", "reload": "Warm", "default": "Enabled=false;url=http://localhost:9000", "concept_ids": ["c-sidecar-mode","c-sidecar-deployment"] }, - { "name": "HEALTHPROBE_PORT", "type": "int", "reload": "Cold", "default": 9000, "concept_ids": ["c-sidecar-deployment"] }, - { "name": "Host1", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-backend-host","c-connection-string-format"] }, - { "name": "IterationMode", "type": "string", "reload": "Warm", "default": "SinglePass", "allowed_values":["SinglePass","MultiPass"], "concept_ids": ["c-iteration-mode"] }, - { "name": "KeepAliveInitialDelaySecs","type":"int", "reload":"Cold", "default": 60, "unit":"s", "concept_ids": ["c-backend-host"] }, - { "name": "KeepAlivePingIntervalSecs","type":"int", "reload":"Cold", "default": 60, "unit":"s", "concept_ids": ["c-backend-host"] }, - { "name": "LoadBalanceMode", "type": "string", "reload": "Warm", "default": "latency","allowed_values":["roundrobin","latency","random"], "concept_ids": ["c-load-balance-mode"] }, - { "name": "LOGFILE_NAME", "type": "string", "reload": "Cold", "default": "eventslog.json", "concept_ids": ["c-log-file"] }, - { "name": "LogHeaders", "type": "string[]","reload":"Warm", "default": [], "concept_ids": ["c-proxy-event"] }, - { "name": "LogAllRequestHeaders", "type": "bool", "reload": "Warm", "default": false, "concept_ids": ["c-proxy-event"] }, - { "name": "LogAllResponseHeaders", "type": "bool", "reload": "Warm", "default": false, "concept_ids": ["c-proxy-event"] }, - { "name": "MaxAttempts", "type": "int", "reload": "Warm", "default": 10, "concept_ids": ["c-max-attempts","c-retry"] }, - { "name": "MaxEvents", "type": "int", "reload": "Cold", "default": 100000, "concept_ids": ["c-proxy-event"] }, - { "name": "MaxQueueLength", "type": "int", "reload": "Cold", "default": 1000, "concept_ids": ["c-max-queue-length","c-enqueue"] }, - { "name": "MultiConnMaxConns", "type": "int", "reload": "Cold", "default": 4000, "concept_ids": ["c-backend-host"] }, - { "name": "PollInterval", "type": "int", "reload": "Cold", "default": 15000, "unit": "ms", "concept_ids": ["c-health-poller"] }, - { "name": "PollTimeout", "type": "int", "reload": "Cold", "default": 3000, "unit": "ms", "concept_ids": ["c-health-poller"] }, - { "name": "Port", "type": "int", "reload": "Cold", "default": 80, "concept_ids": ["c-listener"] }, - { "name": "PriorityKeyHeader", "type": "string", "reload": "Warm", "default": "S7PPriorityKey", "concept_ids": ["c-priority-key-header","c-priority-mapping"] }, - { "name": "PriorityKeys", "type": "string[]","reload":"Warm", "default": null, "concept_ids": ["c-priority-keys","c-priority-mapping"] }, - { "name": "PriorityValues", "type": "int[]", "reload": "Warm", "default": null, "concept_ids": ["c-priority-values","c-priority-mapping"] }, - { "name": "PriorityWorkers", "type": "string", "reload": "Warm", "default": "2:1,3:1","format":"level:count,level:count", "concept_ids": ["c-priority-workers"] }, - { "name": "RequiredHeaders", "type": "string[]","reload":"Warm", "default": [], "concept_ids": ["c-required-headers"] }, - { "name": "SharedIteratorCleanupIntervalSeconds","type":"int","reload":"Cold","default":30,"unit":"s","concept_ids":["c-shared-iterator"] }, - { "name": "SharedIteratorTTLSeconds","type":"int", "reload": "Cold", "default": 60, "unit":"s", "concept_ids": ["c-shared-iterator"] }, - { "name": "StorageDbContainerName","type": "string", "reload": "Warm", "default": "Requests","concept_ids": ["c-result-blob","c-blob-lifecycle-policy"] }, - { "name": "SuccessRate", "type": "int", "reload": "Cold", "default": 80, "unit":"percent","concept_ids": ["c-success-rate","c-active-pool"] }, - { "name": "SuspendedUserConfigUrl","type": "string", "reload": "Warm", "default": null, "concept_ids": ["c-suspended-user"] }, - { "name": "Timeout", "type": "int", "reload": "Cold", "default": 1200000, "unit":"ms", "concept_ids": ["c-timeout"] }, - { "name": "TimeoutHeader", "type": "string", "reload": "Cold", "default": "S7PTimeout", "concept_ids": ["c-timeout"] }, - { "name": "TTLHeader", "type": "string", "reload": "Cold", "default": "S7PTTL","concept_ids": ["c-ttl"] }, - { "name": "UseProfiles", "type": "bool", "reload": "Cold", "default": false, "concept_ids": ["c-user-profile"] }, - { "name": "UseSharedIterators", "type": "bool", "reload": "Cold", "default": true, "concept_ids": ["c-shared-iterator"] }, - { "name": "UserConfigRefreshIntervalSecs","type":"int","reload":"Cold", "default": 3600, "unit":"s", "concept_ids": ["c-profile-refresh"] }, - { "name": "UserConfigRequired", "type": "bool", "reload": "Cold", "default": false, "concept_ids": ["c-user-profile"] }, - { "name": "UserConfigUrl", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-user-profile"] }, - { "name": "UserIDFieldName", "type": "string", "reload": "Cold", "default": "userId","concept_ids": ["c-user-id"] }, - { "name": "UserPriorityThreshold", "type": "float", "reload": "Warm", "default": 0.1, "range":"0.0-1.0", "concept_ids": ["c-per-user-throttling"] }, - { "name": "UserSoftDeleteTTLMinutes","type":"int", "reload":"Cold", "default": 360, "unit":"min", "concept_ids": ["c-suspended-user"] }, - { "name": "ValidateAuthAppFieldName","type":"string","reload":"Warm", "default": "authAppID","concept_ids":["c-app-id-allowlist"] }, - { "name": "ValidateAuthAppID", "type": "bool", "reload": "Warm", "default": false, "concept_ids": ["c-app-id-validation"] }, - { "name": "ValidateAuthAppIDHeader","type":"string", "reload": "Warm", "default": "X-MS-CLIENT-PRINCIPAL-ID","concept_ids":["c-app-id-validation"] }, - { "name": "ValidateAuthAppIDUrl", "type": "string", "reload": "Warm", "default": "file:auth.json","concept_ids":["c-app-id-validation","c-app-id-allowlist"] }, - { "name": "ValidateHeaders", "type": "string", "reload": "Warm", "default": null, "format":"SourceHeader=AllowlistHeader", "concept_ids":["c-header-validation-rule"] }, - { "name": "Workers", "type": "int", "reload": "Cold", "default": 10, "concept_ids": ["c-workers","c-proxy-worker"] } - ], - - "response_codes": [ - { "code": 202, "name": "Accepted", "origin": "proxy", "cause": "Async upgrade: client released early with blob URIs", "concept_ids": ["c-202-response","c-async-upgrade"] }, - { "code": 400, "name": "Bad Request", "origin": "proxy", "cause": "Malformed S7PTTL header value (error: InvalidTTL)", "concept_ids": ["c-ttl"] }, - { "code": 403, "name": "Forbidden", "origin": "proxy", "cause": "App ID not in allowlist (DisallowedAppID); user not in profile or suspended (UnknownProfile)", "concept_ids": ["c-app-id-validation","c-suspended-user","c-user-profile"] }, - { "code": 408, "name": "Request Timeout", "origin": "proxy", "cause": "IO exception or task cancellation on backend communication", "concept_ids": ["c-timeout"] }, - { "code": 412, "name": "Precondition Failed", "origin": "proxy", "cause": "Request TTL expired before dispatch (TTLExpired)", "concept_ids": ["c-ttl"] }, - { "code": 417, "name": "Expectation Failed", "origin": "proxy", "cause": "Required header missing (IncompleteHeaders); header validation rule failed (InvalidHeader)", "concept_ids": ["c-required-headers","c-header-validation-rule"] }, - { "code": 429, "name": "Too Many Requests", "origin": "proxy", "cause": "Queue full; all circuits OPEN; no active hosts; max events exceeded", "concept_ids": ["c-max-queue-length","c-global-blocked-check","c-active-pool"] }, - { "code": 500, "name": "Internal Server Error", "origin": "proxy", "cause": "Unhandled exception; request body too large (ContentTooLarge)", "concept_ids": [] }, - { "code": 503, "name": "Service Unavailable", "origin": "proxy", "cause": "All backends exhausted; exception during enqueue; all circuit breakers blocked", "concept_ids": ["c-global-blocked-check","c-retry","c-active-pool"] } - ], - - "request_headers": [ - { "name": "S7PPriorityKey", "direction": "client-inbound", "configurable_name_via": "PriorityKeyHeader", "definition": "Priority tier value. Looked up in PriorityKeys to determine the request's priority level.", "concept_ids": ["c-priority-key-header","c-priority-mapping"] }, - { "name": "S7PTTL", "direction": "client-inbound", "configurable_name_via": "TTLHeader", "definition": "Per-request TTL override in seconds. Formats: integer, decimal, absolute Unix timestamp, ISO 8601.", "concept_ids": ["c-ttl"] }, - { "name": "S7PTimeout", "direction": "client-inbound", "configurable_name_via": "TimeoutHeader", "definition": "Per-request per-host timeout override in milliseconds.", "concept_ids": ["c-timeout"] }, - { "name": "S7PAsyncMode", "direction": "client-inbound", "configurable_name_via": "AsyncClientRequestHeader", "definition": "Opt-in header enabling async mode for this specific request.", "concept_ids": ["c-opt-in-header","c-async-mode"] }, - { "name": "S7PDEBUG", "direction": "client-inbound", "configurable_name_via": null, "definition": "Set to true to enable per-request debug tracing in logs.", "concept_ids": [] }, - { "name": "S7PREQUEUE", "direction": "backend-response", "configurable_name_via": null, "definition": "Set by a backend on a 429 response to signal the proxy should requeue the request.", "concept_ids": ["c-s7prequeue-header","c-requeue"] } - ], - - "response_headers": [ - { "name": "x-Request-Queue-Duration", "definition": "Milliseconds the request spent in the priority queue before a worker picked it up.", "concept_ids": ["c-priority-queue"] }, - { "name": "x-Request-Process-Duration", "definition": "Milliseconds spent in the proxy worker from dequeue to final response write.", "concept_ids": ["c-proxy-worker"] }, - { "name": "x-Request-Worker", "definition": "Numeric ID of the worker thread that handled the request.", "concept_ids": ["c-workers"] }, - { "name": "BackendHost", "definition": "Hostname of the backend that served the successful response.", "concept_ids": ["c-backend-host"] }, - { "name": "Total-Latency", "definition": "Total milliseconds from enqueue to response complete, covering both queue wait and processing.", "concept_ids": ["c-proxy-event"] } - ], - - "relationships": [ - { "from": "c-ttl", "to": "c-timeout", "type": "bounds", "description": "TTL caps total request lifetime; Timeout caps each individual attempt." }, - { "from": "c-async-trigger-timeout", "to": "c-async-timeout", "type": "succeeds", "description": "AsyncTriggerTimeout fires first to release the client; AsyncTimeout governs the subsequent backend phase." }, - { "from": "c-path-filter", "to": "c-load-balance-mode", "type": "precedes", "description": "Path filter runs before load-balance ordering in the backend selection pipeline." }, - { "from": "c-load-balance-mode", "to": "c-circuit-breaker", "type": "precedes", "description": "Load balancer orders hosts; circuit breaker gates each host before attempt." }, - { "from": "c-health-poller", "to": "c-active-pool", "type": "populates", "description": "Health poller results determine which hosts are in the active pool." }, - { "from": "c-active-pool", "to": "c-circuit-breaker", "type": "feeds", "description": "Active pool hosts are also subject to per-host circuit breaker evaluation." }, - { "from": "c-circuit-breaker", "to": "c-max-attempts", "type": "bypasses", "description": "OPEN circuits are skipped without consuming a MaxAttempts budget entry." }, - { "from": "c-sentinel", "to": "c-warm-setting", "type": "triggers", "description": "Bumping Sentinel triggers hot-reload of all Warm settings." }, - { "from": "c-validation-pipeline", "to": "c-enqueue", "type": "precedes", "description": "Validation pipeline must pass before a request is enqueued." }, - { "from": "c-user-profile", "to": "c-header-validation-rule","type":"provides_data", "description": "User profile supplies the allowlist header values consumed by header validation rules." }, - { "from": "c-async-upgrade", "to": "c-result-blob", "type": "produces", "description": "Async upgrade initiates background processing that ultimately produces a result blob." }, - { "from": "c-result-blob", "to": "c-blob-lifecycle-policy","type":"governed_by", "description": "Result blobs are deleted by the Azure Storage lifecycle policy, not by the proxy setting alone." }, - { "from": "c-processor", "to": "c-token-telemetry", "type": "enables", "description": "processor=OpenAI is required for the proxy to extract token counts from SSE streams." }, - { "from": "c-composite-event-client","to": "c-event-logger-backend","type": "dispatches_to", "description": "CompositeEventClient fans out every ProxyEvent to all registered backends." }, - { "from": "c-connection-string-format","to":"c-legacy-format", "type": "supersedes", "description": "Connection string format supports all modern options; legacy format is deprecated." }, - { "from": "c-direct-mode", "to": "c-latency-iterator", "type": "influences", "description": "Direct-mode hosts report 0 ms average latency and always sort first in latency mode." } - ] -} +{ + "meta": { + "schema_version": "1.0", + "generated_date": "2026-05-21", + "application": "SimpleL7Proxy", + "description": "Machine-readable concept taxonomy derived from SimpleL7Proxy documentation. Not intended for human consumption.", + "source_docs_path": "SimpleL7Proxy/docs/" + }, + + "domains": [ + { "id": "d01", "name": "Request Lifecycle", "description": "End-to-end path a request travels from client to backend and back." }, + { "id": "d02", "name": "Backend Management", "description": "How the proxy discovers, probes, and selects backend hosts." }, + { "id": "d03", "name": "Reliability", "description": "Mechanisms that prevent failures from propagating to clients." }, + { "id": "d04", "name": "Request Governance", "description": "Validation and priority rules applied before a request enters the queue." }, + { "id": "d05", "name": "Async Mode", "description": "Long-running request handling that decouples client wait from backend processing." }, + { "id": "d06", "name": "Observability", "description": "How the proxy exposes telemetry about its own operation." }, + { "id": "d07", "name": "Configuration Management", "description": "How settings reach the proxy and when they take effect." }, + { "id": "d08", "name": "Authentication and Security","description": "How the proxy authenticates to backends and restricts inbound callers." }, + { "id": "d09", "name": "Deployment Architecture", "description": "How the proxy is packaged and run on Azure." }, + { "id": "d10", "name": "Protocol and Headers", "description": "Named HTTP signals that cross the client-proxy and proxy-backend boundaries." } + ], + + "subdomains": [ + { "id": "sd01-01", "domain_id": "d01", "name": "Ingress" }, + { "id": "sd01-02", "domain_id": "d01", "name": "Priority Queue" }, + { "id": "sd01-03", "domain_id": "d01", "name": "Worker Dispatch" }, + { "id": "sd01-04", "domain_id": "d01", "name": "Response" }, + { "id": "sd02-01", "domain_id": "d02", "name": "Host Configuration" }, + { "id": "sd02-02", "domain_id": "d02", "name": "Health Polling" }, + { "id": "sd02-03", "domain_id": "d02", "name": "Backend Selection Pipeline" }, + { "id": "sd03-01", "domain_id": "d03", "name": "Circuit Breaker" }, + { "id": "sd03-02", "domain_id": "d03", "name": "Retry and Requeue" }, + { "id": "sd03-03", "domain_id": "d03", "name": "Timeout Model" }, + { "id": "sd04-01", "domain_id": "d04", "name": "Validation Pipeline" }, + { "id": "sd04-02", "domain_id": "d04", "name": "Validation Settings" }, + { "id": "sd04-03", "domain_id": "d04", "name": "User Profiles" }, + { "id": "sd04-04", "domain_id": "d04", "name": "Priority Mapping" }, + { "id": "sd06-01", "domain_id": "d06", "name": "Event Model" }, + { "id": "sd06-02", "domain_id": "d06", "name": "Telemetry Sinks" }, + { "id": "sd06-03", "domain_id": "d06", "name": "Health Endpoints" } + ], + + "concepts": [ + + { + "id": "c-listener", + "name": "Listener", + "code_file": "Server.cs", + "domain_id": "d01", + "subdomain_id": "sd01-01", + "definition": "Accepts inbound HTTP connections and submits each request to the priority queue.", + "key_settings": ["Port", "Workers"], + "related_concept_ids": ["c-priority-queue", "c-enqueue", "c-validation-pipeline"], + "docs": ["design.md", "ENVIRONMENT_VARIABLES.md"] + }, + { + "id": "c-request-data", + "name": "RequestData", + "code_file": "RequestData.cs", + "domain_id": "d01", + "subdomain_id": "sd01-01", + "definition": "Runtime object holding all data for one inbound request: headers, body, path, assigned priority, and TTL expiry.", + "key_settings": [], + "related_concept_ids": ["c-proxy-data", "c-enqueue"], + "docs": ["design.md"] + }, + { + "id": "c-enqueue", + "name": "Enqueue", + "domain_id": "d01", + "subdomain_id": "sd01-01", + "definition": "Submitting a RequestData into the priority queue. The TTL clock starts at this moment.", + "key_settings": ["MaxQueueLength"], + "related_concept_ids": ["c-ttl", "c-priority-queue", "c-max-queue-length", "c-validation-pipeline"], + "docs": ["RESPONSE_CODES.md", "TIMEOUTS.md"] + }, + { + "id": "c-max-queue-length", + "name": "Max Queue Length", + "domain_id": "d01", + "subdomain_id": "sd01-01", + "definition": "Hard cap on queued requests. Requests arriving when the queue is full receive 429 Too Many Requests.", + "key_settings": ["MaxQueueLength"], + "related_concept_ids": ["c-enqueue"], + "docs": ["ENVIRONMENT_VARIABLES.md", "RESPONSE_CODES.md"] + }, + { + "id": "c-priority-queue", + "name": "Priority Queue", + "code_file": "PriorityQueue.cs", + "domain_id": "d01", + "subdomain_id": "sd01-02", + "definition": "Min-heap data structure ordered by priority level integer. Lower integer exits first.", + "key_settings": ["DefaultPriority"], + "related_concept_ids": ["c-priority-level", "c-proxy-worker"], + "docs": ["design.md", "ADVANCED_CONFIGURATION.md"] + }, + { + "id": "c-priority-level", + "name": "Priority Level", + "domain_id": "d01", + "subdomain_id": "sd01-02", + "definition": "Integer assigned to every request. Lower number means higher dispatch precedence in the queue.", + "key_settings": ["PriorityValues", "DefaultPriority"], + "related_concept_ids": ["c-priority-queue", "c-priority-mapping", "c-default-priority"], + "docs": ["ADVANCED_CONFIGURATION.md", "ENVIRONMENT_VARIABLES.md"] + }, + { + "id": "c-default-priority", + "name": "Default Priority", + "domain_id": "d01", + "subdomain_id": "sd01-02", + "definition": "Priority level assigned when the request carries no matching PriorityKeyHeader value.", + "key_settings": ["DefaultPriority"], + "default_value": 2, + "related_concept_ids": ["c-priority-level", "c-priority-mapping"], + "docs": ["ADVANCED_CONFIGURATION.md", "CONFIGURATION_SETTINGS.md"] + }, + { + "id": "c-ttl", + "name": "TTL (Time-to-Live)", + "domain_id": "d01", + "subdomain_id": "sd01-02", + "definition": "Wall-clock budget for the entire request lifecycle covering queue wait plus all retry attempts. Expiry returns 412.", + "key_settings": ["DefaultTTLSecs"], + "per_request_override_header": "S7PTTL", + "unit": "seconds", + "related_concept_ids": ["c-timeout", "c-enqueue"], + "docs": ["TIMEOUTS.md", "RESPONSE_CODES.md"] + }, + { + "id": "c-proxy-worker", + "name": "ProxyWorker", + "code_file": "ProxyWorker.cs", + "domain_id": "d01", + "subdomain_id": "sd01-03", + "definition": "Thread that dequeues a request and drives the backend selection, send, and response write cycle.", + "key_settings": ["Workers"], + "related_concept_ids": ["c-priority-queue", "c-backend-selection-pipeline"], + "docs": ["design.md"] + }, + { + "id": "c-workers", + "name": "Workers", + "domain_id": "d01", + "subdomain_id": "sd01-03", + "definition": "Count of concurrent proxy worker threads. Cold setting; default 10 is for local testing only.", + "key_settings": ["Workers"], + "reload_type": "Cold", + "default_value": 10, + "related_concept_ids": ["c-proxy-worker", "c-priority-workers"], + "docs": ["ENVIRONMENT_VARIABLES.md", "CONFIGURATION_SETTINGS.md"] + }, + { + "id": "c-proxy-data", + "name": "ProxyData", + "code_file": "ProxyData.cs", + "domain_id": "d01", + "subdomain_id": "sd01-04", + "definition": "Runtime object holding a backend response: HTTP status code, response headers, and body.", + "key_settings": [], + "related_concept_ids": ["c-request-data", "c-pass-through"], + "docs": ["design.md"] + }, + { + "id": "c-pass-through", + "name": "Pass-through", + "domain_id": "d01", + "subdomain_id": "sd01-04", + "definition": "Any backend status code in AcceptableStatusCodes is forwarded directly to the client without triggering retry or circuit recording.", + "key_settings": ["AcceptableStatusCodes"], + "related_concept_ids": ["c-acceptable-status-codes", "c-circuit-breaker"], + "docs": ["RESPONSE_CODES.md", "CIRCUIT_BREAKER.md"] + }, + { + "id": "c-response-headers-injected", + "name": "Injected Response Headers", + "domain_id": "d01", + "subdomain_id": "sd01-04", + "definition": "HTTP headers the proxy appends to every successfully proxied response before returning to the client.", + "header_names": [ + "x-Request-Queue-Duration", + "x-Request-Process-Duration", + "x-Request-Worker", + "BackendHost", + "Total-Latency" + ], + "related_concept_ids": ["c-proxy-event"], + "docs": ["RESPONSE_CODES.md"] + }, + + { + "id": "c-backend-host", + "name": "Backend Host", + "domain_id": "d02", + "subdomain_id": "sd02-01", + "definition": "A single upstream endpoint the proxy can forward requests to. Configured as Host1 through Host9.", + "key_settings": ["Host1","Host2","Host3","Host4","Host5","Host6","Host7","Host8","Host9"], + "connection_string_keys": ["host","probe","path","mode","processor","usemi","useoauth","audience","ipaddress","stripprefix","retryafter"], + "related_concept_ids": ["c-connection-string-format", "c-health-poller", "c-circuit-breaker"], + "docs": ["BACKEND_HOSTS.md", "ENVIRONMENT_VARIABLES.md"] + }, + { + "id": "c-connection-string-format", + "name": "Connection String Format", + "domain_id": "d02", + "subdomain_id": "sd02-01", + "definition": "Preferred per-host configuration using a semicolon-delimited key=value string. Only this format supports path, mode, usemi, processor, and other modern options.", + "example": "host=https://api.example.com;probe=/health;path=/api;usemi=true", + "related_concept_ids": ["c-legacy-format", "c-composite-connection-string"], + "docs": ["BACKEND_HOSTS.md"] + }, + { + "id": "c-legacy-format", + "name": "Legacy Format", + "domain_id": "d02", + "subdomain_id": "sd02-01", + "definition": "Deprecated per-variable host configuration. Cannot express path, mode, usemi, or other modern per-host options. Must not be used in new deployments.", + "deprecated": true, + "deprecated_vars": ["Probe_path1","Probe_path2","Probe_path3","IP1","IP2"], + "related_concept_ids": ["c-connection-string-format"], + "docs": ["BACKEND_HOSTS.md", "ENVIRONMENT_VARIABLES.md"] + }, + { + "id": "c-path-prefix", + "name": "Path Prefix", + "domain_id": "d02", + "subdomain_id": "sd02-01", + "definition": "URL prefix configured on a host via the path connection string key. Requests whose URL starts with this prefix are routed to this host.", + "connection_string_key": "path", + "related_concept_ids": ["c-path-filter", "c-catch-all-host", "c-strip-prefix"], + "docs": ["BACKEND_HOSTS.md", "LOAD_BALANCING.md"] + }, + { + "id": "c-catch-all-host", + "name": "Catch-all Host", + "domain_id": "d02", + "subdomain_id": "sd02-01", + "definition": "A host with path=/ or no path configured. Receives requests that match no specific-path host.", + "connection_string_key": "path", + "related_concept_ids": ["c-path-prefix", "c-path-filter"], + "docs": ["BACKEND_HOSTS.md", "LOAD_BALANCING.md"] + }, + { + "id": "c-ip-override", + "name": "IP Override", + "domain_id": "d02", + "subdomain_id": "sd02-01", + "definition": "Forces all requests to a specific IP address, bypassing DNS resolution for that host.", + "connection_string_key": "ipaddress", + "related_concept_ids": ["c-backend-host"], + "docs": ["BACKEND_HOSTS.md"] + }, + { + "id": "c-strip-prefix", + "name": "Strip Prefix", + "domain_id": "d02", + "subdomain_id": "sd02-01", + "definition": "When true (default), the matched path prefix is removed from the URL before forwarding. Set stripprefix=false to preserve the full original path.", + "connection_string_key": "stripprefix", + "default_value": true, + "related_concept_ids": ["c-path-prefix"], + "docs": ["BACKEND_HOSTS.md"] + }, + { + "id": "c-processor", + "name": "Processor", + "domain_id": "d02", + "subdomain_id": "sd02-01", + "definition": "Named stream processor applied to the backend response after receipt. The built-in OpenAI processor extracts token usage counts from SSE streams.", + "connection_string_key": "processor", + "known_values": ["OpenAI"], + "related_concept_ids": ["c-token-telemetry", "c-direct-mode"], + "docs": ["AI_FOUNDRY_INTEGRATION.md", "OBSERVABILITY.md"] + }, + { + "id": "c-direct-mode", + "name": "Direct Mode", + "domain_id": "d02", + "subdomain_id": "sd02-01", + "definition": "Backend mode where the host is always treated as healthy. No probe is ever sent. In latency mode the host sorts first (0 ms average latency). Use for serverless or on-demand backends that must not be woken by probes.", + "connection_string_key": "mode", + "connection_string_value": "mode=direct", + "related_concept_ids": ["c-health-poller", "c-processor", "c-backend-host"], + "docs": ["BACKEND_HOSTS.md", "AI_FOUNDRY_INTEGRATION.md"] + }, + + { + "id": "c-health-poller", + "name": "Health Poller", + "code_file": "Backends.cs", + "domain_id": "d02", + "subdomain_id": "sd02-02", + "definition": "Background loop that probes each configured host at PollInterval ms intervals and tracks rolling success rate and average latency.", + "key_settings": ["PollInterval", "PollTimeout"], + "related_concept_ids": ["c-probe-path", "c-success-rate", "c-active-pool", "c-average-latency"], + "docs": ["BACKEND_HOSTS.md", "HEALTH_CHECKING.md"] + }, + { + "id": "c-probe-path", + "name": "Probe Path", + "domain_id": "d02", + "subdomain_id": "sd02-02", + "definition": "URL path sent as a GET request to test backend health. Ignored when mode=direct.", + "connection_string_key": "probe", + "default_value": "echo/resource?param1=sample", + "related_concept_ids": ["c-health-poller", "c-direct-mode"], + "docs": ["BACKEND_HOSTS.md"] + }, + { + "id": "c-success-rate", + "name": "Success Rate", + "domain_id": "d02", + "subdomain_id": "sd02-02", + "definition": "Rolling percentage of successful probe responses for a host. Hosts below the SuccessRate threshold are removed from the active pool until they recover.", + "key_settings": ["SuccessRate"], + "default_value": 80, + "unit": "percent", + "related_concept_ids": ["c-active-pool", "c-health-poller"], + "docs": ["BACKEND_HOSTS.md"] + }, + { + "id": "c-active-pool", + "name": "Active Pool", + "domain_id": "d02", + "subdomain_id": "sd02-02", + "definition": "Set of backend hosts currently eligible to receive traffic, filtered by rolling success rate threshold.", + "key_settings": ["SuccessRate"], + "related_concept_ids": ["c-success-rate", "c-health-poller", "c-circuit-breaker"], + "docs": ["BACKEND_HOSTS.md", "LOAD_BALANCING.md"] + }, + { + "id": "c-average-latency", + "name": "Average Latency", + "domain_id": "d02", + "subdomain_id": "sd02-02", + "definition": "Rolling average probe response time per host. Used to order hosts in latency load-balance mode. Direct-mode hosts report 0 ms and sort first.", + "key_settings": [], + "related_concept_ids": ["c-health-poller", "c-latency-iterator"], + "docs": ["BACKEND_HOSTS.md", "LOAD_BALANCING.md"] + }, + { + "id": "c-base-host-health", + "name": "BaseHostHealth", + "code_file": "BaseHostHealth.cs", + "domain_id": "d02", + "subdomain_id": "sd02-02", + "definition": "Runtime object holding a single host's health metrics: rolling success rate, average latency, and circuit breaker state.", + "key_settings": [], + "related_concept_ids": ["c-circuit-breaker", "c-success-rate", "c-average-latency"], + "docs": ["design.md"] + }, + + { + "id": "c-backend-selection-pipeline", + "name": "Backend Selection Pipeline", + "domain_id": "d02", + "subdomain_id": "sd02-03", + "definition": "Three-stage process run on every request: (1) path filter, (2) load-balance ordering, (3) per-host circuit-breaker gate.", + "stages": ["path-filter", "load-balance-mode", "circuit-breaker-gate"], + "related_concept_ids": ["c-path-filter", "c-load-balance-mode", "c-circuit-breaker"], + "docs": ["LOAD_BALANCING.md", "design.md"] + }, + { + "id": "c-iterator-factory", + "name": "IteratorFactory", + "code_file": "IteratorFactory.cs", + "domain_id": "d02", + "subdomain_id": "sd02-03", + "definition": "Creates a load-balance iterator for a given request path based on LoadBalanceMode configuration.", + "key_settings": ["LoadBalanceMode"], + "related_concept_ids": ["c-load-balance-mode", "c-shared-iterator"], + "docs": ["design.md", "LOAD_BALANCING.md"] + }, + { + "id": "c-path-filter", + "name": "Path Filter", + "domain_id": "d02", + "subdomain_id": "sd02-03", + "definition": "Stage 1 of backend selection. Specific-path hosts are checked first; if none match, catch-all hosts are used. Specific paths always win.", + "key_settings": ["path"], + "related_concept_ids": ["c-path-prefix", "c-catch-all-host", "c-backend-selection-pipeline"], + "docs": ["LOAD_BALANCING.md", "BACKEND_HOSTS.md"] + }, + { + "id": "c-load-balance-mode", + "name": "Load Balance Mode", + "domain_id": "d02", + "subdomain_id": "sd02-03", + "definition": "Stage 2 of backend selection. Determines the ordering of hosts within the matched set for each request.", + "key_settings": ["LoadBalanceMode"], + "allowed_values": ["roundrobin", "latency", "random"], + "default_value": "latency", + "reload_type": "Warm", + "related_concept_ids": ["c-round-robin-iterator", "c-latency-iterator", "c-random-iterator"], + "docs": ["LOAD_BALANCING.md"] + }, + { + "id": "c-round-robin-iterator", + "name": "Round-Robin Iterator", + "code_file": "RoundRobinHostIterator.cs", + "domain_id": "d02", + "subdomain_id": "sd02-03", + "definition": "Distributes requests evenly across active hosts using a global atomic counter.", + "config_value": "LoadBalanceMode=roundrobin", + "related_concept_ids": ["c-load-balance-mode", "c-shared-iterator"], + "docs": ["LOAD_BALANCING.md"] + }, + { + "id": "c-latency-iterator", + "name": "Latency Iterator", + "code_file": "LatencyHostIterator.cs", + "domain_id": "d02", + "subdomain_id": "sd02-03", + "definition": "Orders hosts by ascending average probe latency per request. Direct-mode hosts always sort first at 0 ms.", + "config_value": "LoadBalanceMode=latency", + "related_concept_ids": ["c-load-balance-mode", "c-average-latency", "c-direct-mode"], + "docs": ["LOAD_BALANCING.md"] + }, + { + "id": "c-random-iterator", + "name": "Random Iterator", + "code_file": "RandomHostIterator.cs", + "domain_id": "d02", + "subdomain_id": "sd02-03", + "definition": "Shuffles active hosts randomly on each request, producing no predictable access pattern.", + "config_value": "LoadBalanceMode=random", + "related_concept_ids": ["c-load-balance-mode"], + "docs": ["LOAD_BALANCING.md"] + }, + { + "id": "c-shared-iterator", + "name": "Shared Iterator", + "code_file": "SharedIteratorRegistry.cs", + "domain_id": "d02", + "subdomain_id": "sd02-03", + "definition": "Single iterator shared across all concurrent requests to the same path. Enables strict round-robin fairness across parallel workers.", + "key_settings": ["UseSharedIterators", "SharedIteratorTTLSeconds", "SharedIteratorCleanupIntervalSeconds"], + "related_concept_ids": ["c-round-robin-iterator", "c-iterator-factory"], + "docs": ["LOAD_BALANCING.md"] + }, + { + "id": "c-iteration-mode", + "name": "Iteration Mode", + "domain_id": "d02", + "subdomain_id": "sd02-03", + "definition": "Controls retry breadth. SinglePass tries each host at most once. MultiPass cycles through all hosts up to MaxAttempts total.", + "key_settings": ["IterationMode"], + "allowed_values": ["SinglePass", "MultiPass"], + "default_value": "SinglePass", + "reload_type": "Warm", + "related_concept_ids": ["c-max-attempts", "c-retry"], + "docs": ["LOAD_BALANCING.md"] + }, + { + "id": "c-max-attempts", + "name": "Max Attempts", + "domain_id": "d02", + "subdomain_id": "sd02-03", + "definition": "Maximum total host attempts in MultiPass mode. Hosts skipped because their circuit is OPEN do not consume an attempt.", + "key_settings": ["MaxAttempts"], + "default_value": 10, + "reload_type": "Warm", + "related_concept_ids": ["c-iteration-mode", "c-circuit-breaker"], + "docs": ["LOAD_BALANCING.md", "CONFIGURATION_SETTINGS.md"] + }, + + { + "id": "c-circuit-breaker", + "name": "Circuit Breaker", + "code_file": "CircuitBreaker.cs", + "domain_id": "d03", + "subdomain_id": "sd03-01", + "definition": "Per-host failure counter with a sliding time window. Opens when the failure count in the window exceeds CBErrorThreshold.", + "key_settings": ["CBErrorThreshold", "CBTimeslice", "AcceptableStatusCodes"], + "states": ["CLOSED", "OPEN"], + "reload_type": "Warm", + "related_concept_ids": ["c-circuit-closed", "c-circuit-open", "c-sliding-window", "c-auto-recovery", "c-progressive-delay", "c-global-blocked-check"], + "docs": ["CIRCUIT_BREAKER.md"] + }, + { + "id": "c-circuit-closed", + "name": "CLOSED State", + "domain_id": "d03", + "subdomain_id": "sd03-01", + "definition": "Normal circuit breaker state. Requests are forwarded to the host.", + "related_concept_ids": ["c-circuit-breaker", "c-circuit-open"], + "docs": ["CIRCUIT_BREAKER.md"] + }, + { + "id": "c-circuit-open", + "name": "OPEN State", + "domain_id": "d03", + "subdomain_id": "sd03-01", + "definition": "Circuit tripped. Host is skipped by the load-balance iterator. Skipped hosts do not consume the MaxAttempts budget.", + "related_concept_ids": ["c-circuit-breaker", "c-circuit-closed", "c-global-blocked-check"], + "docs": ["CIRCUIT_BREAKER.md"] + }, + { + "id": "c-sliding-window", + "name": "Sliding Window", + "domain_id": "d03", + "subdomain_id": "sd03-01", + "definition": "Only failures timestamped within the last CBTimeslice seconds count toward the threshold. Older entries are pruned automatically.", + "key_settings": ["CBTimeslice"], + "related_concept_ids": ["c-circuit-breaker", "c-auto-recovery"], + "docs": ["CIRCUIT_BREAKER.md"] + }, + { + "id": "c-auto-recovery", + "name": "Auto-Recovery", + "domain_id": "d03", + "subdomain_id": "sd03-01", + "definition": "Circuit closes automatically when all failures age out of the sliding window. No manual intervention required.", + "related_concept_ids": ["c-circuit-breaker", "c-sliding-window"], + "docs": ["CIRCUIT_BREAKER.md"] + }, + { + "id": "c-progressive-delay", + "name": "Progressive Delay", + "domain_id": "d03", + "subdomain_id": "sd03-01", + "definition": "Artificial per-request delay (100-500 ms) added as a host's failure count approaches CBErrorThreshold. Not configurable.", + "delay_table": [ + { "failure_pct_of_threshold": 50, "delay_ms": 100 }, + { "failure_pct_of_threshold": 60, "delay_ms": 200 }, + { "failure_pct_of_threshold": 70, "delay_ms": 300 }, + { "failure_pct_of_threshold": 80, "delay_ms": 400 }, + { "failure_pct_of_threshold": 90, "delay_ms": 500 } + ], + "related_concept_ids": ["c-circuit-breaker"], + "docs": ["CIRCUIT_BREAKER.md"] + }, + { + "id": "c-global-blocked-check", + "name": "Global Blocked Check", + "code_symbol": "AreAllCircuitBreakersBlocked()", + "domain_id": "d03", + "subdomain_id": "sd03-01", + "definition": "When all registered circuit breakers are OPEN simultaneously, the proxy returns 503 immediately without attempting any host.", + "related_concept_ids": ["c-circuit-open"], + "docs": ["CIRCUIT_BREAKER.md"] + }, + { + "id": "c-acceptable-status-codes", + "name": "Acceptable Status Codes", + "domain_id": "d03", + "subdomain_id": "sd03-01", + "definition": "HTTP status codes from backends that are not counted as circuit-breaker failures and are forwarded to the client.", + "key_settings": ["AcceptableStatusCodes"], + "default_value": [200, 202, 400, 401, 403, 404, 408, 410, 412, 417], + "reload_type": "Warm", + "related_concept_ids": ["c-circuit-breaker", "c-pass-through"], + "docs": ["CIRCUIT_BREAKER.md", "RESPONSE_CODES.md"] + }, + + { + "id": "c-retry", + "name": "Retry", + "domain_id": "d03", + "subdomain_id": "sd03-02", + "definition": "Advancing to the next host in the iterator after a non-acceptable backend response. Does not apply when the circuit is OPEN.", + "key_settings": ["IterationMode", "MaxAttempts"], + "related_concept_ids": ["c-iteration-mode", "c-max-attempts", "c-requeue"], + "docs": ["LOAD_BALANCING.md"] + }, + { + "id": "c-requeue", + "name": "Requeue", + "domain_id": "d03", + "subdomain_id": "sd03-02", + "definition": "Returning a request to the priority queue after all backends returned 429 with S7PREQUEUE:true. Uses the shortest retry-after delay seen across all backends.", + "related_concept_ids": ["c-retry", "c-s7prequeue-header", "c-retry-after"], + "docs": ["LOAD_BALANCING.md", "RESPONSE_CODES.md"] + }, + { + "id": "c-s7prequeue-header", + "name": "S7PREQUEUE", + "header_direction": "backend-response", + "domain_id": "d03", + "subdomain_id": "sd03-02", + "definition": "Response header a backend sets on a 429 reply to signal the proxy should requeue the request rather than try the next host.", + "related_concept_ids": ["c-requeue", "c-retry-after"], + "docs": ["RESPONSE_CODES.md", "LOAD_BALANCING.md"] + }, + { + "id": "c-retry-after", + "name": "Retry-After", + "domain_id": "d03", + "subdomain_id": "sd03-02", + "definition": "Delay value from the backend's Retry-After response header. When all backends return S7PREQUEUE, the proxy uses the shortest retry-after seen.", + "related_concept_ids": ["c-requeue", "c-s7prequeue-header"], + "docs": ["LOAD_BALANCING.md"] + }, + + { + "id": "c-timeout", + "name": "Timeout", + "domain_id": "d03", + "subdomain_id": "sd03-03", + "definition": "Per-host-attempt window in milliseconds. Resets on each retry. Effective deadline per attempt = min(remaining TTL, Timeout).", + "key_settings": ["Timeout"], + "per_request_override_header": "S7PTimeout", + "unit": "milliseconds", + "default_value": 1200000, + "reload_type": "Cold", + "related_concept_ids": ["c-ttl", "c-async-trigger-timeout"], + "docs": ["TIMEOUTS.md"] + }, + { + "id": "c-async-trigger-timeout", + "name": "AsyncTriggerTimeout", + "domain_id": "d03", + "subdomain_id": "sd03-03", + "definition": "Milliseconds after enqueue before the proxy releases the client with a 202 response and continues processing in the background.", + "key_settings": ["AsyncTriggerTimeout"], + "unit": "milliseconds", + "default_value": 10000, + "reload_type": "Warm", + "related_concept_ids": ["c-timeout", "c-async-mode", "c-async-timeout"], + "docs": ["TIMEOUTS.md", "AsyncOperation.md"] + }, + { + "id": "c-async-timeout", + "name": "AsyncTimeout", + "domain_id": "d03", + "subdomain_id": "sd03-03", + "definition": "Maximum backend processing time in milliseconds once a request is in async mode.", + "key_settings": ["AsyncTimeout"], + "unit": "milliseconds", + "default_value": 1800000, + "reload_type": "Warm", + "related_concept_ids": ["c-async-trigger-timeout", "c-async-ttl-secs"], + "docs": ["TIMEOUTS.md", "AsyncOperation.md"] + }, + { + "id": "c-async-ttl-secs", + "name": "AsyncTTLSecs", + "domain_id": "d03", + "subdomain_id": "sd03-03", + "definition": "Retention period in seconds for the async result blob in Azure Blob Storage after processing completes.", + "key_settings": ["AsyncTTLSecs"], + "unit": "seconds", + "default_value": 86400, + "reload_type": "Warm", + "related_concept_ids": ["c-result-blob", "c-async-timeout"], + "docs": ["TIMEOUTS.md", "AsyncOperation.md", "StorageBlobConfig.md"] + }, + + { + "id": "c-validation-pipeline", + "name": "Validation Pipeline", + "domain_id": "d04", + "subdomain_id": "sd04-01", + "definition": "Ordered sequence of validation checks applied to every non-probe inbound request before it is enqueued. The execution order is fixed.", + "steps": [ + { "order": 1, "concept_id": "c-app-id-validation", "on_failure_code": 403, "on_failure_label": "DisallowedAppID" }, + { "order": 2, "concept_id": "c-header-stripping", "on_failure_code": null, "on_failure_label": "silent" }, + { "order": 3, "concept_id": "c-user-profile", "on_failure_code": 403, "on_failure_label": "UnknownProfile" }, + { "order": 4, "concept_id": "c-required-headers", "on_failure_code": 417, "on_failure_label": "IncompleteHeaders" }, + { "order": 5, "concept_id": "c-header-validation-rule","on_failure_code": 417, "on_failure_label": "InvalidHeader" } + ], + "related_concept_ids": ["c-app-id-validation","c-header-stripping","c-user-profile","c-required-headers","c-header-validation-rule"], + "docs": ["REQUEST_VALIDATION.md"] + }, + { + "id": "c-app-id-validation", + "name": "App ID Validation", + "domain_id": "d04", + "subdomain_id": "sd04-01", + "definition": "Step 1 of the validation pipeline. Checks the caller's Entra Application ID against a remote allowlist before any other check. Designed for deployments with more than 13 App IDs (Entra's built-in limit).", + "key_settings": ["ValidateAuthAppID", "ValidateAuthAppIDUrl", "ValidateAuthAppIDHeader", "ValidateAuthAppFieldName"], + "on_failure_code": 403, + "reload_type": "Warm", + "related_concept_ids": ["c-app-id-allowlist", "c-validation-pipeline"], + "docs": ["REQUEST_VALIDATION.md", "ADVANCED_CONFIGURATION.md"] + }, + { + "id": "c-header-stripping", + "name": "Header Stripping", + "domain_id": "d04", + "subdomain_id": "sd04-01", + "definition": "Step 2 of the validation pipeline. Removes headers listed in DisallowedHeaders silently. No error is returned to the caller.", + "key_settings": ["DisallowedHeaders"], + "related_concept_ids": ["c-disallowed-headers", "c-validation-pipeline"], + "docs": ["REQUEST_VALIDATION.md"] + }, + { + "id": "c-required-headers", + "name": "Required Headers", + "domain_id": "d04", + "subdomain_id": "sd04-02", + "definition": "List of header names that must be non-empty on every request. The first missing header returns 417.", + "key_settings": ["RequiredHeaders"], + "on_failure_code": 417, + "on_failure_label": "IncompleteHeaders", + "reload_type": "Warm", + "related_concept_ids": ["c-validation-pipeline", "c-header-validation-rule"], + "docs": ["REQUEST_VALIDATION.md"] + }, + { + "id": "c-disallowed-headers", + "name": "Disallowed Headers", + "domain_id": "d04", + "subdomain_id": "sd04-02", + "definition": "Headers stripped silently from the request before forwarding. Automatically includes the allowlist header when ValidateHeaders is configured.", + "key_settings": ["DisallowedHeaders"], + "auto_populated_by": ["c-header-validation-rule"], + "related_concept_ids": ["c-header-stripping", "c-header-validation-rule"], + "docs": ["REQUEST_VALIDATION.md"] + }, + { + "id": "c-header-validation-rule", + "name": "Header Validation Rule", + "domain_id": "d04", + "subdomain_id": "sd04-02", + "definition": "ValidateHeaders mapping SourceHeader=AllowlistHeader. The source header value must appear in the comma-separated allowlist header injected from the user profile. Supports * suffix for prefix matching.", + "key_settings": ["ValidateHeaders"], + "on_failure_code": 417, + "on_failure_label": "InvalidHeader", + "reload_type": "Warm", + "side_effects": [ + "auto-adds SourceHeader and AllowlistHeader to RequiredHeaders", + "auto-adds AllowlistHeader to DisallowedHeaders" + ], + "related_concept_ids": ["c-user-profile", "c-required-headers", "c-disallowed-headers"], + "docs": ["REQUEST_VALIDATION.md", "ADVANCED_CONFIGURATION.md"] + }, + { + "id": "c-app-id-allowlist", + "name": "App ID Allowlist", + "domain_id": "d04", + "subdomain_id": "sd04-02", + "definition": "File or URL returning the list of permitted Entra Application IDs. Enforced at step 1 of the validation pipeline.", + "key_settings": ["ValidateAuthAppIDUrl", "ValidateAuthAppFieldName"], + "related_concept_ids": ["c-app-id-validation"], + "docs": ["REQUEST_VALIDATION.md", "ENVIRONMENT_VARIABLES.md"] + }, + + { + "id": "c-user-profile", + "name": "User Profile", + "domain_id": "d04", + "subdomain_id": "sd04-03", + "definition": "Per-user JSON object loaded periodically from a URL or file. Drives priority assignment, async configuration, and custom header injection.", + "key_settings": ["UseProfiles", "UserConfigUrl"], + "profile_fields": [ + { "field": "userId", "required": true, "description": "Unique identifier matched against the incoming request header." }, + { "field": "S7PPriorityKey", "required": false, "description": "Priority tier value mapped via PriorityKeys." }, + { "field": "async-config", "required": false, "description": "Enables async mode and sets per-user blob container and Service Bus topic." }, + { "field": "[CustomHeader]", "required": false, "description": "Any other key is injected as an HTTP request header." } + ], + "related_concept_ids": ["c-user-id", "c-s7p-priority-key-field", "c-async-config-field", "c-profile-refresh", "c-suspended-user"], + "docs": ["USER_PROFILES.md"] + }, + { + "id": "c-user-id", + "name": "User ID", + "domain_id": "d04", + "subdomain_id": "sd04-03", + "definition": "Unique identifier extracted from a configurable request header and used to look up the user profile record.", + "key_settings": ["UserIDFieldName"], + "default_value": "userId", + "related_concept_ids": ["c-user-profile"], + "docs": ["USER_PROFILES.md"] + }, + { + "id": "c-s7p-priority-key-field", + "name": "S7PPriorityKey Profile Field", + "domain_id": "d04", + "subdomain_id": "sd04-03", + "definition": "User profile field whose value is matched against PriorityKeys to assign a priority tier to the user's requests.", + "related_concept_ids": ["c-user-profile", "c-priority-mapping"], + "docs": ["USER_PROFILES.md", "ADVANCED_CONFIGURATION.md"] + }, + { + "id": "c-async-config-field", + "name": "async-config Profile Field", + "domain_id": "d04", + "subdomain_id": "sd04-03", + "definition": "User profile field enabling async mode for a specific user. Specifies the user's blob container name and Service Bus topic.", + "key_settings": ["AsyncClientConfigFieldName"], + "required_subfields": ["enabled", "containername", "topic"], + "optional_subfields": ["timeout"], + "related_concept_ids": ["c-user-profile", "c-async-mode"], + "docs": ["USER_PROFILES.md", "AsyncOperation.md"] + }, + { + "id": "c-profile-refresh", + "name": "Profile Refresh", + "domain_id": "d04", + "subdomain_id": "sd04-03", + "definition": "Proxy reloads the user profile file at the configured interval (default 1 hour). Enables live user management without restart.", + "key_settings": ["UserConfigRefreshIntervalSecs"], + "default_value": 3600, + "unit": "seconds", + "related_concept_ids": ["c-user-profile"], + "docs": ["USER_PROFILES.md", "ENVIRONMENT_VARIABLES.md"] + }, + { + "id": "c-suspended-user", + "name": "Suspended User", + "domain_id": "d04", + "subdomain_id": "sd04-03", + "definition": "User whose ID appears in the suspended users list. Returns 403 immediately, regardless of profile content.", + "key_settings": ["SuspendedUserConfigUrl"], + "on_failure_code": 403, + "related_concept_ids": ["c-user-profile"], + "docs": ["USER_PROFILES.md"] + }, + + { + "id": "c-priority-mapping", + "name": "Priority Mapping", + "domain_id": "d04", + "subdomain_id": "sd04-04", + "definition": "Maps a request header value to an internal priority integer and allocates dedicated worker threads to that priority level.", + "key_settings": ["PriorityKeyHeader", "PriorityKeys", "PriorityValues", "PriorityWorkers"], + "constraint": "Count of PriorityKeys MUST equal count of PriorityValues. PriorityWorkers MUST reference only levels in PriorityValues.", + "related_concept_ids": ["c-priority-key-header", "c-priority-keys", "c-priority-values", "c-priority-workers"], + "docs": ["ADVANCED_CONFIGURATION.md"] + }, + { + "id": "c-priority-key-header", + "name": "Priority Key Header", + "domain_id": "d04", + "subdomain_id": "sd04-04", + "definition": "Name of the request header the proxy inspects to determine priority tier. Default: S7PPriorityKey.", + "key_settings": ["PriorityKeyHeader"], + "default_value": "S7PPriorityKey", + "reload_type": "Warm", + "related_concept_ids": ["c-priority-mapping"], + "docs": ["ADVANCED_CONFIGURATION.md", "RESPONSE_CODES.md"] + }, + { + "id": "c-priority-keys", + "name": "Priority Keys", + "domain_id": "d04", + "subdomain_id": "sd04-04", + "definition": "Comma-separated list of expected header values. Each entry maps 1:1 to the corresponding entry in PriorityValues.", + "key_settings": ["PriorityKeys"], + "reload_type": "Warm", + "related_concept_ids": ["c-priority-mapping", "c-priority-values"], + "docs": ["ADVANCED_CONFIGURATION.md"] + }, + { + "id": "c-priority-values", + "name": "Priority Values", + "domain_id": "d04", + "subdomain_id": "sd04-04", + "definition": "Comma-separated list of internal priority integers, in 1:1 correspondence with PriorityKeys.", + "key_settings": ["PriorityValues"], + "reload_type": "Warm", + "related_concept_ids": ["c-priority-mapping", "c-priority-keys"], + "docs": ["ADVANCED_CONFIGURATION.md"] + }, + { + "id": "c-priority-workers", + "name": "Priority Workers", + "domain_id": "d04", + "subdomain_id": "sd04-04", + "definition": "PriorityLevel:WorkerCount pairs that reserve dedicated worker threads for each priority level.", + "key_settings": ["PriorityWorkers"], + "format": "level:count,level:count", + "reload_type": "Warm", + "related_concept_ids": ["c-priority-mapping", "c-workers"], + "docs": ["ADVANCED_CONFIGURATION.md"] + }, + { + "id": "c-per-user-throttling", + "name": "Per-user Throttling", + "domain_id": "d04", + "subdomain_id": "sd04-04", + "definition": "When a user's active requests exceed UserPriorityThreshold as a fraction of the total queue, that user's new requests are deprioritized.", + "key_settings": ["UserPriorityThreshold"], + "value_type": "float", + "value_range": "0.0 to 1.0", + "default_value": 0.1, + "reload_type": "Warm", + "related_concept_ids": ["c-priority-mapping", "c-user-profile"], + "docs": ["ADVANCED_CONFIGURATION.md", "USER_PROFILES.md"] + }, + + { + "id": "c-async-mode", + "name": "Async Mode", + "domain_id": "d05", + "definition": "System-wide feature enabling asynchronous request processing. Must be enabled at three levels: proxy (AsyncModeEnabled), user profile (async-config field), and per-request (opt-in header).", + "key_settings": ["AsyncModeEnabled"], + "reload_type": "Cold", + "enablement_levels": [ + "AsyncModeEnabled=true (proxy-wide switch, Cold)", + "async-config field in user profile", + "AsyncClientRequestHeader on the request" + ], + "related_concept_ids": ["c-opt-in-header", "c-async-upgrade", "c-result-blob", "c-service-bus-status"], + "docs": ["AsyncOperation.md"] + }, + { + "id": "c-opt-in-header", + "name": "Opt-in Header", + "domain_id": "d05", + "definition": "Per-request header clients send to request async processing for that call. Default header name: S7PAsyncMode.", + "key_settings": ["AsyncClientRequestHeader"], + "default_value": "S7PAsyncMode", + "reload_type": "Warm", + "related_concept_ids": ["c-async-mode"], + "docs": ["AsyncOperation.md"] + }, + { + "id": "c-async-upgrade", + "name": "Async Upgrade", + "domain_id": "d05", + "definition": "After AsyncTriggerTimeout ms, the proxy returns 202 to the client containing blob URIs and continues backend processing in the background.", + "key_settings": ["AsyncTriggerTimeout"], + "related_concept_ids": ["c-async-trigger-timeout", "c-202-response", "c-result-blob"], + "docs": ["AsyncOperation.md", "TIMEOUTS.md"] + }, + { + "id": "c-202-response", + "name": "202 Accepted Response", + "domain_id": "d05", + "definition": "HTTP 202 returned to the client immediately after async upgrade. Body contains result blob URIs for polling.", + "http_status": 202, + "related_concept_ids": ["c-async-upgrade", "c-result-blob"], + "docs": ["AsyncOperation.md"] + }, + { + "id": "c-result-blob", + "name": "Result Blob", + "domain_id": "d05", + "definition": "Azure Blob Storage object written when the backend completes an async request. URI is included in the 202 response. Expires after AsyncTTLSecs.", + "key_settings": ["AsyncBlobStorageConfig", "StorageDbContainerName", "AsyncTTLSecs"], + "related_concept_ids": ["c-blob-sas-token", "c-blob-lifecycle-policy", "c-async-blob-worker"], + "docs": ["AsyncOperation.md", "StorageBlobConfig.md"] + }, + { + "id": "c-blob-sas-token", + "name": "Blob SAS Token", + "domain_id": "d05", + "definition": "Time-limited access token generated for the result blob. Valid for AsyncTTLSecs seconds.", + "key_settings": ["AsyncTTLSecs"], + "related_concept_ids": ["c-result-blob"], + "docs": ["AsyncOperation.md"] + }, + { + "id": "c-blob-lifecycle-policy", + "name": "Blob Lifecycle Policy", + "domain_id": "d05", + "definition": "Azure Storage lifecycle management rule that automatically deletes blobs after BlobRetentionDays. Must be configured in the storage account independently; the BlobRetentionDays proxy setting alone does not delete blobs.", + "key_settings": ["BlobRetentionDays", "StorageDbContainerName"], + "related_concept_ids": ["c-result-blob"], + "docs": ["StorageBlobConfig.md"] + }, + { + "id": "c-service-bus-status", + "name": "Service Bus Status", + "domain_id": "d05", + "definition": "Real-time async lifecycle events sent to a per-user Azure Service Bus topic as requests move through processing states.", + "key_settings": ["AsyncSBConfig"], + "related_concept_ids": ["c-status-events", "c-async-mode"], + "docs": ["AsyncOperation.md"] + }, + { + "id": "c-status-events", + "name": "Status Events", + "domain_id": "d05", + "definition": "Lifecycle notification values sent over Service Bus.", + "values": ["InQueue", "RetryAfterDelay", "ReQueued", "Processing", "Processed", "Failed", "Expired"], + "related_concept_ids": ["c-service-bus-status"], + "docs": ["AsyncOperation.md"] + }, + { + "id": "c-async-blob-worker", + "name": "Async Blob Worker", + "domain_id": "d05", + "definition": "Background thread that writes completed async request results to Azure Blob Storage.", + "key_settings": ["AsyncBlobWorkerCount"], + "default_value": 2, + "reload_type": "Cold", + "related_concept_ids": ["c-result-blob"], + "docs": ["AsyncOperation.md"] + }, + + { + "id": "c-proxy-event", + "name": "ProxyEvent", + "domain_id": "d06", + "subdomain_id": "sd06-01", + "definition": "Key/value dictionary capturing all dimensions of a single proxied request: HTTP status, timing, token counts, headers, and other metadata.", + "related_concept_ids": ["c-event-headers", "c-composite-event-client"], + "docs": ["OBSERVABILITY.md"] + }, + { + "id": "c-event-headers", + "name": "Event Headers", + "code_interface": "ICommonEventData", + "domain_id": "d06", + "subdomain_id": "sd06-01", + "definition": "Common fields injected into every ProxyEvent: application version, container name, and revision. Supplied by the ICommonEventData implementation.", + "key_settings": ["EVENT_HEADERS"], + "related_concept_ids": ["c-proxy-event"], + "docs": ["OBSERVABILITY.md"] + }, + { + "id": "c-composite-event-client", + "name": "CompositeEventClient", + "code_file": "CompositeEventClient.cs", + "domain_id": "d06", + "subdomain_id": "sd06-01", + "definition": "Fan-out dispatcher that sends every serialized ProxyEvent to all registered event logger backends. Uses FrozenDictionary for zero-lock dispatch on the hot path.", + "related_concept_ids": ["c-event-logger-backend", "c-proxy-event"], + "docs": ["OBSERVABILITY.md"] + }, + { + "id": "c-event-logger-backend", + "name": "Event Logger Backend", + "domain_id": "d06", + "subdomain_id": "sd06-01", + "definition": "Named sink that receives serialized JSON ProxyEvents. Built-in values: file, eventhub. Custom backends can be registered by implementing IEventClient + IHostedService.", + "key_settings": ["EVENT_LOGGERS"], + "built_in_values": ["file", "eventhub"], + "custom_value_format": "", + "related_concept_ids": ["c-composite-event-client", "c-log-file", "c-event-hubs", "c-custom-event-logger"], + "docs": ["OBSERVABILITY.md"] + }, + { + "id": "c-custom-event-logger", + "name": "Custom Event Logger", + "domain_id": "d06", + "subdomain_id": "sd06-01", + "definition": "User-supplied class implementing IEventClient and IHostedService. Calls CompositeEventClient.Add(this) at startup to self-register with the fan-out.", + "related_concept_ids": ["c-event-logger-backend", "c-composite-event-client"], + "docs": ["OBSERVABILITY.md"] + }, + { + "id": "c-application-insights", + "name": "Application Insights", + "domain_id": "d06", + "subdomain_id": "sd06-02", + "definition": "Primary production telemetry sink. Receives structured telemetry (requests, dependencies, exceptions) via TelemetryClient.", + "key_settings": ["APPINSIGHTS_CONNECTIONSTRING"], + "related_concept_ids": ["c-event-logger-backend"], + "docs": ["OBSERVABILITY.md"] + }, + { + "id": "c-event-hubs", + "name": "Event Hubs", + "domain_id": "d06", + "subdomain_id": "sd06-02", + "definition": "High-volume streaming telemetry sink for JSON event lines. Supports connection-string auth or Managed Identity.", + "key_settings": ["EVENTHUB_CONNECTIONSTRING", "EVENTHUB_NAMESPACE", "EVENTHUB_NAME"], + "related_concept_ids": ["c-event-logger-backend"], + "docs": ["OBSERVABILITY.md"] + }, + { + "id": "c-log-file", + "name": "Log File", + "domain_id": "d06", + "subdomain_id": "sd06-02", + "definition": "Local JSON-lines file sink for development and testing. Buffers in a ConcurrentQueue and flushes in batches every 500 ms.", + "key_settings": ["LOGFILE_NAME"], + "related_concept_ids": ["c-event-logger-backend"], + "docs": ["OBSERVABILITY.md"] + }, + { + "id": "c-token-telemetry", + "name": "Token Telemetry", + "domain_id": "d06", + "subdomain_id": "sd06-02", + "definition": "Prompt and completion token counts extracted from SSE (Server-Sent Events) streams and logged per request. Requires processor=OpenAI on the backend host.", + "key_settings": ["processor"], + "connection_string_value": "processor=OpenAI", + "related_concept_ids": ["c-processor", "c-proxy-event"], + "docs": ["OBSERVABILITY.md", "AI_FOUNDRY_INTEGRATION.md"] + }, + { + "id": "c-liveness-endpoint", + "name": "/liveness Endpoint", + "domain_id": "d06", + "subdomain_id": "sd06-03", + "definition": "Returns 200 OK if the process is running. Never returns 503.", + "path": "/liveness", + "responses": { "200": "process running" }, + "related_concept_ids": ["c-readiness-endpoint", "c-sidecar-mode"], + "docs": ["HEALTH_CHECKING.md"] + }, + { + "id": "c-readiness-endpoint", + "name": "/readiness Endpoint", + "domain_id": "d06", + "subdomain_id": "sd06-03", + "definition": "Returns 200 if at least one backend host is in the active pool. Returns 503 if no hosts are active.", + "path": "/readiness", + "responses": { "200": "at least one active host", "503": "no active hosts" }, + "related_concept_ids": ["c-liveness-endpoint", "c-active-pool"], + "docs": ["HEALTH_CHECKING.md"] + }, + { + "id": "c-startup-endpoint", + "name": "/startup Endpoint", + "domain_id": "d06", + "subdomain_id": "sd06-03", + "definition": "Returns 200 if the health poller has completed its first probe pass. Returns 503 during initialization.", + "path": "/startup", + "responses": { "200": "poller has run at least once", "503": "still initializing" }, + "related_concept_ids": ["c-health-poller"], + "docs": ["HEALTH_CHECKING.md"] + }, + { + "id": "c-sidecar-mode", + "name": "Sidecar Mode", + "domain_id": "d06", + "subdomain_id": "sd06-03", + "definition": "Isolation pattern where a separate HealthProbe container on port 9000 handles Kubernetes probes. The proxy pushes its health state to the sidecar every second. If no update arrives for 10 seconds the sidecar fails probes.", + "key_settings": ["HealthProbeSidecar"], + "ports": { "proxy": 8000, "sidecar": 9000 }, + "related_concept_ids": ["c-liveness-endpoint", "c-readiness-endpoint", "c-startup-endpoint"], + "docs": ["HEALTH_CHECKING.md", "SIDECAR_DEPLOYMENT.md"] + }, + + { + "id": "c-warm-setting", + "name": "Warm Setting", + "domain_id": "d07", + "definition": "Configuration value hot-reloaded from Azure App Configuration within ~30 seconds when the Sentinel key changes. No container restart required.", + "label_in_app_config": "none or AZURE_APPCONFIG_LABEL", + "key_prefix": "Warm:", + "related_concept_ids": ["c-cold-setting", "c-hidden-setting", "c-sentinel"], + "docs": ["CONFIGURATION_SETTINGS.md", "AZURE_APP_CONFIGURATION.md"] + }, + { + "id": "c-cold-setting", + "name": "Cold Setting", + "domain_id": "d07", + "definition": "Configuration value published to Azure App Configuration but taking effect only after a container restart.", + "label_in_app_config": "Cold", + "key_prefix": "Warm:", + "related_concept_ids": ["c-warm-setting", "c-hidden-setting"], + "docs": ["CONFIGURATION_SETTINGS.md", "AZURE_APP_CONFIGURATION.md"] + }, + { + "id": "c-hidden-setting", + "name": "Hidden Setting", + "domain_id": "d07", + "definition": "Configuration value never published to Azure App Configuration. Runtime-derived from composite connection strings or infrastructure state.", + "related_concept_ids": ["c-warm-setting", "c-cold-setting"], + "docs": ["CONFIGURATION_SETTINGS.md"] + }, + { + "id": "c-sentinel", + "name": "Sentinel", + "domain_id": "d07", + "definition": "Special key (Warm:Sentinel) in Azure App Configuration. Bumping its value to any new value triggers hot-reload of all Warm settings across all running proxy instances.", + "app_config_key": "Warm:Sentinel", + "related_concept_ids": ["c-warm-setting"], + "docs": ["AZURE_APP_CONFIGURATION.md"] + }, + { + "id": "c-composite-connection-string", + "name": "Composite Connection String", + "domain_id": "d07", + "definition": "Semicolon or comma-delimited key=value string encoding multiple related settings in one environment variable.", + "examples": ["Host1", "AsyncBlobStorageConfig", "AsyncSBConfig", "HealthProbeSidecar"], + "related_concept_ids": ["c-connection-string-format"], + "docs": ["BACKEND_HOSTS.md", "AsyncOperation.md"] + }, + { + "id": "c-app-config-label", + "name": "App Configuration Label", + "domain_id": "d07", + "definition": "Label filter applied when reading from Azure App Configuration. Allows isolated setting namespaces per environment.", + "key_settings": ["AZURE_APPCONFIG_LABEL"], + "related_concept_ids": ["c-warm-setting"], + "docs": ["AZURE_APP_CONFIGURATION.md"] + }, + { + "id": "c-refresh-interval", + "name": "Refresh Interval", + "domain_id": "d07", + "definition": "How often in seconds the proxy polls the Sentinel key in Azure App Configuration.", + "key_settings": ["AZURE_APPCONFIG_REFRESH_INTERVAL_SECONDS"], + "default_value": 30, + "unit": "seconds", + "related_concept_ids": ["c-sentinel", "c-warm-setting"], + "docs": ["AZURE_APP_CONFIGURATION.md", "CONFIGURATION_SETTINGS.md"] + }, + + { + "id": "c-managed-identity", + "name": "Managed Identity", + "domain_id": "d08", + "definition": "Azure-managed credential attached to the container. Used for keyless authentication to backends, App Configuration, Event Hubs, Blob Storage, and Service Bus.", + "connection_string_key": "usemi", + "connection_string_value": "usemi=true", + "related_concept_ids": ["c-keyless-auth", "c-oauth2-bearer"], + "docs": ["AI_FOUNDRY_INTEGRATION.md", "BACKEND_HOSTS.md"] + }, + { + "id": "c-oauth2-bearer", + "name": "OAuth2 Bearer Token", + "domain_id": "d08", + "definition": "Proxy acquires an Entra ID token and attaches it as Authorization: Bearer on outbound backend requests.", + "connection_string_key": "useoauth", + "connection_string_value": "useoauth=true", + "related_concept_ids": ["c-managed-identity"], + "docs": ["BACKEND_HOSTS.md", "AI_FOUNDRY_INTEGRATION.md"] + }, + { + "id": "c-keyless-auth", + "name": "Keyless Auth", + "domain_id": "d08", + "definition": "Configuration pattern using usemi=true and audience=https://cognitiveservices.azure.com to eliminate API key management for Azure OpenAI and AI Foundry backends.", + "connection_string_example": "host=…;usemi=true;audience=https://cognitiveservices.azure.com", + "related_concept_ids": ["c-managed-identity", "c-oauth2-bearer"], + "docs": ["AI_FOUNDRY_INTEGRATION.md"] + }, + { + "id": "c-vnet-deployment", + "name": "VNet Deployment", + "domain_id": "d08", + "definition": "Deployment pattern where the proxy runs entirely inside a Virtual Network with no external data plane dependencies. Supports sovereign and government cloud regions.", + "related_concept_ids": ["c-managed-identity"], + "docs": ["OVERVIEW.md", "CONTAINER_DEPLOYMENT.md"] + }, + + { + "id": "c-azure-container-apps", + "name": "Azure Container Apps", + "domain_id": "d09", + "definition": "Primary hosting platform for SimpleL7Proxy. Provides ingress, auto-scaling, managed identity, and revision management.", + "related_concept_ids": ["c-sidecar-deployment", "c-container-image"], + "docs": ["CONTAINER_DEPLOYMENT.md", "SIDECAR_DEPLOYMENT.md"] + }, + { + "id": "c-container-image", + "name": "Container Image", + "domain_id": "d09", + "definition": "Docker image pushed to Azure Container Registry (ACR) and deployed to Azure Container Apps.", + "key_settings": ["ACR", "REGISTRY_SERVER"], + "related_concept_ids": ["c-azure-container-apps"], + "docs": ["CONTAINER_DEPLOYMENT.md", "SIDECAR_DEPLOYMENT.md"] + }, + { + "id": "c-sidecar-deployment", + "name": "Sidecar Deployment", + "domain_id": "d09", + "definition": "Two-container ACA revision: proxy container on port 8000 and health probe container on port 9000. Both containers share localhost networking within the revision.", + "key_settings": ["HealthProbeSidecar"], + "ports": { "proxy": 8000, "health_probe": 9000 }, + "related_concept_ids": ["c-sidecar-mode", "c-azure-container-apps"], + "docs": ["SIDECAR_DEPLOYMENT.md"] + }, + { + "id": "c-parameters-file", + "name": "Parameters File", + "code_file": "deploy.parameters.sh", + "domain_id": "d09", + "definition": "Single source of truth for all build and deploy scripts in the sidecar deployment. Set values once; all scripts read from this file.", + "path": "deployment/proxy-with-sidecar/deploy.parameters.sh", + "related_concept_ids": ["c-sidecar-deployment"], + "docs": ["SIDECAR_DEPLOYMENT.md"] + }, + { + "id": "c-apim-integration", + "name": "APIM Integration", + "domain_id": "d09", + "definition": "Azure API Management as a front-end to SimpleL7Proxy. Adds semantic caching, PII scrubbing, TPM-based rate limiting, quota enforcement, and mTLS.", + "related_concept_ids": ["c-ai-foundry-integration"], + "docs": ["AI_FOUNDRY_INTEGRATION.md", "OVERVIEW.md"] + }, + { + "id": "c-ai-foundry-integration", + "name": "AI Foundry Integration", + "domain_id": "d09", + "definition": "Direct or APIM-mediated connection to Azure OpenAI and AI Foundry model endpoints. Requires processor=OpenAI for token count extraction.", + "key_settings": ["processor", "usemi"], + "related_concept_ids": ["c-processor", "c-apim-integration", "c-keyless-auth"], + "docs": ["AI_FOUNDRY_INTEGRATION.md"] + } + ], + + "settings": [ + { "name": "AcceptableStatusCodes", "type": "int[]", "reload": "Warm", "default": [200,202,400,401,403,404,408,410,412,417], "concept_ids": ["c-acceptable-status-codes","c-circuit-breaker","c-pass-through"] }, + { "name": "AsyncBlobStorageConfig","type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-result-blob","c-async-mode"] }, + { "name": "AsyncBlobWorkerCount", "type": "int", "reload": "Cold", "default": 2, "concept_ids": ["c-async-blob-worker"] }, + { "name": "AsyncClientConfigFieldName","type":"string","reload":"Warm", "default": "async-config", "concept_ids": ["c-async-config-field"] }, + { "name": "AsyncClientRequestHeader","type":"string","reload":"Warm", "default": "S7PAsyncMode", "concept_ids": ["c-opt-in-header"] }, + { "name": "AsyncModeEnabled", "type": "bool", "reload": "Cold", "default": false, "concept_ids": ["c-async-mode"] }, + { "name": "AsyncSBConfig", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-service-bus-status"] }, + { "name": "AsyncTimeout", "type": "int", "reload": "Warm", "default": 1800000, "unit": "ms", "concept_ids": ["c-async-timeout"] }, + { "name": "AsyncTriggerTimeout", "type": "int", "reload": "Warm", "default": 10000, "unit": "ms", "concept_ids": ["c-async-trigger-timeout"] }, + { "name": "AsyncTTLSecs", "type": "int", "reload": "Warm", "default": 86400, "unit": "s", "concept_ids": ["c-async-ttl-secs"] }, + { "name": "APPINSIGHTS_CONNECTIONSTRING","type":"string","reload":"Cold","default": null, "concept_ids": ["c-application-insights"] }, + { "name": "AZURE_APPCONFIG_CONNECTION_STRING","type":"string","reload":"Cold","default":null,"concept_ids": ["c-warm-setting","c-cold-setting"] }, + { "name": "AZURE_APPCONFIG_ENDPOINT","type":"string","reload":"Cold", "default": null, "concept_ids": ["c-warm-setting","c-cold-setting"] }, + { "name": "AZURE_APPCONFIG_LABEL", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-app-config-label"] }, + { "name": "AZURE_APPCONFIG_REFRESH_INTERVAL_SECONDS","type":"int","reload":"Cold","default":30,"unit":"s","concept_ids":["c-refresh-interval"] }, + { "name": "BlobRetentionDays", "type": "int", "reload": "Warm", "default": 7, "unit": "days","concept_ids": ["c-blob-lifecycle-policy"] }, + { "name": "CBErrorThreshold", "type": "int", "reload": "Warm", "default": 50, "concept_ids": ["c-circuit-breaker","c-sliding-window"] }, + { "name": "CBTimeslice", "type": "int", "reload": "Warm", "default": 60, "unit": "s", "concept_ids": ["c-circuit-breaker","c-sliding-window"] }, + { "name": "DefaultPriority", "type": "int", "reload": "Warm", "default": 2, "concept_ids": ["c-default-priority","c-priority-level"] }, + { "name": "DefaultTTLSecs", "type": "int", "reload": "Warm", "default": 300, "unit": "s", "concept_ids": ["c-ttl"] }, + { "name": "DisallowedHeaders", "type": "string", "reload": "Warm", "default": null, "concept_ids": ["c-disallowed-headers","c-header-stripping"] }, + { "name": "DnsRefreshTimeout", "type": "int", "reload": "Cold", "default": 120000, "unit": "ms", "concept_ids": ["c-backend-host"] }, + { "name": "EVENT_HEADERS", "type": "string", "reload": "Cold", "default": "SimpleL7Proxy.Events.CommonEventHeaders", "concept_ids": ["c-event-headers"] }, + { "name": "EVENT_LOGGERS", "type": "string", "reload": "Cold", "default": "file", "concept_ids": ["c-event-logger-backend"] }, + { "name": "EVENTHUB_CONNECTIONSTRING","type":"string","reload":"Cold", "default": null, "concept_ids": ["c-event-hubs"] }, + { "name": "EVENTHUB_NAME", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-event-hubs"] }, + { "name": "EVENTHUB_NAMESPACE", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-event-hubs"] }, + { "name": "HealthProbeSidecar", "type": "string", "reload": "Warm", "default": "Enabled=false;url=http://localhost:9000", "concept_ids": ["c-sidecar-mode","c-sidecar-deployment"] }, + { "name": "HEALTHPROBE_PORT", "type": "int", "reload": "Cold", "default": 9000, "concept_ids": ["c-sidecar-deployment"] }, + { "name": "Host1", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-backend-host","c-connection-string-format"] }, + { "name": "IterationMode", "type": "string", "reload": "Warm", "default": "SinglePass", "allowed_values":["SinglePass","MultiPass"], "concept_ids": ["c-iteration-mode"] }, + { "name": "KeepAliveInitialDelaySecs","type":"int", "reload":"Cold", "default": 60, "unit":"s", "concept_ids": ["c-backend-host"] }, + { "name": "KeepAlivePingIntervalSecs","type":"int", "reload":"Cold", "default": 60, "unit":"s", "concept_ids": ["c-backend-host"] }, + { "name": "LoadBalanceMode", "type": "string", "reload": "Warm", "default": "latency","allowed_values":["roundrobin","latency","random"], "concept_ids": ["c-load-balance-mode"] }, + { "name": "LOGFILE_NAME", "type": "string", "reload": "Cold", "default": "eventslog.json", "concept_ids": ["c-log-file"] }, + { "name": "LogHeaders", "type": "string[]","reload":"Warm", "default": [], "concept_ids": ["c-proxy-event"] }, + { "name": "LogAllRequestHeaders", "type": "bool", "reload": "Warm", "default": false, "concept_ids": ["c-proxy-event"] }, + { "name": "LogAllResponseHeaders", "type": "bool", "reload": "Warm", "default": false, "concept_ids": ["c-proxy-event"] }, + { "name": "MaxAttempts", "type": "int", "reload": "Warm", "default": 10, "concept_ids": ["c-max-attempts","c-retry"] }, + { "name": "MaxEvents", "type": "int", "reload": "Cold", "default": 100000, "concept_ids": ["c-proxy-event"] }, + { "name": "MaxQueueLength", "type": "int", "reload": "Cold", "default": 1000, "concept_ids": ["c-max-queue-length","c-enqueue"] }, + { "name": "MultiConnMaxConns", "type": "int", "reload": "Cold", "default": 4000, "concept_ids": ["c-backend-host"] }, + { "name": "PollInterval", "type": "int", "reload": "Cold", "default": 15000, "unit": "ms", "concept_ids": ["c-health-poller"] }, + { "name": "PollTimeout", "type": "int", "reload": "Cold", "default": 3000, "unit": "ms", "concept_ids": ["c-health-poller"] }, + { "name": "Port", "type": "int", "reload": "Cold", "default": 80, "concept_ids": ["c-listener"] }, + { "name": "PriorityKeyHeader", "type": "string", "reload": "Warm", "default": "S7PPriorityKey", "concept_ids": ["c-priority-key-header","c-priority-mapping"] }, + { "name": "PriorityKeys", "type": "string[]","reload":"Warm", "default": null, "concept_ids": ["c-priority-keys","c-priority-mapping"] }, + { "name": "PriorityValues", "type": "int[]", "reload": "Warm", "default": null, "concept_ids": ["c-priority-values","c-priority-mapping"] }, + { "name": "PriorityWorkers", "type": "string", "reload": "Warm", "default": "2:1,3:1","format":"level:count,level:count", "concept_ids": ["c-priority-workers"] }, + { "name": "RequiredHeaders", "type": "string[]","reload":"Warm", "default": [], "concept_ids": ["c-required-headers"] }, + { "name": "SharedIteratorCleanupIntervalSeconds","type":"int","reload":"Cold","default":30,"unit":"s","concept_ids":["c-shared-iterator"] }, + { "name": "SharedIteratorTTLSeconds","type":"int", "reload": "Cold", "default": 60, "unit":"s", "concept_ids": ["c-shared-iterator"] }, + { "name": "StorageDbContainerName","type": "string", "reload": "Warm", "default": "Requests","concept_ids": ["c-result-blob","c-blob-lifecycle-policy"] }, + { "name": "SuccessRate", "type": "int", "reload": "Cold", "default": 80, "unit":"percent","concept_ids": ["c-success-rate","c-active-pool"] }, + { "name": "SuspendedUserConfigUrl","type": "string", "reload": "Warm", "default": null, "concept_ids": ["c-suspended-user"] }, + { "name": "Timeout", "type": "int", "reload": "Cold", "default": 1200000, "unit":"ms", "concept_ids": ["c-timeout"] }, + { "name": "TimeoutHeader", "type": "string", "reload": "Cold", "default": "S7PTimeout", "concept_ids": ["c-timeout"] }, + { "name": "TTLHeader", "type": "string", "reload": "Cold", "default": "S7PTTL","concept_ids": ["c-ttl"] }, + { "name": "UseProfiles", "type": "bool", "reload": "Cold", "default": false, "concept_ids": ["c-user-profile"] }, + { "name": "UseSharedIterators", "type": "bool", "reload": "Cold", "default": true, "concept_ids": ["c-shared-iterator"] }, + { "name": "UserConfigRefreshIntervalSecs","type":"int","reload":"Cold", "default": 3600, "unit":"s", "concept_ids": ["c-profile-refresh"] }, + { "name": "UserConfigRequired", "type": "bool", "reload": "Cold", "default": false, "concept_ids": ["c-user-profile"] }, + { "name": "UserConfigUrl", "type": "string", "reload": "Cold", "default": null, "concept_ids": ["c-user-profile"] }, + { "name": "UserIDFieldName", "type": "string", "reload": "Cold", "default": "userId","concept_ids": ["c-user-id"] }, + { "name": "UserPriorityThreshold", "type": "float", "reload": "Warm", "default": 0.1, "range":"0.0-1.0", "concept_ids": ["c-per-user-throttling"] }, + { "name": "UserSoftDeleteTTLMinutes","type":"int", "reload":"Cold", "default": 360, "unit":"min", "concept_ids": ["c-suspended-user"] }, + { "name": "ValidateAuthAppFieldName","type":"string","reload":"Warm", "default": "authAppID","concept_ids":["c-app-id-allowlist"] }, + { "name": "ValidateAuthAppID", "type": "bool", "reload": "Warm", "default": false, "concept_ids": ["c-app-id-validation"] }, + { "name": "ValidateAuthAppIDHeader","type":"string", "reload": "Warm", "default": "X-MS-CLIENT-PRINCIPAL-ID","concept_ids":["c-app-id-validation"] }, + { "name": "ValidateAuthAppIDUrl", "type": "string", "reload": "Warm", "default": "file:auth.json","concept_ids":["c-app-id-validation","c-app-id-allowlist"] }, + { "name": "ValidateHeaders", "type": "string", "reload": "Warm", "default": null, "format":"SourceHeader=AllowlistHeader", "concept_ids":["c-header-validation-rule"] }, + { "name": "Workers", "type": "int", "reload": "Cold", "default": 10, "concept_ids": ["c-workers","c-proxy-worker"] } + ], + + "response_codes": [ + { "code": 202, "name": "Accepted", "origin": "proxy", "cause": "Async upgrade: client released early with blob URIs", "concept_ids": ["c-202-response","c-async-upgrade"] }, + { "code": 400, "name": "Bad Request", "origin": "proxy", "cause": "Malformed S7PTTL header value (error: InvalidTTL)", "concept_ids": ["c-ttl"] }, + { "code": 403, "name": "Forbidden", "origin": "proxy", "cause": "App ID not in allowlist (DisallowedAppID); user not in profile or suspended (UnknownProfile)", "concept_ids": ["c-app-id-validation","c-suspended-user","c-user-profile"] }, + { "code": 408, "name": "Request Timeout", "origin": "proxy", "cause": "IO exception or task cancellation on backend communication", "concept_ids": ["c-timeout"] }, + { "code": 412, "name": "Precondition Failed", "origin": "proxy", "cause": "Request TTL expired before dispatch (TTLExpired)", "concept_ids": ["c-ttl"] }, + { "code": 417, "name": "Expectation Failed", "origin": "proxy", "cause": "Required header missing (IncompleteHeaders); header validation rule failed (InvalidHeader)", "concept_ids": ["c-required-headers","c-header-validation-rule"] }, + { "code": 429, "name": "Too Many Requests", "origin": "proxy", "cause": "Queue full; all circuits OPEN; no active hosts; max events exceeded", "concept_ids": ["c-max-queue-length","c-global-blocked-check","c-active-pool"] }, + { "code": 500, "name": "Internal Server Error", "origin": "proxy", "cause": "Unhandled exception; request body too large (ContentTooLarge)", "concept_ids": [] }, + { "code": 503, "name": "Service Unavailable", "origin": "proxy", "cause": "All backends exhausted; exception during enqueue; all circuit breakers blocked", "concept_ids": ["c-global-blocked-check","c-retry","c-active-pool"] } + ], + + "request_headers": [ + { "name": "S7PPriorityKey", "direction": "client-inbound", "configurable_name_via": "PriorityKeyHeader", "definition": "Priority tier value. Looked up in PriorityKeys to determine the request's priority level.", "concept_ids": ["c-priority-key-header","c-priority-mapping"] }, + { "name": "S7PTTL", "direction": "client-inbound", "configurable_name_via": "TTLHeader", "definition": "Per-request TTL override in seconds. Formats: integer, decimal, absolute Unix timestamp, ISO 8601.", "concept_ids": ["c-ttl"] }, + { "name": "S7PTimeout", "direction": "client-inbound", "configurable_name_via": "TimeoutHeader", "definition": "Per-request per-host timeout override in milliseconds.", "concept_ids": ["c-timeout"] }, + { "name": "S7PAsyncMode", "direction": "client-inbound", "configurable_name_via": "AsyncClientRequestHeader", "definition": "Opt-in header enabling async mode for this specific request.", "concept_ids": ["c-opt-in-header","c-async-mode"] }, + { "name": "S7PDEBUG", "direction": "client-inbound", "configurable_name_via": null, "definition": "Set to true to enable per-request debug tracing in logs.", "concept_ids": [] }, + { "name": "S7PREQUEUE", "direction": "backend-response", "configurable_name_via": null, "definition": "Set by a backend on a 429 response to signal the proxy should requeue the request.", "concept_ids": ["c-s7prequeue-header","c-requeue"] } + ], + + "response_headers": [ + { "name": "x-Request-Queue-Duration", "definition": "Milliseconds the request spent in the priority queue before a worker picked it up.", "concept_ids": ["c-priority-queue"] }, + { "name": "x-Request-Process-Duration", "definition": "Milliseconds spent in the proxy worker from dequeue to final response write.", "concept_ids": ["c-proxy-worker"] }, + { "name": "x-Request-Worker", "definition": "Numeric ID of the worker thread that handled the request.", "concept_ids": ["c-workers"] }, + { "name": "BackendHost", "definition": "Hostname of the backend that served the successful response.", "concept_ids": ["c-backend-host"] }, + { "name": "Total-Latency", "definition": "Total milliseconds from enqueue to response complete, covering both queue wait and processing.", "concept_ids": ["c-proxy-event"] } + ], + + "relationships": [ + { "from": "c-ttl", "to": "c-timeout", "type": "bounds", "description": "TTL caps total request lifetime; Timeout caps each individual attempt." }, + { "from": "c-async-trigger-timeout", "to": "c-async-timeout", "type": "succeeds", "description": "AsyncTriggerTimeout fires first to release the client; AsyncTimeout governs the subsequent backend phase." }, + { "from": "c-path-filter", "to": "c-load-balance-mode", "type": "precedes", "description": "Path filter runs before load-balance ordering in the backend selection pipeline." }, + { "from": "c-load-balance-mode", "to": "c-circuit-breaker", "type": "precedes", "description": "Load balancer orders hosts; circuit breaker gates each host before attempt." }, + { "from": "c-health-poller", "to": "c-active-pool", "type": "populates", "description": "Health poller results determine which hosts are in the active pool." }, + { "from": "c-active-pool", "to": "c-circuit-breaker", "type": "feeds", "description": "Active pool hosts are also subject to per-host circuit breaker evaluation." }, + { "from": "c-circuit-breaker", "to": "c-max-attempts", "type": "bypasses", "description": "OPEN circuits are skipped without consuming a MaxAttempts budget entry." }, + { "from": "c-sentinel", "to": "c-warm-setting", "type": "triggers", "description": "Bumping Sentinel triggers hot-reload of all Warm settings." }, + { "from": "c-validation-pipeline", "to": "c-enqueue", "type": "precedes", "description": "Validation pipeline must pass before a request is enqueued." }, + { "from": "c-user-profile", "to": "c-header-validation-rule","type":"provides_data", "description": "User profile supplies the allowlist header values consumed by header validation rules." }, + { "from": "c-async-upgrade", "to": "c-result-blob", "type": "produces", "description": "Async upgrade initiates background processing that ultimately produces a result blob." }, + { "from": "c-result-blob", "to": "c-blob-lifecycle-policy","type":"governed_by", "description": "Result blobs are deleted by the Azure Storage lifecycle policy, not by the proxy setting alone." }, + { "from": "c-processor", "to": "c-token-telemetry", "type": "enables", "description": "processor=OpenAI is required for the proxy to extract token counts from SSE streams." }, + { "from": "c-composite-event-client","to": "c-event-logger-backend","type": "dispatches_to", "description": "CompositeEventClient fans out every ProxyEvent to all registered backends." }, + { "from": "c-connection-string-format","to":"c-legacy-format", "type": "supersedes", "description": "Connection string format supports all modern options; legacy format is deprecated." }, + { "from": "c-direct-mode", "to": "c-latency-iterator", "type": "influences", "description": "Direct-mode hosts report 0 ms average latency and always sort first in latency mode." } + ] +} From 70231019627733ca6b1fe7e720196e9e264eeafb Mon Sep 17 00:00:00 2001 From: Nagendra Mishr Date: Tue, 26 May 2026 23:49:26 -0400 Subject: [PATCH 13/16] simplify with a script --- deployment/POC/readme.md | 194 ++++++++ deployment/POC/secureProxy.sh | 884 ++++++++++++++++++++++++++++++++++ docs/POC-Secure-the-proxy.md | 198 ++------ 3 files changed, 1131 insertions(+), 145 deletions(-) create mode 100644 deployment/POC/readme.md create mode 100644 deployment/POC/secureProxy.sh diff --git a/deployment/POC/readme.md b/deployment/POC/readme.md new file mode 100644 index 0000000..56e0779 --- /dev/null +++ b/deployment/POC/readme.md @@ -0,0 +1,194 @@ +### Step 1 — Set Variables + +Set `APP_NAME`, `CONTAINER_APP_NAME`, `RG` to match your environment. + +```bash +export ENTRA_APP_NAME="aca-proxy" # Display name for the Entra app registration +export CONTAINER_APP_NAME="" # Container App name +export RG="" # Container App resource group +``` + +> [!NOTE] +> In Windows/WSL environments, sanitize `-o tsv` outputs with `tr -d '\r\n'` before reusing values in later CLI calls. + +### Step 2 — Create the Entra App Registration and enable EasyAuth + +Save the generated `APP_ID` and `CLIENT_SECRET` variables for troubleshooting. + +```bash + +# Lookup tenant and app fqdn +export TENANT_ID="$(az account show --query tenantId -o tsv | tr -d '\r\n')" +export APP_FQDN="https://$(az containerapp show --name "$CONTAINER_APP_NAME" --resource-group "$RG" --query properties.configuration.ingress.fqdn -o tsv | tr -d '\r\n')" +export HEALTH_URL="$APP_FQDN/health" + +export APP_ID=$(az ad app create \ + --display-name "$ENTRA_APP_NAME" \ + --sign-in-audience AzureADMyOrg \ + --query appId -o tsv | tr -d '\r\n') +echo "APP_ID=$APP_ID" + +# Required so az token requests to api://$APP_ID can resolve the resource principal. +az ad app update --id "$APP_ID" --identifier-uris "api://$APP_ID" + +# Create service principal +az ad sp create --id "$APP_ID" 1>/dev/null + +# Create delegated scope +if [ -z "$APP_ID" ]; then + echo "APP_ID is empty. Re-run Step 2 app creation or app lookup first." + exit 1 +fi + +SCOPE_ID="$(uuidgen | tr '[:upper:]' '[:lower:]')" +API_OBJ="$(az ad app show --id "$APP_ID" --query api -o json)" +UPDATED_API_OBJ="$(echo "$API_OBJ" | jq --arg id "$SCOPE_ID" '.oauth2PermissionScopes = [{ + adminConsentDescription: "Access the API", + adminConsentDisplayName: "Admin Access", + id: $id, + isEnabled: true, + type: "Admin", + userConsentDescription: "Access the API", + userConsentDisplayName: "User Access", + value: "api.access" +}]')" +az ad app update --id "$APP_ID" --set api="$UPDATED_API_OBJ" + +# Enable ID token issuance +az ad app update --id "$APP_ID" --enable-id-token-issuance true + +# Create client secret +export CLIENT_SECRET=$(az ad app credential reset \ + --id "$APP_ID" \ + --display-name "proxy-auth-secret" \ + --end-date "$(date -d '+30 days' '+%Y-%m-%d')" \ + --query password -o tsv | tr -d '\r\n') + +# Do not print or commit secret values. Keep them in memory only. + + +# Enable EazyAuth +az containerapp auth microsoft update \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --client-id "$APP_ID" \ + --client-secret "$CLIENT_SECRET" \ + --tenant-id "$TENANT_ID" \ + --yes + +# Verify identifier URI is set correctly. +az ad app show --id "$APP_ID" --query "{appId:appId,identifierUris:identifierUris,scopes:api.oauth2PermissionScopes[].value}" -o table + +# Optional hygiene: clear secret from shell after auth configuration is complete. +# unset CLIENT_SECRET +``` + +> [!WARNING] +> You may need to grant admin consent in the Azure portal before token acquisition works. +> If `az account get-access-token --resource "api://$APP_ID"` returns `AADSTS65001` (`consent_required`), ask a tenant admin to grant consent for your client app/API scope in Entra ID: +> **App registrations** -> your client app -> **API permissions** -> **Grant admin consent**. +> +> For better secret hygiene, avoid sharing terminal output that includes auth commands and never paste secret values into tickets, PR comments, or chat logs. + +### Step 3 — Verify Container App + +Run these checks to ensure auth is enabled and the Microsoft identity provider is registered. + +```bash +ENABLED="$(az containerapp auth show \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --query "platform.enabled" -o tsv | tr -d '\r\n')" + +AAD_CLIENT_ID="$(az containerapp auth show \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --query "identityProviders.azureActiveDirectory.registration.clientId" -o tsv | tr -d '\r\n')" + +AUDIENCE="$(az containerapp auth show \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --query "identityProviders.azureActiveDirectory.validation.allowedAudiences[0]" -o tsv | tr -d '\r\n')" + +echo "enabled=$ENABLED" +echo "aad_client_id=$AAD_CLIENT_ID" +echo "allowed_audience=$AUDIENCE" +``` + +Expected: + +- enabled=true +- aad_client_id= +- allowed_audience=api:// + +If `aad_client_id` or `allowed_audience` is empty, or `enabled` is not `true`, re-run the auth microsoft update command above and do not continue to Step 4. + +### Step 4 — Set the Unauthenticated Action + +Rejects unauthenticated requests outright — callers get a `401` with no redirect. + +```bash +az containerapp auth update \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --enabled true \ + --unauthenticated-client-action Return401 +``` + +### Step 5 — Align Allowed Audiences with v2.0 Tokens + +The default Microsoft provider configuration registers `api://` as the allowed audience. Entra v2.0 access tokens (issued by `https://login.microsoftonline.com//v2.0`) carry the **bare GUID** in their `aud` claim — not the `api://` form. Without this step, valid tokens are rejected with `403`. + +Replace the allowed audience with the bare GUID: + +```bash +az containerapp auth microsoft update \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --allowed-audiences "$APP_ID" +``` + +Verify: + +```bash +az containerapp auth show \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --query "identityProviders.azureActiveDirectory.validation.allowedAudiences" +``` + +Expected output: + +```json +[ + "" +] +``` + +> [!NOTE] +> `--allowed-audiences` **replaces** the existing list and only accepts one value per invocation. If you also need to accept v1-style `api://` audiences (for legacy callers), patch the list directly via `az containerapp auth update --set identityProviders.azureActiveDirectory.validation.allowedAudiences=...` with a JSON array, taking care with shell quoting. + +### Step 6 — Restrict to Trusted Client Applications + +Even with a valid token, EasyAuth's `defaultAuthorizationPolicy.allowedApplications` decides which client apps may call the proxy. An empty list combined with EasyAuth's MISE evaluation results in `403` with `"this principal does not match any of the allowed applications"`. + +For this POC we explicitly trust the Microsoft Azure CLI client (`04b07795-8ddb-461a-bbee-02f9e1bf7b46`) so you can verify with `az account get-access-token`. In production, replace this with the client app IDs of the real callers (your console SP, APIM, another service, etc.). + +```bash +az containerapp auth update \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --set identityProviders.azureActiveDirectory.validation.defaultAuthorizationPolicy.allowedApplications='["04b07795-8ddb-461a-bbee-02f9e1bf7b46"]' +``` + +Verify: + +```bash +az containerapp auth show \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --query "identityProviders.azureActiveDirectory.validation.defaultAuthorizationPolicy.allowedApplications" +``` + +> [!NOTE] +> EasyAuth caches authorization decisions per principal for roughly 60 seconds. After changing `allowedApplications` or `allowedAudiences`, wait a minute (or acquire a fresh token) before re-testing — otherwise you'll see a cached deny. diff --git a/deployment/POC/secureProxy.sh b/deployment/POC/secureProxy.sh new file mode 100644 index 0000000..b484226 --- /dev/null +++ b/deployment/POC/secureProxy.sh @@ -0,0 +1,884 @@ +#!/usr/bin/env bash +# secureProxy.sh +# +# Configure Entra ID app registration for a proxy Container App. +# Idempotent: safe to re-run. Each step checks current state before modifying. +# +# Three modes: +# +# -m ACA (default) +# Creates app registration + delegated scope 'api.access' + client secret +# and enables Container Apps EasyAuth (Microsoft provider) on the proxy. +# Use when the platform should enforce auth on the proxy's ingress. +# +# -m APIM +# Creates app registration + app role 'API.Caller' and assigns that role +# to the proxy Container App's managed identity. ACA auth is NOT modified. +# Use when the proxy calls APIM with its managed identity and APIM +# validates the token via policy. +# +# -m ADDCLIENT (or pass -z with no -m) +# Adds an arbitrary client app ID to the API app's preAuthorizedApplications +# for the 'api.access' scope. If -n and -g are also given, also adds the +# client ID to the Container App's EasyAuth allowedApplications so the +# client's tokens are accepted at runtime (otherwise the proxy returns 403). +# +# Usage: +# secureProxy.sh -a [-n ] [-g ] [-m ] [-z ] + +set -euo pipefail + +# ---------------------------------------------------------------------------- +# Logging helpers +# ---------------------------------------------------------------------------- +log() { printf '\033[0;36m[%s]\033[0m %s\n' "$(date +%H:%M:%S)" "$*" >&2; } +ok() { printf '\033[0;32m[ OK ]\033[0m %s\n' "$*" >&2; } +warn() { printf '\033[0;33m[WARN]\033[0m %s\n' "$*" >&2; } +err() { printf '\033[0;31m[FAIL]\033[0m %s\n' "$*" >&2; } + +# die [remediation hint...] +# Prints a clearly formatted failure block and exits 1. +die() { + err "$1" + shift + if [[ $# -gt 0 ]]; then + printf '\033[0;31m \u2192 %s\033[0m\n' "$@" >&2 + fi + exit 1 +} + +# run_az +# Captures stderr so the real Azure error is surfaced verbatim when something fails. +run_az() { + local description="$1"; shift + local stderr_file + stderr_file="$(mktemp)" + if ! "$@" 2>"$stderr_file"; then + err "$description failed" + printf '\033[0;31m command: %s\033[0m\n' "$*" >&2 + printf '\033[0;31m azure error:\033[0m\n' >&2 + sed 's/^/ /' "$stderr_file" >&2 + rm -f "$stderr_file" + exit 1 + fi + rm -f "$stderr_file" +} + +trap 'err "Aborted at line $LINENO while running: $BASH_COMMAND"' ERR + +# ============================================================================ +# Global state — populated by functions; intentionally module-scoped so each +# step is small and readable. Keep this list as the single source of truth. +# ============================================================================ + +# Inputs (set by parse_args) +CONTAINER_APP_NAME="" +RG="" +ENTRA_APP_NAME="" +MODE="ACA" +EXTRA_CLIENT_ID="" # set by -z / --authorize; client app ID to pre-authorize + +# Azure context (set by require_logged_in) +CURRENT_SUB="" +TENANT_ID="" + +# Container App facts (set by discover_container_app) +APP_FQDN="" +HEALTH_URL="" +MI_PRINCIPAL_ID="" +MI_TYPE="" + +# App registration (set by ensure_app_registration / ensure_service_principal) +APP_ID="" +SP_OID="" + +# ACA-only state +CLIENT_SECRET="" + +# APIM-only state +readonly APP_ROLE_VALUE="API.Caller" +readonly APP_ROLE_DISPLAY="API Caller" +readonly APP_ROLE_DESC="Applications assigned this role may invoke the proxy via APIM." +APP_ROLE_ID="" +API_SP_OID="" + +# ============================================================================ +# CLI plumbing +# ============================================================================ + +usage() { + cat <&2 +Configures an Entra ID app registration that secures a SimpleL7Proxy Container App. +Pick a mode to: + - secure inbound traffic to the proxy via ACA EasyAuth (ACA mode) + - secure outbound calls from the proxy to APIM, where the proxy authenticates + with its managed identity and APIM validates the token (APIM mode) + - pre-authorize additional client apps to call the proxy (ADDCLIENT mode) +Idempotent: safe to re-run. + +Usage: + $(basename "$0") -a [-n ] [-g ] + [-m ] [-z ] + +Options: + -a, --app-name Display name for the Entra app registration to create/reuse. + -n, --container-app Container App name (the proxy). Required for ACA and APIM. + -g, --resource-group Resource group of the Container App. Required for ACA and APIM. + -m, --mode One of: ACA (default), APIM, ADDCLIENT. + ACA configure ACA EasyAuth (Microsoft provider) on the proxy. + APIM create app role and assign it to the proxy managed identity; + ACA auth is left untouched (APIM validates the token). + ADDCLIENT add a client app ID to the API app's preAuthorizedApplications + for the 'api.access' scope, and (if -n/-g are also given) to + the Container App's EasyAuth allowedApplications list. + -z, --authorize Client app ID (GUID) to pre-authorize. Implies --mode ADDCLIENT. + -h, --help Show this help. +EOF + exit 1 +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + -n|--container-app) CONTAINER_APP_NAME="${2:-}"; shift 2 ;; + -g|--resource-group) RG="${2:-}"; shift 2 ;; + -a|--app-name) ENTRA_APP_NAME="${2:-}"; shift 2 ;; + -m|--mode) MODE="${2:-}"; shift 2 ;; + -z|--authorize) EXTRA_CLIENT_ID="${2:-}"; shift 2 ;; + -h|--help) usage ;; + --) shift; break ;; + -*) err "Unknown option: $1"; usage ;; + *) err "Unexpected positional argument: $1"; usage ;; + esac + done + + # -z implies ADDCLIENT mode when -m wasn't explicitly given a non-default value. + if [[ -n "$EXTRA_CLIENT_ID" && "$MODE" == "ACA" ]]; then + MODE="ADDCLIENT" + fi + + local script_name + script_name="$(basename "$0")" + # Each line below becomes its own '→ ...' remediation hint. + local ex_desc1="Configures an Entra ID app registration that secures a SimpleL7Proxy Container App." \ + ex_desc2="Pick a mode:" \ + ex_desc3=" ACA - secure inbound traffic to the proxy via EasyAuth." \ + ex_desc4=" APIM - secure outbound calls from the proxy to APIM via managed identity." \ + ex_desc5=" ADDCLIENT - pre-authorize additional client apps to call the proxy." \ + ex_blank="" \ + ex_header="Examples:" \ + ex_aca=" ACA : $script_name -a -n -g " \ + ex_apimmi=" APIM : $script_name -a -n -g -m APIM" \ + ex_authorize=" ADDCLIENT : $script_name -a -z " \ + ex_help="Run '$script_name -h' for the full option reference." + + local desc_args=("$ex_desc1" "$ex_desc2" "$ex_desc3" "$ex_desc4" "$ex_desc5" "$ex_blank") + + [[ -z "$ENTRA_APP_NAME" ]] && die "Missing required argument: -a " \ + "${desc_args[@]}" "$ex_header" "$ex_aca" "$ex_apimmi" "$ex_authorize" "$ex_help" + + case "$MODE" in + ACA|APIM) + [[ -z "$CONTAINER_APP_NAME" ]] && die "Missing required argument: -n (mode=$MODE)" \ + "${desc_args[@]}" "$ex_header" "$ex_aca" "$ex_apimmi" "$ex_help" + [[ -z "$RG" ]] && die "Missing required argument: -g (mode=$MODE)" \ + "${desc_args[@]}" "$ex_header" "$ex_aca" "$ex_apimmi" "$ex_help" + ;; + ADDCLIENT) + [[ -z "$EXTRA_CLIENT_ID" ]] && die "Mode 'ADDCLIENT' requires -z " \ + "${desc_args[@]}" "$ex_header" "$ex_authorize" "$ex_help" + if [[ ! "$EXTRA_CLIENT_ID" =~ ^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$ ]]; then + die "--authorize value is not a GUID: '$EXTRA_CLIENT_ID'" \ + "Pass the client's appId (a GUID), not its display name." \ + "$ex_authorize" + fi + ;; + *) die "Invalid mode: '$MODE'" "Valid values: ACA, APIM, ADDCLIENT" ;; + esac + log "Mode: $MODE" +} + +# ============================================================================ +# Environment / context +# ============================================================================ + +check_prerequisites() { + command -v az >/dev/null || die \ + "Azure CLI ('az') is not installed or not on PATH." \ + "Install from https://learn.microsoft.com/cli/azure/install-azure-cli" + + command -v jq >/dev/null || die \ + "'jq' is required to manipulate app-registration JSON but was not found." \ + "Install with: sudo apt-get install jq (or: brew install jq)" + + command -v uuidgen >/dev/null || die \ + "'uuidgen' is required to generate scope IDs but was not found." \ + "Install the 'uuid-runtime' package (Debian/Ubuntu) or use a host that ships it." +} + +require_logged_in() { + if ! az account show >/dev/null 2>&1; then + die "Not logged in to Azure (or token expired)." \ + "Run: az login" \ + "If you have multiple subscriptions: az account set --subscription " + fi + CURRENT_SUB="$(az account show --query name -o tsv | tr -d '\r\n')" + TENANT_ID="$(az account show --query tenantId -o tsv | tr -d '\r\n')" + [[ -z "$TENANT_ID" ]] && die \ + "Could not read tenantId from the current az context." \ + "Try: az logout && az login" + log "Active subscription: $CURRENT_SUB" +} + +require_resource_group() { + [[ -z "$RG" ]] && return # ADDCLIENT mode does not need a resource group + if ! az group show --name "$RG" >/dev/null 2>&1; then + die "Resource group '$RG' not found in subscription '$CURRENT_SUB'." \ + "List available groups: az group list --query \"[].name\" -o tsv" \ + "Switch subscription: az account set --subscription " + fi +} + +# Sets APP_FQDN, HEALTH_URL, MI_PRINCIPAL_ID, MI_TYPE. +discover_container_app() { + if [[ "$MODE" == "ADDCLIENT" ]]; then + log "Skipping Container App discovery (mode=ADDCLIENT)" + return + fi + log "Looking up Container App '$CONTAINER_APP_NAME'..." + local stderr_file ca_json fqdn user_assigned_first + stderr_file="$(mktemp)" + ca_json="$(az containerapp show \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + -o json 2>"$stderr_file" || true)" + + if [[ -z "$ca_json" || "$ca_json" == "null" ]]; then + local ca_err; ca_err="$(cat "$stderr_file")"; rm -f "$stderr_file" + die "Container App '$CONTAINER_APP_NAME' does not exist in resource group '$RG'." \ + "List Container Apps in this group: az containerapp list -g '$RG' --query \"[].name\" -o tsv" \ + "Azure error: $ca_err" + fi + rm -f "$stderr_file" + + fqdn="$(echo "$ca_json" | jq -r '.properties.configuration.ingress.fqdn // empty')" + MI_PRINCIPAL_ID="$(echo "$ca_json" | jq -r '.identity.principalId // empty')" + MI_TYPE="$(echo "$ca_json" | jq -r '.identity.type // "None"')" + user_assigned_first="$(echo "$ca_json" | jq -r '(.identity.userAssignedIdentities // {}) | to_entries | .[0].value.principalId // empty')" + + if [[ -n "$fqdn" ]]; then + APP_FQDN="https://$fqdn" + HEALTH_URL="$APP_FQDN/health" + ok "Container App FQDN: $APP_FQDN" + elif [[ "$MODE" == "ACA" ]]; then + die "Container App '$CONTAINER_APP_NAME' has no ingress FQDN configured." \ + "EasyAuth requires ingress. Enable it and re-run, e.g.:" \ + " az containerapp ingress enable -n '$CONTAINER_APP_NAME' -g '$RG' --type external --target-port " + else + warn "Container App has no ingress FQDN (OK for APIM mode; proxy is callee-only)." + fi + + # In APIM mode, prefer system-assigned identity; fall back to first user-assigned. + if [[ "$MODE" == "APIM" ]]; then + [[ -z "$MI_PRINCIPAL_ID" ]] && MI_PRINCIPAL_ID="$user_assigned_first" + if [[ -z "$MI_PRINCIPAL_ID" ]]; then + die "Container App '$CONTAINER_APP_NAME' has no managed identity (identity.type='$MI_TYPE')." \ + "Enable a managed identity and re-run, e.g.:" \ + " az containerapp identity assign -n '$CONTAINER_APP_NAME' -g '$RG' --system-assigned" + fi + ok "Proxy managed identity principalId: $MI_PRINCIPAL_ID (identity.type=$MI_TYPE)" + fi +} + +# ============================================================================ +# Entra app registration (shared by both modes) +# ============================================================================ + +# Sets APP_ID. Creates the app registration if it does not exist. +ensure_app_registration() { + log "Looking up Entra app registration '$ENTRA_APP_NAME'..." + + local match_count + match_count="$(az ad app list --display-name "$ENTRA_APP_NAME" --query "length(@)" -o tsv 2>/dev/null | tr -d '\r\n' || echo 0)" + if [[ "$match_count" -gt 1 ]]; then + die "Multiple ($match_count) Entra app registrations are named '$ENTRA_APP_NAME'." \ + "This script cannot safely pick one. Resolve the ambiguity in the portal" \ + "(Entra ID \u2192 App registrations) or pass a different -a value." + fi + + APP_ID="$(az ad app list --display-name "$ENTRA_APP_NAME" --query "[0].appId" -o tsv 2>/dev/null | tr -d '\r\n' || true)" + if [[ -n "$APP_ID" ]]; then + ok "Reusing existing app registration. APP_ID=$APP_ID" + return + fi + + log "Creating Entra app registration '$ENTRA_APP_NAME'..." + local stderr_file err_msg + stderr_file="$(mktemp)" + APP_ID="$(az ad app create \ + --display-name "$ENTRA_APP_NAME" \ + --sign-in-audience AzureADMyOrg \ + --query appId -o tsv 2>"$stderr_file" | tr -d '\r\n' || true)" + if [[ -z "$APP_ID" ]]; then + err_msg="$(cat "$stderr_file")"; rm -f "$stderr_file" + die "Failed to create Entra app registration '$ENTRA_APP_NAME'." \ + "You likely lack 'Application.ReadWrite.All' or equivalent in this tenant." \ + "Ask a tenant admin to either create the app or grant you the role." \ + "Azure error: $err_msg" + fi + rm -f "$stderr_file" + ok "Created app registration. APP_ID=$APP_ID" +} + +ensure_identifier_uri() { + local expected="api://$APP_ID" + local current + current="$(az ad app show --id "$APP_ID" --query "identifierUris" -o json)" + if echo "$current" | jq -e --arg u "$expected" 'index($u)' >/dev/null; then + ok "Identifier URI already set: $expected" + return + fi + log "Setting identifier URI: $expected" + run_az "Setting identifier URI on $APP_ID" \ + az ad app update --id "$APP_ID" --identifier-uris "$expected" + ok "Identifier URI set" +} + +# Sets SP_OID. Creates the service principal if missing. +ensure_service_principal() { + SP_OID="$(az ad sp list --filter "appId eq '$APP_ID'" --query "[0].id" -o tsv 2>/dev/null | tr -d '\r\n' || true)" + if [[ -n "$SP_OID" ]]; then + ok "Service principal already exists ($SP_OID)" + return + fi + log "Creating service principal for $APP_ID..." + run_az "Creating service principal for $APP_ID" \ + az ad sp create --id "$APP_ID" --output none + SP_OID="$(az ad sp list --filter "appId eq '$APP_ID'" --query "[0].id" -o tsv 2>/dev/null | tr -d '\r\n' || true)" + ok "Service principal created ($SP_OID)" +} + +# ============================================================================ +# ACA mode steps +# ============================================================================ + +ensure_delegated_scope() { + local api_obj has_scope scope_id updated + api_obj="$(az ad app show --id "$APP_ID" --query api -o json)" + has_scope="$(echo "$api_obj" | jq -r '[.oauth2PermissionScopes[]? | select(.value == "api.access")] | length')" + if [[ "$has_scope" != "0" ]]; then + ok "Scope 'api.access' already exists" + return + fi + + log "Adding 'api.access' delegated scope..." + scope_id="$(uuidgen | tr '[:upper:]' '[:lower:]')" + updated="$(echo "$api_obj" | jq --arg id "$scope_id" '.oauth2PermissionScopes = [{ + adminConsentDescription: "Access the API", + adminConsentDisplayName: "Admin Access", + id: $id, + isEnabled: true, + type: "Admin", + userConsentDescription: "Access the API", + userConsentDisplayName: "User Access", + value: "api.access" + }]')" + run_az "Adding 'api.access' scope to $APP_ID" \ + az ad app update --id "$APP_ID" --set api="$updated" + ok "Scope 'api.access' added" +} + +# Ensure the app registration issues v2.0 access tokens. +# +# Without requestedAccessTokenVersion=2, Entra issues v1.0 tokens when a caller +# requests them via `--resource api://`, with aud="api://". The ACA +# EasyAuth config we set uses the bare GUID as allowedAudiences, which only +# matches v2.0 tokens (aud=). Setting v2 here aligns the two so a token +# acquired via either `--resource api://` or `--scope api:///.default` +# is accepted by EasyAuth. +ensure_v2_access_tokens() { + local current api_obj updated + current="$(az ad app show --id "$APP_ID" --query "api.requestedAccessTokenVersion" -o tsv 2>/dev/null || echo "")" + if [[ "$current" == "2" ]]; then + ok "App registration already issues v2.0 access tokens" + return + fi + + log "Setting requestedAccessTokenVersion=2 on app registration..." + api_obj="$(az ad app show --id "$APP_ID" --query api -o json)" + updated="$(echo "$api_obj" | jq '.requestedAccessTokenVersion = 2')" + run_az "Setting requestedAccessTokenVersion=2 on $APP_ID" \ + az ad app update --id "$APP_ID" --set api="$updated" + ok "App registration set to issue v2.0 access tokens (aud will be bare GUID)" +} + +# Pre-authorize a client app ID on the API's 'api.access' delegated scope. +# Used by ACA mode (for Azure CLI) and by ADDCLIENT mode (for arbitrary clients). +preauthorize_client() { + local client_id="$1" + local client_label="${2:-$client_id}" + local api_obj scope_id existing updated + api_obj="$(az ad app show --id "$APP_ID" --query api -o json)" + scope_id="$(echo "$api_obj" | jq -r '[.oauth2PermissionScopes[]? | select(.value == "api.access")][0].id')" + if [[ -z "$scope_id" || "$scope_id" == "null" ]]; then + die "Cannot pre-authorize $client_label: 'api.access' scope not found on $APP_ID" \ + "Re-run ensure_delegated_scope, then retry" + fi + existing="$(echo "$api_obj" | jq -r --arg cli "$client_id" --arg sid "$scope_id" \ + '[.preAuthorizedApplications[]? | select(.appId == $cli) | .delegatedPermissionIds[]? | select(. == $sid)] | length')" + if [[ "$existing" != "0" ]]; then + ok "$client_label already pre-authorized for 'api.access'" + return + fi + log "Pre-authorizing $client_label for 'api.access' scope..." + updated="$(echo "$api_obj" | jq --arg cli "$client_id" --arg sid "$scope_id" ' + .preAuthorizedApplications = ((.preAuthorizedApplications // []) | map(select(.appId != $cli))) + [{ + appId: $cli, + delegatedPermissionIds: [$sid] + }]')" + run_az "Pre-authorizing $client_label on $APP_ID" \ + az ad app update --id "$APP_ID" --set api="$updated" + ok "$client_label pre-authorized for 'api.access'" +} + +# Well-known Azure CLI public client ID. Used both as a pre-authorized client +# on the app registration (so `az account get-access-token` works without +# interactive consent) and as an allowed calling application on EasyAuth (so +# the bearer token's azp claim is accepted at authorization time). +AZURE_CLI_CLIENT_ID="04b07795-8ddb-461a-bbee-02f9e1bf7b46" + +ensure_azure_cli_preauthorized() { + # Pre-authorize Azure CLI on the api.access scope so token acquisition does + # not trigger interactive consent (which fails with AADSTS650057 on a fresh + # app reg). + preauthorize_client "$AZURE_CLI_CLIENT_ID" "Azure CLI" +} + +authorize_extra_client() { + preauthorize_client "$EXTRA_CLIENT_ID" "client $EXTRA_CLIENT_ID" +} + +ensure_id_token_issuance() { + local enabled + enabled="$(az ad app show --id "$APP_ID" --query "web.implicitGrantSettings.enableIdTokenIssuance" -o tsv | tr -d '\r\n')" + if [[ "$enabled" == "true" ]]; then + ok "ID token issuance already enabled" + return + fi + log "Enabling ID token issuance..." + run_az "Enabling ID token issuance on $APP_ID" \ + az ad app update --id "$APP_ID" --enable-id-token-issuance true + ok "ID token issuance enabled" +} + +# Sets CLIENT_SECRET. Intentionally NOT idempotent — credential reset --append +# always creates a new secret value; existing secrets are preserved. +create_client_secret() { + log "Creating a new client secret (valid 30 days)..." + local stderr_file err_msg + stderr_file="$(mktemp)" + CLIENT_SECRET="$(az ad app credential reset \ + --id "$APP_ID" \ + --display-name "proxy-auth-secret" \ + --append \ + --end-date "$(date -d '+30 days' '+%Y-%m-%d')" \ + --query password -o tsv 2>"$stderr_file" | tr -d '\r\n' || true)" + if [[ -z "$CLIENT_SECRET" ]]; then + err_msg="$(cat "$stderr_file")"; rm -f "$stderr_file" + die "Failed to create client secret for $APP_ID." \ + "Common causes: insufficient permissions on the app, or the tenant blocks secret creation." \ + "Azure error: $err_msg" + fi + rm -f "$stderr_file" + ok "Client secret created (held in memory only)" +} + +# Step 3 in the POC doc: register Microsoft provider with bare-GUID audience. +configure_aca_provider() { + log "Enabling EasyAuth (Microsoft provider) on Container App..." + run_az "Enabling EasyAuth on Container App '$CONTAINER_APP_NAME'" \ + az containerapp auth microsoft update \ + --name "$CONTAINER_APP_NAME" \ + --resource-group "$RG" \ + --client-id "$APP_ID" \ + --client-secret "$CLIENT_SECRET" \ + --tenant-id "$TENANT_ID" \ + --allowed-audiences "$APP_ID" \ + --yes --output none + ok "EasyAuth Microsoft provider configured (audience=$APP_ID)" +} + +# After EasyAuth validates the bearer token, it enforces an authorization +# policy: by default only the registered clientId itself is an allowed +# 'calling application' (azp claim). Tokens minted by other clients (e.g. the +# Azure CLI, or an arbitrary client added via ADDCLIENT mode) are rejected +# with HTTP 403 / SubStatusCode 76 even though authentication succeeded, +# unless their app ID is on this list. +# +# Idempotent merge: reads the existing list, appends the given client ID if +# absent, dedupes, and writes back. Existing entries are preserved across +# repeated invocations (e.g. ACA mode adds the CLI, ADDCLIENT mode then adds +# another client without erasing the CLI). +# +# Args: [