From efbd26f841fb11a4804207db77c4bd500e58c5f3 Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Mon, 8 Jun 2026 19:28:47 -0500 Subject: [PATCH 01/15] Add signed batch run context --- .../cwms/cda/security/BatchJobContext.java | 160 ++++++++++++++++++ .../cwms/cda/security/CdaAccessManager.java | 1 + .../OpenIdConnectIdentitityProvider.java | 8 +- 3 files changed, 167 insertions(+), 2 deletions(-) create mode 100644 cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java diff --git a/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java b/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java new file mode 100644 index 0000000000..268ee7bce6 --- /dev/null +++ b/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java @@ -0,0 +1,160 @@ +package cwms.cda.security; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.flogger.FluentLogger; +import cwms.cda.ApiServlet; +import cwms.cda.datasource.ConnectionPreparingDataSource; +import cwms.cda.datasource.ConnectionPreparer; +import cwms.cda.datasource.DelegatingConnectionPreparer; +import cwms.cda.datasource.SessionOfficePreparer; +import io.javalin.http.Context; +import io.jsonwebtoken.Claims; +import io.jsonwebtoken.JwtException; +import io.jsonwebtoken.Jwts; +import io.jsonwebtoken.security.Keys; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.security.Key; +import java.util.Base64; +import java.util.Locale; +import java.util.Map; +import javax.sql.DataSource; +import javax.servlet.http.HttpServletResponse; + +public final class BatchJobContext { + private static final FluentLogger logger = FluentLogger.forEnclosingClass(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static final String HEADER = "X-CWMS-Job-Context"; + public static final String RUN_AS_OFFICE_ATTR = "BatchRunAsOffice"; + public static final String JOB_ID_ATTR = "BatchJobId"; + public static final String REQUESTED_BY_ATTR = "BatchRequestedBy"; + public static final String DISPATCH_SOURCE_ATTR = "BatchDispatchSource"; + + public static final String SECRET_PROPERTY = "cwms.dataapi.batch.jobContext.secret"; + public static final String PREVIOUS_SECRET_PROPERTY = "cwms.dataapi.batch.jobContext.previousSecret"; + public static final String KEY_ID_PROPERTY = "cwms.dataapi.batch.jobContext.keyId"; + public static final String ISSUER_PROPERTY = "cwms.dataapi.batch.jobContext.issuer"; + public static final String AUDIENCE_PROPERTY = "cwms.dataapi.batch.jobContext.audience"; + public static final String MACHINE_USERS_PROPERTY = "cwms.dataapi.batch.machineUsers"; + + private static final String DEFAULT_ISSUER = "cwms-batch-events"; + private static final String DEFAULT_AUDIENCE = "cwms-data-api"; + private static final String DEFAULT_MACHINE_USERS = ""; + + private BatchJobContext() { + } + + public static boolean isBatchMachineUser(String username) { + if (username == null) { + return false; + } + String machineUsers = readSetting(MACHINE_USERS_PROPERTY, DEFAULT_MACHINE_USERS); + if (machineUsers.isBlank()) { + return false; + } + for (String machineUser : machineUsers.split(",")) { + if (username.equalsIgnoreCase(machineUser.trim())) { + return true; + } + } + return false; + } + + public static void prepareContext(Context ctx, DataApiPrincipal principal) throws CwmsAuthException { + if (!isBatchMachineUser(principal.getName())) { + return; + } + + String token = ctx.header(HEADER); + if (token == null || token.isBlank()) { + throw new CwmsAuthException("Batch machine request missing signed job context", + HttpServletResponse.SC_UNAUTHORIZED); + } + + try { + Claims claims = parse(token); + String office = claims.get("run_as_office", String.class); + if (office == null || office.isBlank()) { + office = claims.get("office", String.class); + } + if (office == null || office.isBlank()) { + throw new CwmsAuthException("Batch job context missing run_as_office", + HttpServletResponse.SC_UNAUTHORIZED); + } + ctx.attribute(RUN_AS_OFFICE_ATTR, office.toUpperCase(Locale.ROOT)); + ctx.attribute(JOB_ID_ATTR, claims.get("job_id", String.class)); + ctx.attribute(REQUESTED_BY_ATTR, claims.get("requested_by", String.class)); + ctx.attribute(DISPATCH_SOURCE_ATTR, claims.get("dispatch_source", String.class)); + } catch (JwtException | IllegalArgumentException ex) { + logger.atFine().withCause(ex).log("Batch job context token validation failed."); + throw new CwmsAuthException("Batch job context token not valid", ex, + HttpServletResponse.SC_UNAUTHORIZED); + } + } + + public static void applyRunContext(Context ctx) { + String runAsOffice = ctx.attribute(RUN_AS_OFFICE_ATTR); + if (runAsOffice == null || runAsOffice.isBlank()) { + return; + } + + DataSource dataSource = ctx.attribute(ApiServlet.DATA_SOURCE); + ConnectionPreparer officePreparer = new SessionOfficePreparer(runAsOffice); + if (dataSource instanceof ConnectionPreparingDataSource) { + ConnectionPreparingDataSource preparingDataSource = (ConnectionPreparingDataSource) dataSource; + preparingDataSource.setPreparer(new DelegatingConnectionPreparer( + preparingDataSource.getPreparer(), officePreparer)); + } else { + ctx.attribute(ApiServlet.DATA_SOURCE, + new ConnectionPreparingDataSource(officePreparer, dataSource)); + } + } + + private static Claims parse(String token) { + String secret = secretForToken(token); + if (secret.length() < 32) { + throw new IllegalArgumentException("Batch job context secret must be at least 32 characters"); + } + Key key = Keys.hmacShaKeyFor(secret.getBytes(StandardCharsets.UTF_8)); + return Jwts.parserBuilder() + .requireIssuer(readSetting(ISSUER_PROPERTY, DEFAULT_ISSUER)) + .requireAudience(readSetting(AUDIENCE_PROPERTY, DEFAULT_AUDIENCE)) + .setSigningKey(key) + .build() + .parseClaimsJws(token) + .getBody(); + } + + private static String secretForToken(String token) { + String expectedKeyId = readSetting(KEY_ID_PROPERTY, "current"); + String keyId = keyIdForToken(token); + if (keyId == null || keyId.isBlank() || expectedKeyId.equals(keyId)) { + return readSetting(SECRET_PROPERTY, ""); + } + if ("previous".equals(keyId)) { + return readSetting(PREVIOUS_SECRET_PROPERTY, ""); + } + throw new IllegalArgumentException("Batch job context key id is not recognized"); + } + + private static String keyIdForToken(String token) { + String[] parts = token.split("\\."); + if (parts.length != 3) { + throw new IllegalArgumentException("Batch job context token is malformed"); + } + try { + byte[] headerBytes = Base64.getUrlDecoder().decode(parts[0]); + Map header = OBJECT_MAPPER.readValue(headerBytes, Map.class); + Object keyId = header.get("kid"); + return keyId instanceof String ? (String) keyId : null; + } catch (IllegalArgumentException | IOException e) { + throw new IllegalArgumentException("Batch job context token header is malformed", e); + } + } + + private static String readSetting(String key, String defaultValue) { + String value = System.getProperty(key, System.getenv(key)); + return value == null || value.isBlank() ? defaultValue : value; + } +} diff --git a/cwms-data-api/src/main/java/cwms/cda/security/CdaAccessManager.java b/cwms-data-api/src/main/java/cwms/cda/security/CdaAccessManager.java index ffb5c82c16..533073a7cf 100644 --- a/cwms-data-api/src/main/java/cwms/cda/security/CdaAccessManager.java +++ b/cwms-data-api/src/main/java/cwms/cda/security/CdaAccessManager.java @@ -41,6 +41,7 @@ public void manage(Handler handler, Context ctx, Set routeRoles) thr } checkRateLimit(ctx); prepareContext(ctx, principal); + BatchJobContext.applyRunContext(ctx); handler.handle(ctx); } diff --git a/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java b/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java index 48d1c8940c..340033dd3e 100644 --- a/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java +++ b/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java @@ -105,12 +105,16 @@ private DataApiPrincipal getUserFromToken(Context ctx) throws CwmsAuthException AuthDao dao = AuthDao.getInstance(JooqDao.getDslContext(ctx), ctx.attribute(ApiServlet.OFFICE_ID)); Optional principal = dao.getPrincipalFromPrincipal(oidcPrincipal); if (principal.isPresent()) { - return principal.get(); + DataApiPrincipal dataApiPrincipal = principal.get(); + BatchJobContext.prepareContext(ctx, dataApiPrincipal); + return dataApiPrincipal; } else if (CREATE_USERS) { final String preferredUserName = claims.get(PREFERRED_USERNAME_CLAIM, String.class); final String givenName = claims.get(GIVEN_NAME_CLAIM, String.class); final String email = claims.get(EMAIL_CLAIM, String.class); - return dao.createUser(preferredUserName, oidcPrincipal, givenName, email); + DataApiPrincipal dataApiPrincipal = dao.createUser(preferredUserName, oidcPrincipal, givenName, email); + BatchJobContext.prepareContext(ctx, dataApiPrincipal); + return dataApiPrincipal; } else { throw new CwmsAuthException("Not Authorized",HttpServletResponse.SC_UNAUTHORIZED); } From c7833eacc14d2797c790f5767bbe730a8a4f95b5 Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Mon, 8 Jun 2026 19:59:19 -0500 Subject: [PATCH 02/15] Report expired batch job context tokens --- .../src/main/java/cwms/cda/security/BatchJobContext.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java b/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java index 268ee7bce6..a9d96620bf 100644 --- a/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java +++ b/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java @@ -9,6 +9,7 @@ import cwms.cda.datasource.SessionOfficePreparer; import io.javalin.http.Context; import io.jsonwebtoken.Claims; +import io.jsonwebtoken.ExpiredJwtException; import io.jsonwebtoken.JwtException; import io.jsonwebtoken.Jwts; import io.jsonwebtoken.security.Keys; @@ -86,6 +87,10 @@ public static void prepareContext(Context ctx, DataApiPrincipal principal) throw ctx.attribute(JOB_ID_ATTR, claims.get("job_id", String.class)); ctx.attribute(REQUESTED_BY_ATTR, claims.get("requested_by", String.class)); ctx.attribute(DISPATCH_SOURCE_ATTR, claims.get("dispatch_source", String.class)); + } catch (ExpiredJwtException ex) { + logger.atFine().withCause(ex).log("Batch job context token expired."); + throw new CwmsAuthException("Batch job context token expired", ex, + HttpServletResponse.SC_UNAUTHORIZED); } catch (JwtException | IllegalArgumentException ex) { logger.atFine().withCause(ex).log("Batch job context token validation failed."); throw new CwmsAuthException("Batch job context token not valid", ex, From aec1c952ff85ada2e8dea8980aadcf35691736f5 Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Mon, 8 Jun 2026 21:27:32 -0500 Subject: [PATCH 03/15] Document signed batch machine run context --- .../0009-batch-machine-run-context.md | 198 ++++++++++++++++++ docs/source/decisions/index.rst | 1 + 2 files changed, 199 insertions(+) create mode 100644 docs/source/decisions/0009-batch-machine-run-context.md diff --git a/docs/source/decisions/0009-batch-machine-run-context.md b/docs/source/decisions/0009-batch-machine-run-context.md new file mode 100644 index 0000000000..1ce2374cc4 --- /dev/null +++ b/docs/source/decisions/0009-batch-machine-run-context.md @@ -0,0 +1,198 @@ +# Signed Batch Machine Run Context + +| Status | Proposed | +| :------------- | :---------------------- | +| **ADR #** | 0009 | +| **Author(s)** | CWBI Batch Runtime Team | +| **Sponsor** | HEC/USACE | +| **Date** | 6/8/2026 | +| **Supersedes** | N/A | + +## Objective + +Provide CWMS Data API with a trusted batch run context for jobs that execute through a shared machine identity. + +Batch runtimes will authenticate to CDA with a service account (via Keycloak). Each job will also provide a signed context token that identifies the authorized job launch context, including the office for which the scheduler or API approved the run. + +The signed context is **not** a replacement for normal CDA or database authorization. It establishes **who** launched the machine runtime and why. CDA and the CWMS database remain responsible for deciding whether the machine principal may read or write the requested resource office. + +## Motivation + +A single machine service account reduces Keycloak and AWS Batch configuration requirements. It also removes the natural office-specific identity that existed when each office had its own API key or job role in AWS. CDA therefore needs trusted context from the dispatcher so scripts cannot choose their own run authority by changing an environment variable, URI parameter, or request body. + +This is needed because CDA request office fields describe resource ownership, not caller authority. For example, a job approved for SWT (Tulsa District) may write data owned by another office when the mapped machine user has the required database roles. The request office identifies the target data; it does not identify who the job is running as. i.e. `&office=SWT` in the URI. + +## User Benefit + +### For Batch Operators + +- Runtime job definitions can be managed by language or image instead of by office/image combinations. +- One machine service account can support scheduled and ad hoc batch execution. +- Office launch context is still available for audit and policy decisions. + +### For Script Authors + +- Scripts call CDA with standard bearer-token authentication. +- Scripts do not need per-office CDA API keys. +- Scripts can still read and write resource offices allowed by the mapped CDA database user. + +### For Security and Operations + +- CDA rejects machine requests that lack a signed dispatcher issued run context. +- Request parameters and payload fields are not trusted as caller authority. +- CDA audit records can include both the machine principal and the signed job context. + +## Design Proposal + +### Batch Run Flow + +```mermaid +sequenceDiagram + participant Caller as Airflow or Batch Events API + participant Dispatcher as Batch Dispatcher + participant Runner as Batch Runtime + participant Keycloak as Keycloak + participant CDA as CWMS Data API + participant DB as CWMS Database + + Caller->>Dispatcher: Request job for office and script + Dispatcher->>Dispatcher: Authorize request and create job record + Dispatcher->>Dispatcher: Sign short-lived run context JWT + Dispatcher->>Runner: Start runtime with job id and brokered env + Runner->>Keycloak: Request machine token with client_credentials + Keycloak-->>Runner: Machine bearer token + Runner->>CDA: Request with bearer token and X-CWMS-Job-Context + CDA->>CDA: Validate machine principal and signed run context + CDA->>DB: Execute normal CDA/CWMS authorization + DB-->>CDA: Authorized result or denial + CDA-->>Runner: CDA response +``` + +### Run Context Token + +The dispatcher signs a short-lived JWT from the authoritative job record. The runner sends the token to CDA in the `X-CWMS-Job-Context` header. + +The token contains: + +| Claim | Description | +| ---------------------------- | ----------------------------------------------------- | +| `iss` | Trusted dispatcher issuer | +| `aud` | CDA audience | +| `iat` | Issued-at time | +| `exp` | Expiration time | +| `job_id` | Batch job identifier | +| `script_id` or `script_slug` | Script identity | +| `run_as_office` | Office context authorized for the job launch | +| `requested_by` | User or system that requested the job, when available | +| `dispatch_source` | Source such as `airflow` or `api` | + +The `run_as_office` claim represents the authorized launch context. It is not the same as a resource office on a CDA endpoint. + +### CDA Behavior + +CDA validates signed run context only for configured batch machine users. + +For those users, CDA will: + +- Require `X-CWMS-Job-Context`. +- Validate signature, issuer, audience, and expiration. +- Read job and run context from signed claims. +- Reject missing, expired, forged, or wrong-audience tokens. +- Make the run context available for audit logging and default session context where appropriate. + +CDA will not: + +- Treat request `office`, `office-id`, or body office fields as caller authority. +- Reject a request solely because the target resource office differs from `run_as_office`. +- Use signed run context to bypass route roles or database office roles. + +Normal CDA route authorization and CWMS database permissions determine whether the machine user can act on the requested resource office. + +### Configuration + +The Java API is configured with system properties or environment variables: + +| Setting | Description | +| ---------------------------------------- | -------------------------------------------------------------- | +| `cwms.dataapi.batch.jobContext.secret` | Signing secret for validating HS256 job context tokens | +| `cwms.dataapi.batch.jobContext.issuer` | Expected dispatcher issuer | +| `cwms.dataapi.batch.jobContext.audience` | Expected CDA audience | +| `cwms.dataapi.batch.machineUsers` | Comma-separated CDA users allowed to present batch run context | + +The signing secret belongs in a managed secret store. A later hardening step should use asymmetric signing or KMS-backed verification so CDA can verify run context without sharing the signing key. + +## Alternatives Considered + +### Per-Office Keycloak Service Accounts + +Create one service account per office or trust boundary. + +- **Pros**: Office context is represented directly by the service account. +- **Cons**: Recreates service-account and secret sprawl as offices and runtimes grow. +- **Rejected**: The design goal is one machine identity for batch runtimes. + +### Per-Office Batch Job Definitions and API Keys + +Continue using separate Batch definitions and CDA API secrets per office/runtime combination. + +- **Pros**: Uses the existing model. +- **Cons**: Requires hard-coded expansion across offices, runtimes, images, and secrets. +- **Rejected**: The dynamic runtime model is intended to remove this duplication. + +### Trust Request or Environment Office + +Use `OFFICE`, URI parameters, query parameters, or request body fields to decide who the job is running as. + +- **Pros**: Simple to pass through the runtime. +- **Cons**: These values are controlled by scripts and often identify target data ownership rather than caller authority. +- **Rejected**: Request office is resource context, not trusted run context. + +### Signed Dispatcher-Issued Run Context + +Use one machine identity and require a short-lived signed token from the trusted dispatcher. + +- **Pros**: Reduces runtime duplication while preserving trusted job launch context and normal CDA/DB authorization. +- **Cons**: Requires token validation and signing key management. +- **Selected**: Provides the required trust boundary without per-office machine identities. + +## Compatibility + +Existing user API key and user OIDC flows are unchanged. + +Non-machine users do not need `X-CWMS-Job-Context`. Configured batch machine users must provide a valid signed run context. + +Endpoint resource-office semantics are unchanged. Controllers and DAOs may continue to use request office values to retrieve or store CWMS resources. The database remains the source of truth for whether the active CDA user has roles for those resources. + +## Implementation Status + +### Proposed + +- Add CDA validation for `X-CWMS-Job-Context` on configured batch machine users. +- Preserve signed run context separately from request resource office. +- Expose run context for audit logging and default session behavior where appropriate. +- Add dispatcher-side signing in the batch events service. +- Add runner support for forwarding `X-CWMS-Job-Context` with CDA requests. + +## Criteria + +### Functional Requirements + +- A batch job launched through Airflow or the ad hoc API can call CDA using the shared machine Keycloak service account. +- CDA rejects configured machine-user requests that omit or forge signed run context. +- CDA records signed job/run context for audit. +- Resource office access remains controlled by CDA route roles and CWMS database roles. +- A job with `run_as_office=SWT` can act on another office's resource data only when the mapped machine user has the required roles for that resource office. + +### Test Scenarios + +- **Direct API path**: An authorized user submits an SWT job through the batch events API; CDA accepts the valid signed run context. +- **Airflow path**: Airflow triggers the same job path; CDA receives and validates the signed run context. +- **Cross-office allowed**: Signed run context is SWT, target resource office is MVS or SPK, and the request succeeds when the machine user has the needed DB roles. +- **Cross-office denied**: Signed run context is SWT, target resource office is unauthorized for the machine user, and CDA/DB returns `403`. +- **Forgery denied**: A script changes `OFFICE` or request `office` without a valid signed context; CDA rejects the request. + +## Conclusion + +Signed batch run context allows CDA to support dynamic batch runtimes with one Keycloak service account while preserving the existing CWMS authorization model. + +The token represents trusted job launch context. It does not redefine resource office semantics and does not bypass normal CDA or database authorization for the data being read or written. diff --git a/docs/source/decisions/index.rst b/docs/source/decisions/index.rst index 0dc16c28f5..f614a295ce 100644 --- a/docs/source/decisions/index.rst +++ b/docs/source/decisions/index.rst @@ -25,3 +25,4 @@ Some decisions may also be a proposal and marked appropriately. CDA Authorization Filtering <./0006-cda-authorization-filtering.md> Access Management Clients <./0007-access-management-clients.md> Timeseries CSV Format <./0008-timeseries-csv-format.rst> + Signed Batch Machine Run Context <./0009-batch-machine-run-context.md> From db72d62ee1f7df9ae4b36f8e97aa8f1cfa0a6646 Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Mon, 8 Jun 2026 23:38:14 -0500 Subject: [PATCH 04/15] Test signed batch job context --- .../cda/security/BatchJobContextTest.java | 283 ++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java diff --git a/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java b/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java new file mode 100644 index 0000000000..4166d08fd4 --- /dev/null +++ b/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java @@ -0,0 +1,283 @@ +package cwms.cda.security; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.doAnswer; + +import cwms.cda.ApiServlet; +import cwms.cda.datasource.ConnectionPreparingDataSource; +import io.javalin.http.Context; +import io.jsonwebtoken.Jwts; +import io.jsonwebtoken.SignatureAlgorithm; +import io.jsonwebtoken.security.Keys; +import java.nio.charset.StandardCharsets; +import java.io.PrintWriter; +import java.security.Key; +import java.sql.Connection; +import java.sql.SQLException; +import java.time.Instant; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; +import javax.sql.DataSource; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; +import org.mockito.stubbing.Answer; + +class BatchJobContextTest { + private static final String SECRET = "test-batch-context-secret-32-characters"; + private static final String OTHER_SECRET = "other-batch-context-secret-32-chars"; + private static final String ISSUER = "cwms-batch-events"; + private static final String AUDIENCE = "cwms-data-api"; + private static final String MACHINE_USER = "SERVICE-ACCOUNT-CWMS-BATCH-RUNNER"; + + @AfterEach + void clearProperties() { + System.clearProperty(BatchJobContext.SECRET_PROPERTY); + System.clearProperty(BatchJobContext.PREVIOUS_SECRET_PROPERTY); + System.clearProperty(BatchJobContext.KEY_ID_PROPERTY); + System.clearProperty(BatchJobContext.ISSUER_PROPERTY); + System.clearProperty(BatchJobContext.AUDIENCE_PROPERTY); + System.clearProperty(BatchJobContext.MACHINE_USERS_PROPERTY); + } + + @Test + void machineUserDetectionReturnsFalseWhenUnset() { + assertFalse(BatchJobContext.isBatchMachineUser(MACHINE_USER)); + } + + @Test + void machineUserDetectionMatchesConfiguredUsersCaseInsensitively() { + System.setProperty(BatchJobContext.MACHINE_USERS_PROPERTY, + "some-user, service-account-cwms-batch-runner "); + + assertTrue(BatchJobContext.isBatchMachineUser(MACHINE_USER)); + } + + @Test + void prepareContextDoesNotRequireTokenForNormalUsers() throws CwmsAuthException { + Context ctx = contextWithHeaders(Map.of()); + DataApiPrincipal principal = new DataApiPrincipal("normal-user", Set.of()); + + BatchJobContext.prepareContext(ctx, principal); + + assertNull(ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); + } + + @Test + void validTokenSetsBatchContextAttributes() throws CwmsAuthException { + configureBatchContext(); + String token = token(Map.of( + "run_as_office", "swt", + "job_id", "job-123", + "requested_by", "m5hectest", + "dispatch_source", "api" + ), SECRET, ISSUER, AUDIENCE, Instant.now().plusSeconds(300)); + Context ctx = contextWithHeaders(Map.of(BatchJobContext.HEADER, token)); + + BatchJobContext.prepareContext(ctx, machinePrincipal()); + + assertEquals("SWT", ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); + assertEquals("job-123", ctx.attribute(BatchJobContext.JOB_ID_ATTR)); + assertEquals("m5hectest", ctx.attribute(BatchJobContext.REQUESTED_BY_ATTR)); + assertEquals("api", ctx.attribute(BatchJobContext.DISPATCH_SOURCE_ATTR)); + } + + @Test + void missingTokenForMachineUserThrowsUnauthorized() { + configureBatchContext(); + Context ctx = contextWithHeaders(Map.of()); + + CwmsAuthException ex = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext(ctx, machinePrincipal())); + + assertEquals(HttpServletResponse.SC_UNAUTHORIZED, ex.getAuthFailCode()); + assertEquals("Batch machine request missing signed job context", ex.getMessage()); + } + + @Test + void expiredTokenThrowsSpecificUnauthorizedMessage() { + configureBatchContext(); + String token = token(Map.of("run_as_office", "SWT"), SECRET, ISSUER, AUDIENCE, + Instant.now().minusSeconds(60)); + Context ctx = contextWithHeaders(Map.of(BatchJobContext.HEADER, token)); + + CwmsAuthException ex = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext(ctx, machinePrincipal())); + + assertEquals(HttpServletResponse.SC_UNAUTHORIZED, ex.getAuthFailCode()); + assertEquals("Batch job context token expired", ex.getMessage()); + } + + @Test + void forgedTokenThrowsInvalidMessage() { + configureBatchContext(); + String token = token(Map.of("run_as_office", "SWT"), OTHER_SECRET, ISSUER, AUDIENCE, + Instant.now().plusSeconds(300)); + Context ctx = contextWithHeaders(Map.of(BatchJobContext.HEADER, token)); + + CwmsAuthException ex = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext(ctx, machinePrincipal())); + + assertEquals(HttpServletResponse.SC_UNAUTHORIZED, ex.getAuthFailCode()); + assertEquals("Batch job context token not valid", ex.getMessage()); + } + + @Test + void wrongIssuerOrAudienceIsRejected() { + configureBatchContext(); + String wrongIssuer = token(Map.of("run_as_office", "SWT"), SECRET, "other-issuer", + AUDIENCE, Instant.now().plusSeconds(300)); + String wrongAudience = token(Map.of("run_as_office", "SWT"), SECRET, ISSUER, + "other-audience", Instant.now().plusSeconds(300)); + + CwmsAuthException issuerEx = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext( + contextWithHeaders(Map.of(BatchJobContext.HEADER, wrongIssuer)), machinePrincipal())); + CwmsAuthException audienceEx = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext( + contextWithHeaders(Map.of(BatchJobContext.HEADER, wrongAudience)), machinePrincipal())); + + assertEquals("Batch job context token not valid", issuerEx.getMessage()); + assertEquals("Batch job context token not valid", audienceEx.getMessage()); + } + + @Test + void legacyOfficeClaimIsUsedWhenRunAsOfficeIsMissing() throws CwmsAuthException { + configureBatchContext(); + String token = token(Map.of("office", "spk"), SECRET, ISSUER, AUDIENCE, + Instant.now().plusSeconds(300)); + Context ctx = contextWithHeaders(Map.of(BatchJobContext.HEADER, token)); + + BatchJobContext.prepareContext(ctx, machinePrincipal()); + + assertEquals("SPK", ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); + } + + @Test + void applyRunContextWrapsDataSourceWhenOfficeIsPresent() { + Context ctx = contextWithAttributes(); + DataSource dataSource = new StubDataSource(); + ctx.attribute(ApiServlet.DATA_SOURCE, dataSource); + ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR, "SWT"); + + BatchJobContext.applyRunContext(ctx); + + assertInstanceOf(ConnectionPreparingDataSource.class, ctx.attribute(ApiServlet.DATA_SOURCE)); + } + + @Test + void applyRunContextLeavesDataSourceUnchangedWhenOfficeIsMissing() { + Context ctx = contextWithAttributes(); + DataSource dataSource = new StubDataSource(); + ctx.attribute(ApiServlet.DATA_SOURCE, dataSource); + + BatchJobContext.applyRunContext(ctx); + + assertSame(dataSource, ctx.attribute(ApiServlet.DATA_SOURCE)); + } + + private static void configureBatchContext() { + System.setProperty(BatchJobContext.SECRET_PROPERTY, SECRET); + System.setProperty(BatchJobContext.ISSUER_PROPERTY, ISSUER); + System.setProperty(BatchJobContext.AUDIENCE_PROPERTY, AUDIENCE); + System.setProperty(BatchJobContext.MACHINE_USERS_PROPERTY, MACHINE_USER); + } + + private static DataApiPrincipal machinePrincipal() { + return new DataApiPrincipal(MACHINE_USER, Set.of()); + } + + private static String token(Map claims, String secret, String issuer, + String audience, Instant expiration) { + Key key = Keys.hmacShaKeyFor(secret.getBytes(StandardCharsets.UTF_8)); + return Jwts.builder() + .setHeaderParam("kid", "current") + .setIssuer(issuer) + .setAudience(audience) + .setIssuedAt(Date.from(Instant.now())) + .setExpiration(Date.from(expiration)) + .addClaims(claims) + .signWith(key, SignatureAlgorithm.HS256) + .compact(); + } + + private static Context contextWithHeaders(Map headers) { + return context(headers); + } + + private static Context contextWithAttributes() { + return context(Map.of()); + } + + private static Context context(Map headers) { + HttpServletRequest request = mock(HttpServletRequest.class); + HttpServletResponse response = mock(HttpServletResponse.class); + Map attributes = new HashMap<>(); + org.mockito.Mockito.when(request.getHeader(org.mockito.ArgumentMatchers.anyString())) + .thenAnswer((Answer) invocation -> headers.get(invocation.getArgument(0))); + org.mockito.Mockito.when(request.getAttribute(org.mockito.ArgumentMatchers.anyString())) + .thenAnswer((Answer) invocation -> attributes.get(invocation.getArgument(0))); + doAnswer((Answer) invocation -> { + attributes.put(invocation.getArgument(0), invocation.getArgument(1)); + return null; + }) + .when(request).setAttribute(org.mockito.ArgumentMatchers.anyString(), org.mockito.ArgumentMatchers.any()); + return new Context(request, response, new HashMap<>()); + } + + private static final class StubDataSource implements DataSource { + @Override + public Connection getConnection() throws SQLException { + throw new SQLException("not used"); + } + + @Override + public Connection getConnection(String username, String password) throws SQLException { + throw new SQLException("not used"); + } + + @Override + public PrintWriter getLogWriter() { + return null; + } + + @Override + public void setLogWriter(PrintWriter out) { + } + + @Override + public void setLoginTimeout(int seconds) { + } + + @Override + public int getLoginTimeout() { + return 0; + } + + @Override + public Logger getParentLogger() { + return Logger.getGlobal(); + } + + @Override + public T unwrap(Class iface) throws SQLException { + throw new SQLException("not used"); + } + + @Override + public boolean isWrapperFor(Class iface) { + return false; + } + } +} From bc40f8b7afe3923dcda0dfa7a258f1a6efd0df0d Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Wed, 10 Jun 2026 07:54:28 -0500 Subject: [PATCH 05/15] Harden batch machine context handling --- .../cwms/cda/security/BatchJobContext.java | 6 ----- .../OpenIdConnectIdentitityProvider.java | 4 ++++ .../cda/security/BatchJobContextTest.java | 5 +--- .../0009-batch-machine-run-context.md | 23 ++++++++++++++----- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java b/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java index a9d96620bf..0fd3ca08e3 100644 --- a/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java +++ b/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java @@ -28,9 +28,6 @@ public final class BatchJobContext { public static final String HEADER = "X-CWMS-Job-Context"; public static final String RUN_AS_OFFICE_ATTR = "BatchRunAsOffice"; - public static final String JOB_ID_ATTR = "BatchJobId"; - public static final String REQUESTED_BY_ATTR = "BatchRequestedBy"; - public static final String DISPATCH_SOURCE_ATTR = "BatchDispatchSource"; public static final String SECRET_PROPERTY = "cwms.dataapi.batch.jobContext.secret"; public static final String PREVIOUS_SECRET_PROPERTY = "cwms.dataapi.batch.jobContext.previousSecret"; @@ -84,9 +81,6 @@ public static void prepareContext(Context ctx, DataApiPrincipal principal) throw HttpServletResponse.SC_UNAUTHORIZED); } ctx.attribute(RUN_AS_OFFICE_ATTR, office.toUpperCase(Locale.ROOT)); - ctx.attribute(JOB_ID_ATTR, claims.get("job_id", String.class)); - ctx.attribute(REQUESTED_BY_ATTR, claims.get("requested_by", String.class)); - ctx.attribute(DISPATCH_SOURCE_ATTR, claims.get("dispatch_source", String.class)); } catch (ExpiredJwtException ex) { logger.atFine().withCause(ex).log("Batch job context token expired."); throw new CwmsAuthException("Batch job context token expired", ex, diff --git a/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java b/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java index 340033dd3e..4339536f19 100644 --- a/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java +++ b/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java @@ -110,6 +110,10 @@ private DataApiPrincipal getUserFromToken(Context ctx) throws CwmsAuthException return dataApiPrincipal; } else if (CREATE_USERS) { final String preferredUserName = claims.get(PREFERRED_USERNAME_CLAIM, String.class); + if (BatchJobContext.isBatchMachineUser(preferredUserName)) { + throw new CwmsAuthException("Batch machine principal is not registered", + HttpServletResponse.SC_UNAUTHORIZED); + } final String givenName = claims.get(GIVEN_NAME_CLAIM, String.class); final String email = claims.get(EMAIL_CLAIM, String.class); DataApiPrincipal dataApiPrincipal = dao.createUser(preferredUserName, oidcPrincipal, givenName, email); diff --git a/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java b/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java index 4166d08fd4..e28cfd1ddf 100644 --- a/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java +++ b/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java @@ -75,7 +75,7 @@ void prepareContextDoesNotRequireTokenForNormalUsers() throws CwmsAuthException } @Test - void validTokenSetsBatchContextAttributes() throws CwmsAuthException { + void validTokenSetsRunContextAttribute() throws CwmsAuthException { configureBatchContext(); String token = token(Map.of( "run_as_office", "swt", @@ -88,9 +88,6 @@ void validTokenSetsBatchContextAttributes() throws CwmsAuthException { BatchJobContext.prepareContext(ctx, machinePrincipal()); assertEquals("SWT", ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); - assertEquals("job-123", ctx.attribute(BatchJobContext.JOB_ID_ATTR)); - assertEquals("m5hectest", ctx.attribute(BatchJobContext.REQUESTED_BY_ATTR)); - assertEquals("api", ctx.attribute(BatchJobContext.DISPATCH_SOURCE_ATTR)); } @Test diff --git a/docs/source/decisions/0009-batch-machine-run-context.md b/docs/source/decisions/0009-batch-machine-run-context.md index 1ce2374cc4..707d1e05df 100644 --- a/docs/source/decisions/0009-batch-machine-run-context.md +++ b/docs/source/decisions/0009-batch-machine-run-context.md @@ -12,9 +12,9 @@ Provide CWMS Data API with a trusted batch run context for jobs that execute through a shared machine identity. -Batch runtimes will authenticate to CDA with a service account (via Keycloak). Each job will also provide a signed context token that identifies the authorized job launch context, including the office for which the scheduler or API approved the run. +Batch runtimes will authenticate to CDA with a service account (via Keycloak). Each job will also provide trusted launch context, including the office for which the scheduler or API approved the run. The preferred long-term shape is for Keycloak to mint that context into the access token when it can safely receive per-job values. The signed dispatcher context described here is the fallback proposal when Keycloak cannot provide dynamic job context without a custom extension. -The signed context is **not** a replacement for normal CDA or database authorization. It establishes **who** launched the machine runtime and why. CDA and the CWMS database remain responsible for deciding whether the machine principal may read or write the requested resource office. +The signed context is **not** a replacement for normal CDA or database authorization. It establishes **who** launched the machine runtime and why. CDA and the CWMS database remain responsible for deciding whether the machine principal may read or write the requested resource office. The machine principal must already be registered in CDA and the CWMS database; CDA must not auto-create batch machine users. ## Motivation @@ -96,15 +96,16 @@ For those users, CDA will: - Require `X-CWMS-Job-Context`. - Validate signature, issuer, audience, and expiration. -- Read job and run context from signed claims. +- Read only the run office needed to establish session context from signed claims. - Reject missing, expired, forged, or wrong-audience tokens. -- Make the run context available for audit logging and default session context where appropriate. +- Make additional job context available to logging only when a logging-specific mechanism exists. CDA will not: - Treat request `office`, `office-id`, or body office fields as caller authority. - Reject a request solely because the target resource office differs from `run_as_office`. - Use signed run context to bypass route roles or database office roles. +- Expose job identifiers or requester metadata as general request attributes for downstream controllers. Normal CDA route authorization and CWMS database permissions determine whether the machine user can act on the requested resource office. @@ -153,7 +154,15 @@ Use one machine identity and require a short-lived signed token from the trusted - **Pros**: Reduces runtime duplication while preserving trusted job launch context and normal CDA/DB authorization. - **Cons**: Requires token validation and signing key management. -- **Selected**: Provides the required trust boundary without per-office machine identities. +- **Fallback proposal**: Provides the required trust boundary without per-office machine identities if Keycloak cannot mint dynamic job context into the access token. + +### Keycloak-Minted Job Context Claims + +Have Keycloak receive trusted per-job context during token minting and include that context in the normal access token. + +- **Pros**: CDA validates one JWT from one issuer and does not need a second signing secret. +- **Cons**: Requires proof that Keycloak can safely receive dynamic per-job values such as `run_as_office` and `job_id` without a custom extension. +- **Preferred if feasible**: This will be investigated before the signed dispatcher context is adopted for production. ## Compatibility @@ -169,9 +178,10 @@ Endpoint resource-office semantics are unchanged. Controllers and DAOs may conti - Add CDA validation for `X-CWMS-Job-Context` on configured batch machine users. - Preserve signed run context separately from request resource office. -- Expose run context for audit logging and default session behavior where appropriate. +- Use run context only for session behavior in CDA; reserve job/requester metadata for future logging. - Add dispatcher-side signing in the batch events service. - Add runner support for forwarding `X-CWMS-Job-Context` with CDA requests. +- Investigate Keycloak-minted dynamic job context claims before production adoption. ## Criteria @@ -179,6 +189,7 @@ Endpoint resource-office semantics are unchanged. Controllers and DAOs may conti - A batch job launched through Airflow or the ad hoc API can call CDA using the shared machine Keycloak service account. - CDA rejects configured machine-user requests that omit or forge signed run context. +- CDA rejects configured batch machine principals that are not already registered in CDA/DB. - CDA records signed job/run context for audit. - Resource office access remains controlled by CDA route roles and CWMS database roles. - A job with `run_as_office=SWT` can act on another office's resource data only when the mapped machine user has the required roles for that resource office. From 819a2a18dfa156eb317698420ce8496ed162858f Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Wed, 10 Jun 2026 08:31:18 -0500 Subject: [PATCH 06/15] Support Keycloak batch run context claims --- .../cwms/cda/security/BatchJobContext.java | 52 +++++++++++++--- .../OpenIdConnectIdentitityProvider.java | 6 +- .../cda/security/BatchJobContextTest.java | 51 ++++++++++++++++ .../0009-batch-machine-run-context.md | 60 ++++++++++++------- 4 files changed, 134 insertions(+), 35 deletions(-) diff --git a/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java b/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java index 0fd3ca08e3..b3a45c4207 100644 --- a/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java +++ b/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java @@ -28,6 +28,9 @@ public final class BatchJobContext { public static final String HEADER = "X-CWMS-Job-Context"; public static final String RUN_AS_OFFICE_ATTR = "BatchRunAsOffice"; + public static final String MACHINE_AUTH_CLAIM = "machine_auth"; + public static final String RUN_AS_OFFICE_CLAIM = "run_as_office"; + public static final String LEGACY_OFFICE_CLAIM = "office"; public static final String SECRET_PROPERTY = "cwms.dataapi.batch.jobContext.secret"; public static final String PREVIOUS_SECRET_PROPERTY = "cwms.dataapi.batch.jobContext.previousSecret"; @@ -59,6 +62,19 @@ public static boolean isBatchMachineUser(String username) { return false; } + public static boolean isBatchMachinePrincipal(String username, Claims claims) { + return hasMachineAuthClaim(claims) || isBatchMachineUser(username); + } + + public static void prepareContext(Context ctx, DataApiPrincipal principal, Claims claims) + throws CwmsAuthException { + if (hasMachineAuthClaim(claims)) { + setRunOfficeFromClaims(ctx, claims); + return; + } + prepareContext(ctx, principal); + } + public static void prepareContext(Context ctx, DataApiPrincipal principal) throws CwmsAuthException { if (!isBatchMachineUser(principal.getName())) { return; @@ -72,15 +88,7 @@ public static void prepareContext(Context ctx, DataApiPrincipal principal) throw try { Claims claims = parse(token); - String office = claims.get("run_as_office", String.class); - if (office == null || office.isBlank()) { - office = claims.get("office", String.class); - } - if (office == null || office.isBlank()) { - throw new CwmsAuthException("Batch job context missing run_as_office", - HttpServletResponse.SC_UNAUTHORIZED); - } - ctx.attribute(RUN_AS_OFFICE_ATTR, office.toUpperCase(Locale.ROOT)); + setRunOfficeFromClaims(ctx, claims); } catch (ExpiredJwtException ex) { logger.atFine().withCause(ex).log("Batch job context token expired."); throw new CwmsAuthException("Batch job context token expired", ex, @@ -92,6 +100,32 @@ public static void prepareContext(Context ctx, DataApiPrincipal principal) throw } } + private static boolean hasMachineAuthClaim(Claims claims) { + if (claims == null) { + return false; + } + Object value = claims.get(MACHINE_AUTH_CLAIM); + if (value instanceof Boolean) { + return (Boolean) value; + } + if (value instanceof String) { + return Boolean.parseBoolean((String) value); + } + return false; + } + + private static void setRunOfficeFromClaims(Context ctx, Claims claims) throws CwmsAuthException { + String office = claims.get(RUN_AS_OFFICE_CLAIM, String.class); + if (office == null || office.isBlank()) { + office = claims.get(LEGACY_OFFICE_CLAIM, String.class); + } + if (office == null || office.isBlank()) { + throw new CwmsAuthException("Batch job context missing run_as_office", + HttpServletResponse.SC_UNAUTHORIZED); + } + ctx.attribute(RUN_AS_OFFICE_ATTR, office.toUpperCase(Locale.ROOT)); + } + public static void applyRunContext(Context ctx) { String runAsOffice = ctx.attribute(RUN_AS_OFFICE_ATTR); if (runAsOffice == null || runAsOffice.isBlank()) { diff --git a/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java b/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java index 4339536f19..f636724712 100644 --- a/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java +++ b/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java @@ -106,18 +106,18 @@ private DataApiPrincipal getUserFromToken(Context ctx) throws CwmsAuthException Optional principal = dao.getPrincipalFromPrincipal(oidcPrincipal); if (principal.isPresent()) { DataApiPrincipal dataApiPrincipal = principal.get(); - BatchJobContext.prepareContext(ctx, dataApiPrincipal); + BatchJobContext.prepareContext(ctx, dataApiPrincipal, claims); return dataApiPrincipal; } else if (CREATE_USERS) { final String preferredUserName = claims.get(PREFERRED_USERNAME_CLAIM, String.class); - if (BatchJobContext.isBatchMachineUser(preferredUserName)) { + if (BatchJobContext.isBatchMachinePrincipal(preferredUserName, claims)) { throw new CwmsAuthException("Batch machine principal is not registered", HttpServletResponse.SC_UNAUTHORIZED); } final String givenName = claims.get(GIVEN_NAME_CLAIM, String.class); final String email = claims.get(EMAIL_CLAIM, String.class); DataApiPrincipal dataApiPrincipal = dao.createUser(preferredUserName, oidcPrincipal, givenName, email); - BatchJobContext.prepareContext(ctx, dataApiPrincipal); + BatchJobContext.prepareContext(ctx, dataApiPrincipal, claims); return dataApiPrincipal; } else { throw new CwmsAuthException("Not Authorized",HttpServletResponse.SC_UNAUTHORIZED); diff --git a/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java b/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java index e28cfd1ddf..a873c8a8a1 100644 --- a/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java +++ b/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java @@ -13,6 +13,7 @@ import cwms.cda.ApiServlet; import cwms.cda.datasource.ConnectionPreparingDataSource; import io.javalin.http.Context; +import io.jsonwebtoken.Claims; import io.jsonwebtoken.Jwts; import io.jsonwebtoken.SignatureAlgorithm; import io.jsonwebtoken.security.Keys; @@ -74,6 +75,56 @@ void prepareContextDoesNotRequireTokenForNormalUsers() throws CwmsAuthException assertNull(ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); } + @Test + void machineAuthClaimSetsRunContextWithoutSignedHeader() throws CwmsAuthException { + Context ctx = contextWithHeaders(Map.of()); + Claims claims = Jwts.claims(Map.of( + BatchJobContext.MACHINE_AUTH_CLAIM, true, + BatchJobContext.RUN_AS_OFFICE_CLAIM, "swt" + )); + + BatchJobContext.prepareContext(ctx, machinePrincipal(), claims); + + assertEquals("SWT", ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); + } + + @Test + void machineAuthStringClaimSetsRunContextWithoutSignedHeader() throws CwmsAuthException { + Context ctx = contextWithHeaders(Map.of()); + Claims claims = Jwts.claims(Map.of( + BatchJobContext.MACHINE_AUTH_CLAIM, "true", + BatchJobContext.RUN_AS_OFFICE_CLAIM, "spk" + )); + + BatchJobContext.prepareContext(ctx, machinePrincipal(), claims); + + assertEquals("SPK", ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); + } + + @Test + void machineAuthClaimRequiresRunAsOffice() { + Context ctx = contextWithHeaders(Map.of()); + Claims claims = Jwts.claims(Map.of(BatchJobContext.MACHINE_AUTH_CLAIM, true)); + + CwmsAuthException ex = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext(ctx, machinePrincipal(), claims)); + + assertEquals(HttpServletResponse.SC_UNAUTHORIZED, ex.getAuthFailCode()); + assertEquals("Batch job context missing run_as_office", ex.getMessage()); + } + + @Test + void batchMachinePrincipalUsesMachineAuthClaimOrConfiguredUser() { + Claims claims = Jwts.claims(Map.of(BatchJobContext.MACHINE_AUTH_CLAIM, true)); + + assertTrue(BatchJobContext.isBatchMachinePrincipal("normal-user", claims)); + assertFalse(BatchJobContext.isBatchMachinePrincipal("normal-user", Jwts.claims())); + + System.setProperty(BatchJobContext.MACHINE_USERS_PROPERTY, MACHINE_USER); + + assertTrue(BatchJobContext.isBatchMachinePrincipal(MACHINE_USER, Jwts.claims())); + } + @Test void validTokenSetsRunContextAttribute() throws CwmsAuthException { configureBatchContext(); diff --git a/docs/source/decisions/0009-batch-machine-run-context.md b/docs/source/decisions/0009-batch-machine-run-context.md index 707d1e05df..cbb64db3fb 100644 --- a/docs/source/decisions/0009-batch-machine-run-context.md +++ b/docs/source/decisions/0009-batch-machine-run-context.md @@ -1,4 +1,4 @@ -# Signed Batch Machine Run Context +# Batch Machine Run Context | Status | Proposed | | :------------- | :---------------------- | @@ -12,7 +12,7 @@ Provide CWMS Data API with a trusted batch run context for jobs that execute through a shared machine identity. -Batch runtimes will authenticate to CDA with a service account (via Keycloak). Each job will also provide trusted launch context, including the office for which the scheduler or API approved the run. The preferred long-term shape is for Keycloak to mint that context into the access token when it can safely receive per-job values. The signed dispatcher context described here is the fallback proposal when Keycloak cannot provide dynamic job context without a custom extension. +Batch runtimes will authenticate to CDA with a service account (via Keycloak). Each job will also provide trusted launch context, including the office for which the scheduler or API approved the run. CDA will use Keycloak-minted access-token claims when Keycloak can safely receive per-job values. CDA will also support a signed dispatcher context header as a fallback when Keycloak cannot provide dynamic job context without a custom extension. The signed context is **not** a replacement for normal CDA or database authorization. It establishes **who** launched the machine runtime and why. CDA and the CWMS database remain responsible for deciding whether the machine principal may read or write the requested resource office. The machine principal must already be registered in CDA and the CWMS database; CDA must not auto-create batch machine users. @@ -57,18 +57,29 @@ sequenceDiagram Caller->>Dispatcher: Request job for office and script Dispatcher->>Dispatcher: Authorize request and create job record - Dispatcher->>Dispatcher: Sign short-lived run context JWT + Dispatcher->>Dispatcher: Authorize run context Dispatcher->>Runner: Start runtime with job id and brokered env - Runner->>Keycloak: Request machine token with client_credentials - Keycloak-->>Runner: Machine bearer token - Runner->>CDA: Request with bearer token and X-CWMS-Job-Context - CDA->>CDA: Validate machine principal and signed run context + Runner->>Keycloak: Request machine token with run context, when supported + Keycloak-->>Runner: Machine bearer token with machine_auth and run_as_office + Runner->>CDA: Request with bearer token + CDA->>CDA: Validate machine principal and token run context CDA->>DB: Execute normal CDA/CWMS authorization DB-->>CDA: Authorized result or denial CDA-->>Runner: CDA response ``` -### Run Context Token +### Keycloak-Minted Run Context + +When available, Keycloak mints the batch run context into the normal access token. CDA validates that token through the existing OIDC flow and reads these claims: + +| Claim | Description | +| --------------- | -------------------------------------------- | +| `machine_auth` | Marks the access token as a batch machine run | +| `run_as_office` | Office context authorized for the job launch | + +The `run_as_office` claim represents the authorized launch context. It is not the same as a resource office on a CDA endpoint. + +### Signed Run Context Fallback The dispatcher signs a short-lived JWT from the authoritative job record. The runner sends the token to CDA in the `X-CWMS-Job-Context` header. @@ -86,17 +97,16 @@ The token contains: | `requested_by` | User or system that requested the job, when available | | `dispatch_source` | Source such as `airflow` or `api` | -The `run_as_office` claim represents the authorized launch context. It is not the same as a resource office on a CDA endpoint. - ### CDA Behavior -CDA validates signed run context only for configured batch machine users. +CDA validates batch run context for machine-authenticated requests. -For those users, CDA will: +For those requests, CDA will: -- Require `X-CWMS-Job-Context`. +- Prefer validated OIDC claims from the access token. +- Fall back to `X-CWMS-Job-Context` for configured batch machine users. - Validate signature, issuer, audience, and expiration. -- Read only the run office needed to establish session context from signed claims. +- Read only the run office needed to establish session context from trusted run context claims. - Reject missing, expired, forged, or wrong-audience tokens. - Make additional job context available to logging only when a logging-specific mechanism exists. @@ -104,7 +114,7 @@ CDA will not: - Treat request `office`, `office-id`, or body office fields as caller authority. - Reject a request solely because the target resource office differs from `run_as_office`. -- Use signed run context to bypass route roles or database office roles. +- Use batch run context to bypass route roles or database office roles. - Expose job identifiers or requester metadata as general request attributes for downstream controllers. Normal CDA route authorization and CWMS database permissions determine whether the machine user can act on the requested resource office. @@ -122,6 +132,8 @@ The Java API is configured with system properties or environment variables: The signing secret belongs in a managed secret store. A later hardening step should use asymmetric signing or KMS-backed verification so CDA can verify run context without sharing the signing key. +When Keycloak mints the batch run context directly, CDA does not need the signing secret for those requests. The machine principal must still be registered in CDA and the CWMS database. + ## Alternatives Considered ### Per-Office Keycloak Service Accounts @@ -168,7 +180,7 @@ Have Keycloak receive trusted per-job context during token minting and include t Existing user API key and user OIDC flows are unchanged. -Non-machine users do not need `X-CWMS-Job-Context`. Configured batch machine users must provide a valid signed run context. +Non-machine users do not need batch run context. Configured batch machine users must provide either Keycloak-minted machine run claims or a valid signed run context. Endpoint resource-office semantics are unchanged. Controllers and DAOs may continue to use request office values to retrieve or store CWMS resources. The database remains the source of truth for whether the active CDA user has roles for those resources. @@ -176,19 +188,21 @@ Endpoint resource-office semantics are unchanged. Controllers and DAOs may conti ### Proposed +- Accept Keycloak-minted `machine_auth` and `run_as_office` claims from validated OIDC access tokens. - Add CDA validation for `X-CWMS-Job-Context` on configured batch machine users. -- Preserve signed run context separately from request resource office. +- Preserve batch run context separately from request resource office. - Use run context only for session behavior in CDA; reserve job/requester metadata for future logging. - Add dispatcher-side signing in the batch events service. - Add runner support for forwarding `X-CWMS-Job-Context` with CDA requests. -- Investigate Keycloak-minted dynamic job context claims before production adoption. +- Use the signed dispatcher context only when Keycloak cannot mint dynamic job context claims safely. ## Criteria ### Functional Requirements - A batch job launched through Airflow or the ad hoc API can call CDA using the shared machine Keycloak service account. -- CDA rejects configured machine-user requests that omit or forge signed run context. +- CDA accepts registered machine-user requests with valid Keycloak-minted machine run claims. +- CDA rejects configured machine-user requests that omit or forge run context. - CDA rejects configured batch machine principals that are not already registered in CDA/DB. - CDA records signed job/run context for audit. - Resource office access remains controlled by CDA route roles and CWMS database roles. @@ -196,10 +210,10 @@ Endpoint resource-office semantics are unchanged. Controllers and DAOs may conti ### Test Scenarios -- **Direct API path**: An authorized user submits an SWT job through the batch events API; CDA accepts the valid signed run context. -- **Airflow path**: Airflow triggers the same job path; CDA receives and validates the signed run context. -- **Cross-office allowed**: Signed run context is SWT, target resource office is MVS or SPK, and the request succeeds when the machine user has the needed DB roles. -- **Cross-office denied**: Signed run context is SWT, target resource office is unauthorized for the machine user, and CDA/DB returns `403`. +- **Direct API path**: An authorized user submits an SWT job through the batch events API; CDA accepts valid machine run context. +- **Airflow path**: Airflow triggers the same job path; CDA receives and validates machine run context. +- **Cross-office allowed**: Batch run context is SWT, target resource office is MVS or SPK, and the request succeeds when the machine user has the needed DB roles. +- **Cross-office denied**: Batch run context is SWT, target resource office is unauthorized for the machine user, and CDA/DB returns `403`. - **Forgery denied**: A script changes `OFFICE` or request `office` without a valid signed context; CDA rejects the request. ## Conclusion From 6a758569b2faab0a37b586fc9a24354f65fc6e3f Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Thu, 25 Jun 2026 22:29:02 -0500 Subject: [PATCH 07/15] Add local Keycloak batch service accounts --- compose_files/keycloak/realm.json | 168 ++++++++++++++++++ compose_files/sql/users.sql | 34 ++++ .../cda/api/auth/OpenIdConnectTestIT.java | 103 ++++++++++- .../test/java/fixtures/KeyCloakExtension.java | 38 ++++ 4 files changed, 335 insertions(+), 8 deletions(-) diff --git a/compose_files/keycloak/realm.json b/compose_files/keycloak/realm.json index 9a0851a223..581600f863 100644 --- a/compose_files/keycloak/realm.json +++ b/compose_files/keycloak/realm.json @@ -728,6 +728,154 @@ "microprofile-jwt" ] }, + { + "id": "76dbdfad-5201-4a9d-bac4-ee9b89a794f1", + "clientId": "cwms-batch-runner-swt", + "name": "CWMS Batch Runner SWT", + "description": "Local test service account for SWT batch jobs", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": true, + "clientAuthenticatorType": "client-secret", + "secret": "local-cwms-batch-runner-swt-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": true, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "client_credentials.use_refresh_token": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "74f0d829-2921-4df2-93d4-468d6cd77d38", + "name": "machine_auth", + "protocol": "openid-connect", + "protocolMapper": "oidc-hardcoded-claim-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "claim.name": "machine_auth", + "claim.value": "true", + "id.token.claim": "false", + "jsonType.label": "boolean", + "userinfo.token.claim": "false" + } + }, + { + "id": "422385db-9fd4-4c8c-b1c7-8d12ce41418c", + "name": "run_as_office", + "protocol": "openid-connect", + "protocolMapper": "oidc-hardcoded-claim-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "claim.name": "run_as_office", + "claim.value": "SWT", + "id.token.claim": "false", + "jsonType.label": "String", + "userinfo.token.claim": "false" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "roles", + "profile", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "7bed3c57-6b95-45a7-b94f-1d85103776e2", + "clientId": "cwms-batch-runner-spk", + "name": "CWMS Batch Runner SPK", + "description": "Local test service account for SPK batch jobs", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": true, + "clientAuthenticatorType": "client-secret", + "secret": "local-cwms-batch-runner-spk-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": true, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "client_credentials.use_refresh_token": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "019637fa-9fe7-4ee1-b372-5d20cb8b4a05", + "name": "machine_auth", + "protocol": "openid-connect", + "protocolMapper": "oidc-hardcoded-claim-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "claim.name": "machine_auth", + "claim.value": "true", + "id.token.claim": "false", + "jsonType.label": "boolean", + "userinfo.token.claim": "false" + } + }, + { + "id": "f6fde48e-c253-472d-8ed1-6267b4db1227", + "name": "run_as_office", + "protocol": "openid-connect", + "protocolMapper": "oidc-hardcoded-claim-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "claim.name": "run_as_office", + "claim.value": "SPK", + "id.token.claim": "false", + "jsonType.label": "String", + "userinfo.token.claim": "false" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "roles", + "profile", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, { "id": "f09103f0-5563-4dc0-bc70-2ff0309edae7", "clientId": "realm-management", @@ -2321,6 +2469,26 @@ "cwms_user", "new_user" ] + }, + { + "id": "a4e88497-0ffc-41d5-b0fd-cc91760e366b", + "username": "service-account-cwms-batch-runner-swt", + "enabled": true, + "emailVerified": false, + "serviceAccountClientId": "cwms-batch-runner-swt", + "realmRoles": [ + "cwms_user" + ] + }, + { + "id": "d2d6f91b-a5dd-40c3-8ee6-49a52da9892e", + "username": "service-account-cwms-batch-runner-spk", + "enabled": true, + "emailVerified": false, + "serviceAccountClientId": "cwms-batch-runner-spk", + "realmRoles": [ + "cwms_user" + ] } ] } diff --git a/compose_files/sql/users.sql b/compose_files/sql/users.sql index 9eac95cc17..40c57cb879 100644 --- a/compose_files/sql/users.sql +++ b/compose_files/sql/users.sql @@ -50,6 +50,40 @@ begin cwms_sec.add_user_to_group('m5testadmin','CWMS User Admins', 'LRL'); cwms_sec.add_user_to_group('m5testadmin','SHOW STACK TRACE', 'LRL'); + begin + insert into at_sec_cwms_users(userid, createdby, principle_name) + values( + 'SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT', + 'CWMS_20', + 'http://localhost:8081/auth/realms/cwms::a4e88497-0ffc-41d5-b0fd-cc91760e366b' + ); + exception + when dup_val_on_index then + update at_sec_cwms_users + set principle_name = 'http://localhost:8081/auth/realms/cwms::a4e88497-0ffc-41d5-b0fd-cc91760e366b' + where userid = 'SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT'; + end; + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT', 'All Users', 'SWT'); + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT', 'CWMS Users', 'SWT'); + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT', 'TS ID Creator', 'SWT'); + + begin + insert into at_sec_cwms_users(userid, createdby, principle_name) + values( + 'SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK', + 'CWMS_20', + 'http://localhost:8081/auth/realms/cwms::d2d6f91b-a5dd-40c3-8ee6-49a52da9892e' + ); + exception + when dup_val_on_index then + update at_sec_cwms_users + set principle_name = 'http://localhost:8081/auth/realms/cwms::d2d6f91b-a5dd-40c3-8ee6-49a52da9892e' + where userid = 'SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK'; + end; + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK', 'All Users', 'SPK'); + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK', 'CWMS Users', 'SPK'); + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK', 'TS ID Creator', 'SPK'); + end; / quit; diff --git a/cwms-data-api/src/test/java/cwms/cda/api/auth/OpenIdConnectTestIT.java b/cwms-data-api/src/test/java/cwms/cda/api/auth/OpenIdConnectTestIT.java index ffa71eb81a..1c5993c4f3 100644 --- a/cwms-data-api/src/test/java/cwms/cda/api/auth/OpenIdConnectTestIT.java +++ b/cwms-data-api/src/test/java/cwms/cda/api/auth/OpenIdConnectTestIT.java @@ -2,24 +2,31 @@ import static io.restassured.RestAssured.given; import static org.hamcrest.Matchers.*; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -import java.util.Optional; - - -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; - import cwms.cda.api.DataApiTestIT; +import fixtures.CwmsDataApiSetupCallback; import fixtures.KeyCloakExtension; import io.javalin.http.HttpCode; import io.restassured.filter.log.LogDetail; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.Optional; +import mil.army.usace.hec.test.database.CwmsDatabaseContainer; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; @Tag("integration") @ExtendWith(KeyCloakExtension.class) public class OpenIdConnectTestIT extends DataApiTestIT { - + private static final String SWT_BATCH_CLIENT = "cwms-batch-runner-swt"; + private static final String SWT_BATCH_CLIENT_SECRET = "local-cwms-batch-runner-swt-secret"; + private static final String SWT_BATCH_USER = "SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT"; + private static final String SPK_BATCH_CLIENT = "cwms-batch-runner-spk"; + private static final String SPK_BATCH_CLIENT_SECRET = "local-cwms-batch-runner-spk-secret"; + private static final String SPK_BATCH_USER = "SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK"; @Test void test_keycloak_user_is_created() { @@ -55,4 +62,84 @@ void test_keycloak_user_can_operate() { .log().ifValidationFails(LogDetail.ALL,true) .statusCode(is(HttpCode.OK.getStatus())); } + + @Test + void test_keycloak_batch_service_account_claims_require_registered_machine_principal() throws Exception { + Optional spkToken = KeyCloakExtension.tokenForClientCredentials(SPK_BATCH_CLIENT, + SPK_BATCH_CLIENT_SECRET); + assertTrue(spkToken.isPresent()); + assertBatchClaims(spkToken.get(), "SPK", SPK_BATCH_CLIENT); + + given() + .log().ifValidationFails(LogDetail.ALL, true) + .header("Authorization", "Bearer " + spkToken.get()) + .when() + .get("/user/profile") + .then() + .log().ifValidationFails(LogDetail.ALL,true) + .statusCode(is(HttpCode.UNAUTHORIZED.getStatus())); + + registerBatchMachinePrincipal(SPK_BATCH_USER, spkToken.get(), "SPK"); + + given() + .log().ifValidationFails(LogDetail.ALL, true) + .header("Authorization", "Bearer " + spkToken.get()) + .when() + .get("/user/profile") + .then() + .log().ifValidationFails(LogDetail.ALL,true) + .statusCode(is(HttpCode.OK.getStatus())) + .body("user-name", equalTo(SPK_BATCH_USER)) + .body("roles.SPK", hasItems("All Users", "CWMS Users", "TS ID Creator")) + .body("roles.SWT", nullValue()); + + Optional swtToken = KeyCloakExtension.tokenForClientCredentials(SWT_BATCH_CLIENT, + SWT_BATCH_CLIENT_SECRET); + assertTrue(swtToken.isPresent()); + assertBatchClaims(swtToken.get(), "SWT", SWT_BATCH_CLIENT); + registerBatchMachinePrincipal(SWT_BATCH_USER, swtToken.get(), "SWT"); + + given() + .log().ifValidationFails(LogDetail.ALL, true) + .header("Authorization", "Bearer " + swtToken.get()) + .when() + .get("/user/profile") + .then() + .log().ifValidationFails(LogDetail.ALL,true) + .statusCode(is(HttpCode.OK.getStatus())) + .body("user-name", equalTo(SWT_BATCH_USER)) + .body("roles.SWT", hasItems("All Users", "CWMS Users", "TS ID Creator")) + .body("roles.SPK", nullValue()); + } + + private static void assertBatchClaims(String token, String office, String clientId) throws Exception { + assertEquals(true, KeyCloakExtension.claims(token).get("machine_auth").asBoolean()); + assertEquals(office, KeyCloakExtension.claims(token).get("run_as_office").asText()); + assertEquals("service-account-" + clientId, + KeyCloakExtension.claims(token).get("preferred_username").asText()); + } + + private static void registerBatchMachinePrincipal(String userName, String token, String office) throws Exception { + addNewUser(userName); + updateOidcPrincipal(userName, token); + addUserToGroup(userName, "CWMS Users", office); + addUserToGroup(userName, "All Users", office); + addUserToGroup(userName, "TS ID Creator", office); + } + + private static void updateOidcPrincipal(String userName, String token) throws Exception { + String oidcPrincipal = KeyCloakExtension.getIssuer() + "::" + + KeyCloakExtension.claims(token).get("sub").asText(); + CwmsDatabaseContainer db = CwmsDataApiSetupCallback.getDatabaseLink(); + db.connection((c) -> { + try (PreparedStatement stmt = c.prepareStatement( + "update AT_SEC_CWMS_USERS set principle_name = ? where userid = upper(?)")) { + stmt.setString(1, oidcPrincipal); + stmt.setString(2, userName); + stmt.executeUpdate(); + } catch (SQLException ex) { + throw new RuntimeException("Unable to update OIDC principal for: " + userName, ex); + } + }, "cwms_20"); + } } diff --git a/cwms-data-api/src/test/java/fixtures/KeyCloakExtension.java b/cwms-data-api/src/test/java/fixtures/KeyCloakExtension.java index 70a0bd2232..0bf09d2710 100644 --- a/cwms-data-api/src/test/java/fixtures/KeyCloakExtension.java +++ b/cwms-data-api/src/test/java/fixtures/KeyCloakExtension.java @@ -10,6 +10,7 @@ import java.net.URL; import java.nio.charset.Charset; import java.time.Duration; +import java.util.Base64; import java.util.HashMap; import java.util.Map; import java.util.Optional; @@ -162,4 +163,41 @@ public static Optional tokenForUser(String username, String password) { return Optional.empty(); } } + + /** + * Retrieve an access token for a confidential client service account. + * This is needed to verify CDA's machine-to-machine path with the same + * client_credentials flow a batch runner uses. Password-grant user tokens + * do not exercise Keycloak service-account subjects or built-in client + * mappers for machine_auth and run_as_office claims. + * @param clientId Keycloak client ID + * @param clientSecret Keycloak client secret + * @return Access token only + */ + public static Optional tokenForClientCredentials(String clientId, String clientSecret) { + try { + Response response = + given() + .log().ifValidationFails(LogDetail.ALL,true) + .contentType(ContentType.URLENC) + .formParam("client_id", clientId) + .formParam("grant_type", "client_credentials") + .formParam("client_secret", clientSecret) + .formParam("scope", "openid profile email") + .when() + .post(new URL(getTokenUrl())); + + logger.atFine().log(response.asPrettyString()); + JsonNode tokenInfo = mapper.readTree(response.asString()); + return Optional.of(tokenInfo.get("access_token").asText()); + } catch (JsonProcessingException | MalformedURLException ex) { + logger.atWarning().withCause(ex).log("Unable to retrieve token for client %s", clientId); + return Optional.empty(); + } + } + + public static JsonNode claims(String token) throws IOException { + String[] parts = token.split("\\."); + return mapper.readTree(Base64.getUrlDecoder().decode(parts[1])); + } } From f8a69f8705290d390e0152de9bb74e244e2d630a Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Thu, 25 Jun 2026 22:40:48 -0500 Subject: [PATCH 08/15] Add local cwms audience mapper --- compose_files/keycloak/realm.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/compose_files/keycloak/realm.json b/compose_files/keycloak/realm.json index 581600f863..4b87c4263e 100644 --- a/compose_files/keycloak/realm.json +++ b/compose_files/keycloak/realm.json @@ -713,6 +713,20 @@ "authenticationFlowBindingOverrides": {}, "fullScopeAllowed": true, "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "5edb86b8-82c7-4d9b-9f58-7a8febf50a3d", + "name": "cwms audience", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-mapper", + "consentRequired": false, + "config": { + "included.client.audience": "cwms", + "id.token.claim": "false", + "access.token.claim": "true" + } + } + ], "defaultClientScopes": [ "web-origins", "acr", From 8a1fbf351def928caffad903bfa505a15880febb Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Fri, 26 Jun 2026 06:50:25 -0500 Subject: [PATCH 09/15] Add local SWT Airflow service account --- compose_files/keycloak/realm.json | 66 +++++++++++++++++++++++++++++++ compose_files/sql/users.sql | 16 ++++++++ 2 files changed, 82 insertions(+) diff --git a/compose_files/keycloak/realm.json b/compose_files/keycloak/realm.json index 4b87c4263e..d57cdb20ac 100644 --- a/compose_files/keycloak/realm.json +++ b/compose_files/keycloak/realm.json @@ -816,6 +816,62 @@ "microprofile-jwt" ] }, + { + "id": "9620b229-ea0e-48f9-9f2f-cb601f2d6ee1", + "clientId": "cwms-batch-airflow-swt", + "name": "CWMS Batch Airflow SWT", + "description": "Local test service account for SWT scheduled batch job triggers", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": true, + "clientAuthenticatorType": "client-secret", + "secret": "local-cwms-batch-airflow-swt-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": true, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "client_credentials.use_refresh_token": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "e8865c36-fb89-45cb-8b81-a20562c5a154", + "name": "cwms audience", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-mapper", + "consentRequired": false, + "config": { + "included.client.audience": "cwms", + "id.token.claim": "false", + "access.token.claim": "true" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "roles", + "profile", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, { "id": "7bed3c57-6b95-45a7-b94f-1d85103776e2", "clientId": "cwms-batch-runner-spk", @@ -2494,6 +2550,16 @@ "cwms_user" ] }, + { + "id": "b70c2e60-ce11-42c7-8271-10bb2b3fd4bd", + "username": "service-account-cwms-batch-airflow-swt", + "enabled": true, + "emailVerified": false, + "serviceAccountClientId": "cwms-batch-airflow-swt", + "realmRoles": [ + "cwms_user" + ] + }, { "id": "d2d6f91b-a5dd-40c3-8ee6-49a52da9892e", "username": "service-account-cwms-batch-runner-spk", diff --git a/compose_files/sql/users.sql b/compose_files/sql/users.sql index 40c57cb879..8bb9649fb2 100644 --- a/compose_files/sql/users.sql +++ b/compose_files/sql/users.sql @@ -67,6 +67,22 @@ begin cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT', 'CWMS Users', 'SWT'); cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT', 'TS ID Creator', 'SWT'); + begin + insert into at_sec_cwms_users(userid, createdby, principle_name) + values( + 'SERVICE-ACCOUNT-CWMS-BATCH-AIRFLOW-SWT', + 'CWMS_20', + 'http://localhost:8081/auth/realms/cwms::b70c2e60-ce11-42c7-8271-10bb2b3fd4bd' + ); + exception + when dup_val_on_index then + update at_sec_cwms_users + set principle_name = 'http://localhost:8081/auth/realms/cwms::b70c2e60-ce11-42c7-8271-10bb2b3fd4bd' + where userid = 'SERVICE-ACCOUNT-CWMS-BATCH-AIRFLOW-SWT'; + end; + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-AIRFLOW-SWT', 'All Users', 'SWT'); + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-AIRFLOW-SWT', 'CWMS Users', 'SWT'); + begin insert into at_sec_cwms_users(userid, createdby, principle_name) values( From 7b2ad6840e401735455bb00dbbc47bc260914d5b Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Fri, 26 Jun 2026 07:11:35 -0500 Subject: [PATCH 10/15] Add scheduled M2M flow diagram to ADR --- .../0010-batch-machine-run-context.md | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/docs/source/decisions/0010-batch-machine-run-context.md b/docs/source/decisions/0010-batch-machine-run-context.md index eff35836c0..b51f015096 100644 --- a/docs/source/decisions/0010-batch-machine-run-context.md +++ b/docs/source/decisions/0010-batch-machine-run-context.md @@ -68,6 +68,70 @@ sequenceDiagram CDA-->>Runner: CDA response ``` +### Current Scheduled M2M Flow + +```mermaid +flowchart LR + subgraph Airflow["Airflow scheduler"] + DAG["SWT hourly DAG
cwms_batch_events_swt_hourly"] + SchedulerClient["Scheduler client
cwms-batch-airflow-swt"] + end + + subgraph Identity["Keycloak"] + Keycloak["Token endpoint"] + SchedulerToken["Scheduler access token
azp=cwms-batch-airflow-swt"] + RunnerToken["Runner access token
machine_auth=true
run_as_office=SWT"] + end + + subgraph BatchEvents["Batch Events"] + API["POST /jobs"] + RoleCheck["Validate JWT
allow listed azp
CDA profile roles"] + JobDB[("Jobs and scripts DB
SWT hourly
bin/hourly.sh
runtime=shell")] + Queue["SQS or ElasticMQ"] + Dispatcher["Dispatcher
Lambda or local loop"] + Broker["Runtime env broker
/internal/jobs/{job}/runtime-env"] + end + + subgraph Runtime["Shared runtime"] + Batch["AWS Batch or local Docker
shared shell runner job definition"] + Runner["Runner container
cwbi-wm-images"] + Script["SWT office script
bin/hourly.sh
uses CDA_BEARER_TOKEN"] + end + + subgraph Secrets["Office secret store"] + SecretStore[("cwms-batch-jobs-swd-secrets
SWT_CDA_CLIENT_ID
SWT_CDA_CLIENT_SECRET")] + end + + subgraph CDA["CWMS Data API"] + Profile["/user/profile
scheduler service account has SWT roles"] + CdaAuth["OIDC validation
registered machine principal
normal CDA and DB auth"] + DbAuth[("CWMS database roles")] + end + + DAG --> SchedulerClient + SchedulerClient -->|"client_credentials"| Keycloak + Keycloak --> SchedulerToken + SchedulerToken -->|"Bearer token"| API + API --> RoleCheck + RoleCheck -->|"resolve caller roles"| Profile + RoleCheck -->|"create authorized job"| JobDB + JobDB --> Queue + Queue --> Dispatcher + Dispatcher -->|"submit runtime job"| Batch + Batch --> Runner + Runner -->|"job id + runtime token"| Broker + Broker --> SecretStore + Broker -->|"CDA client creds + script env"| Runner + Runner -->|"client_credentials"| Keycloak + Keycloak --> RunnerToken + RunnerToken --> Runner + Runner --> Script + Script -->|"CDA_BEARER_TOKEN"| CdaAuth + CdaAuth --> DbAuth +``` + +The scheduler identity and runner identity are intentionally separate. Airflow's office-specific service account is authorized to request a job. The runner's office-specific service account is the machine principal used when the job calls CDA. + ### Keycloak-Minted Run Context When available, Keycloak mints the batch run context into the normal access token. CDA validates that token through the existing OIDC flow and reads these claims: From 5eb849efe5c8d7540b4090cd2138cf796f52c5b2 Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Fri, 26 Jun 2026 20:56:17 -0500 Subject: [PATCH 11/15] Add local Batch Events admin user --- compose_files/keycloak/realm.json | 16 ++++++++++++++++ compose_files/sql/users.sql | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/compose_files/keycloak/realm.json b/compose_files/keycloak/realm.json index d57cdb20ac..09fd8bcc85 100644 --- a/compose_files/keycloak/realm.json +++ b/compose_files/keycloak/realm.json @@ -2540,6 +2540,22 @@ "new_user" ] }, + { + "id": "7e6d24c7-3a13-4054-a4e4-d37f4f2ac850", + "username": "m5batcheventadmin", + "enabled": true, + "email": "noreply-batch-admin@data.test", + "emailVerified": true, + "credentials": [ + { + "type": "password", + "value": "m5batcheventadmin" + } + ], + "realmRoles": [ + "cwms_user" + ] + }, { "id": "a4e88497-0ffc-41d5-b0fd-cc91760e366b", "username": "service-account-cwms-batch-runner-swt", diff --git a/compose_files/sql/users.sql b/compose_files/sql/users.sql index 8bb9649fb2..2eab752638 100644 --- a/compose_files/sql/users.sql +++ b/compose_files/sql/users.sql @@ -50,6 +50,23 @@ begin cwms_sec.add_user_to_group('m5testadmin','CWMS User Admins', 'LRL'); cwms_sec.add_user_to_group('m5testadmin','SHOW STACK TRACE', 'LRL'); + begin + insert into at_sec_cwms_users(userid, createdby, principle_name) + values( + 'M5BATCHEVENTADMIN', + 'CWMS_20', + 'http://localhost:8081/auth/realms/cwms::7e6d24c7-3a13-4054-a4e4-d37f4f2ac850' + ); + exception + when dup_val_on_index then + update at_sec_cwms_users + set principle_name = 'http://localhost:8081/auth/realms/cwms::7e6d24c7-3a13-4054-a4e4-d37f4f2ac850' + where userid = 'M5BATCHEVENTADMIN'; + end; + cwms_sec.add_user_to_group('M5BATCHEVENTADMIN', 'All Users', 'SWT'); + cwms_sec.add_user_to_group('M5BATCHEVENTADMIN', 'CWMS Users', 'SWT'); + cwms_sec.add_user_to_group('M5BATCHEVENTADMIN', 'Data Acquisition Mgr', 'SWT'); + begin insert into at_sec_cwms_users(userid, createdby, principle_name) values( From 1fb7f6727243b07ec1c8df856e86871ce4a30589 Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Fri, 26 Jun 2026 21:32:18 -0500 Subject: [PATCH 12/15] Align local Batch Events admin principal --- compose_files/keycloak/realm.json | 2 +- compose_files/sql/users.sql | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/compose_files/keycloak/realm.json b/compose_files/keycloak/realm.json index 09fd8bcc85..6d774373f1 100644 --- a/compose_files/keycloak/realm.json +++ b/compose_files/keycloak/realm.json @@ -2541,7 +2541,7 @@ ] }, { - "id": "7e6d24c7-3a13-4054-a4e4-d37f4f2ac850", + "id": "6049479d-e783-4c7f-9024-57a7d29649f7", "username": "m5batcheventadmin", "enabled": true, "email": "noreply-batch-admin@data.test", diff --git a/compose_files/sql/users.sql b/compose_files/sql/users.sql index 2eab752638..4fa84cefd5 100644 --- a/compose_files/sql/users.sql +++ b/compose_files/sql/users.sql @@ -55,12 +55,12 @@ begin values( 'M5BATCHEVENTADMIN', 'CWMS_20', - 'http://localhost:8081/auth/realms/cwms::7e6d24c7-3a13-4054-a4e4-d37f4f2ac850' + 'http://localhost:8081/auth/realms/cwms::6049479d-e783-4c7f-9024-57a7d29649f7' ); exception when dup_val_on_index then update at_sec_cwms_users - set principle_name = 'http://localhost:8081/auth/realms/cwms::7e6d24c7-3a13-4054-a4e4-d37f4f2ac850' + set principle_name = 'http://localhost:8081/auth/realms/cwms::6049479d-e783-4c7f-9024-57a7d29649f7' where userid = 'M5BATCHEVENTADMIN'; end; cwms_sec.add_user_to_group('M5BATCHEVENTADMIN', 'All Users', 'SWT'); From 362e8bfde67de2b029bf367fb026ecd34432aa45 Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Fri, 26 Jun 2026 22:25:46 -0500 Subject: [PATCH 13/15] Align batch M2M ADR with office service accounts --- .../0010-batch-machine-run-context.md | 49 ++++++++++--------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/docs/source/decisions/0010-batch-machine-run-context.md b/docs/source/decisions/0010-batch-machine-run-context.md index b51f015096..cd6e2b4de4 100644 --- a/docs/source/decisions/0010-batch-machine-run-context.md +++ b/docs/source/decisions/0010-batch-machine-run-context.md @@ -10,15 +10,15 @@ ## Objective -Provide CWMS Data API with a trusted batch run context for jobs that execute through a shared machine identity. +Provide CWMS Data API with a trusted batch run context for jobs that execute through shared Batch runtime infrastructure. -Batch runtimes will authenticate to CDA with a service account (via Keycloak). Each job will also provide trusted launch context, including the office for which the scheduler or API approved the run. CDA will use Keycloak-minted access-token claims when Keycloak can safely receive per-job values. CDA will also support a signed dispatcher context header as a fallback when Keycloak cannot provide dynamic job context without a custom extension. +Batch runtimes will authenticate to CDA with office-scoped service accounts through Keycloak. Each job will provide trusted launch context, including the office for which the scheduler or API approved the run. CDA will use Keycloak-minted access-token claims when Keycloak can mint that context into the normal access token. CDA will also support a signed dispatcher context header as a fallback when Keycloak cannot provide job context without a custom extension. The signed context is **not** a replacement for normal CDA or database authorization. It establishes **who** launched the machine runtime and why. CDA and the CWMS database remain responsible for deciding whether the machine principal may read or write the requested resource office. The machine principal must already be registered in CDA and the CWMS database; CDA must not auto-create batch machine users. ## Motivation -A single machine service account reduces Keycloak and AWS Batch configuration requirements. It also removes the natural office-specific identity that existed when each office had its own API key or job role in AWS. CDA therefore needs trusted context from the dispatcher so scripts cannot choose their own run authority by changing an environment variable, URI parameter, or request body. +Shared AWS Batch job definitions reduce AWS Batch configuration requirements, but they must not remove the office-specific machine identity used by CDA and the CWMS database. The current preferred shape is one scheduler service account and one runner service account per office or trust boundary. CDA still needs trusted context from Keycloak or the dispatcher so scripts cannot choose their own run authority by changing an environment variable, URI parameter, or request body. This is needed because CDA request office fields describe resource ownership, not caller authority. For example, a job approved for SWT (Tulsa District) may write data owned by another office when the mapped machine user has the required database roles. The request office identifies the target data; it does not identify who the job is running as. i.e. `&office=SWT` in the URI. @@ -27,8 +27,8 @@ This is needed because CDA request office fields describe resource ownership, no ### For Batch Operators - Runtime job definitions can be managed by language or image instead of by office/image combinations. -- One machine service account can support scheduled and ad hoc batch execution. -- Office launch context is still available for audit and policy decisions. +- Office service accounts can be managed in Keycloak while AWS Batch job definitions remain shared. +- Office launch context is available for audit and policy decisions. ### For Script Authors @@ -38,7 +38,7 @@ This is needed because CDA request office fields describe resource ownership, no ### For Security and Operations -- CDA rejects machine requests that lack a signed dispatcher issued run context. +- CDA rejects machine requests that lack trusted Keycloak-minted or dispatcher-issued run context. - Request parameters and payload fields are not trusted as caller authority. - CDA audit records can include both the machine principal and the signed job context. @@ -202,11 +202,11 @@ When Keycloak mints the batch run context directly, CDA does not need the signin ### Per-Office Keycloak Service Accounts -Create one service account per office or trust boundary. +Create one scheduler service account and one runner service account per office or trust boundary. - **Pros**: Office context is represented directly by the service account. - **Cons**: Recreates service-account and secret sprawl as offices and runtimes grow. -- **Rejected**: The design goal is one machine identity for batch runtimes. +- **Selected for current rollout**: This avoids Keycloak custom extensions while still allowing AWS Batch job definitions and runner images to be shared across offices. ### Per-Office Batch Job Definitions and API Keys @@ -214,7 +214,7 @@ Continue using separate Batch definitions and CDA API secrets per office/runtime - **Pros**: Uses the existing model. - **Cons**: Requires hard-coded expansion across offices, runtimes, images, and secrets. -- **Rejected**: The dynamic runtime model is intended to remove this duplication. +- **Rejected**: The dynamic runtime model is intended to remove this AWS Batch and API-key duplication. ### Trust Request or Environment Office @@ -226,19 +226,19 @@ Use `OFFICE`, URI parameters, query parameters, or request body fields to decide ### Signed Dispatcher-Issued Run Context -Use one machine identity and require a short-lived signed token from the trusted dispatcher. +Use a shared machine identity and require a short-lived signed token from the trusted dispatcher. - **Pros**: Reduces runtime duplication while preserving trusted job launch context and normal CDA/DB authorization. - **Cons**: Requires token validation and signing key management. -- **Fallback proposal**: Provides the required trust boundary without per-office machine identities if Keycloak cannot mint dynamic job context into the access token. +- **Fallback proposal**: Provides the required trust boundary if Keycloak cannot mint the required run context into the access token and per-office service accounts are not sufficient for a future use case. ### Keycloak-Minted Job Context Claims -Have Keycloak receive trusted per-job context during token minting and include that context in the normal access token. +Have Keycloak mint trusted machine context into the normal access token. - **Pros**: CDA validates one JWT from one issuer and does not need a second signing secret. -- **Cons**: Requires proof that Keycloak can safely receive dynamic per-job values such as `run_as_office` and `job_id` without a custom extension. -- **Preferred if feasible**: This will be investigated before the signed dispatcher context is adopted for production. +- **Cons**: Dynamic per-job values such as `job_id` would require more Keycloak customization than CWBI is likely to operate. +- **Preferred shape for current rollout**: Use office-scoped Keycloak service accounts that mint stable claims such as `machine_auth` and `run_as_office`; keep per-job metadata in Batch Events. ## Compatibility @@ -250,25 +250,30 @@ Endpoint resource-office semantics are unchanged. Controllers and DAOs may conti ## Implementation Status -### Proposed +### Current rollout + +- Use per-office Keycloak scheduler and runner service accounts. +- Have the runner token include `machine_auth` and `run_as_office`. +- Keep AWS Batch job definitions shared by runtime rather than by office. +- Keep job id, script, schedule, timeout, resource profile, env vars, and allowed secret names in the Batch Events registry. + +### CDA implementation - Accept Keycloak-minted `machine_auth` and `run_as_office` claims from validated OIDC access tokens. - Add CDA validation for `X-CWMS-Job-Context` on configured batch machine users. - Preserve batch run context separately from request resource office. - Use run context only for session behavior in CDA; reserve job/requester metadata for future logging. -- Add dispatcher-side signing in the batch events service. -- Add runner support for forwarding `X-CWMS-Job-Context` with CDA requests. -- Use the signed dispatcher context only when Keycloak cannot mint dynamic job context claims safely. +- Keep dispatcher-side signing and `X-CWMS-Job-Context` as a fallback path rather than the preferred production shape. ## Criteria ### Functional Requirements -- A batch job launched through Airflow or the ad hoc API can call CDA using the shared machine Keycloak service account. +- A batch job launched through Airflow or the ad hoc API can call CDA using the registered office runner Keycloak service account. - CDA accepts registered machine-user requests with valid Keycloak-minted machine run claims. - CDA rejects configured machine-user requests that omit or forge run context. - CDA rejects configured batch machine principals that are not already registered in CDA/DB. -- CDA records signed job/run context for audit. +- CDA can expose validated run context for audit without trusting script-controlled office values. - Resource office access remains controlled by CDA route roles and CWMS database roles. - A job with `run_as_office=SWT` can act on another office's resource data only when the mapped machine user has the required roles for that resource office. @@ -282,6 +287,6 @@ Endpoint resource-office semantics are unchanged. Controllers and DAOs may conti ## Conclusion -Signed batch run context allows CDA to support dynamic batch runtimes with one Keycloak service account while preserving the existing CWMS authorization model. +Trusted batch run context allows CDA to support dynamic shared Batch runtimes while preserving the existing CWMS authorization model. -The token represents trusted job launch context. It does not redefine resource office semantics and does not bypass normal CDA or database authorization for the data being read or written. +The Keycloak token claims or signed fallback token represent trusted job launch context. They do not redefine resource office semantics and do not bypass normal CDA or database authorization for the data being read or written. From a88f35eb39f315a13c694e5f1a1a9501b570f772 Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Sat, 27 Jun 2026 15:21:12 -0500 Subject: [PATCH 14/15] Update batch M2M ADR diagrams --- .../0010-batch-machine-run-context.md | 127 ++++++------------ .../batch-airflow-scheduler-flow.drawio | 29 ++++ .../diagrams/batch-airflow-scheduler-flow.svg | 86 ++++++++++++ .../diagrams/batch-m2m-overview.drawio | 23 ++++ .../decisions/diagrams/batch-m2m-overview.svg | 105 +++++++++++++++ .../diagrams/batch-ui-job-flow.drawio | 27 ++++ .../decisions/diagrams/batch-ui-job-flow.svg | 85 ++++++++++++ 7 files changed, 399 insertions(+), 83 deletions(-) create mode 100644 docs/source/decisions/diagrams/batch-airflow-scheduler-flow.drawio create mode 100644 docs/source/decisions/diagrams/batch-airflow-scheduler-flow.svg create mode 100644 docs/source/decisions/diagrams/batch-m2m-overview.drawio create mode 100644 docs/source/decisions/diagrams/batch-m2m-overview.svg create mode 100644 docs/source/decisions/diagrams/batch-ui-job-flow.drawio create mode 100644 docs/source/decisions/diagrams/batch-ui-job-flow.svg diff --git a/docs/source/decisions/0010-batch-machine-run-context.md b/docs/source/decisions/0010-batch-machine-run-context.md index cd6e2b4de4..7326a80efa 100644 --- a/docs/source/decisions/0010-batch-machine-run-context.md +++ b/docs/source/decisions/0010-batch-machine-run-context.md @@ -46,92 +46,48 @@ This is needed because CDA request office fields describe resource ownership, no ### Batch Run Flow -```mermaid -sequenceDiagram - participant Caller as Airflow or Batch Events API - participant Dispatcher as Batch Dispatcher - participant Runner as Batch Runtime - participant Keycloak as Keycloak - participant CDA as CWMS Data API - participant DB as CWMS Database - - Caller->>Dispatcher: Request job for office and script - Dispatcher->>Dispatcher: Authorize request and create job record - Dispatcher->>Dispatcher: Authorize run context - Dispatcher->>Runner: Start runtime with job id and brokered env - Runner->>Keycloak: Request machine token with run context, when supported - Keycloak-->>Runner: Machine bearer token with machine_auth and run_as_office - Runner->>CDA: Request with bearer token - CDA->>CDA: Validate machine principal and token run context - CDA->>DB: Execute normal CDA/CWMS authorization - DB-->>CDA: Authorized result or denial - CDA-->>Runner: CDA response -``` - -### Current Scheduled M2M Flow - -```mermaid -flowchart LR - subgraph Airflow["Airflow scheduler"] - DAG["SWT hourly DAG
cwms_batch_events_swt_hourly"] - SchedulerClient["Scheduler client
cwms-batch-airflow-swt"] - end - - subgraph Identity["Keycloak"] - Keycloak["Token endpoint"] - SchedulerToken["Scheduler access token
azp=cwms-batch-airflow-swt"] - RunnerToken["Runner access token
machine_auth=true
run_as_office=SWT"] - end - - subgraph BatchEvents["Batch Events"] - API["POST /jobs"] - RoleCheck["Validate JWT
allow listed azp
CDA profile roles"] - JobDB[("Jobs and scripts DB
SWT hourly
bin/hourly.sh
runtime=shell")] - Queue["SQS or ElasticMQ"] - Dispatcher["Dispatcher
Lambda or local loop"] - Broker["Runtime env broker
/internal/jobs/{job}/runtime-env"] - end - - subgraph Runtime["Shared runtime"] - Batch["AWS Batch or local Docker
shared shell runner job definition"] - Runner["Runner container
cwbi-wm-images"] - Script["SWT office script
bin/hourly.sh
uses CDA_BEARER_TOKEN"] - end - - subgraph Secrets["Office secret store"] - SecretStore[("cwms-batch-jobs-swd-secrets
SWT_CDA_CLIENT_ID
SWT_CDA_CLIENT_SECRET")] - end - - subgraph CDA["CWMS Data API"] - Profile["/user/profile
scheduler service account has SWT roles"] - CdaAuth["OIDC validation
registered machine principal
normal CDA and DB auth"] - DbAuth[("CWMS database roles")] - end - - DAG --> SchedulerClient - SchedulerClient -->|"client_credentials"| Keycloak - Keycloak --> SchedulerToken - SchedulerToken -->|"Bearer token"| API - API --> RoleCheck - RoleCheck -->|"resolve caller roles"| Profile - RoleCheck -->|"create authorized job"| JobDB - JobDB --> Queue - Queue --> Dispatcher - Dispatcher -->|"submit runtime job"| Batch - Batch --> Runner - Runner -->|"job id + runtime token"| Broker - Broker --> SecretStore - Broker -->|"CDA client creds + script env"| Runner - Runner -->|"client_credentials"| Keycloak - Keycloak --> RunnerToken - RunnerToken --> Runner - Runner --> Script - Script -->|"CDA_BEARER_TOKEN"| CdaAuth - CdaAuth --> DbAuth -``` +![CWMS Batch M2M overview](diagrams/batch-m2m-overview.svg) + +Editable source: [batch-m2m-overview.drawio](diagrams/batch-m2m-overview.drawio) + +The scheduler and runner identities are intentionally separate. A user or +office scheduler can request an authorized job, but the running script calls CDA +with an office-scoped runner service account. Batch Events remains the source +of truth for script registry rows, job records, runtime env brokering, status, +and log lookup. + +### End User UI Flow + +![End user Batch Events UI flow](diagrams/batch-ui-job-flow.svg) + +Editable source: [batch-ui-job-flow.drawio](diagrams/batch-ui-job-flow.drawio) + +The Batch Events UI is now registry-oriented. A script admin chooses an office, +creates or edits a script row, selects either a GitHub file path or an inline +command, configures runtime, resource profile, timeout, schedule, roles, env +vars, and secret names, then submits jobs from that registry row. GitHub file +paths can be browsed from the configured repository checkout and show +runtime-specific file type hints, while command rows intentionally allow trusted +users to run arbitrary commands in the trusted runtime image. The local executor +and AWS Batch both honor the configured timeout; local timeout handling was +verified with a command that sleeps longer than its one-minute timeout. + +### Airflow Scheduled M2M Flow + +![Airflow scheduled Batch Events flow](diagrams/batch-airflow-scheduler-flow.svg) + +Editable source: [batch-airflow-scheduler-flow.drawio](diagrams/batch-airflow-scheduler-flow.drawio) The scheduler identity and runner identity are intentionally separate. Airflow's office-specific service account is authorized to request a job. The runner's office-specific service account is the machine principal used when the job calls CDA. +Airflow does not submit AWS Batch jobs directly for the registry-driven path. +Instead, the scheduled DAG lists due scripts through Batch Events, evaluates +hourly or cron schedules using each script's `scheduleTimezone`, and posts a +Batch Events job for each due row. Daylight-saving time gaps are skipped and +repeated local occurrences run once. Airflow does not wait for AWS Batch +completion; Batch Events owns dispatch, status, log lookup, and runtime broker +behavior after the job is accepted. + ### Keycloak-Minted Run Context When available, Keycloak mints the batch run context into the normal access token. CDA validates that token through the existing OIDC flow and reads these claims: @@ -256,6 +212,10 @@ Endpoint resource-office semantics are unchanged. Controllers and DAOs may conti - Have the runner token include `machine_auth` and `run_as_office`. - Keep AWS Batch job definitions shared by runtime rather than by office. - Keep job id, script, schedule, timeout, resource profile, env vars, and allowed secret names in the Batch Events registry. +- Let script admins register either a GitHub file path or a trusted runtime command. +- Let script admins choose schedule timezone; cron and hourly schedules are evaluated in that timezone by Airflow before a job is posted. +- Let script admins choose small, medium, or large resource profiles; AWS Batch receives resource overrides at dispatch time. +- Keep local Docker execution aligned with AWS Batch timeout behavior so local E2E can prove long-running jobs fail when they exceed the configured timeout. ### CDA implementation @@ -264,6 +224,7 @@ Endpoint resource-office semantics are unchanged. Controllers and DAOs may conti - Preserve batch run context separately from request resource office. - Use run context only for session behavior in CDA; reserve job/requester metadata for future logging. - Keep dispatcher-side signing and `X-CWMS-Job-Context` as a fallback path rather than the preferred production shape. +- Reject unregistered machine principals rather than auto-creating users when a machine token appears. ## Criteria diff --git a/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.drawio b/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.drawio new file mode 100644 index 0000000000..73d24e5c8b --- /dev/null +++ b/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.drawio @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.svg b/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.svg new file mode 100644 index 0000000000..1c590da147 --- /dev/null +++ b/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.svg @@ -0,0 +1,86 @@ + + Airflow scheduled Batch Events flow + Airflow polls Batch Events scheduled scripts per office, obtains scheduler service-account tokens, triggers due scripts, and lets Batch Events own dispatch, status, logs, and CDA runner authentication. + + + + + + Airflow Scheduled Flow + One registry-driven scheduler can serve many offices without one DAG per script or one AWS Batch job definition per office. + + + 1. Every-minute DAG + Registry scheduler runs + optionally loops configured offices + + + 2. Scheduler token + client_credentials + cwms-batch-airflow-<office> + + + 3. List due scripts + GET /scripts/scheduled + Batch Events filters by office roles + + + 4. Timezone due check + hourly minute or cron + scheduleTimezone controls local time; + DST gaps skipped, repeats run once + + + 5. Trigger due rows + POST /jobs per script + does not wait for AWS Batch + one office failure does not block others + + + 6. Registry job record + Jobs DB + queue + runtime, resources, timeout, + env vars copied from script row + + + 7. Shared runtime + Dispatcher submits job + runtime job definition + overrides + + + 8. Runner bootstraps + Fetches brokered env + office runner client credentials + + + 9. Runner token + client_credentials + machine_auth + run_as_office + + + 10. CDA call + CDA validates token + then normal DB roles + + + + + + + + + + + diff --git a/docs/source/decisions/diagrams/batch-m2m-overview.drawio b/docs/source/decisions/diagrams/batch-m2m-overview.drawio new file mode 100644 index 0000000000..016ea300b2 --- /dev/null +++ b/docs/source/decisions/diagrams/batch-m2m-overview.drawio @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/decisions/diagrams/batch-m2m-overview.svg b/docs/source/decisions/diagrams/batch-m2m-overview.svg new file mode 100644 index 0000000000..2f8fb3827f --- /dev/null +++ b/docs/source/decisions/diagrams/batch-m2m-overview.svg @@ -0,0 +1,105 @@ + + CWMS Batch M2M overview + Airflow and users submit jobs through Batch Events. Batch Events records the job, dispatches a shared runtime, brokers environment and secrets, and the runner uses an office-scoped Keycloak service account to call CDA. + + + + + + + + CWMS Batch M2M Auth and Shared Runtime + Scheduler identity requests jobs; runner identity calls CDA. Batch Events owns registry, dispatch, runtime env broker, logs, and office-scoped authorization. + + + Requesters + + Web UI user + PKCE login + + Airflow + per-office + scheduler client + + + Keycloak + + User token + roles + offices + + Scheduler + cwms-batch- + airflow-swt + + Runner token + machine_auth + run_as_office + + + Batch Events + + Registry API + scripts, schedules, + timeouts, resources + + Jobs DB + queue + office-scoped jobs + status + log lookup + + Dispatcher + shared runtime map + + Runtime broker + job env + secrets + + + Runtime + + AWS Batch + one job def per + runtime + + Runner image + executes script + + + CDA + Data + + Office secrets + runner client creds + + CWMS Data API + validates OIDC + registered machine + + CWMS DB roles + + + + + + + + + + + + + + + + client_credentials + diff --git a/docs/source/decisions/diagrams/batch-ui-job-flow.drawio b/docs/source/decisions/diagrams/batch-ui-job-flow.drawio new file mode 100644 index 0000000000..e4a19ba96b --- /dev/null +++ b/docs/source/decisions/diagrams/batch-ui-job-flow.drawio @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/decisions/diagrams/batch-ui-job-flow.svg b/docs/source/decisions/diagrams/batch-ui-job-flow.svg new file mode 100644 index 0000000000..38f8353bcb --- /dev/null +++ b/docs/source/decisions/diagrams/batch-ui-job-flow.svg @@ -0,0 +1,85 @@ + + End user Batch Events UI job flow + A user signs into the Batch Events UI, creates or selects a script registry row, submits a job, and then views status and logs while the shared runner executes the job and calls CDA with machine credentials. + + + + + + End User UI Flow + Manual user flow through Scripts Manager and Submit Job using the Batch Events registry. + + + 1. Sign in + User authenticates + PKCE token carries user roles + + + 2. Pick office + Scripts Manager filters + admin offices from CDA profile + + + 3. Register script + Choose GitHub path or command + runtime, resource profile, timeout, + schedule timezone, roles, env vars + + + 4. Optional path browse + Browse repository directories + runtime type hints are case-insensitive; + browse can select any file + + + 5. Save registry row + Batch Events validates + reserved env names, schedule shape, + timezone, timeout, runtime + + + 6. Submit job + User selects script + Batch Events checks required roles + + + 7. Create job + Jobs DB row + queue + office, script, timeout, resources + + + 8. Dispatch runtime + Shared runner job definition + resource overrides at submit time + + + 9. Execute script + Runner fetches brokered env + gets CDA bearer with runner client + + + 10. Status and logs + UI shows job result + office-scoped lookup and logs + + + + + + + + + + + From dca48b1c4bcf06dc5c2a712fd12e30e8a83e2952 Mon Sep 17 00:00:00 2001 From: "Charles Graham, SWT" Date: Sat, 27 Jun 2026 16:52:44 -0500 Subject: [PATCH 15/15] Document batch Keycloak service accounts --- .../0010-batch-machine-run-context.md | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/source/decisions/0010-batch-machine-run-context.md b/docs/source/decisions/0010-batch-machine-run-context.md index 7326a80efa..7c41102527 100644 --- a/docs/source/decisions/0010-batch-machine-run-context.md +++ b/docs/source/decisions/0010-batch-machine-run-context.md @@ -154,6 +154,29 @@ The signing secret belongs in a managed secret store. A later hardening step sho When Keycloak mints the batch run context directly, CDA does not need the signing secret for those requests. The machine principal must still be registered in CDA and the CWMS database. +### Keycloak Client and Service Account Shape + +The local Keycloak realm uses confidential OIDC clients with service accounts to model non-human batch actors. In CWBI/cloud Keycloak, the same shape needs to be recreated for each environment. These are Keycloak clients and service-account users, not AWS Batch job definitions. + +Some configuration is common to all offices: + +| Item | Purpose | +| ---- | ------- | +| `machine_auth` access-token claim | Marks a service-account token as a CDA batch machine token. CDA uses this to distinguish machine run tokens from normal user OIDC tokens. | +| `run_as_office` access-token claim | Carries the office context authorized for the launch. For office-specific clients this value is stable, such as `SWT`, `SPK`, or another office code. | +| CDA/CWMS user registration for runner service accounts | CDA rejects unregistered machine principals. The corresponding service-account principal must exist in CDA and the CWMS database with the roles needed for the resource offices it will access. | +| Batch Events user/CAC roles | Human users keep using normal Keycloak/CAC sessions. Batch Events checks those user roles before allowing interactive job creation, editing, or submission. | + +Office-specific clients are replicated per office or per trust boundary. In the examples below, `swt` is the office suffix; a production rollout would create equivalent clients such as `cwms-batch-runner-spk` or `cwms-batch-airflow-mvp` where those offices are enabled. + +| Name pattern | Local example | Replicate per office? | Used by | Why it exists | +| ------------ | ------------- | --------------------- | ------- | ------------- | +| `cwms-batch-runner-` | `cwms-batch-runner-swt` | Yes | The job runner container | This is the identity used by the running script when it calls CDA. Its token should include `machine_auth=true` and `run_as_office=`. CDA maps the token subject to a registered machine principal and then normal CDA/database roles decide which resource offices it may access. | +| `cwms-batch-airflow-` | `cwms-batch-airflow-swt` | Yes, when Airflow schedules jobs for that office | Airflow scheduled DAGs | This is the scheduler identity used to call Batch Events and request due jobs for an office. It should be allowed to list and submit scheduled Batch Events jobs for that office, but it is not the CDA write identity used inside the running job. | +| Normal user/OIDC client and CAC user roles | Existing user login clients and users | No per-office service-account pattern, but user roles are office-scoped | Interactive Batch Events UI/API users | Human users authenticate with their CAC-backed Keycloak session. Batch Events authorizes whether the user may create scripts, manage schedules, or submit jobs for an office. The user's login does not become the CDA token used by the runner. | + +Keeping scheduler and runner identities separate prevents permission bleed. Airflow needs permission to trigger jobs for an office; the runner needs CDA/database roles to read or write data while the job executes. A user logged into the Batch Events UI needs permission to request a job, but the job itself still uses the office runner service account and trusted run context. + ## Alternatives Considered ### Per-Office Keycloak Service Accounts