diff --git a/compose_files/keycloak/realm.json b/compose_files/keycloak/realm.json index 9a0851a223..6d774373f1 100644 --- a/compose_files/keycloak/realm.json +++ b/compose_files/keycloak/realm.json @@ -713,6 +713,20 @@ "authenticationFlowBindingOverrides": {}, "fullScopeAllowed": true, "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "5edb86b8-82c7-4d9b-9f58-7a8febf50a3d", + "name": "cwms audience", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-mapper", + "consentRequired": false, + "config": { + "included.client.audience": "cwms", + "id.token.claim": "false", + "access.token.claim": "true" + } + } + ], "defaultClientScopes": [ "web-origins", "acr", @@ -728,6 +742,210 @@ "microprofile-jwt" ] }, + { + "id": "76dbdfad-5201-4a9d-bac4-ee9b89a794f1", + "clientId": "cwms-batch-runner-swt", + "name": "CWMS Batch Runner SWT", + "description": "Local test service account for SWT batch jobs", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": true, + "clientAuthenticatorType": "client-secret", + "secret": "local-cwms-batch-runner-swt-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": true, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "client_credentials.use_refresh_token": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "74f0d829-2921-4df2-93d4-468d6cd77d38", + "name": "machine_auth", + "protocol": "openid-connect", + "protocolMapper": "oidc-hardcoded-claim-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "claim.name": "machine_auth", + "claim.value": "true", + "id.token.claim": "false", + "jsonType.label": "boolean", + "userinfo.token.claim": "false" + } + }, + { + "id": "422385db-9fd4-4c8c-b1c7-8d12ce41418c", + "name": "run_as_office", + "protocol": "openid-connect", + "protocolMapper": "oidc-hardcoded-claim-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "claim.name": "run_as_office", + "claim.value": "SWT", + "id.token.claim": "false", + "jsonType.label": "String", + "userinfo.token.claim": "false" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "roles", + "profile", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "9620b229-ea0e-48f9-9f2f-cb601f2d6ee1", + "clientId": "cwms-batch-airflow-swt", + "name": "CWMS Batch Airflow SWT", + "description": "Local test service account for SWT scheduled batch job triggers", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": true, + "clientAuthenticatorType": "client-secret", + "secret": "local-cwms-batch-airflow-swt-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": true, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "client_credentials.use_refresh_token": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "e8865c36-fb89-45cb-8b81-a20562c5a154", + "name": "cwms audience", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-mapper", + "consentRequired": false, + "config": { + "included.client.audience": "cwms", + "id.token.claim": "false", + "access.token.claim": "true" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "roles", + "profile", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, + { + "id": "7bed3c57-6b95-45a7-b94f-1d85103776e2", + "clientId": "cwms-batch-runner-spk", + "name": "CWMS Batch Runner SPK", + "description": "Local test service account for SPK batch jobs", + "surrogateAuthRequired": false, + "enabled": true, + "alwaysDisplayInConsole": true, + "clientAuthenticatorType": "client-secret", + "secret": "local-cwms-batch-runner-spk-secret", + "redirectUris": [], + "webOrigins": [], + "notBefore": 0, + "bearerOnly": false, + "consentRequired": false, + "standardFlowEnabled": false, + "implicitFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": true, + "publicClient": false, + "frontchannelLogout": false, + "protocol": "openid-connect", + "attributes": { + "client_credentials.use_refresh_token": "false" + }, + "authenticationFlowBindingOverrides": {}, + "fullScopeAllowed": true, + "nodeReRegistrationTimeout": -1, + "protocolMappers": [ + { + "id": "019637fa-9fe7-4ee1-b372-5d20cb8b4a05", + "name": "machine_auth", + "protocol": "openid-connect", + "protocolMapper": "oidc-hardcoded-claim-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "claim.name": "machine_auth", + "claim.value": "true", + "id.token.claim": "false", + "jsonType.label": "boolean", + "userinfo.token.claim": "false" + } + }, + { + "id": "f6fde48e-c253-472d-8ed1-6267b4db1227", + "name": "run_as_office", + "protocol": "openid-connect", + "protocolMapper": "oidc-hardcoded-claim-mapper", + "consentRequired": false, + "config": { + "access.token.claim": "true", + "claim.name": "run_as_office", + "claim.value": "SPK", + "id.token.claim": "false", + "jsonType.label": "String", + "userinfo.token.claim": "false" + } + } + ], + "defaultClientScopes": [ + "web-origins", + "acr", + "roles", + "profile", + "email" + ], + "optionalClientScopes": [ + "address", + "phone", + "offline_access", + "microprofile-jwt" + ] + }, { "id": "f09103f0-5563-4dc0-bc70-2ff0309edae7", "clientId": "realm-management", @@ -2321,6 +2539,52 @@ "cwms_user", "new_user" ] + }, + { + "id": "6049479d-e783-4c7f-9024-57a7d29649f7", + "username": "m5batcheventadmin", + "enabled": true, + "email": "noreply-batch-admin@data.test", + "emailVerified": true, + "credentials": [ + { + "type": "password", + "value": "m5batcheventadmin" + } + ], + "realmRoles": [ + "cwms_user" + ] + }, + { + "id": "a4e88497-0ffc-41d5-b0fd-cc91760e366b", + "username": "service-account-cwms-batch-runner-swt", + "enabled": true, + "emailVerified": false, + "serviceAccountClientId": "cwms-batch-runner-swt", + "realmRoles": [ + "cwms_user" + ] + }, + { + "id": "b70c2e60-ce11-42c7-8271-10bb2b3fd4bd", + "username": "service-account-cwms-batch-airflow-swt", + "enabled": true, + "emailVerified": false, + "serviceAccountClientId": "cwms-batch-airflow-swt", + "realmRoles": [ + "cwms_user" + ] + }, + { + "id": "d2d6f91b-a5dd-40c3-8ee6-49a52da9892e", + "username": "service-account-cwms-batch-runner-spk", + "enabled": true, + "emailVerified": false, + "serviceAccountClientId": "cwms-batch-runner-spk", + "realmRoles": [ + "cwms_user" + ] } ] } diff --git a/compose_files/sql/users.sql b/compose_files/sql/users.sql index 9eac95cc17..4fa84cefd5 100644 --- a/compose_files/sql/users.sql +++ b/compose_files/sql/users.sql @@ -50,6 +50,73 @@ begin cwms_sec.add_user_to_group('m5testadmin','CWMS User Admins', 'LRL'); cwms_sec.add_user_to_group('m5testadmin','SHOW STACK TRACE', 'LRL'); + begin + insert into at_sec_cwms_users(userid, createdby, principle_name) + values( + 'M5BATCHEVENTADMIN', + 'CWMS_20', + 'http://localhost:8081/auth/realms/cwms::6049479d-e783-4c7f-9024-57a7d29649f7' + ); + exception + when dup_val_on_index then + update at_sec_cwms_users + set principle_name = 'http://localhost:8081/auth/realms/cwms::6049479d-e783-4c7f-9024-57a7d29649f7' + where userid = 'M5BATCHEVENTADMIN'; + end; + cwms_sec.add_user_to_group('M5BATCHEVENTADMIN', 'All Users', 'SWT'); + cwms_sec.add_user_to_group('M5BATCHEVENTADMIN', 'CWMS Users', 'SWT'); + cwms_sec.add_user_to_group('M5BATCHEVENTADMIN', 'Data Acquisition Mgr', 'SWT'); + + begin + insert into at_sec_cwms_users(userid, createdby, principle_name) + values( + 'SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT', + 'CWMS_20', + 'http://localhost:8081/auth/realms/cwms::a4e88497-0ffc-41d5-b0fd-cc91760e366b' + ); + exception + when dup_val_on_index then + update at_sec_cwms_users + set principle_name = 'http://localhost:8081/auth/realms/cwms::a4e88497-0ffc-41d5-b0fd-cc91760e366b' + where userid = 'SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT'; + end; + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT', 'All Users', 'SWT'); + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT', 'CWMS Users', 'SWT'); + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT', 'TS ID Creator', 'SWT'); + + begin + insert into at_sec_cwms_users(userid, createdby, principle_name) + values( + 'SERVICE-ACCOUNT-CWMS-BATCH-AIRFLOW-SWT', + 'CWMS_20', + 'http://localhost:8081/auth/realms/cwms::b70c2e60-ce11-42c7-8271-10bb2b3fd4bd' + ); + exception + when dup_val_on_index then + update at_sec_cwms_users + set principle_name = 'http://localhost:8081/auth/realms/cwms::b70c2e60-ce11-42c7-8271-10bb2b3fd4bd' + where userid = 'SERVICE-ACCOUNT-CWMS-BATCH-AIRFLOW-SWT'; + end; + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-AIRFLOW-SWT', 'All Users', 'SWT'); + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-AIRFLOW-SWT', 'CWMS Users', 'SWT'); + + begin + insert into at_sec_cwms_users(userid, createdby, principle_name) + values( + 'SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK', + 'CWMS_20', + 'http://localhost:8081/auth/realms/cwms::d2d6f91b-a5dd-40c3-8ee6-49a52da9892e' + ); + exception + when dup_val_on_index then + update at_sec_cwms_users + set principle_name = 'http://localhost:8081/auth/realms/cwms::d2d6f91b-a5dd-40c3-8ee6-49a52da9892e' + where userid = 'SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK'; + end; + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK', 'All Users', 'SPK'); + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK', 'CWMS Users', 'SPK'); + cwms_sec.add_user_to_group('SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK', 'TS ID Creator', 'SPK'); + end; / quit; diff --git a/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java b/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java new file mode 100644 index 0000000000..b3a45c4207 --- /dev/null +++ b/cwms-data-api/src/main/java/cwms/cda/security/BatchJobContext.java @@ -0,0 +1,193 @@ +package cwms.cda.security; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.flogger.FluentLogger; +import cwms.cda.ApiServlet; +import cwms.cda.datasource.ConnectionPreparingDataSource; +import cwms.cda.datasource.ConnectionPreparer; +import cwms.cda.datasource.DelegatingConnectionPreparer; +import cwms.cda.datasource.SessionOfficePreparer; +import io.javalin.http.Context; +import io.jsonwebtoken.Claims; +import io.jsonwebtoken.ExpiredJwtException; +import io.jsonwebtoken.JwtException; +import io.jsonwebtoken.Jwts; +import io.jsonwebtoken.security.Keys; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.security.Key; +import java.util.Base64; +import java.util.Locale; +import java.util.Map; +import javax.sql.DataSource; +import javax.servlet.http.HttpServletResponse; + +public final class BatchJobContext { + private static final FluentLogger logger = FluentLogger.forEnclosingClass(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static final String HEADER = "X-CWMS-Job-Context"; + public static final String RUN_AS_OFFICE_ATTR = "BatchRunAsOffice"; + public static final String MACHINE_AUTH_CLAIM = "machine_auth"; + public static final String RUN_AS_OFFICE_CLAIM = "run_as_office"; + public static final String LEGACY_OFFICE_CLAIM = "office"; + + public static final String SECRET_PROPERTY = "cwms.dataapi.batch.jobContext.secret"; + public static final String PREVIOUS_SECRET_PROPERTY = "cwms.dataapi.batch.jobContext.previousSecret"; + public static final String KEY_ID_PROPERTY = "cwms.dataapi.batch.jobContext.keyId"; + public static final String ISSUER_PROPERTY = "cwms.dataapi.batch.jobContext.issuer"; + public static final String AUDIENCE_PROPERTY = "cwms.dataapi.batch.jobContext.audience"; + public static final String MACHINE_USERS_PROPERTY = "cwms.dataapi.batch.machineUsers"; + + private static final String DEFAULT_ISSUER = "cwms-batch-events"; + private static final String DEFAULT_AUDIENCE = "cwms-data-api"; + private static final String DEFAULT_MACHINE_USERS = ""; + + private BatchJobContext() { + } + + public static boolean isBatchMachineUser(String username) { + if (username == null) { + return false; + } + String machineUsers = readSetting(MACHINE_USERS_PROPERTY, DEFAULT_MACHINE_USERS); + if (machineUsers.isBlank()) { + return false; + } + for (String machineUser : machineUsers.split(",")) { + if (username.equalsIgnoreCase(machineUser.trim())) { + return true; + } + } + return false; + } + + public static boolean isBatchMachinePrincipal(String username, Claims claims) { + return hasMachineAuthClaim(claims) || isBatchMachineUser(username); + } + + public static void prepareContext(Context ctx, DataApiPrincipal principal, Claims claims) + throws CwmsAuthException { + if (hasMachineAuthClaim(claims)) { + setRunOfficeFromClaims(ctx, claims); + return; + } + prepareContext(ctx, principal); + } + + public static void prepareContext(Context ctx, DataApiPrincipal principal) throws CwmsAuthException { + if (!isBatchMachineUser(principal.getName())) { + return; + } + + String token = ctx.header(HEADER); + if (token == null || token.isBlank()) { + throw new CwmsAuthException("Batch machine request missing signed job context", + HttpServletResponse.SC_UNAUTHORIZED); + } + + try { + Claims claims = parse(token); + setRunOfficeFromClaims(ctx, claims); + } catch (ExpiredJwtException ex) { + logger.atFine().withCause(ex).log("Batch job context token expired."); + throw new CwmsAuthException("Batch job context token expired", ex, + HttpServletResponse.SC_UNAUTHORIZED); + } catch (JwtException | IllegalArgumentException ex) { + logger.atFine().withCause(ex).log("Batch job context token validation failed."); + throw new CwmsAuthException("Batch job context token not valid", ex, + HttpServletResponse.SC_UNAUTHORIZED); + } + } + + private static boolean hasMachineAuthClaim(Claims claims) { + if (claims == null) { + return false; + } + Object value = claims.get(MACHINE_AUTH_CLAIM); + if (value instanceof Boolean) { + return (Boolean) value; + } + if (value instanceof String) { + return Boolean.parseBoolean((String) value); + } + return false; + } + + private static void setRunOfficeFromClaims(Context ctx, Claims claims) throws CwmsAuthException { + String office = claims.get(RUN_AS_OFFICE_CLAIM, String.class); + if (office == null || office.isBlank()) { + office = claims.get(LEGACY_OFFICE_CLAIM, String.class); + } + if (office == null || office.isBlank()) { + throw new CwmsAuthException("Batch job context missing run_as_office", + HttpServletResponse.SC_UNAUTHORIZED); + } + ctx.attribute(RUN_AS_OFFICE_ATTR, office.toUpperCase(Locale.ROOT)); + } + + public static void applyRunContext(Context ctx) { + String runAsOffice = ctx.attribute(RUN_AS_OFFICE_ATTR); + if (runAsOffice == null || runAsOffice.isBlank()) { + return; + } + + DataSource dataSource = ctx.attribute(ApiServlet.DATA_SOURCE); + ConnectionPreparer officePreparer = new SessionOfficePreparer(runAsOffice); + if (dataSource instanceof ConnectionPreparingDataSource) { + ConnectionPreparingDataSource preparingDataSource = (ConnectionPreparingDataSource) dataSource; + preparingDataSource.setPreparer(new DelegatingConnectionPreparer( + preparingDataSource.getPreparer(), officePreparer)); + } else { + ctx.attribute(ApiServlet.DATA_SOURCE, + new ConnectionPreparingDataSource(officePreparer, dataSource)); + } + } + + private static Claims parse(String token) { + String secret = secretForToken(token); + if (secret.length() < 32) { + throw new IllegalArgumentException("Batch job context secret must be at least 32 characters"); + } + Key key = Keys.hmacShaKeyFor(secret.getBytes(StandardCharsets.UTF_8)); + return Jwts.parserBuilder() + .requireIssuer(readSetting(ISSUER_PROPERTY, DEFAULT_ISSUER)) + .requireAudience(readSetting(AUDIENCE_PROPERTY, DEFAULT_AUDIENCE)) + .setSigningKey(key) + .build() + .parseClaimsJws(token) + .getBody(); + } + + private static String secretForToken(String token) { + String expectedKeyId = readSetting(KEY_ID_PROPERTY, "current"); + String keyId = keyIdForToken(token); + if (keyId == null || keyId.isBlank() || expectedKeyId.equals(keyId)) { + return readSetting(SECRET_PROPERTY, ""); + } + if ("previous".equals(keyId)) { + return readSetting(PREVIOUS_SECRET_PROPERTY, ""); + } + throw new IllegalArgumentException("Batch job context key id is not recognized"); + } + + private static String keyIdForToken(String token) { + String[] parts = token.split("\\."); + if (parts.length != 3) { + throw new IllegalArgumentException("Batch job context token is malformed"); + } + try { + byte[] headerBytes = Base64.getUrlDecoder().decode(parts[0]); + Map header = OBJECT_MAPPER.readValue(headerBytes, Map.class); + Object keyId = header.get("kid"); + return keyId instanceof String ? (String) keyId : null; + } catch (IllegalArgumentException | IOException e) { + throw new IllegalArgumentException("Batch job context token header is malformed", e); + } + } + + private static String readSetting(String key, String defaultValue) { + String value = System.getProperty(key, System.getenv(key)); + return value == null || value.isBlank() ? defaultValue : value; + } +} diff --git a/cwms-data-api/src/main/java/cwms/cda/security/CdaAccessManager.java b/cwms-data-api/src/main/java/cwms/cda/security/CdaAccessManager.java index ffb5c82c16..533073a7cf 100644 --- a/cwms-data-api/src/main/java/cwms/cda/security/CdaAccessManager.java +++ b/cwms-data-api/src/main/java/cwms/cda/security/CdaAccessManager.java @@ -41,6 +41,7 @@ public void manage(Handler handler, Context ctx, Set routeRoles) thr } checkRateLimit(ctx); prepareContext(ctx, principal); + BatchJobContext.applyRunContext(ctx); handler.handle(ctx); } diff --git a/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java b/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java index 48d1c8940c..f636724712 100644 --- a/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java +++ b/cwms-data-api/src/main/java/cwms/cda/security/OpenIdConnectIdentitityProvider.java @@ -105,12 +105,20 @@ private DataApiPrincipal getUserFromToken(Context ctx) throws CwmsAuthException AuthDao dao = AuthDao.getInstance(JooqDao.getDslContext(ctx), ctx.attribute(ApiServlet.OFFICE_ID)); Optional principal = dao.getPrincipalFromPrincipal(oidcPrincipal); if (principal.isPresent()) { - return principal.get(); + DataApiPrincipal dataApiPrincipal = principal.get(); + BatchJobContext.prepareContext(ctx, dataApiPrincipal, claims); + return dataApiPrincipal; } else if (CREATE_USERS) { final String preferredUserName = claims.get(PREFERRED_USERNAME_CLAIM, String.class); + if (BatchJobContext.isBatchMachinePrincipal(preferredUserName, claims)) { + throw new CwmsAuthException("Batch machine principal is not registered", + HttpServletResponse.SC_UNAUTHORIZED); + } final String givenName = claims.get(GIVEN_NAME_CLAIM, String.class); final String email = claims.get(EMAIL_CLAIM, String.class); - return dao.createUser(preferredUserName, oidcPrincipal, givenName, email); + DataApiPrincipal dataApiPrincipal = dao.createUser(preferredUserName, oidcPrincipal, givenName, email); + BatchJobContext.prepareContext(ctx, dataApiPrincipal, claims); + return dataApiPrincipal; } else { throw new CwmsAuthException("Not Authorized",HttpServletResponse.SC_UNAUTHORIZED); } diff --git a/cwms-data-api/src/test/java/cwms/cda/api/auth/OpenIdConnectTestIT.java b/cwms-data-api/src/test/java/cwms/cda/api/auth/OpenIdConnectTestIT.java index ffa71eb81a..1c5993c4f3 100644 --- a/cwms-data-api/src/test/java/cwms/cda/api/auth/OpenIdConnectTestIT.java +++ b/cwms-data-api/src/test/java/cwms/cda/api/auth/OpenIdConnectTestIT.java @@ -2,24 +2,31 @@ import static io.restassured.RestAssured.given; import static org.hamcrest.Matchers.*; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -import java.util.Optional; - - -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; - import cwms.cda.api.DataApiTestIT; +import fixtures.CwmsDataApiSetupCallback; import fixtures.KeyCloakExtension; import io.javalin.http.HttpCode; import io.restassured.filter.log.LogDetail; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.Optional; +import mil.army.usace.hec.test.database.CwmsDatabaseContainer; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; @Tag("integration") @ExtendWith(KeyCloakExtension.class) public class OpenIdConnectTestIT extends DataApiTestIT { - + private static final String SWT_BATCH_CLIENT = "cwms-batch-runner-swt"; + private static final String SWT_BATCH_CLIENT_SECRET = "local-cwms-batch-runner-swt-secret"; + private static final String SWT_BATCH_USER = "SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SWT"; + private static final String SPK_BATCH_CLIENT = "cwms-batch-runner-spk"; + private static final String SPK_BATCH_CLIENT_SECRET = "local-cwms-batch-runner-spk-secret"; + private static final String SPK_BATCH_USER = "SERVICE-ACCOUNT-CWMS-BATCH-RUNNER-SPK"; @Test void test_keycloak_user_is_created() { @@ -55,4 +62,84 @@ void test_keycloak_user_can_operate() { .log().ifValidationFails(LogDetail.ALL,true) .statusCode(is(HttpCode.OK.getStatus())); } + + @Test + void test_keycloak_batch_service_account_claims_require_registered_machine_principal() throws Exception { + Optional spkToken = KeyCloakExtension.tokenForClientCredentials(SPK_BATCH_CLIENT, + SPK_BATCH_CLIENT_SECRET); + assertTrue(spkToken.isPresent()); + assertBatchClaims(spkToken.get(), "SPK", SPK_BATCH_CLIENT); + + given() + .log().ifValidationFails(LogDetail.ALL, true) + .header("Authorization", "Bearer " + spkToken.get()) + .when() + .get("/user/profile") + .then() + .log().ifValidationFails(LogDetail.ALL,true) + .statusCode(is(HttpCode.UNAUTHORIZED.getStatus())); + + registerBatchMachinePrincipal(SPK_BATCH_USER, spkToken.get(), "SPK"); + + given() + .log().ifValidationFails(LogDetail.ALL, true) + .header("Authorization", "Bearer " + spkToken.get()) + .when() + .get("/user/profile") + .then() + .log().ifValidationFails(LogDetail.ALL,true) + .statusCode(is(HttpCode.OK.getStatus())) + .body("user-name", equalTo(SPK_BATCH_USER)) + .body("roles.SPK", hasItems("All Users", "CWMS Users", "TS ID Creator")) + .body("roles.SWT", nullValue()); + + Optional swtToken = KeyCloakExtension.tokenForClientCredentials(SWT_BATCH_CLIENT, + SWT_BATCH_CLIENT_SECRET); + assertTrue(swtToken.isPresent()); + assertBatchClaims(swtToken.get(), "SWT", SWT_BATCH_CLIENT); + registerBatchMachinePrincipal(SWT_BATCH_USER, swtToken.get(), "SWT"); + + given() + .log().ifValidationFails(LogDetail.ALL, true) + .header("Authorization", "Bearer " + swtToken.get()) + .when() + .get("/user/profile") + .then() + .log().ifValidationFails(LogDetail.ALL,true) + .statusCode(is(HttpCode.OK.getStatus())) + .body("user-name", equalTo(SWT_BATCH_USER)) + .body("roles.SWT", hasItems("All Users", "CWMS Users", "TS ID Creator")) + .body("roles.SPK", nullValue()); + } + + private static void assertBatchClaims(String token, String office, String clientId) throws Exception { + assertEquals(true, KeyCloakExtension.claims(token).get("machine_auth").asBoolean()); + assertEquals(office, KeyCloakExtension.claims(token).get("run_as_office").asText()); + assertEquals("service-account-" + clientId, + KeyCloakExtension.claims(token).get("preferred_username").asText()); + } + + private static void registerBatchMachinePrincipal(String userName, String token, String office) throws Exception { + addNewUser(userName); + updateOidcPrincipal(userName, token); + addUserToGroup(userName, "CWMS Users", office); + addUserToGroup(userName, "All Users", office); + addUserToGroup(userName, "TS ID Creator", office); + } + + private static void updateOidcPrincipal(String userName, String token) throws Exception { + String oidcPrincipal = KeyCloakExtension.getIssuer() + "::" + + KeyCloakExtension.claims(token).get("sub").asText(); + CwmsDatabaseContainer db = CwmsDataApiSetupCallback.getDatabaseLink(); + db.connection((c) -> { + try (PreparedStatement stmt = c.prepareStatement( + "update AT_SEC_CWMS_USERS set principle_name = ? where userid = upper(?)")) { + stmt.setString(1, oidcPrincipal); + stmt.setString(2, userName); + stmt.executeUpdate(); + } catch (SQLException ex) { + throw new RuntimeException("Unable to update OIDC principal for: " + userName, ex); + } + }, "cwms_20"); + } } diff --git a/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java b/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java new file mode 100644 index 0000000000..a873c8a8a1 --- /dev/null +++ b/cwms-data-api/src/test/java/cwms/cda/security/BatchJobContextTest.java @@ -0,0 +1,331 @@ +package cwms.cda.security; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.doAnswer; + +import cwms.cda.ApiServlet; +import cwms.cda.datasource.ConnectionPreparingDataSource; +import io.javalin.http.Context; +import io.jsonwebtoken.Claims; +import io.jsonwebtoken.Jwts; +import io.jsonwebtoken.SignatureAlgorithm; +import io.jsonwebtoken.security.Keys; +import java.nio.charset.StandardCharsets; +import java.io.PrintWriter; +import java.security.Key; +import java.sql.Connection; +import java.sql.SQLException; +import java.time.Instant; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; +import javax.sql.DataSource; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; +import org.mockito.stubbing.Answer; + +class BatchJobContextTest { + private static final String SECRET = "test-batch-context-secret-32-characters"; + private static final String OTHER_SECRET = "other-batch-context-secret-32-chars"; + private static final String ISSUER = "cwms-batch-events"; + private static final String AUDIENCE = "cwms-data-api"; + private static final String MACHINE_USER = "SERVICE-ACCOUNT-CWMS-BATCH-RUNNER"; + + @AfterEach + void clearProperties() { + System.clearProperty(BatchJobContext.SECRET_PROPERTY); + System.clearProperty(BatchJobContext.PREVIOUS_SECRET_PROPERTY); + System.clearProperty(BatchJobContext.KEY_ID_PROPERTY); + System.clearProperty(BatchJobContext.ISSUER_PROPERTY); + System.clearProperty(BatchJobContext.AUDIENCE_PROPERTY); + System.clearProperty(BatchJobContext.MACHINE_USERS_PROPERTY); + } + + @Test + void machineUserDetectionReturnsFalseWhenUnset() { + assertFalse(BatchJobContext.isBatchMachineUser(MACHINE_USER)); + } + + @Test + void machineUserDetectionMatchesConfiguredUsersCaseInsensitively() { + System.setProperty(BatchJobContext.MACHINE_USERS_PROPERTY, + "some-user, service-account-cwms-batch-runner "); + + assertTrue(BatchJobContext.isBatchMachineUser(MACHINE_USER)); + } + + @Test + void prepareContextDoesNotRequireTokenForNormalUsers() throws CwmsAuthException { + Context ctx = contextWithHeaders(Map.of()); + DataApiPrincipal principal = new DataApiPrincipal("normal-user", Set.of()); + + BatchJobContext.prepareContext(ctx, principal); + + assertNull(ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); + } + + @Test + void machineAuthClaimSetsRunContextWithoutSignedHeader() throws CwmsAuthException { + Context ctx = contextWithHeaders(Map.of()); + Claims claims = Jwts.claims(Map.of( + BatchJobContext.MACHINE_AUTH_CLAIM, true, + BatchJobContext.RUN_AS_OFFICE_CLAIM, "swt" + )); + + BatchJobContext.prepareContext(ctx, machinePrincipal(), claims); + + assertEquals("SWT", ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); + } + + @Test + void machineAuthStringClaimSetsRunContextWithoutSignedHeader() throws CwmsAuthException { + Context ctx = contextWithHeaders(Map.of()); + Claims claims = Jwts.claims(Map.of( + BatchJobContext.MACHINE_AUTH_CLAIM, "true", + BatchJobContext.RUN_AS_OFFICE_CLAIM, "spk" + )); + + BatchJobContext.prepareContext(ctx, machinePrincipal(), claims); + + assertEquals("SPK", ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); + } + + @Test + void machineAuthClaimRequiresRunAsOffice() { + Context ctx = contextWithHeaders(Map.of()); + Claims claims = Jwts.claims(Map.of(BatchJobContext.MACHINE_AUTH_CLAIM, true)); + + CwmsAuthException ex = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext(ctx, machinePrincipal(), claims)); + + assertEquals(HttpServletResponse.SC_UNAUTHORIZED, ex.getAuthFailCode()); + assertEquals("Batch job context missing run_as_office", ex.getMessage()); + } + + @Test + void batchMachinePrincipalUsesMachineAuthClaimOrConfiguredUser() { + Claims claims = Jwts.claims(Map.of(BatchJobContext.MACHINE_AUTH_CLAIM, true)); + + assertTrue(BatchJobContext.isBatchMachinePrincipal("normal-user", claims)); + assertFalse(BatchJobContext.isBatchMachinePrincipal("normal-user", Jwts.claims())); + + System.setProperty(BatchJobContext.MACHINE_USERS_PROPERTY, MACHINE_USER); + + assertTrue(BatchJobContext.isBatchMachinePrincipal(MACHINE_USER, Jwts.claims())); + } + + @Test + void validTokenSetsRunContextAttribute() throws CwmsAuthException { + configureBatchContext(); + String token = token(Map.of( + "run_as_office", "swt", + "job_id", "job-123", + "requested_by", "m5hectest", + "dispatch_source", "api" + ), SECRET, ISSUER, AUDIENCE, Instant.now().plusSeconds(300)); + Context ctx = contextWithHeaders(Map.of(BatchJobContext.HEADER, token)); + + BatchJobContext.prepareContext(ctx, machinePrincipal()); + + assertEquals("SWT", ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); + } + + @Test + void missingTokenForMachineUserThrowsUnauthorized() { + configureBatchContext(); + Context ctx = contextWithHeaders(Map.of()); + + CwmsAuthException ex = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext(ctx, machinePrincipal())); + + assertEquals(HttpServletResponse.SC_UNAUTHORIZED, ex.getAuthFailCode()); + assertEquals("Batch machine request missing signed job context", ex.getMessage()); + } + + @Test + void expiredTokenThrowsSpecificUnauthorizedMessage() { + configureBatchContext(); + String token = token(Map.of("run_as_office", "SWT"), SECRET, ISSUER, AUDIENCE, + Instant.now().minusSeconds(60)); + Context ctx = contextWithHeaders(Map.of(BatchJobContext.HEADER, token)); + + CwmsAuthException ex = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext(ctx, machinePrincipal())); + + assertEquals(HttpServletResponse.SC_UNAUTHORIZED, ex.getAuthFailCode()); + assertEquals("Batch job context token expired", ex.getMessage()); + } + + @Test + void forgedTokenThrowsInvalidMessage() { + configureBatchContext(); + String token = token(Map.of("run_as_office", "SWT"), OTHER_SECRET, ISSUER, AUDIENCE, + Instant.now().plusSeconds(300)); + Context ctx = contextWithHeaders(Map.of(BatchJobContext.HEADER, token)); + + CwmsAuthException ex = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext(ctx, machinePrincipal())); + + assertEquals(HttpServletResponse.SC_UNAUTHORIZED, ex.getAuthFailCode()); + assertEquals("Batch job context token not valid", ex.getMessage()); + } + + @Test + void wrongIssuerOrAudienceIsRejected() { + configureBatchContext(); + String wrongIssuer = token(Map.of("run_as_office", "SWT"), SECRET, "other-issuer", + AUDIENCE, Instant.now().plusSeconds(300)); + String wrongAudience = token(Map.of("run_as_office", "SWT"), SECRET, ISSUER, + "other-audience", Instant.now().plusSeconds(300)); + + CwmsAuthException issuerEx = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext( + contextWithHeaders(Map.of(BatchJobContext.HEADER, wrongIssuer)), machinePrincipal())); + CwmsAuthException audienceEx = assertThrows(CwmsAuthException.class, + () -> BatchJobContext.prepareContext( + contextWithHeaders(Map.of(BatchJobContext.HEADER, wrongAudience)), machinePrincipal())); + + assertEquals("Batch job context token not valid", issuerEx.getMessage()); + assertEquals("Batch job context token not valid", audienceEx.getMessage()); + } + + @Test + void legacyOfficeClaimIsUsedWhenRunAsOfficeIsMissing() throws CwmsAuthException { + configureBatchContext(); + String token = token(Map.of("office", "spk"), SECRET, ISSUER, AUDIENCE, + Instant.now().plusSeconds(300)); + Context ctx = contextWithHeaders(Map.of(BatchJobContext.HEADER, token)); + + BatchJobContext.prepareContext(ctx, machinePrincipal()); + + assertEquals("SPK", ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR)); + } + + @Test + void applyRunContextWrapsDataSourceWhenOfficeIsPresent() { + Context ctx = contextWithAttributes(); + DataSource dataSource = new StubDataSource(); + ctx.attribute(ApiServlet.DATA_SOURCE, dataSource); + ctx.attribute(BatchJobContext.RUN_AS_OFFICE_ATTR, "SWT"); + + BatchJobContext.applyRunContext(ctx); + + assertInstanceOf(ConnectionPreparingDataSource.class, ctx.attribute(ApiServlet.DATA_SOURCE)); + } + + @Test + void applyRunContextLeavesDataSourceUnchangedWhenOfficeIsMissing() { + Context ctx = contextWithAttributes(); + DataSource dataSource = new StubDataSource(); + ctx.attribute(ApiServlet.DATA_SOURCE, dataSource); + + BatchJobContext.applyRunContext(ctx); + + assertSame(dataSource, ctx.attribute(ApiServlet.DATA_SOURCE)); + } + + private static void configureBatchContext() { + System.setProperty(BatchJobContext.SECRET_PROPERTY, SECRET); + System.setProperty(BatchJobContext.ISSUER_PROPERTY, ISSUER); + System.setProperty(BatchJobContext.AUDIENCE_PROPERTY, AUDIENCE); + System.setProperty(BatchJobContext.MACHINE_USERS_PROPERTY, MACHINE_USER); + } + + private static DataApiPrincipal machinePrincipal() { + return new DataApiPrincipal(MACHINE_USER, Set.of()); + } + + private static String token(Map claims, String secret, String issuer, + String audience, Instant expiration) { + Key key = Keys.hmacShaKeyFor(secret.getBytes(StandardCharsets.UTF_8)); + return Jwts.builder() + .setHeaderParam("kid", "current") + .setIssuer(issuer) + .setAudience(audience) + .setIssuedAt(Date.from(Instant.now())) + .setExpiration(Date.from(expiration)) + .addClaims(claims) + .signWith(key, SignatureAlgorithm.HS256) + .compact(); + } + + private static Context contextWithHeaders(Map headers) { + return context(headers); + } + + private static Context contextWithAttributes() { + return context(Map.of()); + } + + private static Context context(Map headers) { + HttpServletRequest request = mock(HttpServletRequest.class); + HttpServletResponse response = mock(HttpServletResponse.class); + Map attributes = new HashMap<>(); + org.mockito.Mockito.when(request.getHeader(org.mockito.ArgumentMatchers.anyString())) + .thenAnswer((Answer) invocation -> headers.get(invocation.getArgument(0))); + org.mockito.Mockito.when(request.getAttribute(org.mockito.ArgumentMatchers.anyString())) + .thenAnswer((Answer) invocation -> attributes.get(invocation.getArgument(0))); + doAnswer((Answer) invocation -> { + attributes.put(invocation.getArgument(0), invocation.getArgument(1)); + return null; + }) + .when(request).setAttribute(org.mockito.ArgumentMatchers.anyString(), org.mockito.ArgumentMatchers.any()); + return new Context(request, response, new HashMap<>()); + } + + private static final class StubDataSource implements DataSource { + @Override + public Connection getConnection() throws SQLException { + throw new SQLException("not used"); + } + + @Override + public Connection getConnection(String username, String password) throws SQLException { + throw new SQLException("not used"); + } + + @Override + public PrintWriter getLogWriter() { + return null; + } + + @Override + public void setLogWriter(PrintWriter out) { + } + + @Override + public void setLoginTimeout(int seconds) { + } + + @Override + public int getLoginTimeout() { + return 0; + } + + @Override + public Logger getParentLogger() { + return Logger.getGlobal(); + } + + @Override + public T unwrap(Class iface) throws SQLException { + throw new SQLException("not used"); + } + + @Override + public boolean isWrapperFor(Class iface) { + return false; + } + } +} diff --git a/cwms-data-api/src/test/java/fixtures/KeyCloakExtension.java b/cwms-data-api/src/test/java/fixtures/KeyCloakExtension.java index 70a0bd2232..0bf09d2710 100644 --- a/cwms-data-api/src/test/java/fixtures/KeyCloakExtension.java +++ b/cwms-data-api/src/test/java/fixtures/KeyCloakExtension.java @@ -10,6 +10,7 @@ import java.net.URL; import java.nio.charset.Charset; import java.time.Duration; +import java.util.Base64; import java.util.HashMap; import java.util.Map; import java.util.Optional; @@ -162,4 +163,41 @@ public static Optional tokenForUser(String username, String password) { return Optional.empty(); } } + + /** + * Retrieve an access token for a confidential client service account. + * This is needed to verify CDA's machine-to-machine path with the same + * client_credentials flow a batch runner uses. Password-grant user tokens + * do not exercise Keycloak service-account subjects or built-in client + * mappers for machine_auth and run_as_office claims. + * @param clientId Keycloak client ID + * @param clientSecret Keycloak client secret + * @return Access token only + */ + public static Optional tokenForClientCredentials(String clientId, String clientSecret) { + try { + Response response = + given() + .log().ifValidationFails(LogDetail.ALL,true) + .contentType(ContentType.URLENC) + .formParam("client_id", clientId) + .formParam("grant_type", "client_credentials") + .formParam("client_secret", clientSecret) + .formParam("scope", "openid profile email") + .when() + .post(new URL(getTokenUrl())); + + logger.atFine().log(response.asPrettyString()); + JsonNode tokenInfo = mapper.readTree(response.asString()); + return Optional.of(tokenInfo.get("access_token").asText()); + } catch (JsonProcessingException | MalformedURLException ex) { + logger.atWarning().withCause(ex).log("Unable to retrieve token for client %s", clientId); + return Optional.empty(); + } + } + + public static JsonNode claims(String token) throws IOException { + String[] parts = token.split("\\."); + return mapper.readTree(Base64.getUrlDecoder().decode(parts[1])); + } } diff --git a/docs/source/decisions/0010-batch-machine-run-context.md b/docs/source/decisions/0010-batch-machine-run-context.md new file mode 100644 index 0000000000..7c41102527 --- /dev/null +++ b/docs/source/decisions/0010-batch-machine-run-context.md @@ -0,0 +1,276 @@ +# Batch Machine Run Context + +| Status | Proposed | +| :------------- | :---------------------- | +| **ADR #** | 0010 | +| **Author(s)** | CWBI Batch Runtime Team | +| **Sponsor** | HEC/USACE | +| **Date** | 6/8/2026 | +| **Supersedes** | N/A | + +## Objective + +Provide CWMS Data API with a trusted batch run context for jobs that execute through shared Batch runtime infrastructure. + +Batch runtimes will authenticate to CDA with office-scoped service accounts through Keycloak. Each job will provide trusted launch context, including the office for which the scheduler or API approved the run. CDA will use Keycloak-minted access-token claims when Keycloak can mint that context into the normal access token. CDA will also support a signed dispatcher context header as a fallback when Keycloak cannot provide job context without a custom extension. + +The signed context is **not** a replacement for normal CDA or database authorization. It establishes **who** launched the machine runtime and why. CDA and the CWMS database remain responsible for deciding whether the machine principal may read or write the requested resource office. The machine principal must already be registered in CDA and the CWMS database; CDA must not auto-create batch machine users. + +## Motivation + +Shared AWS Batch job definitions reduce AWS Batch configuration requirements, but they must not remove the office-specific machine identity used by CDA and the CWMS database. The current preferred shape is one scheduler service account and one runner service account per office or trust boundary. CDA still needs trusted context from Keycloak or the dispatcher so scripts cannot choose their own run authority by changing an environment variable, URI parameter, or request body. + +This is needed because CDA request office fields describe resource ownership, not caller authority. For example, a job approved for SWT (Tulsa District) may write data owned by another office when the mapped machine user has the required database roles. The request office identifies the target data; it does not identify who the job is running as. i.e. `&office=SWT` in the URI. + +## User Benefit + +### For Batch Operators + +- Runtime job definitions can be managed by language or image instead of by office/image combinations. +- Office service accounts can be managed in Keycloak while AWS Batch job definitions remain shared. +- Office launch context is available for audit and policy decisions. + +### For Script Authors + +- Scripts call CDA with standard bearer-token authentication. +- Scripts do not need per-office CDA API keys. +- Scripts can still read and write resource offices allowed by the mapped CDA database user. + +### For Security and Operations + +- CDA rejects machine requests that lack trusted Keycloak-minted or dispatcher-issued run context. +- Request parameters and payload fields are not trusted as caller authority. +- CDA audit records can include both the machine principal and the signed job context. + +## Design Proposal + +### Batch Run Flow + +![CWMS Batch M2M overview](diagrams/batch-m2m-overview.svg) + +Editable source: [batch-m2m-overview.drawio](diagrams/batch-m2m-overview.drawio) + +The scheduler and runner identities are intentionally separate. A user or +office scheduler can request an authorized job, but the running script calls CDA +with an office-scoped runner service account. Batch Events remains the source +of truth for script registry rows, job records, runtime env brokering, status, +and log lookup. + +### End User UI Flow + +![End user Batch Events UI flow](diagrams/batch-ui-job-flow.svg) + +Editable source: [batch-ui-job-flow.drawio](diagrams/batch-ui-job-flow.drawio) + +The Batch Events UI is now registry-oriented. A script admin chooses an office, +creates or edits a script row, selects either a GitHub file path or an inline +command, configures runtime, resource profile, timeout, schedule, roles, env +vars, and secret names, then submits jobs from that registry row. GitHub file +paths can be browsed from the configured repository checkout and show +runtime-specific file type hints, while command rows intentionally allow trusted +users to run arbitrary commands in the trusted runtime image. The local executor +and AWS Batch both honor the configured timeout; local timeout handling was +verified with a command that sleeps longer than its one-minute timeout. + +### Airflow Scheduled M2M Flow + +![Airflow scheduled Batch Events flow](diagrams/batch-airflow-scheduler-flow.svg) + +Editable source: [batch-airflow-scheduler-flow.drawio](diagrams/batch-airflow-scheduler-flow.drawio) + +The scheduler identity and runner identity are intentionally separate. Airflow's office-specific service account is authorized to request a job. The runner's office-specific service account is the machine principal used when the job calls CDA. + +Airflow does not submit AWS Batch jobs directly for the registry-driven path. +Instead, the scheduled DAG lists due scripts through Batch Events, evaluates +hourly or cron schedules using each script's `scheduleTimezone`, and posts a +Batch Events job for each due row. Daylight-saving time gaps are skipped and +repeated local occurrences run once. Airflow does not wait for AWS Batch +completion; Batch Events owns dispatch, status, log lookup, and runtime broker +behavior after the job is accepted. + +### Keycloak-Minted Run Context + +When available, Keycloak mints the batch run context into the normal access token. CDA validates that token through the existing OIDC flow and reads these claims: + +| Claim | Description | +| --------------- | -------------------------------------------- | +| `machine_auth` | Marks the access token as a batch machine run | +| `run_as_office` | Office context authorized for the job launch | + +The `run_as_office` claim represents the authorized launch context. It is not the same as a resource office on a CDA endpoint. + +### Signed Run Context Fallback + +The dispatcher signs a short-lived JWT from the authoritative job record. The runner sends the token to CDA in the `X-CWMS-Job-Context` header. + +The token contains: + +| Claim | Description | +| ---------------------------- | ----------------------------------------------------- | +| `iss` | Trusted dispatcher issuer | +| `aud` | CDA audience | +| `iat` | Issued-at time | +| `exp` | Expiration time | +| `job_id` | Batch job identifier | +| `script_id` or `script_slug` | Script identity | +| `run_as_office` | Office context authorized for the job launch | +| `requested_by` | User or system that requested the job, when available | +| `dispatch_source` | Source such as `airflow` or `api` | + +### CDA Behavior + +CDA validates batch run context for machine-authenticated requests. + +For those requests, CDA will: + +- Prefer validated OIDC claims from the access token. +- Fall back to `X-CWMS-Job-Context` for configured batch machine users. +- Validate signature, issuer, audience, and expiration. +- Read only the run office needed to establish session context from trusted run context claims. +- Reject missing, expired, forged, or wrong-audience tokens. +- Make additional job context available to logging only when a logging-specific mechanism exists. + +CDA will not: + +- Treat request `office`, `office-id`, or body office fields as caller authority. +- Reject a request solely because the target resource office differs from `run_as_office`. +- Use batch run context to bypass route roles or database office roles. +- Expose job identifiers or requester metadata as general request attributes for downstream controllers. + +Normal CDA route authorization and CWMS database permissions determine whether the machine user can act on the requested resource office. + +### Configuration + +The Java API is configured with system properties or environment variables: + +| Setting | Description | +| ---------------------------------------- | -------------------------------------------------------------- | +| `cwms.dataapi.batch.jobContext.secret` | Signing secret for validating HS256 job context tokens | +| `cwms.dataapi.batch.jobContext.issuer` | Expected dispatcher issuer | +| `cwms.dataapi.batch.jobContext.audience` | Expected CDA audience | +| `cwms.dataapi.batch.machineUsers` | Comma-separated CDA users allowed to present batch run context | + +The signing secret belongs in a managed secret store. A later hardening step should use asymmetric signing or KMS-backed verification so CDA can verify run context without sharing the signing key. + +When Keycloak mints the batch run context directly, CDA does not need the signing secret for those requests. The machine principal must still be registered in CDA and the CWMS database. + +### Keycloak Client and Service Account Shape + +The local Keycloak realm uses confidential OIDC clients with service accounts to model non-human batch actors. In CWBI/cloud Keycloak, the same shape needs to be recreated for each environment. These are Keycloak clients and service-account users, not AWS Batch job definitions. + +Some configuration is common to all offices: + +| Item | Purpose | +| ---- | ------- | +| `machine_auth` access-token claim | Marks a service-account token as a CDA batch machine token. CDA uses this to distinguish machine run tokens from normal user OIDC tokens. | +| `run_as_office` access-token claim | Carries the office context authorized for the launch. For office-specific clients this value is stable, such as `SWT`, `SPK`, or another office code. | +| CDA/CWMS user registration for runner service accounts | CDA rejects unregistered machine principals. The corresponding service-account principal must exist in CDA and the CWMS database with the roles needed for the resource offices it will access. | +| Batch Events user/CAC roles | Human users keep using normal Keycloak/CAC sessions. Batch Events checks those user roles before allowing interactive job creation, editing, or submission. | + +Office-specific clients are replicated per office or per trust boundary. In the examples below, `swt` is the office suffix; a production rollout would create equivalent clients such as `cwms-batch-runner-spk` or `cwms-batch-airflow-mvp` where those offices are enabled. + +| Name pattern | Local example | Replicate per office? | Used by | Why it exists | +| ------------ | ------------- | --------------------- | ------- | ------------- | +| `cwms-batch-runner-` | `cwms-batch-runner-swt` | Yes | The job runner container | This is the identity used by the running script when it calls CDA. Its token should include `machine_auth=true` and `run_as_office=`. CDA maps the token subject to a registered machine principal and then normal CDA/database roles decide which resource offices it may access. | +| `cwms-batch-airflow-` | `cwms-batch-airflow-swt` | Yes, when Airflow schedules jobs for that office | Airflow scheduled DAGs | This is the scheduler identity used to call Batch Events and request due jobs for an office. It should be allowed to list and submit scheduled Batch Events jobs for that office, but it is not the CDA write identity used inside the running job. | +| Normal user/OIDC client and CAC user roles | Existing user login clients and users | No per-office service-account pattern, but user roles are office-scoped | Interactive Batch Events UI/API users | Human users authenticate with their CAC-backed Keycloak session. Batch Events authorizes whether the user may create scripts, manage schedules, or submit jobs for an office. The user's login does not become the CDA token used by the runner. | + +Keeping scheduler and runner identities separate prevents permission bleed. Airflow needs permission to trigger jobs for an office; the runner needs CDA/database roles to read or write data while the job executes. A user logged into the Batch Events UI needs permission to request a job, but the job itself still uses the office runner service account and trusted run context. + +## Alternatives Considered + +### Per-Office Keycloak Service Accounts + +Create one scheduler service account and one runner service account per office or trust boundary. + +- **Pros**: Office context is represented directly by the service account. +- **Cons**: Recreates service-account and secret sprawl as offices and runtimes grow. +- **Selected for current rollout**: This avoids Keycloak custom extensions while still allowing AWS Batch job definitions and runner images to be shared across offices. + +### Per-Office Batch Job Definitions and API Keys + +Continue using separate Batch definitions and CDA API secrets per office/runtime combination. + +- **Pros**: Uses the existing model. +- **Cons**: Requires hard-coded expansion across offices, runtimes, images, and secrets. +- **Rejected**: The dynamic runtime model is intended to remove this AWS Batch and API-key duplication. + +### Trust Request or Environment Office + +Use `OFFICE`, URI parameters, query parameters, or request body fields to decide who the job is running as. + +- **Pros**: Simple to pass through the runtime. +- **Cons**: These values are controlled by scripts and often identify target data ownership rather than caller authority. +- **Rejected**: Request office is resource context, not trusted run context. + +### Signed Dispatcher-Issued Run Context + +Use a shared machine identity and require a short-lived signed token from the trusted dispatcher. + +- **Pros**: Reduces runtime duplication while preserving trusted job launch context and normal CDA/DB authorization. +- **Cons**: Requires token validation and signing key management. +- **Fallback proposal**: Provides the required trust boundary if Keycloak cannot mint the required run context into the access token and per-office service accounts are not sufficient for a future use case. + +### Keycloak-Minted Job Context Claims + +Have Keycloak mint trusted machine context into the normal access token. + +- **Pros**: CDA validates one JWT from one issuer and does not need a second signing secret. +- **Cons**: Dynamic per-job values such as `job_id` would require more Keycloak customization than CWBI is likely to operate. +- **Preferred shape for current rollout**: Use office-scoped Keycloak service accounts that mint stable claims such as `machine_auth` and `run_as_office`; keep per-job metadata in Batch Events. + +## Compatibility + +Existing user API key and user OIDC flows are unchanged. + +Non-machine users do not need batch run context. Configured batch machine users must provide either Keycloak-minted machine run claims or a valid signed run context. + +Endpoint resource-office semantics are unchanged. Controllers and DAOs may continue to use request office values to retrieve or store CWMS resources. The database remains the source of truth for whether the active CDA user has roles for those resources. + +## Implementation Status + +### Current rollout + +- Use per-office Keycloak scheduler and runner service accounts. +- Have the runner token include `machine_auth` and `run_as_office`. +- Keep AWS Batch job definitions shared by runtime rather than by office. +- Keep job id, script, schedule, timeout, resource profile, env vars, and allowed secret names in the Batch Events registry. +- Let script admins register either a GitHub file path or a trusted runtime command. +- Let script admins choose schedule timezone; cron and hourly schedules are evaluated in that timezone by Airflow before a job is posted. +- Let script admins choose small, medium, or large resource profiles; AWS Batch receives resource overrides at dispatch time. +- Keep local Docker execution aligned with AWS Batch timeout behavior so local E2E can prove long-running jobs fail when they exceed the configured timeout. + +### CDA implementation + +- Accept Keycloak-minted `machine_auth` and `run_as_office` claims from validated OIDC access tokens. +- Add CDA validation for `X-CWMS-Job-Context` on configured batch machine users. +- Preserve batch run context separately from request resource office. +- Use run context only for session behavior in CDA; reserve job/requester metadata for future logging. +- Keep dispatcher-side signing and `X-CWMS-Job-Context` as a fallback path rather than the preferred production shape. +- Reject unregistered machine principals rather than auto-creating users when a machine token appears. + +## Criteria + +### Functional Requirements + +- A batch job launched through Airflow or the ad hoc API can call CDA using the registered office runner Keycloak service account. +- CDA accepts registered machine-user requests with valid Keycloak-minted machine run claims. +- CDA rejects configured machine-user requests that omit or forge run context. +- CDA rejects configured batch machine principals that are not already registered in CDA/DB. +- CDA can expose validated run context for audit without trusting script-controlled office values. +- Resource office access remains controlled by CDA route roles and CWMS database roles. +- A job with `run_as_office=SWT` can act on another office's resource data only when the mapped machine user has the required roles for that resource office. + +### Test Scenarios + +- **Direct API path**: An authorized user submits an SWT job through the batch events API; CDA accepts valid machine run context. +- **Airflow path**: Airflow triggers the same job path; CDA receives and validates machine run context. +- **Cross-office allowed**: Batch run context is SWT, target resource office is MVS or SPK, and the request succeeds when the machine user has the needed DB roles. +- **Cross-office denied**: Batch run context is SWT, target resource office is unauthorized for the machine user, and CDA/DB returns `403`. +- **Forgery denied**: A script changes `OFFICE` or request `office` without a valid signed context; CDA rejects the request. + +## Conclusion + +Trusted batch run context allows CDA to support dynamic shared Batch runtimes while preserving the existing CWMS authorization model. + +The Keycloak token claims or signed fallback token represent trusted job launch context. They do not redefine resource office semantics and do not bypass normal CDA or database authorization for the data being read or written. diff --git a/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.drawio b/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.drawio new file mode 100644 index 0000000000..73d24e5c8b --- /dev/null +++ b/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.drawio @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.svg b/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.svg new file mode 100644 index 0000000000..1c590da147 --- /dev/null +++ b/docs/source/decisions/diagrams/batch-airflow-scheduler-flow.svg @@ -0,0 +1,86 @@ + + Airflow scheduled Batch Events flow + Airflow polls Batch Events scheduled scripts per office, obtains scheduler service-account tokens, triggers due scripts, and lets Batch Events own dispatch, status, logs, and CDA runner authentication. + + + + + + Airflow Scheduled Flow + One registry-driven scheduler can serve many offices without one DAG per script or one AWS Batch job definition per office. + + + 1. Every-minute DAG + Registry scheduler runs + optionally loops configured offices + + + 2. Scheduler token + client_credentials + cwms-batch-airflow-<office> + + + 3. List due scripts + GET /scripts/scheduled + Batch Events filters by office roles + + + 4. Timezone due check + hourly minute or cron + scheduleTimezone controls local time; + DST gaps skipped, repeats run once + + + 5. Trigger due rows + POST /jobs per script + does not wait for AWS Batch + one office failure does not block others + + + 6. Registry job record + Jobs DB + queue + runtime, resources, timeout, + env vars copied from script row + + + 7. Shared runtime + Dispatcher submits job + runtime job definition + overrides + + + 8. Runner bootstraps + Fetches brokered env + office runner client credentials + + + 9. Runner token + client_credentials + machine_auth + run_as_office + + + 10. CDA call + CDA validates token + then normal DB roles + + + + + + + + + + + diff --git a/docs/source/decisions/diagrams/batch-m2m-overview.drawio b/docs/source/decisions/diagrams/batch-m2m-overview.drawio new file mode 100644 index 0000000000..016ea300b2 --- /dev/null +++ b/docs/source/decisions/diagrams/batch-m2m-overview.drawio @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/decisions/diagrams/batch-m2m-overview.svg b/docs/source/decisions/diagrams/batch-m2m-overview.svg new file mode 100644 index 0000000000..2f8fb3827f --- /dev/null +++ b/docs/source/decisions/diagrams/batch-m2m-overview.svg @@ -0,0 +1,105 @@ + + CWMS Batch M2M overview + Airflow and users submit jobs through Batch Events. Batch Events records the job, dispatches a shared runtime, brokers environment and secrets, and the runner uses an office-scoped Keycloak service account to call CDA. + + + + + + + + CWMS Batch M2M Auth and Shared Runtime + Scheduler identity requests jobs; runner identity calls CDA. Batch Events owns registry, dispatch, runtime env broker, logs, and office-scoped authorization. + + + Requesters + + Web UI user + PKCE login + + Airflow + per-office + scheduler client + + + Keycloak + + User token + roles + offices + + Scheduler + cwms-batch- + airflow-swt + + Runner token + machine_auth + run_as_office + + + Batch Events + + Registry API + scripts, schedules, + timeouts, resources + + Jobs DB + queue + office-scoped jobs + status + log lookup + + Dispatcher + shared runtime map + + Runtime broker + job env + secrets + + + Runtime + + AWS Batch + one job def per + runtime + + Runner image + executes script + + + CDA + Data + + Office secrets + runner client creds + + CWMS Data API + validates OIDC + registered machine + + CWMS DB roles + + + + + + + + + + + + + + + + client_credentials + diff --git a/docs/source/decisions/diagrams/batch-ui-job-flow.drawio b/docs/source/decisions/diagrams/batch-ui-job-flow.drawio new file mode 100644 index 0000000000..e4a19ba96b --- /dev/null +++ b/docs/source/decisions/diagrams/batch-ui-job-flow.drawio @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/decisions/diagrams/batch-ui-job-flow.svg b/docs/source/decisions/diagrams/batch-ui-job-flow.svg new file mode 100644 index 0000000000..38f8353bcb --- /dev/null +++ b/docs/source/decisions/diagrams/batch-ui-job-flow.svg @@ -0,0 +1,85 @@ + + End user Batch Events UI job flow + A user signs into the Batch Events UI, creates or selects a script registry row, submits a job, and then views status and logs while the shared runner executes the job and calls CDA with machine credentials. + + + + + + End User UI Flow + Manual user flow through Scripts Manager and Submit Job using the Batch Events registry. + + + 1. Sign in + User authenticates + PKCE token carries user roles + + + 2. Pick office + Scripts Manager filters + admin offices from CDA profile + + + 3. Register script + Choose GitHub path or command + runtime, resource profile, timeout, + schedule timezone, roles, env vars + + + 4. Optional path browse + Browse repository directories + runtime type hints are case-insensitive; + browse can select any file + + + 5. Save registry row + Batch Events validates + reserved env names, schedule shape, + timezone, timeout, runtime + + + 6. Submit job + User selects script + Batch Events checks required roles + + + 7. Create job + Jobs DB row + queue + office, script, timeout, resources + + + 8. Dispatch runtime + Shared runner job definition + resource overrides at submit time + + + 9. Execute script + Runner fetches brokered env + gets CDA bearer with runner client + + + 10. Status and logs + UI shows job result + office-scoped lookup and logs + + + + + + + + + + + diff --git a/docs/source/decisions/index.rst b/docs/source/decisions/index.rst index 539f13e895..08d179a8f2 100644 --- a/docs/source/decisions/index.rst +++ b/docs/source/decisions/index.rst @@ -26,3 +26,4 @@ Some decisions may also be a proposal and marked appropriately. Access Management Clients <./0007-access-management-clients.md> Timeseries CSV Format <./0008-timeseries-csv-format.rst> Handling Releases <./0009-code-changes-and-releases.rst> + Batch Machine Run Context <./0010-batch-machine-run-context.md>