diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 0000000..2a98ab2 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,73 @@ +kind: pipeline +type: kubernetes +name: default + +platform: + arch: amd64 + +trigger: + event: + - push + - tag + ref: + - refs/heads/main + - refs/tags/v* + +steps: + - name: publish + image: plugins/kaniko-ecr + settings: + create_repository: true + registry: 795250896452.dkr.ecr.us-east-1.amazonaws.com + repo: docs/${DRONE_REPO_NAME} + context: . + dockerfile: Dockerfile + tags: + - git-${DRONE_COMMIT_SHA:0:7} + - latest + access_key: + from_secret: ecr_access_key + secret_key: + from_secret: ecr_secret_key + when: + event: + - push + - tag + + - name: deploy-staging + image: public.ecr.aws/kanopy/drone-helm:v3 + settings: + chart: mongodb/web-app + chart_version: 4.30.0 + atomic: true + add_repos: [mongodb=https://10gen.github.io/helm-charts] + namespace: docs + release: ${DRONE_REPO_NAME} + values: image.tag=git-${DRONE_COMMIT_SHA:0:7},image.repository=795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/${DRONE_REPO_NAME} + values_files: ["kanopy/values.yaml", "kanopy/values.staging.yaml"] + api_server: https://api.staging.corp.mongodb.com + kubernetes_token: + from_secret: staging_kubernetes_token + when: + event: + - push + + - name: deploy-prod + image: public.ecr.aws/kanopy/drone-helm:v3 + settings: + chart: mongodb/web-app + chart_version: 4.30.0 + atomic: true + add_repos: [mongodb=https://10gen.github.io/helm-charts] + namespace: docs + release: ${DRONE_REPO_NAME} + values: image.tag=git-${DRONE_COMMIT_SHA:0:7},image.repository=795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/${DRONE_REPO_NAME} + values_files: ["kanopy/values.yaml", "kanopy/values.prod.yaml"] + api_server: https://api.prod.corp.mongodb.com + kubernetes_token: + from_secret: prod_kubernetes_token + when: + event: + - tag + ref: + - refs/tags/v* diff --git a/CHANGELOG.md b/CHANGELOG.md index 97a3cb3..f67db1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] +### Added + +- **Kanopy CorpSecure auth mode** — new `OPERATOR_AUTH_MODE=kanopy` option for the operator UI. When deployed behind Kanopy's CorpSecure proxy, authentication is handled automatically via Okta; the app verifies the forwarded `X-Kanopy-Internal-Authorization` JWT and maps Okta group membership to roles (`OPERATOR_AUTH_KANOPY_GROUP` → operator, all other authenticated employees → writer). GitHub PAT auth (`OPERATOR_AUTH_MODE=github`) remains the default and is unchanged. +- **Kanopy deployment** — Drone CI/CD pipeline (`.drone.yml`) and Helm values (`kanopy/`) to deploy the copier as a Kanopy app. Staging deploys on push to `main`; production deploys on `v*` tag. Secrets provisioned via `helm ksec`; GCP Secret Manager bypassed with `SKIP_SECRET_MANAGER=true`. +- **`OPERATOR_AUTH_KANOPY_JWKS_URL`** — optional override for the CorpSecure JWKS endpoint, used to point staging deployments at `login.staging.corp.mongodb.com`. + ## [v0.4.3] - 2026-05-02 ### Fixed diff --git a/configs/environment.go b/configs/environment.go index 9223fad..aa31c88 100644 --- a/configs/environment.go +++ b/configs/environment.go @@ -71,11 +71,19 @@ type Config struct { WebhookRetryInitialDelay int // initial delay between retries in seconds (doubles each attempt) // Operator web UI — off unless OPERATOR_UI_ENABLED=true. Works with any HTTP - // origin (local dev, Cloud Run, etc.). Access is gated by GitHub PATs: - // each user authenticates with their personal token, and the role - // (operator or writer) is determined by their permission on OPERATOR_AUTH_REPO. + // origin (local dev, Cloud Run, etc.). + // + // Auth mode "github" (default): each user authenticates with their GitHub PAT; + // their permission on OPERATOR_AUTH_REPO determines role. + // + // Auth mode "kanopy": Kanopy's CorpSecure proxy handles authentication; + // the app reads the X-Kanopy-Internal-Authorization JWT and maps Okta group + // membership to roles. OPERATOR_AUTH_REPO is not required in this mode. OperatorUIEnabled bool - OperatorAuthRepo string // "owner/repo" — user permissions here determine role (required when UI is enabled) + OperatorAuthMode string // "github" (default) or "kanopy" + OperatorAuthRepo string // "owner/repo" — required in github mode + OperatorAuthKanopyGroup string // Okta group for RoleOperator in kanopy mode (e.g. "10gen-github-copier-operators") + OperatorAuthKanopyJWKSURL string // override JWKS endpoint (default: prod login.corp.mongodb.com) OperatorRepoSlug string // "owner/repo" for GitHub links in audit/trace rows (optional) OperatorReleaseGitHubToken string // PAT with contents:write to create a version tag (optional) OperatorReleaseTargetBranch string // branch SHA used when creating a tag (default main) @@ -141,7 +149,10 @@ const ( WebhookMaxRetries = "WEBHOOK_MAX_RETRIES" WebhookRetryInitialDelay = "WEBHOOK_RETRY_INITIAL_DELAY" //nolint:gosec // env var name, not a credential OperatorUIEnabled = "OPERATOR_UI_ENABLED" - OperatorAuthRepo = "OPERATOR_AUTH_REPO" // repo for GitHub PAT permission check + OperatorAuthMode = "OPERATOR_AUTH_MODE" + OperatorAuthRepo = "OPERATOR_AUTH_REPO" // repo for GitHub PAT permission check (github mode) + OperatorAuthKanopyGroup = "OPERATOR_AUTH_KANOPY_GROUP" + OperatorAuthKanopyJWKSURL = "OPERATOR_AUTH_KANOPY_JWKS_URL" OperatorRepoSlug = "OPERATOR_REPO_SLUG" OperatorReleaseGitHubToken = "OPERATOR_RELEASE_GITHUB_TOKEN" // #nosec G101 -- env var name OperatorReleaseTargetBranch = "OPERATOR_RELEASE_TARGET_BRANCH" @@ -269,7 +280,10 @@ func LoadEnvironment(envFile string) (*Config, error) { config.WebhookRetryInitialDelay = getIntEnvWithDefault(WebhookRetryInitialDelay, config.WebhookRetryInitialDelay) config.OperatorUIEnabled = getBoolEnvWithDefault(OperatorUIEnabled, false) + config.OperatorAuthMode = strings.ToLower(getEnvWithDefault(OperatorAuthMode, "github")) config.OperatorAuthRepo = os.Getenv(OperatorAuthRepo) + config.OperatorAuthKanopyGroup = os.Getenv(OperatorAuthKanopyGroup) + config.OperatorAuthKanopyJWKSURL = os.Getenv(OperatorAuthKanopyJWKSURL) config.OperatorRepoSlug = os.Getenv(OperatorRepoSlug) config.OperatorReleaseGitHubToken = os.Getenv(OperatorReleaseGitHubToken) config.OperatorReleaseTargetBranch = getEnvWithDefault(OperatorReleaseTargetBranch, "main") @@ -385,18 +399,26 @@ func validateConfig(config *Config) error { return nil } -// validateOperatorAuth enforces that OPERATOR_AUTH_REPO is set when the UI is -// enabled. Without it, any valid GitHub user could authenticate with full -// operator access since there would be no per-repo permission gate. +// validateOperatorAuth enforces that auth-mode-specific required fields are set +// when the operator UI is enabled. func validateOperatorAuth(config *Config) error { if !config.OperatorUIEnabled { return nil } - if strings.TrimSpace(config.OperatorAuthRepo) == "" { - return fmt.Errorf("OPERATOR_UI_ENABLED=true requires OPERATOR_AUTH_REPO (owner/repo) to gate access — each user authenticates with their GitHub PAT and their permission on that repo determines their role") - } - if !strings.Contains(config.OperatorAuthRepo, "/") { - return fmt.Errorf("OPERATOR_AUTH_REPO must be in owner/repo format (got %q)", config.OperatorAuthRepo) + switch config.OperatorAuthMode { + case "kanopy": + if strings.TrimSpace(config.OperatorAuthKanopyGroup) == "" { + return fmt.Errorf("OPERATOR_AUTH_MODE=kanopy requires OPERATOR_AUTH_KANOPY_GROUP (the Okta group whose members get operator role, e.g. \"10gen-github-copier-operators\")") + } + case "github", "": + if strings.TrimSpace(config.OperatorAuthRepo) == "" { + return fmt.Errorf("OPERATOR_UI_ENABLED=true requires OPERATOR_AUTH_REPO (owner/repo) to gate access — each user authenticates with their GitHub PAT and their permission on that repo determines their role") + } + if !strings.Contains(config.OperatorAuthRepo, "/") { + return fmt.Errorf("OPERATOR_AUTH_REPO must be in owner/repo format (got %q)", config.OperatorAuthRepo) + } + default: + return fmt.Errorf("OPERATOR_AUTH_MODE must be \"github\" or \"kanopy\" (got %q)", config.OperatorAuthMode) } return nil } diff --git a/kanopy/values.prod.yaml b/kanopy/values.prod.yaml new file mode 100644 index 0000000..3964e77 --- /dev/null +++ b/kanopy/values.prod.yaml @@ -0,0 +1,55 @@ +# Prod overlay for github-copier. Merged on top of values.yaml at deploy time. + +ingress: + enabled: true + hosts: + - github-copier.docs.corp.mongodb.com + +replicaCount: 2 + +env: + GITHUB_APP_ID: "1166559" + INSTALLATION_ID: "91420665" + + CONFIG_REPO_OWNER: "grove-platform" + CONFIG_REPO_NAME: "github-copier" + CONFIG_REPO_BRANCH: "main" + MAIN_CONFIG_FILE: ".copier/main.yaml" + USE_MAIN_CONFIG: "true" + DEPRECATION_FILE: "deprecated_examples.json" + + WEBSERVER_PATH: "/events" + COMMITTER_NAME: "GitHub Copier App" + COMMITTER_EMAIL: "bot@mongodb.com" + + GOOGLE_CLOUD_PROJECT_ID: "github-copy-code-examples" + COPIER_LOG_NAME: "code-copier-log" + + AUDIT_ENABLED: "true" + METRICS_ENABLED: "true" + + OPERATOR_UI_ENABLED: "true" + OPERATOR_AUTH_MODE: "kanopy" + OPERATOR_AUTH_KANOPY_GROUP: "10gen-github-copier-operators" + # Prod uses the default JWKS endpoint (login.corp.mongodb.com); no override needed. + OPERATOR_REPO_SLUG: "grove-platform/github-copier" + + SKIP_SECRET_MANAGER: "true" + + LLM_PROVIDER: "anthropic" + LLM_BASE_URL: "https://grove-gateway-prod.azure-api.net/grove-foundry-prod/anthropic" + LLM_MODEL: "claude-haiku-4-5" + +# Provision before first prod deploy (use the prod cluster context): +# +# helm ksec set github-copier-secrets \ +# GITHUB_APP_PRIVATE_KEY_B64="$(base64 < your-app.private-key.pem)" \ +# WEBHOOK_SECRET="your-webhook-secret" \ +# MONGO_URI="mongodb+srv://..." \ +# ANTHROPIC_API_KEY="sk-ant-..." +# +envSecrets: + GITHUB_APP_PRIVATE_KEY_B64: github-copier-secrets + WEBHOOK_SECRET: github-copier-secrets + MONGO_URI: github-copier-secrets + ANTHROPIC_API_KEY: github-copier-secrets diff --git a/kanopy/values.staging.yaml b/kanopy/values.staging.yaml new file mode 100644 index 0000000..2ce8745 --- /dev/null +++ b/kanopy/values.staging.yaml @@ -0,0 +1,66 @@ +# Staging overlay for github-copier. Merged on top of values.yaml at deploy time. + +ingress: + enabled: true + hosts: + - github-copier.docs.staging.corp.mongodb.com + +replicaCount: 1 + +# Non-secret environment variables. +env: + # GitHub App — org-specific IDs (not sensitive, but vary by deployment) + GITHUB_APP_ID: "1166559" + INSTALLATION_ID: "91420665" + + # Config repository + CONFIG_REPO_OWNER: "grove-platform" + CONFIG_REPO_NAME: "github-copier" + CONFIG_REPO_BRANCH: "main" + MAIN_CONFIG_FILE: ".copier/main.yaml" + USE_MAIN_CONFIG: "true" + DEPRECATION_FILE: "deprecated_examples.json" + + # Webhook + committer + WEBSERVER_PATH: "/events" + COMMITTER_NAME: "GitHub Copier App" + COMMITTER_EMAIL: "bot@mongodb.com" + + # GCP logging (still used even on Kanopy for structured logs) + GOOGLE_CLOUD_PROJECT_ID: "github-copy-code-examples" + COPIER_LOG_NAME: "code-copier-log" + + # Features + AUDIT_ENABLED: "true" + METRICS_ENABLED: "true" + + # Operator UI — Kanopy CorpSecure auth mode + OPERATOR_UI_ENABLED: "true" + OPERATOR_AUTH_MODE: "kanopy" + OPERATOR_AUTH_KANOPY_GROUP: "10gen-github-copier-operators" + # Staging uses the staging JWKS endpoint; issuer remains login.corp.mongodb.com. + OPERATOR_AUTH_KANOPY_JWKS_URL: "https://login.staging.corp.mongodb.com/.well-known/jwks.json" + OPERATOR_REPO_SLUG: "grove-platform/github-copier" + + # Secrets are read from env vars directly (no GCP Secret Manager on Kanopy). + SKIP_SECRET_MANAGER: "true" + + # LLM rule suggester + LLM_PROVIDER: "anthropic" + LLM_BASE_URL: "https://grove-gateway-prod.azure-api.net/grove-foundry-prod/anthropic" + LLM_MODEL: "claude-haiku-4-5" + +# envSecrets maps ENV_VAR_NAME → Kubernetes Secret name. +# Provision all entries before first deploy: +# +# helm ksec set github-copier-secrets \ +# GITHUB_APP_PRIVATE_KEY_B64="$(base64 < your-app.private-key.pem)" \ +# WEBHOOK_SECRET="your-webhook-secret" \ +# MONGO_URI="mongodb+srv://..." \ +# ANTHROPIC_API_KEY="sk-ant-..." +# +envSecrets: + GITHUB_APP_PRIVATE_KEY_B64: github-copier-secrets + WEBHOOK_SECRET: github-copier-secrets + MONGO_URI: github-copier-secrets + ANTHROPIC_API_KEY: github-copier-secrets diff --git a/kanopy/values.yaml b/kanopy/values.yaml new file mode 100644 index 0000000..ad6963e --- /dev/null +++ b/kanopy/values.yaml @@ -0,0 +1,51 @@ +# Kanopy web-app chart base values for github-copier. +# Per-environment overlays in values..yaml are merged on top at deploy time. +# Field names follow the mongodb/web-app chart schema. + +image: + repository: 795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/github-copier + pullPolicy: IfNotPresent + # tag is not set here — the Drone deploy step passes image.tag=git- at deploy time. + +# The app listens on PORT 8080. The chart routes Ingress → port 80 → targetPort 8080. +services: + - name: http + ingress: true + port: 80 + probes: true + protocol: TCP + targetPort: 8080 + type: ClusterIP + +# Health and readiness checks against /health and /ready. +probes: + enabled: true + path: /health + liveness: + initialDelaySeconds: 15 + periodSeconds: 15 + timeoutSeconds: 5 + failureThreshold: 3 + readiness: + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + +resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +# Join the Istio service mesh so that inbound requests from CorpSecure arrive +# with the X-Kanopy-Internal-Authorization JWT injected by the proxy. +mesh: + enabled: true + +# Dedicated ServiceAccount for a distinct SPIFFE identity on outbound requests. +serviceAccount: + enabled: true + name: sa diff --git a/services/operator_auth_kanopy.go b/services/operator_auth_kanopy.go new file mode 100644 index 0000000..b096734 --- /dev/null +++ b/services/operator_auth_kanopy.go @@ -0,0 +1,291 @@ +package services + +import ( + "context" + "crypto/rsa" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "log" + "math/big" + "net/http" + "os" + "slices" + "strings" + "sync" + "time" + + "github.com/golang-jwt/jwt/v5" + "golang.org/x/sync/singleflight" +) + +// corpSecureClaims represents the JWT claims forwarded by Kanopy's CorpSecure +// proxy on the X-Kanopy-Internal-Authorization header. +// +// No `aud` claim is present — per Kanopy's CorpSecure docs, audience checks +// must be disabled. See agent-skill-dashboard lib/auth.ts for precedent. +type corpSecureClaims struct { + jwt.RegisteredClaims + Email string `json:"email"` + Groups []string `json:"groups"` + // Scp carries scopes for service-principal and mesh-internal tokens. + // Human user tokens do not set this field. + Scp []string `json:"scp"` +} + +// kanopyJWKSURLDefault is the prod JWKS endpoint. Override with +// OPERATOR_AUTH_KANOPY_JWKS_URL for the staging endpoint +// (https://login.staging.corp.mongodb.com/.well-known/jwks.json). +// The issuer claim is login.corp.mongodb.com in both environments. +const kanopyJWKSURLDefault = "https://login.corp.mongodb.com/.well-known/jwks.json" +const kanopyIssuer = "login.corp.mongodb.com" + +const jwksTTL = 10 * time.Minute +const jwksFailureCooldown = 30 * time.Second + +// kanopyJWKSCache fetches and caches RSA public keys from the CorpSecure JWKS +// endpoint. Construct with newKanopyJWKSCache; the zero value is not valid. +// +// Concurrency: singleflight deduplicates concurrent fetches so a slow JWKS +// endpoint degrades individual request latency (singleflight wait) rather +// than serialising all auth requests behind a write lock for the full HTTP +// timeout. The write lock is held only for the in-memory cache update, not +// across the network call. +type kanopyJWKSCache struct { + url string + mu sync.RWMutex + keys map[string]*rsa.PublicKey // kid → public key + fetched time.Time + failed time.Time + sf singleflight.Group +} + +func newKanopyJWKSCache(url string) *kanopyJWKSCache { + if url == "" { + url = kanopyJWKSURLDefault + } + return &kanopyJWKSCache{url: url} +} + +func (c *kanopyJWKSCache) getKeys() (map[string]*rsa.PublicKey, error) { + // Fast path: cache is fresh. + c.mu.RLock() + if !c.fetched.IsZero() && time.Since(c.fetched) < jwksTTL { + keys := c.keys + c.mu.RUnlock() + return keys, nil + } + // Failure cooldown: serve stale keys rather than hammering a downed endpoint. + if !c.failed.IsZero() && time.Since(c.failed) < jwksFailureCooldown { + keys := c.keys + c.mu.RUnlock() + if keys != nil { + return keys, nil + } + return nil, fmt.Errorf("JWKS unavailable: in backoff after failed fetch") + } + c.mu.RUnlock() + + // Slow path: deduplicate concurrent fetches with singleflight. + type result struct { + keys map[string]*rsa.PublicKey + err error + } + v, _, _ := c.sf.Do("fetch", func() (any, error) { + // Re-check after winning the singleflight slot — another goroutine may + // have refreshed the cache while we were waiting. + c.mu.RLock() + if !c.fetched.IsZero() && time.Since(c.fetched) < jwksTTL { + k := c.keys + c.mu.RUnlock() + return result{keys: k}, nil + } + c.mu.RUnlock() + + // Fetch with background context — JWKS is a shared resource not tied to + // any individual request. Using the caller's context would cancel the + // fetch (and invalidate it for all singleflight waiters) if that + // specific request times out first. + keys, fetchErr := fetchAndParseJWKS(context.Background(), c.url) + + c.mu.Lock() + defer c.mu.Unlock() + if fetchErr != nil { + c.failed = time.Now() + if c.keys != nil { + return result{keys: c.keys}, nil // serve stale on transient failure + } + return result{err: fmt.Errorf("JWKS fetch failed: %w", fetchErr)}, nil + } + c.keys = keys + c.fetched = time.Now() + c.failed = time.Time{} + return result{keys: keys}, nil + }) + r := v.(result) + return r.keys, r.err +} + +// jwkEntry is the minimal subset of a JWK we need. +type jwkEntry struct { + Kty string `json:"kty"` + Kid string `json:"kid"` + N string `json:"n"` + E string `json:"e"` +} + +type jwkSet struct { + Keys []jwkEntry `json:"keys"` +} + +func fetchAndParseJWKS(ctx context.Context, url string) (map[string]*rsa.PublicKey, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) // #nosec G107 -- URL is from config, not user input + if err != nil { + return nil, err + } + resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req) + if err != nil { + return nil, err + } + defer func() { _ = resp.Body.Close() }() + body, readErr := io.ReadAll(io.LimitReader(resp.Body, 1<<17)) // 128 KB cap + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("JWKS HTTP %d", resp.StatusCode) + } + if readErr != nil { + return nil, fmt.Errorf("read JWKS response: %w", readErr) + } + + var set jwkSet + if err := json.Unmarshal(body, &set); err != nil { + return nil, fmt.Errorf("parse JWKS: %w", err) + } + + keys := make(map[string]*rsa.PublicKey, len(set.Keys)) + for _, k := range set.Keys { + if k.Kty != "RSA" || k.Kid == "" || k.N == "" || k.E == "" { + continue + } + pub, err := jwkToRSA(k) + if err != nil { + continue + } + keys[k.Kid] = pub + } + if len(keys) == 0 { + return nil, fmt.Errorf("no valid RSA keys in JWKS response") + } + return keys, nil +} + +func jwkToRSA(k jwkEntry) (*rsa.PublicKey, error) { + nBytes, err := base64.RawURLEncoding.DecodeString(k.N) + if err != nil { + return nil, fmt.Errorf("decode n: %w", err) + } + eBytes, err := base64.RawURLEncoding.DecodeString(k.E) + if err != nil { + return nil, fmt.Errorf("decode e: %w", err) + } + e := int(new(big.Int).SetBytes(eBytes).Int64()) + if e == 0 { + return nil, fmt.Errorf("invalid exponent") + } + return &rsa.PublicKey{N: new(big.Int).SetBytes(nBytes), E: e}, nil +} + +// validateKanopyJWT verifies a CorpSecure JWT and returns an OperatorUser. +// operatorGroup is the Okta group whose members receive RoleOperator; all other +// authenticated human users receive RoleWriter. +// +// cache is the caller-owned JWKS cache — injected rather than global so tests +// can point at an httptest server without process-wide side effects. +func validateKanopyJWT(rawToken string, operatorGroup string, cache *kanopyJWKSCache) (*OperatorUser, error) { + if rawToken == "" { + return nil, fmt.Errorf("empty token") + } + + var claims corpSecureClaims + _, err := jwt.ParseWithClaims(rawToken, &claims, func(t *jwt.Token) (any, error) { + if _, ok := t.Method.(*jwt.SigningMethodRSA); !ok { + return nil, fmt.Errorf("unexpected signing method: %v", t.Header["alg"]) + } + keys, err := cache.getKeys() + if err != nil { + return nil, err + } + kid, _ := t.Header["kid"].(string) + key, ok := keys[kid] + if !ok { + // Do not fall back to an arbitrary key — random map iteration would + // pick the wrong key during rotation and produce spurious rejections. + return nil, fmt.Errorf("no key for kid %q in JWKS (%d keys cached)", kid, len(keys)) + } + return key, nil + }, + jwt.WithIssuer(kanopyIssuer), + jwt.WithExpirationRequired(), + jwt.WithLeeway(30*time.Second), + // No WithAudience — CorpSecure JWTs do not carry an aud claim by design. + ) + if err != nil { + return nil, fmt.Errorf("verify token: %w", err) + } + + // Reject service-mesh principals. Per corpsecure/mesh.md and the + // 2025-01-31_mesh_service_principal_identity advisory, mesh-internal tokens + // are minted by CorpSecure for all pod-to-pod requests when mesh.enabled=true. + // They carry scp=["mesh-internal"] and a spiffe:// subject instead of an + // email + groups. They pass signature and issuer checks identically to user + // tokens — the operator UI must explicitly reject them to prevent any + // mesh-resident workload from reading audit events, delivery logs, and + // workflow config. + if claims.Email == "" || + slices.Contains(claims.Scp, "mesh-internal") || + strings.HasPrefix(claims.Subject, "spiffe://") { + return nil, fmt.Errorf("non-user principal rejected (service-mesh identity)") + } + + role := RoleWriter + if slices.Contains(claims.Groups, operatorGroup) { + role = RoleOperator + } + + return &OperatorUser{ + Login: claims.Subject, + Role: role, + }, nil +} + +// init runs the dev-bypass safety check once at process startup — not on the +// first request — so a misconfigured pod is refused before it ever serves +// traffic. Mirrors agent-skill-dashboard lib/auth.ts (assertDevBypassIsLocalDev). +func init() { + if os.Getenv("DEV_BYPASS_AUTH") != "1" { + return + } + inCluster := os.Getenv("KUBERNETES_SERVICE_HOST") != "" || os.Getenv("KANOPY_NAMESPACE") != "" + if inCluster { + log.Fatal("[kanopy auth] DEV_BYPASS_AUTH=1 is set inside a Kubernetes pod — refusing to start. Unset it from the deployment config.") + } +} + +// devBypassUser returns a synthetic OperatorUser when DEV_BYPASS_AUTH=1. +// The Kubernetes safety check is enforced at startup by init(); this function +// is a non-panicking helper that callers invoke per-request. +// Returns nil when the bypass is not active. +func devBypassUser() *OperatorUser { + if os.Getenv("DEV_BYPASS_AUTH") != "1" { + return nil + } + email := os.Getenv("DEV_BYPASS_AUTH_EMAIL") + if email == "" { + email = "dev-user@local.mongodb.com" + } + login := email + if idx := len(email) - len("@local.mongodb.com"); idx > 0 && email[idx:] == "@local.mongodb.com" { + login = email[:idx] + } + return &OperatorUser{Login: login, Role: RoleOperator} +} diff --git a/services/operator_auth_kanopy_test.go b/services/operator_auth_kanopy_test.go new file mode 100644 index 0000000..a4b7219 --- /dev/null +++ b/services/operator_auth_kanopy_test.go @@ -0,0 +1,251 @@ +package services + +import ( + "crypto/rand" + "crypto/rsa" + "encoding/base64" + "encoding/json" + "math/big" + "net/http" + "net/http/httptest" + "sync" + "testing" + "time" + + "github.com/golang-jwt/jwt/v5" + "github.com/stretchr/testify/require" +) + +const testKid = "test-key-1" +const testOperatorGroup = "10gen-test-operators" + +// testRSAKey is generated once per test run — 2048-bit RSA is slow to generate. +var ( + testKeyOnce sync.Once + testRSAKey *rsa.PrivateKey +) + +func getTestKey(t *testing.T) *rsa.PrivateKey { + t.Helper() + testKeyOnce.Do(func() { + var err error + testRSAKey, err = rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + panic("test RSA key generation: " + err.Error()) + } + }) + return testRSAKey +} + +// makeJWKSServer starts an httptest.Server that serves the given public key as +// a minimal JWKS document. Cleans up automatically via t.Cleanup. +func makeJWKSServer(t *testing.T, pub *rsa.PublicKey, kid string) *httptest.Server { + t.Helper() + nBytes := pub.N.Bytes() + eBytes := big.NewInt(int64(pub.E)).Bytes() + body, err := json.Marshal(map[string]any{ + "keys": []map[string]any{{ + "kty": "RSA", + "kid": kid, + "n": base64.RawURLEncoding.EncodeToString(nBytes), + "e": base64.RawURLEncoding.EncodeToString(eBytes), + }}, + }) + require.NoError(t, err) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write(body) + })) + t.Cleanup(srv.Close) + return srv +} + +// makeTestCache returns a kanopyJWKSCache pointed at the given httptest server. +func makeTestCache(srv *httptest.Server) *kanopyJWKSCache { + return newKanopyJWKSCache(srv.URL) +} + +// signToken creates a signed RS256 JWT using the provided claims and key. +func signToken(t *testing.T, key *rsa.PrivateKey, kid string, claims corpSecureClaims) string { + t.Helper() + token := jwt.NewWithClaims(jwt.SigningMethodRS256, claims) + token.Header["kid"] = kid + signed, err := token.SignedString(key) + require.NoError(t, err) + return signed +} + +// validUserClaims returns a baseline set of claims for a human user token. +func validUserClaims(groups ...string) corpSecureClaims { + return corpSecureClaims{ + RegisteredClaims: jwt.RegisteredClaims{ + Issuer: kanopyIssuer, + Subject: "jdoe", + ExpiresAt: jwt.NewNumericDate(time.Now().Add(time.Hour)), + IssuedAt: jwt.NewNumericDate(time.Now()), + }, + Email: "jdoe@mongodb.com", + Groups: groups, + } +} + +func TestValidateKanopyJWT(t *testing.T) { + key := getTestKey(t) + srv := makeJWKSServer(t, &key.PublicKey, testKid) + cache := makeTestCache(srv) + + tests := []struct { + name string + token func() string + wantRole OperatorRole + wantErrFrag string + }{ + { + name: "operator group member gets RoleOperator", + token: func() string { + return signToken(t, key, testKid, validUserClaims(testOperatorGroup, "10gen-everyone")) + }, + wantRole: RoleOperator, + }, + { + name: "user not in operator group gets RoleWriter", + token: func() string { + return signToken(t, key, testKid, validUserClaims("10gen-everyone")) + }, + wantRole: RoleWriter, + }, + { + name: "empty token rejected", + token: func() string { + return "" + }, + wantErrFrag: "empty token", + }, + { + name: "wrong issuer rejected", + token: func() string { + c := validUserClaims() + c.Issuer = "evil.example.com" + return signToken(t, key, testKid, c) + }, + wantErrFrag: "verify token", + }, + { + name: "expired token rejected", + token: func() string { + c := validUserClaims() + c.ExpiresAt = jwt.NewNumericDate(time.Now().Add(-2 * time.Hour)) + c.IssuedAt = jwt.NewNumericDate(time.Now().Add(-3 * time.Hour)) + return signToken(t, key, testKid, c) + }, + wantErrFrag: "verify token", + }, + { + name: "HS256 alg rejected (alg confusion)", + token: func() string { + // Sign with the HMAC method using the public key bytes as the + // secret — this is the classic alg-confusion attack vector. + hmacToken := jwt.NewWithClaims(jwt.SigningMethodHS256, validUserClaims()) + hmacToken.Header["kid"] = testKid + signed, err := hmacToken.SignedString(key.N.Bytes()) + require.NoError(t, err) + return signed + }, + wantErrFrag: "verify token", + }, + { + name: "unknown kid rejected (no key fallback)", + token: func() string { + return signToken(t, key, "unknown-kid", validUserClaims()) + }, + wantErrFrag: "verify token", + }, + { + name: "mesh-internal scp claim rejected", + token: func() string { + c := validUserClaims() + c.Scp = []string{"mesh-internal"} + return signToken(t, key, testKid, c) + }, + wantErrFrag: "non-user principal rejected", + }, + { + name: "spiffe:// subject rejected", + token: func() string { + c := validUserClaims() + c.Subject = "spiffe://cluster.local/ns/docs/sa/some-service" + return signToken(t, key, testKid, c) + }, + wantErrFrag: "non-user principal rejected", + }, + { + name: "missing email rejected", + token: func() string { + c := validUserClaims() + c.Email = "" + return signToken(t, key, testKid, c) + }, + wantErrFrag: "non-user principal rejected", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + user, err := validateKanopyJWT(tc.token(), testOperatorGroup, cache) + if tc.wantErrFrag != "" { + require.Error(t, err) + require.Contains(t, err.Error(), tc.wantErrFrag) + require.Nil(t, user) + return + } + require.NoError(t, err) + require.NotNil(t, user) + require.Equal(t, tc.wantRole, user.Role) + }) + } +} + +func TestValidateKanopyJWT_JWKSServerDown(t *testing.T) { + // Start a server, immediately close it, then verify we get a fetch error + // rather than a panic or successful auth. + key := getTestKey(t) + srv := makeJWKSServer(t, &key.PublicKey, testKid) + cache := makeTestCache(srv) + srv.Close() // close before the request + + token := signToken(t, key, testKid, validUserClaims()) + _, err := validateKanopyJWT(token, testOperatorGroup, cache) + require.Error(t, err) +} + +func TestJwkToRSA(t *testing.T) { + key := getTestKey(t) + pub := &key.PublicKey + + entry := jwkEntry{ + Kty: "RSA", + Kid: "k1", + N: base64.RawURLEncoding.EncodeToString(pub.N.Bytes()), + E: base64.RawURLEncoding.EncodeToString(big.NewInt(int64(pub.E)).Bytes()), + } + got, err := jwkToRSA(entry) + require.NoError(t, err) + require.Equal(t, pub.N, got.N) + require.Equal(t, pub.E, got.E) +} + +func TestDevBypassUser(t *testing.T) { + t.Run("inactive when env var not set", func(t *testing.T) { + t.Setenv("DEV_BYPASS_AUTH", "0") + require.Nil(t, devBypassUser()) + }) + + t.Run("returns operator user when active", func(t *testing.T) { + t.Setenv("DEV_BYPASS_AUTH", "1") + t.Setenv("DEV_BYPASS_AUTH_EMAIL", "testuser@local.mongodb.com") + u := devBypassUser() + require.NotNil(t, u) + require.Equal(t, RoleOperator, u.Role) + require.Equal(t, "testuser", u.Login) + }) +} diff --git a/services/operator_repo_filter.go b/services/operator_repo_filter.go index 0b8ccb0..04fe147 100644 --- a/services/operator_repo_filter.go +++ b/services/operator_repo_filter.go @@ -30,8 +30,10 @@ func newRepoFilter(ctx context.Context, cache *ghAuthCache, pat string, user *Op // bypass reports whether this filter should let every row through unmodified. // Operators see everything; writers go through per-repo checks. +// When cache is nil (kanopy auth mode has no GitHub PAT), bypass so writers +// see all rows — there is no PAT available to call the GitHub permission API. func (f *repoFilter) bypass() bool { - return f == nil || f.user == nil || f.user.Role == RoleOperator + return f == nil || f.user == nil || f.user.Role == RoleOperator || f.cache == nil } // canRead returns true if the caller has read access to repo. Empty repo diff --git a/services/operator_suggest_rule.go b/services/operator_suggest_rule.go index dc41361..579dcc5 100644 --- a/services/operator_suggest_rule.go +++ b/services/operator_suggest_rule.go @@ -66,11 +66,17 @@ func (o *operatorUI) handleSuggestRule(w http.ResponseWriter, r *http.Request) { return } - // Per-PAT rate limit caps Anthropic token spend per operator. Keyed by - // hashed PAT so the bucket survives across cache evictions of the full - // user record and can't be leaked by a memory dump. - if pat := bearerToken(r); pat != "" && o.suggestLimiter != nil { - allowed, resetAt := o.suggestLimiter.Allow(hashToken(pat)) + // Per-user rate limit caps Anthropic token spend. In github mode keyed by + // hashed PAT; in kanopy mode keyed by hashed login (no PAT available). + // Either way the raw credential never sits in the bucket map. + var rlKey string + if pat := bearerToken(r); pat != "" { + rlKey = hashToken(pat) + } else if u := operatorUserFromCtx(r); u != nil { + rlKey = hashToken(u.Login) + } + if rlKey != "" && o.suggestLimiter != nil { + allowed, resetAt := o.suggestLimiter.Allow(rlKey) if !allowed { retry := time.Until(resetAt).Round(time.Second) w.Header().Set("Retry-After", fmt.Sprintf("%d", int(retry.Seconds()))) diff --git a/services/operator_ui.go b/services/operator_ui.go index c65f693..3db18a9 100644 --- a/services/operator_ui.go +++ b/services/operator_ui.go @@ -37,11 +37,16 @@ func RegisterOperatorRoutes(mux *http.ServeMux, cfg *configs.Config, container * cfg: cfg, container: container, version: version, - ghCache: newGHAuthCache(5 * time.Minute), - // 30 suggestions/hour/PAT caps Anthropic spend per operator. Normal + authMode: cfg.OperatorAuthMode, + // 30 suggestions/hour/user caps Anthropic spend per operator. Normal // usage is well under this; a misbehaving client can't rack up a bill. suggestLimiter: newTokenBucket(30, time.Hour), } + if o.authMode == "kanopy" { + o.kanopyJWKS = newKanopyJWKSCache(cfg.OperatorAuthKanopyJWKSURL) + } else { + o.ghCache = newGHAuthCache(5 * time.Minute) + } // Always create the LLM client; availability is checked dynamically via Ping. // Operators can change the active model and base URL from the UI without restart. if client, err := NewLLMClient(LLMClientOptions{ @@ -77,18 +82,24 @@ func RegisterOperatorRoutes(mux *http.ServeMux, cfg *configs.Config, container * mux.HandleFunc("/operator", func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "/operator/", http.StatusFound) }) - LogInfo("Operator UI: /operator/ with GitHub PAT authentication", "auth_repo", cfg.OperatorAuthRepo) + if o.authMode == "kanopy" { + LogInfo("Operator UI: /operator/ with Kanopy CorpSecure authentication", "operator_group", cfg.OperatorAuthKanopyGroup) + } else { + LogInfo("Operator UI: /operator/ with GitHub PAT authentication", "auth_repo", cfg.OperatorAuthRepo) + } } type operatorUI struct { cfg *configs.Config container *ServiceContainer version string - replayInFlight sync.Map // key: "owner/repo#pr" → prevents concurrent replays - ghCache *ghAuthCache // GitHub PAT validation + per-repo permission cache - llm LLMClient // optional: enabled when cfg.LLMEnabled is true - suggestLimiter *tokenBucket // per-PAT rate limit for /api/suggest-rule (LLM cost cap) - llmPing llmPingCache // cached Ping() result so /llm/status doesn't burn tokens on every refresh + replayInFlight sync.Map // key: "owner/repo#pr" → prevents concurrent replays + ghCache *ghAuthCache // GitHub PAT validation + per-repo permission cache (github mode only) + kanopyJWKS *kanopyJWKSCache // Kanopy JWKS cache (kanopy mode only) + authMode string // "github" or "kanopy" + llm LLMClient // optional: enabled when cfg.LLMEnabled is true + suggestLimiter *tokenBucket // per-user rate limit for /api/suggest-rule (LLM cost cap) + llmPing llmPingCache // cached Ping() result so /llm/status doesn't burn tokens on every refresh } // llmPingCache memoises the most recent LLMClient.Ping() outcome. Status-tab @@ -134,8 +145,18 @@ func operatorUserFromCtx(r *http.Request) *OperatorUser { return u } -// wrapAPI validates the incoming request's GitHub PAT and attaches the user to the context. +// wrapAPI validates the incoming request and attaches the authenticated user to +// the context. Dispatches to the github or kanopy implementation based on +// the configured auth mode. func (o *operatorUI) wrapAPI(next http.HandlerFunc) http.HandlerFunc { + if o.authMode == "kanopy" { + return o.wrapAPIKanopy(next) + } + return o.wrapAPIGitHub(next) +} + +// wrapAPIGitHub validates a GitHub PAT from the Authorization: Bearer header. +func (o *operatorUI) wrapAPIGitHub(next http.HandlerFunc) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") token := bearerToken(r) @@ -155,6 +176,39 @@ func (o *operatorUI) wrapAPI(next http.HandlerFunc) http.HandlerFunc { } } +// wrapAPIKanopy validates the CorpSecure JWT from X-Kanopy-Internal-Authorization. +// CorpSecure injects this header after authenticating the user against Okta; +// the app never sees unauthenticated requests when deployed behind the proxy. +func (o *operatorUI) wrapAPIKanopy(next http.HandlerFunc) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + // Dev escape hatch: DEV_BYPASS_AUTH=1 in local development only. + if u := devBypassUser(); u != nil { + ctx := context.WithValue(r.Context(), operatorUserCtxKey{}, u) + next(w, r.WithContext(ctx)) + return + } + + raw := strings.TrimPrefix(r.Header.Get("X-Kanopy-Internal-Authorization"), "Bearer ") + if raw == "" { + w.WriteHeader(http.StatusUnauthorized) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "missing Kanopy authorization header"}) + return + } + + user, err := validateKanopyJWT(raw, o.cfg.OperatorAuthKanopyGroup, o.kanopyJWKS) + if err != nil { + LogWarning("Kanopy JWT verification failed", "error", err.Error()) + w.WriteHeader(http.StatusUnauthorized) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "authentication failed"}) + return + } + ctx := context.WithValue(r.Context(), operatorUserCtxKey{}, user) + next(w, r.WithContext(ctx)) + } +} + // wrapOperatorOnly wraps a handler that requires the "operator" role (replay, release). func (o *operatorUI) wrapOperatorOnly(next http.HandlerFunc) http.HandlerFunc { return o.wrapAPI(func(w http.ResponseWriter, r *http.Request) { @@ -200,12 +254,15 @@ func (o *operatorUI) handleOperatorStatus(w http.ResponseWriter, r *http.Request w.Header().Set("Content-Type", "application/json") out := map[string]any{ "operator_apis_enabled": true, - "auth_repo": o.cfg.OperatorAuthRepo, + "auth_mode": o.authMode, "llm_available": o.llm != nil, // client exists; reachability checked via /operator/api/llm/status "metrics_enabled": o.cfg.MetricsEnabled, "audit_enabled": o.cfg.AuditEnabled, "version": o.version, } + if o.authMode != "kanopy" { + out["auth_repo"] = o.cfg.OperatorAuthRepo + } if o.container != nil && o.container.DeliveryTracker != nil { out["webhook_dedupe_entries"] = o.container.DeliveryTracker.Len() out["webhook_recent_observations"] = o.container.DeliveryTracker.HistoryLen() @@ -249,16 +306,25 @@ func (o *operatorUI) handleRepoPermission(w http.ResponseWriter, r *http.Request } repos := strings.Split(reposParam, ",") - // Per-repo result: Allowed + optional Error. Surfacing the error lets - // the frontend distinguish "user genuinely can't read this repo" from - // "GitHub rate limited us" so disabled replay buttons can carry an - // actionable tooltip instead of an opaque gray state. + // Per-repo result: Allowed + optional Error. type repoPerm struct { Allowed bool `json:"allowed"` Error string `json:"error,omitempty"` } result := make(map[string]repoPerm, len(repos)) + // In kanopy mode there is no GitHub PAT to call the permissions API with. + // Writers see all repos (CorpSecure already guarantees a valid MongoDB employee). + if o.authMode == "kanopy" { + for _, repo := range repos { + if repo = strings.TrimSpace(repo); repo != "" { + result[repo] = repoPerm{Allowed: true} + } + } + _ = json.NewEncoder(w).Encode(map[string]any{"permissions": result}) + return + } + user := operatorUserFromCtx(r) userPAT := bearerToken(r) if user == nil || userPAT == "" { @@ -674,13 +740,15 @@ func (o *operatorUI) handleDeliveryLogs(w http.ResponseWriter, r *http.Request) _ = json.NewEncoder(w).Encode(map[string]string{"error": "delivery not visible"}) return } - ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second) - defer cancel() - canRead, _ := o.ghCache.CanUserReadRepo(ctx, bearerToken(r), user.Login, repo) - if !canRead { - w.WriteHeader(http.StatusForbidden) - _ = json.NewEncoder(w).Encode(map[string]string{"error": "delivery not visible"}) - return + if o.ghCache != nil { + ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second) + defer cancel() + canRead, _ := o.ghCache.CanUserReadRepo(ctx, bearerToken(r), user.Login, repo) + if !canRead { + w.WriteHeader(http.StatusForbidden) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "delivery not visible"}) + return + } } } logs := o.container.DeliveryLogs.Get(deliveryID) @@ -803,9 +871,10 @@ func (o *operatorUI) handleReplay(w http.ResponseWriter, r *http.Request) { return } - // Source-repo permission check: the user's PAT must have at least read - // access to the source repo being replayed. - { + // Source-repo permission check (github mode only): the user's PAT must have + // at least read access to the source repo being replayed. Skipped in kanopy + // mode — operators are already trusted members of the operator Okta group. + if o.ghCache != nil { user := operatorUserFromCtx(r) userPAT := bearerToken(r) if user == nil || userPAT == "" { diff --git a/services/web/operator/index.html b/services/web/operator/index.html index e952928..2c18708 100644 --- a/services/web/operator/index.html +++ b/services/web/operator/index.html @@ -328,7 +328,7 @@
-
+
@@ -714,9 +714,13 @@

Keyboard shortcuts

/* ── Utilities ── */ var TKEY='github-copier-operator-token', SKEY='operator-sections', DKEY='operator-dark', TMKEY='operator-time-mode'; var COP_ORIGIN=(location.protocol==='file:'||!location.origin)?'':location.origin; +var _authMode='github'; // updated from /operator/api/status on load function appURL(p){return COP_ORIGIN?COP_ORIGIN+(p[0]==='/'?p:'/'+p):p;} function authHeaders(){var t=sessionStorage.getItem(TKEY)||'',h={Accept:'application/json'};if(t)h['Authorization']='Bearer '+t;return h;} function hasToken(){return!!sessionStorage.getItem(TKEY);} +// hasAuth: true when the user has a stored PAT (github mode) or auth is +// handled transparently by the CorpSecure proxy (kanopy mode). +function hasAuth(){return _authMode==='kanopy'||hasToken();} function escapeHtml(s){return String(s).replace(/[&<>"']/g,function(c){return{'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];});} function showErr(id,msg){var el=$(id);if(!msg){el.hidden=true;el.textContent='';return;}el.hidden=false;el.textContent=msg;} function $(id){return document.getElementById(id);} @@ -886,7 +890,7 @@

Keyboard shortcuts

/* ── Authenticated user ── */ var _currentUser=null; function fetchMe(){ - if(!hasToken())return; + if(!hasAuth())return; fetch(appURL('/operator/api/me'),{headers:authHeaders()}).then(function(r){ if(!r.ok)return null; return r.json(); @@ -933,6 +937,12 @@

Keyboard shortcuts

if(lbl){lbl.textContent=d.auth_repo;} } setLLMAvailable(!!d.llm_available); + if(d.auth_mode==='kanopy'){ + _authMode='kanopy'; + var ts=$('tokenSection');if(ts)ts.hidden=true; + fetchMe(); + loadAllSecured(); + } }).catch(function(){}); } @@ -1902,7 +1912,7 @@

Keyboard shortcuts

}; /* ── Global refresh ── */ -function loadAllSecured(){if(!hasToken())return;loadAudit();loadWebhookTraces();loadDeliveries();loadOverview();loadWorkflows();loadAISettingsStatus();$('loadDeploy').click();} +function loadAllSecured(){if(!hasAuth())return;loadAudit();loadWebhookTraces();loadDeliveries();loadOverview();loadWorkflows();loadAISettingsStatus();$('loadDeploy').click();} function refreshAll(){if(COP_ORIGIN){loadMetricsCards();loadAllProbes();}loadAllSecured();} $('refreshAll').onclick=refreshAll;