From 20afa408695713db4c01758274099a27e024fb25 Mon Sep 17 00:00:00 2001 From: Chong Yang Date: Tue, 2 Jun 2026 16:14:09 +0930 Subject: [PATCH] feat(metrics): add keeperhub-metrics-collector service + image (TECH-6484 A) Standalone single-replica service that serves the DB-sourced Prometheus gauges (the former /api/metrics/db scrape) off the request-serving pods. Executor-style: build-context = repo root, reuses lib/metrics + lib/db verbatim via tsx, so the exposed gauge families are identical. - keeperhub-metrics-collector/: node:http server (PORT 9090) serving GET /metrics (updateDbMetrics -> getDbMetrics) and GET /health, with a test exercising the HTTP wiring on an ephemeral port. - Dockerfile metrics-collector stage (source-only dep, no Next builder) + source-stage wiring. Validated by a real image build: imports resolve, /health 200, /metrics runs the real queries. - docker-bake.hcl target/group + METRICS_COLLECTOR_ECR_REPO var. Inert until a deploy workflow + ECR repo wire it (TECH-6484 B). --- Dockerfile | 34 ++++++++++ docker-bake.hcl | 23 ++++++- keeperhub-metrics-collector/index.ts | 34 ++++++++++ keeperhub-metrics-collector/package.json | 18 ++++++ keeperhub-metrics-collector/server.test.ts | 75 ++++++++++++++++++++++ keeperhub-metrics-collector/server.ts | 70 ++++++++++++++++++++ 6 files changed, 253 insertions(+), 1 deletion(-) create mode 100644 keeperhub-metrics-collector/index.ts create mode 100644 keeperhub-metrics-collector/package.json create mode 100644 keeperhub-metrics-collector/server.test.ts create mode 100644 keeperhub-metrics-collector/server.ts diff --git a/Dockerfile b/Dockerfile index d141cc9be..e7accb793 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,6 +50,7 @@ COPY drizzle/ ./drizzle/ COPY hooks/ ./hooks/ COPY keeperhub-events/ ./keeperhub-events/ COPY keeperhub-executor/ ./keeperhub-executor/ +COPY keeperhub-metrics-collector/ ./keeperhub-metrics-collector/ COPY keeperhub-scheduler/ ./keeperhub-scheduler/ COPY lib/ ./lib/ COPY plugins/ ./plugins/ @@ -310,3 +311,36 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ # Start the application CMD ["node", "server.js"] + +# ============================================================================== +# Stage: metrics-collector (TECH-6484) +# Single-replica service that serves the DB-sourced Prometheus gauges (the +# former /api/metrics/db scrape) off the request-serving pods. Executor-style: +# reuses lib/metrics + lib/db verbatim via tsx. Copies the full root +# node_modules (the imported lib code has transitive deps beyond the +# collector's own package.json). The metrics import graph touches no +# builder-generated file at runtime -- lib/db/schema only references the +# generated lib/types/integration via `import type`, which tsx erases -- so +# this stage depends on source only, not the (expensive) Next builder stage. +# ============================================================================== +FROM node:24-alpine AS metrics-collector +WORKDIR /app +RUN npm install -g pnpm@9 tsx@4 +COPY --link --from=deps /etc/ssl/certs/rds-combined-ca-bundle.pem /etc/ssl/certs/rds-combined-ca-bundle.pem + +COPY --link --from=deps /app/node_modules ./node_modules +COPY --link --from=source /app/keeperhub-metrics-collector ./keeperhub-metrics-collector +COPY --link --from=source /app/lib ./lib +COPY --link --from=source /app/package.json ./package.json +COPY --link --from=source /app/tsconfig.json ./tsconfig.json + +# Shim server-only (runs outside Next.js) +SHELL ["/bin/ash", "-o", "pipefail", "-c"] +RUN find /app/node_modules -path "*server-only*/index.js" | while read -r f; do echo 'module.exports = {};' > "$f"; done + +ENV NODE_ENV=production + +EXPOSE 9090 + +# Build with: docker build --target metrics-collector -t keeperhub-metrics-collector . +CMD ["tsx", "keeperhub-metrics-collector/index.ts"] diff --git a/docker-bake.hcl b/docker-bake.hcl index 83ce89fed..788b45a00 100644 --- a/docker-bake.hcl +++ b/docker-bake.hcl @@ -26,6 +26,7 @@ variable "EVENTS_ECR_TRACKER_REPO" { default = "" } variable "SCHEDULER_ECR_REPO" { default = "" } variable "EXECUTOR_ECR_REPO" { default = "" } variable "SANDBOX_ECR_REPO" { default = "" } +variable "METRICS_COLLECTOR_ECR_REPO" { default = "" } group "default" { targets = ["app", "migrator", "workflow-runner"] @@ -43,8 +44,12 @@ group "sandbox" { targets = ["sandbox"] } +group "metrics-collector" { + targets = ["metrics-collector"] +} + group "all" { - targets = ["app", "migrator", "workflow-runner", "event-tracker", "schedule-dispatcher", "block-dispatcher", "executor", "sandbox"] + targets = ["app", "migrator", "workflow-runner", "event-tracker", "schedule-dispatcher", "block-dispatcher", "executor", "sandbox", "metrics-collector"] } target "app" { @@ -216,3 +221,19 @@ target "sandbox" { cache-to = ["type=registry,ref=${ECR_REGISTRY}/${SANDBOX_ECR_REPO}:cache,mode=max"] attest = [] } + +# Metrics collector (TECH-6484). Context is repo root because the stage reuses +# lib/ and the root node_modules. Tag prefix `collector-`. +target "metrics-collector" { + context = "." + dockerfile = "Dockerfile" + target = "metrics-collector" + tags = compact([ + "${ECR_REGISTRY}/${METRICS_COLLECTOR_ECR_REPO}:collector-${IMAGE_TAG}", + "${ECR_REGISTRY}/${METRICS_COLLECTOR_ECR_REPO}:collector-latest", + ENVIRONMENT_TAG != "" ? "${ECR_REGISTRY}/${METRICS_COLLECTOR_ECR_REPO}:${ENVIRONMENT_TAG}" : "", + ]) + cache-from = ["type=registry,ref=${ECR_REGISTRY}/${METRICS_COLLECTOR_ECR_REPO}:cache"] + cache-to = ["type=registry,ref=${ECR_REGISTRY}/${METRICS_COLLECTOR_ECR_REPO}:cache,mode=max"] + attest = [] +} diff --git a/keeperhub-metrics-collector/index.ts b/keeperhub-metrics-collector/index.ts new file mode 100644 index 000000000..71c82436e --- /dev/null +++ b/keeperhub-metrics-collector/index.ts @@ -0,0 +1,34 @@ +/** + * keeperhub-metrics-collector + * + * Single-replica service that serves the DB-sourced Prometheus gauges that used + * to live on /api/metrics/db on the common app pods. It reuses the app's + * metrics code verbatim (updateDbMetrics refreshes the dbRegistry from Postgres, + * getDbMetrics serializes it), so the exposed gauge families are identical. + * Running it as its own single-replica deployment keeps the heavy aggregate + * scan off the request-serving pods and makes the scrape deterministic. + * + * Env: + * PORT HTTP port for /metrics and /health (default 9090) + * DATABASE_URL Postgres connection (consumed by lib/db) + */ +import type { Server } from "node:http"; +import { buildServer } from "./server"; + +const PORT = Number.parseInt(process.env.PORT ?? "9090", 10) || 9090; + +const server: Server = buildServer(); + +server.listen(PORT, () => { + console.log( + `[MetricsCollector] serving /metrics and /health on port ${PORT}` + ); +}); + +function shutdown(signal: string): void { + console.log(`[MetricsCollector] received ${signal}, shutting down`); + server.close(() => process.exit(0)); +} + +process.on("SIGTERM", () => shutdown("SIGTERM")); +process.on("SIGINT", () => shutdown("SIGINT")); diff --git a/keeperhub-metrics-collector/package.json b/keeperhub-metrics-collector/package.json new file mode 100644 index 000000000..1677e887c --- /dev/null +++ b/keeperhub-metrics-collector/package.json @@ -0,0 +1,18 @@ +{ + "name": "@keeperhub/metrics-collector", + "version": "1.0.0", + "private": true, + "description": "Single-replica service that serves the DB-sourced Prometheus gauges (the former /api/metrics/db scrape) off the request-serving pods", + "scripts": { + "collector": "tsx keeperhub-metrics-collector/index.ts", + "typecheck": "tsc --noEmit" + }, + "dependencies": { + "drizzle-orm": "^0.45.2", + "postgres": "^3.4.0", + "prom-client": "^15.1.3" + }, + "devDependencies": { + "tsx": "^4.0.0" + } +} diff --git a/keeperhub-metrics-collector/server.test.ts b/keeperhub-metrics-collector/server.test.ts new file mode 100644 index 000000000..305d23031 --- /dev/null +++ b/keeperhub-metrics-collector/server.test.ts @@ -0,0 +1,75 @@ +import type { AddressInfo, Server } from "node:net"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +const updateDbMetrics = vi.fn(); +const getDbMetrics = vi.fn(); +const getPrometheusContentType = vi.fn(); + +// Mock the app's metrics module so the HTTP wiring is tested without a DB. +vi.mock("../lib/metrics/prometheus-api", () => ({ + updateDbMetrics: () => updateDbMetrics(), + getDbMetrics: () => getDbMetrics(), + getPrometheusContentType: () => getPrometheusContentType(), +})); + +const { buildServer } = await import("./server"); + +describe("metrics-collector server (TECH-6484)", () => { + let server: Server; + let baseUrl: string; + + beforeEach(async () => { + updateDbMetrics.mockReset().mockResolvedValue(undefined); + getDbMetrics.mockReset().mockResolvedValue("keeperhub_user_total 0\n"); + getPrometheusContentType + .mockReset() + .mockReturnValue("text/plain; version=0.0.4; charset=utf-8"); + + server = buildServer(); + await new Promise((resolve) => { + server.listen(0, () => resolve()); + }); + const port = (server.address() as AddressInfo).port; + baseUrl = `http://127.0.0.1:${port}`; + }); + + afterEach(async () => { + await new Promise((resolve) => { + server.close(() => resolve()); + }); + }); + + it("GET /metrics refreshes then serves the DB registry", async () => { + const res = await fetch(`${baseUrl}/metrics`); + + expect(res.status).toBe(200); + expect(res.headers.get("content-type")).toContain("text/plain"); + expect(await res.text()).toContain("keeperhub_user_total"); + expect(updateDbMetrics).toHaveBeenCalledTimes(1); + expect(getDbMetrics).toHaveBeenCalledTimes(1); + }); + + it("GET /health returns ok without touching the DB", async () => { + const res = await fetch(`${baseUrl}/health`); + + expect(res.status).toBe(200); + const body = (await res.json()) as { status: string; service: string }; + expect(body.status).toBe("ok"); + expect(body.service).toBe("keeperhub-metrics-collector"); + expect(updateDbMetrics).not.toHaveBeenCalled(); + }); + + it("returns 500 when the refresh throws", async () => { + updateDbMetrics.mockRejectedValueOnce(new Error("db down")); + + const res = await fetch(`${baseUrl}/metrics`); + + expect(res.status).toBe(500); + }); + + it("404s unknown paths", async () => { + const res = await fetch(`${baseUrl}/nope`); + + expect(res.status).toBe(404); + }); +}); diff --git a/keeperhub-metrics-collector/server.ts b/keeperhub-metrics-collector/server.ts new file mode 100644 index 000000000..b364bdee5 --- /dev/null +++ b/keeperhub-metrics-collector/server.ts @@ -0,0 +1,70 @@ +import { + createServer, + type IncomingMessage, + type Server, + type ServerResponse, +} from "node:http"; +// server-only is a no-op under Node/tsx; this reuses the app's metrics code +// verbatim so the exposed gauges are byte-for-byte identical to the former +// /api/metrics/db endpoint. +import { + getDbMetrics, + getPrometheusContentType, + updateDbMetrics, +} from "../lib/metrics/prometheus-api"; + +/** + * Build the metrics-collector HTTP server. + * + * GET /metrics refreshes the DB-sourced gauges (updateDbMetrics, which is + * TTL- and pool-gated in lib/) and serves the dbRegistry. + * GET /health liveness/readiness probe. + * + * Returned without listening so tests can bind an ephemeral port. + */ +export function buildServer(): Server { + return createServer((req: IncomingMessage, res: ServerResponse): void => { + if (req.method !== "GET") { + res.writeHead(405); + res.end(); + return; + } + + if (req.url === "/health") { + res.writeHead(200, { "Content-Type": "application/json" }); + res.end( + JSON.stringify({ + status: "ok", + service: "keeperhub-metrics-collector", + timestamp: new Date().toISOString(), + }) + ); + return; + } + + if (req.url === "/metrics") { + // serveMetrics owns all its error handling and never rejects. + serveMetrics(res); + return; + } + + res.writeHead(404); + res.end(); + }); +} + +async function serveMetrics(res: ServerResponse): Promise { + try { + await updateDbMetrics(); + const metrics = await getDbMetrics(); + res.writeHead(200, { + "Content-Type": getPrometheusContentType(), + "Cache-Control": "no-store, no-cache, must-revalidate", + }); + res.end(metrics); + } catch (error) { + console.error("[MetricsCollector] Failed to serve metrics:", error); + res.writeHead(500); + res.end("Failed to collect metrics"); + } +}