Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ COPY drizzle/ ./drizzle/
COPY hooks/ ./hooks/
COPY keeperhub-events/ ./keeperhub-events/
COPY keeperhub-executor/ ./keeperhub-executor/
COPY keeperhub-metrics-collector/ ./keeperhub-metrics-collector/
COPY keeperhub-scheduler/ ./keeperhub-scheduler/
COPY lib/ ./lib/
COPY plugins/ ./plugins/
Expand Down Expand Up @@ -310,3 +311,36 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \

# Start the application
CMD ["node", "server.js"]

# ==============================================================================
# Stage: metrics-collector (TECH-6484)
# Single-replica service that serves the DB-sourced Prometheus gauges (the
# former /api/metrics/db scrape) off the request-serving pods. Executor-style:
# reuses lib/metrics + lib/db verbatim via tsx. Copies the full root
# node_modules (the imported lib code has transitive deps beyond the
# collector's own package.json). The metrics import graph touches no
# builder-generated file at runtime -- lib/db/schema only references the
# generated lib/types/integration via `import type`, which tsx erases -- so
# this stage depends on source only, not the (expensive) Next builder stage.
# ==============================================================================
FROM node:24-alpine AS metrics-collector
WORKDIR /app
RUN npm install -g pnpm@9 tsx@4
COPY --link --from=deps /etc/ssl/certs/rds-combined-ca-bundle.pem /etc/ssl/certs/rds-combined-ca-bundle.pem

COPY --link --from=deps /app/node_modules ./node_modules
COPY --link --from=source /app/keeperhub-metrics-collector ./keeperhub-metrics-collector
COPY --link --from=source /app/lib ./lib
COPY --link --from=source /app/package.json ./package.json
COPY --link --from=source /app/tsconfig.json ./tsconfig.json

# Shim server-only (runs outside Next.js)
SHELL ["/bin/ash", "-o", "pipefail", "-c"]
RUN find /app/node_modules -path "*server-only*/index.js" | while read -r f; do echo 'module.exports = {};' > "$f"; done

ENV NODE_ENV=production

EXPOSE 9090

# Build with: docker build --target metrics-collector -t keeperhub-metrics-collector .
CMD ["tsx", "keeperhub-metrics-collector/index.ts"]
23 changes: 22 additions & 1 deletion docker-bake.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ variable "EVENTS_ECR_TRACKER_REPO" { default = "" }
variable "SCHEDULER_ECR_REPO" { default = "" }
variable "EXECUTOR_ECR_REPO" { default = "" }
variable "SANDBOX_ECR_REPO" { default = "" }
variable "METRICS_COLLECTOR_ECR_REPO" { default = "" }

group "default" {
targets = ["app", "migrator", "workflow-runner"]
Expand All @@ -43,8 +44,12 @@ group "sandbox" {
targets = ["sandbox"]
}

group "metrics-collector" {
targets = ["metrics-collector"]
}

group "all" {
targets = ["app", "migrator", "workflow-runner", "event-tracker", "schedule-dispatcher", "block-dispatcher", "executor", "sandbox"]
targets = ["app", "migrator", "workflow-runner", "event-tracker", "schedule-dispatcher", "block-dispatcher", "executor", "sandbox", "metrics-collector"]
}

target "app" {
Expand Down Expand Up @@ -216,3 +221,19 @@ target "sandbox" {
cache-to = ["type=registry,ref=${ECR_REGISTRY}/${SANDBOX_ECR_REPO}:cache,mode=max"]
attest = []
}

# Metrics collector (TECH-6484). Context is repo root because the stage reuses
# lib/ and the root node_modules. Tag prefix `collector-`.
target "metrics-collector" {
context = "."
dockerfile = "Dockerfile"
target = "metrics-collector"
tags = compact([
"${ECR_REGISTRY}/${METRICS_COLLECTOR_ECR_REPO}:collector-${IMAGE_TAG}",
"${ECR_REGISTRY}/${METRICS_COLLECTOR_ECR_REPO}:collector-latest",
ENVIRONMENT_TAG != "" ? "${ECR_REGISTRY}/${METRICS_COLLECTOR_ECR_REPO}:${ENVIRONMENT_TAG}" : "",
])
cache-from = ["type=registry,ref=${ECR_REGISTRY}/${METRICS_COLLECTOR_ECR_REPO}:cache"]
cache-to = ["type=registry,ref=${ECR_REGISTRY}/${METRICS_COLLECTOR_ECR_REPO}:cache,mode=max"]
attest = []
}
34 changes: 34 additions & 0 deletions keeperhub-metrics-collector/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/**
* keeperhub-metrics-collector
*
* Single-replica service that serves the DB-sourced Prometheus gauges that used
* to live on /api/metrics/db on the common app pods. It reuses the app's
* metrics code verbatim (updateDbMetrics refreshes the dbRegistry from Postgres,
* getDbMetrics serializes it), so the exposed gauge families are identical.
* Running it as its own single-replica deployment keeps the heavy aggregate
* scan off the request-serving pods and makes the scrape deterministic.
*
* Env:
* PORT HTTP port for /metrics and /health (default 9090)
* DATABASE_URL Postgres connection (consumed by lib/db)
*/
import type { Server } from "node:http";
import { buildServer } from "./server";

const PORT = Number.parseInt(process.env.PORT ?? "9090", 10) || 9090;

const server: Server = buildServer();

server.listen(PORT, () => {
console.log(
`[MetricsCollector] serving /metrics and /health on port ${PORT}`
);
});

function shutdown(signal: string): void {
console.log(`[MetricsCollector] received ${signal}, shutting down`);
server.close(() => process.exit(0));
}

process.on("SIGTERM", () => shutdown("SIGTERM"));
process.on("SIGINT", () => shutdown("SIGINT"));
18 changes: 18 additions & 0 deletions keeperhub-metrics-collector/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "@keeperhub/metrics-collector",
"version": "1.0.0",
"private": true,
"description": "Single-replica service that serves the DB-sourced Prometheus gauges (the former /api/metrics/db scrape) off the request-serving pods",
"scripts": {
"collector": "tsx keeperhub-metrics-collector/index.ts",
"typecheck": "tsc --noEmit"
},
"dependencies": {
"drizzle-orm": "^0.45.2",
"postgres": "^3.4.0",
"prom-client": "^15.1.3"
},
"devDependencies": {
"tsx": "^4.0.0"
}
}
75 changes: 75 additions & 0 deletions keeperhub-metrics-collector/server.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import type { AddressInfo, Server } from "node:net";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";

const updateDbMetrics = vi.fn();
const getDbMetrics = vi.fn();
const getPrometheusContentType = vi.fn();

// Mock the app's metrics module so the HTTP wiring is tested without a DB.
vi.mock("../lib/metrics/prometheus-api", () => ({
updateDbMetrics: () => updateDbMetrics(),
getDbMetrics: () => getDbMetrics(),
getPrometheusContentType: () => getPrometheusContentType(),
}));

const { buildServer } = await import("./server");

describe("metrics-collector server (TECH-6484)", () => {
let server: Server;
let baseUrl: string;

beforeEach(async () => {
updateDbMetrics.mockReset().mockResolvedValue(undefined);
getDbMetrics.mockReset().mockResolvedValue("keeperhub_user_total 0\n");
getPrometheusContentType
.mockReset()
.mockReturnValue("text/plain; version=0.0.4; charset=utf-8");

server = buildServer();
await new Promise<void>((resolve) => {
server.listen(0, () => resolve());
});
const port = (server.address() as AddressInfo).port;
baseUrl = `http://127.0.0.1:${port}`;
});

afterEach(async () => {
await new Promise<void>((resolve) => {
server.close(() => resolve());
});
});

it("GET /metrics refreshes then serves the DB registry", async () => {
const res = await fetch(`${baseUrl}/metrics`);

expect(res.status).toBe(200);
expect(res.headers.get("content-type")).toContain("text/plain");
expect(await res.text()).toContain("keeperhub_user_total");
expect(updateDbMetrics).toHaveBeenCalledTimes(1);
expect(getDbMetrics).toHaveBeenCalledTimes(1);
});

it("GET /health returns ok without touching the DB", async () => {
const res = await fetch(`${baseUrl}/health`);

expect(res.status).toBe(200);
const body = (await res.json()) as { status: string; service: string };
expect(body.status).toBe("ok");
expect(body.service).toBe("keeperhub-metrics-collector");
expect(updateDbMetrics).not.toHaveBeenCalled();
});

it("returns 500 when the refresh throws", async () => {
updateDbMetrics.mockRejectedValueOnce(new Error("db down"));

const res = await fetch(`${baseUrl}/metrics`);

expect(res.status).toBe(500);
});

it("404s unknown paths", async () => {
const res = await fetch(`${baseUrl}/nope`);

expect(res.status).toBe(404);
});
});
70 changes: 70 additions & 0 deletions keeperhub-metrics-collector/server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import {
createServer,
type IncomingMessage,
type Server,
type ServerResponse,
} from "node:http";
// server-only is a no-op under Node/tsx; this reuses the app's metrics code
// verbatim so the exposed gauges are byte-for-byte identical to the former
// /api/metrics/db endpoint.
import {
getDbMetrics,
getPrometheusContentType,
updateDbMetrics,
} from "../lib/metrics/prometheus-api";

/**
* Build the metrics-collector HTTP server.
*
* GET /metrics refreshes the DB-sourced gauges (updateDbMetrics, which is
* TTL- and pool-gated in lib/) and serves the dbRegistry.
* GET /health liveness/readiness probe.
*
* Returned without listening so tests can bind an ephemeral port.
*/
export function buildServer(): Server {
return createServer((req: IncomingMessage, res: ServerResponse): void => {
if (req.method !== "GET") {
res.writeHead(405);
res.end();
return;
}

if (req.url === "/health") {
res.writeHead(200, { "Content-Type": "application/json" });
res.end(
JSON.stringify({
status: "ok",
service: "keeperhub-metrics-collector",
timestamp: new Date().toISOString(),
})
);
return;
}

if (req.url === "/metrics") {
// serveMetrics owns all its error handling and never rejects.
serveMetrics(res);
return;
}

res.writeHead(404);
res.end();
});
}

async function serveMetrics(res: ServerResponse): Promise<void> {
try {
await updateDbMetrics();
const metrics = await getDbMetrics();
res.writeHead(200, {
"Content-Type": getPrometheusContentType(),
"Cache-Control": "no-store, no-cache, must-revalidate",
});
res.end(metrics);
} catch (error) {
console.error("[MetricsCollector] Failed to serve metrics:", error);
res.writeHead(500);
res.end("Failed to collect metrics");
}
}
Loading