diff --git a/.env.example b/.env.example index 6b96d32..790ad4c 100644 --- a/.env.example +++ b/.env.example @@ -54,6 +54,12 @@ THROTTLE_LIMIT=60 SENTRY_DSN=your_sentry_dsn_here GIT_COMMIT_SHA= +# Observability (OpenTelemetry + Loki) +# Set OTEL_EXPORTER_OTLP_ENDPOINT to enable distributed tracing (e.g. http://localhost:4318/v1/traces) +OTEL_EXPORTER_OTLP_ENDPOINT= +# Set LOKI_URL to enable log aggregation (e.g. http://localhost:3100) +LOKI_URL= + # Frontend NEXT_PUBLIC_API_URL=http://localhost:3000 NEXT_PUBLIC_STELLAR_NETWORK=testnet diff --git a/apps/backend/package.json b/apps/backend/package.json index ffd78ca..6c7b385 100644 --- a/apps/backend/package.json +++ b/apps/backend/package.json @@ -45,7 +45,20 @@ "prom-client": "^15.1.0", "typeorm": "^0.3.0", "class-sanitizer": "^1.0.1", - "sanitize-html": "^2.13.0" + "sanitize-html": "^2.13.0", + "@opentelemetry/api": "^1.9.0", + "@opentelemetry/auto-instrumentations-node": "^0.51.0", + "@opentelemetry/exporter-prometheus": "^0.55.0", + "@opentelemetry/exporter-trace-otlp-http": "^0.55.0", + "@opentelemetry/resources": "^1.28.0", + "@opentelemetry/sdk-node": "^0.55.0", + "@opentelemetry/semantic-conventions": "^1.28.0", + "winston-loki": "^6.1.3", + "nest-winston": "^1.10.0", + "winston": "^3.14.0", + "@nestjs/terminus": "^10.2.3", + "@nestjs/axios": "^3.0.3", + "axios": "^1.7.9" }, "devDependencies": { "@eslint/js": "^8.56.0", diff --git a/apps/backend/src/app.module.ts b/apps/backend/src/app.module.ts index bf61a06..d47b216 100644 --- a/apps/backend/src/app.module.ts +++ b/apps/backend/src/app.module.ts @@ -14,6 +14,7 @@ import { NotificationsModule } from './notifications/notifications.module'; import { LoggerModule } from './common/logger'; import { HealthModule } from './health/health.module'; import { MetricsModule } from './metrics/metrics.module'; +import { TracingModule } from './tracing'; import * as redisStore from 'cache-manager-redis-store'; import { ThrottlerStorageRedisService } from '@nest-lab/throttler-storage-redis'; import configuration from './config/configuration'; @@ -75,6 +76,7 @@ import { validationSchema } from './config/validation.schema'; NotificationsModule, HealthModule, MetricsModule, + TracingModule, ], providers: [{ provide: APP_GUARD, useClass: ThrottlerGuard }], }) diff --git a/apps/backend/src/common/logger/logger.module.ts b/apps/backend/src/common/logger/logger.module.ts index 80442c6..d259d5b 100644 --- a/apps/backend/src/common/logger/logger.module.ts +++ b/apps/backend/src/common/logger/logger.module.ts @@ -3,6 +3,24 @@ import { WinstonModule } from 'nest-winston'; import { ConfigService } from '@nestjs/config'; import * as winston from 'winston'; +// Loki transport for log aggregation (optional, loaded dynamically) +function createLokiTransport(lokiUrl: string, nodeEnv: string) { + try { + // eslint-disable-next-line @typescript-eslint/no-var-requires + const LokiTransport = require('winston-loki'); + return new LokiTransport({ + host: lokiUrl, + labels: { app: 'brain-storm-backend', env: nodeEnv }, + json: true, + batching: true, + interval: 5, + onConnectionError: (err: Error) => console.error('Loki connection error:', err.message), + }); + } catch { + return null; + } +} + @Module({ imports: [ WinstonModule.forRootAsync({ @@ -10,35 +28,42 @@ import * as winston from 'winston'; useFactory: (configService: ConfigService) => { const logLevel = configService.get('LOG_LEVEL', 'info'); const nodeEnv = configService.get('NODE_ENV', 'development'); - - // Define log format based on environment - const logFormat = nodeEnv === 'production' - ? winston.format.combine( - winston.format.timestamp(), - winston.format.errors({ stack: true }), - winston.format.json() - ) - : winston.format.combine( - winston.format.timestamp(), - winston.format.errors({ stack: true }), - winston.format.colorize(), - winston.format.printf(({ timestamp, level, message, context, ...meta }) => { - const contextStr = context ? `[${context}] ` : ''; - const metaStr = Object.keys(meta).length ? ` ${JSON.stringify(meta)}` : ''; - return `${timestamp} ${level}: ${contextStr}${message}${metaStr}`; - }) - ); + const lokiUrl = configService.get('LOKI_URL'); + + const logFormat = + nodeEnv === 'production' + ? winston.format.combine( + winston.format.timestamp(), + winston.format.errors({ stack: true }), + winston.format.json(), + ) + : winston.format.combine( + winston.format.timestamp(), + winston.format.errors({ stack: true }), + winston.format.colorize(), + winston.format.printf(({ timestamp, level, message, context, ...meta }) => { + const contextStr = context ? `[${context}] ` : ''; + const metaStr = Object.keys(meta).length ? ` ${JSON.stringify(meta)}` : ''; + return `${timestamp} ${level}: ${contextStr}${message}${metaStr}`; + }), + ); + + const transports: winston.transport[] = [ + new winston.transports.Console({ + handleExceptions: true, + handleRejections: true, + }), + ]; + + if (lokiUrl) { + const lokiTransport = createLokiTransport(lokiUrl, nodeEnv); + if (lokiTransport) transports.push(lokiTransport); + } return { level: logLevel, format: logFormat, - transports: [ - // Console transport - logs to stdout for container orchestrators - new winston.transports.Console({ - handleExceptions: true, - handleRejections: true, - }), - ], + transports, exitOnError: false, }; }, @@ -46,4 +71,4 @@ import * as winston from 'winston'; ], exports: [WinstonModule], }) -export class LoggerModule {} \ No newline at end of file +export class LoggerModule {} diff --git a/apps/backend/src/main.ts b/apps/backend/src/main.ts index 5725178..c620067 100644 --- a/apps/backend/src/main.ts +++ b/apps/backend/src/main.ts @@ -1,3 +1,4 @@ +import './tracing/otel'; import './instrument'; import { NestFactory } from '@nestjs/core'; import { AppModule } from './app.module'; diff --git a/apps/backend/src/metrics/metrics.interceptor.ts b/apps/backend/src/metrics/metrics.interceptor.ts index 022819f..d8c7e07 100644 --- a/apps/backend/src/metrics/metrics.interceptor.ts +++ b/apps/backend/src/metrics/metrics.interceptor.ts @@ -15,14 +15,14 @@ export class MetricsInterceptor implements NestInterceptor { intercept(context: ExecutionContext, next: CallHandler): Observable { const request = context.switchToHttp().getRequest(); const response = context.switchToHttp().getResponse(); + const startTime = Date.now(); return next.handle().pipe( tap(() => { - this.metricsService.incrementHttpRequests( - request.method, - request.route?.path || request.url, - response.statusCode, - ); + const route = request.route?.path || request.url; + const durationSeconds = (Date.now() - startTime) / 1000; + this.metricsService.incrementHttpRequests(request.method, route, response.statusCode); + this.metricsService.observeHttpDuration(request.method, route, response.statusCode, durationSeconds); }), ); } diff --git a/apps/backend/src/metrics/metrics.service.ts b/apps/backend/src/metrics/metrics.service.ts index d2fdb2f..1ecfed0 100644 --- a/apps/backend/src/metrics/metrics.service.ts +++ b/apps/backend/src/metrics/metrics.service.ts @@ -1,12 +1,17 @@ -import { Injectable } from '@nestjs/common'; -import { Counter, Histogram, register } from 'prom-client'; +import { Injectable, OnModuleInit } from '@nestjs/common'; +import { Counter, Histogram, Gauge, register } from 'prom-client'; @Injectable() -export class MetricsService { +export class MetricsService implements OnModuleInit { private readonly httpRequestsTotal: Counter; + private readonly httpRequestDuration: Histogram; private readonly credentialIssuedTotal: Counter; private readonly bstMintedTotal: Counter; private readonly stellarRpcLatency: Histogram; + private readonly activeConnections: Gauge; + private readonly enrollmentsTotal: Counter; + private readonly courseCompletionsTotal: Counter; + private readonly authAttemptsTotal: Counter; constructor() { this.httpRequestsTotal = new Counter({ @@ -16,6 +21,14 @@ export class MetricsService { registers: [register], }); + this.httpRequestDuration = new Histogram({ + name: 'http_request_duration_seconds', + help: 'HTTP request duration in seconds', + labelNames: ['method', 'route', 'status_code'], + buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5], + registers: [register], + }); + this.credentialIssuedTotal = new Counter({ name: 'credential_issued_total', help: 'Total number of credentials issued', @@ -37,14 +50,45 @@ export class MetricsService { buckets: [0.1, 0.5, 1, 2, 5], registers: [register], }); + + this.activeConnections = new Gauge({ + name: 'active_connections', + help: 'Number of active HTTP connections', + registers: [register], + }); + + this.enrollmentsTotal = new Counter({ + name: 'enrollments_total', + help: 'Total number of course enrollments', + labelNames: ['course_id'], + registers: [register], + }); + + this.courseCompletionsTotal = new Counter({ + name: 'course_completions_total', + help: 'Total number of course completions', + labelNames: ['course_id'], + registers: [register], + }); + + this.authAttemptsTotal = new Counter({ + name: 'auth_attempts_total', + help: 'Total number of authentication attempts', + labelNames: ['type', 'status'], + registers: [register], + }); + } + + onModuleInit() { + // Metrics are registered in constructor; nothing extra needed } incrementHttpRequests(method: string, route: string, statusCode: number) { - this.httpRequestsTotal.inc({ - method, - route, - status_code: statusCode.toString(), - }); + this.httpRequestsTotal.inc({ method, route, status_code: statusCode.toString() }); + } + + observeHttpDuration(method: string, route: string, statusCode: number, durationSeconds: number) { + this.httpRequestDuration.observe({ method, route, status_code: statusCode.toString() }, durationSeconds); } incrementCredentialIssued(credentialType: string) { @@ -58,4 +102,20 @@ export class MetricsService { observeStellarRpcLatency(method: string, status: string, durationSeconds: number) { this.stellarRpcLatency.observe({ method, status }, durationSeconds); } + + setActiveConnections(count: number) { + this.activeConnections.set(count); + } + + incrementEnrollments(courseId: string) { + this.enrollmentsTotal.inc({ course_id: courseId }); + } + + incrementCourseCompletions(courseId: string) { + this.courseCompletionsTotal.inc({ course_id: courseId }); + } + + incrementAuthAttempts(type: 'login' | 'register' | 'refresh', status: 'success' | 'failure') { + this.authAttemptsTotal.inc({ type, status }); + } } diff --git a/apps/backend/src/tracing/index.ts b/apps/backend/src/tracing/index.ts new file mode 100644 index 0000000..3f22bcd --- /dev/null +++ b/apps/backend/src/tracing/index.ts @@ -0,0 +1,2 @@ +export { TracingModule } from './tracing.module'; +export { TracingService } from './tracing.service'; diff --git a/apps/backend/src/tracing/otel.ts b/apps/backend/src/tracing/otel.ts new file mode 100644 index 0000000..c71a926 --- /dev/null +++ b/apps/backend/src/tracing/otel.ts @@ -0,0 +1,34 @@ +import { NodeSDK } from '@opentelemetry/sdk-node'; +import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node'; +import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; +import { PrometheusExporter } from '@opentelemetry/exporter-prometheus'; +import { Resource } from '@opentelemetry/resources'; +import { SEMRESATTRS_SERVICE_NAME, SEMRESATTRS_SERVICE_VERSION, SEMRESATTRS_DEPLOYMENT_ENVIRONMENT } from '@opentelemetry/semantic-conventions'; + +const traceExporter = process.env.OTEL_EXPORTER_OTLP_ENDPOINT + ? new OTLPTraceExporter({ url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT }) + : undefined; + +const sdk = new NodeSDK({ + resource: new Resource({ + [SEMRESATTRS_SERVICE_NAME]: 'brain-storm-backend', + [SEMRESATTRS_SERVICE_VERSION]: process.env.npm_package_version || '1.0.0', + [SEMRESATTRS_DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV || 'development', + }), + traceExporter, + instrumentations: [ + getNodeAutoInstrumentations({ + '@opentelemetry/instrumentation-http': { enabled: true }, + '@opentelemetry/instrumentation-express': { enabled: true }, + '@opentelemetry/instrumentation-pg': { enabled: true }, + '@opentelemetry/instrumentation-redis': { enabled: true }, + '@opentelemetry/instrumentation-fs': { enabled: false }, + }), + ], +}); + +sdk.start(); + +process.on('SIGTERM', () => { + sdk.shutdown().finally(() => process.exit(0)); +}); diff --git a/apps/backend/src/tracing/tracing.module.ts b/apps/backend/src/tracing/tracing.module.ts new file mode 100644 index 0000000..0c6f1e7 --- /dev/null +++ b/apps/backend/src/tracing/tracing.module.ts @@ -0,0 +1,9 @@ +import { Module, Global } from '@nestjs/common'; +import { TracingService } from './tracing.service'; + +@Global() +@Module({ + providers: [TracingService], + exports: [TracingService], +}) +export class TracingModule {} diff --git a/apps/backend/src/tracing/tracing.service.ts b/apps/backend/src/tracing/tracing.service.ts new file mode 100644 index 0000000..4bfaab3 --- /dev/null +++ b/apps/backend/src/tracing/tracing.service.ts @@ -0,0 +1,50 @@ +import { Injectable } from '@nestjs/common'; +import { trace, context, SpanStatusCode, Span, Tracer } from '@opentelemetry/api'; + +@Injectable() +export class TracingService { + private readonly tracer: Tracer; + + constructor() { + this.tracer = trace.getTracer('brain-storm-backend', '1.0.0'); + } + + startSpan(name: string, attributes?: Record): Span { + const span = this.tracer.startSpan(name); + if (attributes) { + span.setAttributes(attributes); + } + return span; + } + + async withSpan( + name: string, + fn: (span: Span) => Promise, + attributes?: Record, + ): Promise { + return this.tracer.startActiveSpan(name, { attributes }, async (span) => { + try { + const result = await fn(span); + span.setStatus({ code: SpanStatusCode.OK }); + return result; + } catch (error) { + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + span.recordException(error); + throw error; + } finally { + span.end(); + } + }); + } + + getCurrentSpan(): Span | undefined { + return trace.getActiveSpan(); + } + + addSpanAttributes(attributes: Record): void { + const span = this.getCurrentSpan(); + if (span) { + span.setAttributes(attributes); + } + } +} diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml index c63d31b..c078a21 100644 --- a/docker-compose.monitoring.yml +++ b/docker-compose.monitoring.yml @@ -8,12 +8,37 @@ services: - "9090:9090" volumes: - ./infra/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml + - ./infra/monitoring/alerts.yml:/etc/prometheus/alerts.yml - prometheus_data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.console.libraries=/usr/share/prometheus/console_libraries' - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.enable-lifecycle' + restart: unless-stopped + networks: + - monitoring + + alertmanager: + image: prom/alertmanager:latest + container_name: alertmanager + ports: + - "9093:9093" + volumes: + - ./infra/monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + restart: unless-stopped + networks: + - monitoring + + loki: + image: grafana/loki:latest + container_name: loki + ports: + - "3100:3100" + command: -config.file=/etc/loki/local-config.yaml restart: unless-stopped networks: - monitoring @@ -36,6 +61,7 @@ services: - monitoring depends_on: - prometheus + - loki volumes: prometheus_data: diff --git a/docs/monitoring.md b/docs/monitoring.md index a381c75..be2d678 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -1,79 +1,15 @@ -# Monitoring Setup +# Monitoring -Brain Storm uses Prometheus for metrics collection and Grafana for visualization. +> **See the full observability guide: [docs/observability.md](./observability.md)** + +This document is superseded by the comprehensive observability documentation which covers distributed tracing (OpenTelemetry), metrics (Prometheus), log aggregation (Winston + Loki), alerting (Alertmanager), and Grafana dashboards. ## Quick Start -1. Start the monitoring stack: ```bash -docker-compose -f docker-compose.monitoring.yml up -d +docker compose -f docker-compose.monitoring.yml up -d ``` -2. Access the dashboards: -- Grafana: http://localhost:3002 (admin/admin) -- Prometheus: http://localhost:9090 - -## Available Metrics - -### HTTP Metrics -- `http_requests_total` - Total HTTP requests by method, route, and status code - -### Business Metrics -- `credential_issued_total` - Credentials issued by type -- `bst_minted_total` - BST tokens minted by user - -### Performance Metrics -- `stellar_rpc_latency_seconds` - Stellar RPC call latency histogram - -### System Metrics (default) -- `process_cpu_user_seconds_total` - CPU usage -- `process_resident_memory_bytes` - Memory usage -- `nodejs_eventloop_lag_seconds` - Event loop lag -- `nodejs_heap_size_total_bytes` - Heap size - -## Grafana Dashboard - -A pre-built dashboard is automatically provisioned at startup showing: -- HTTP request rates -- Credential issuance stats -- BST token minting stats -- Stellar RPC latency percentiles -- Node.js memory usage - -## Custom Metrics - -To add custom metrics, use the `MetricsService`: - -```typescript -import { MetricsService } from './metrics/metrics.service'; - -constructor(private metricsService: MetricsService) {} - -// Increment counters -this.metricsService.incrementCredentialIssued('course-completion'); -this.metricsService.incrementBstMinted(userId); - -// Observe latency -const start = Date.now(); -// ... perform operation -const duration = (Date.now() - start) / 1000; -this.metricsService.observeStellarRpcLatency('submitTransaction', 'success', duration); -``` - -## Production Deployment - -In production, configure Prometheus to scrape the `/metrics` endpoint: - -```yaml -scrape_configs: - - job_name: 'brain-storm-backend' - static_configs: - - targets: ['backend:3000'] -``` - -## Alerting - -Configure Prometheus alerting rules in `infra/monitoring/prometheus.yml` for: -- High error rates (5xx responses) -- Slow Stellar RPC calls (p95 > 2s) -- Memory leaks (continuous memory growth) +- Grafana: http://localhost:3002 (admin/admin) +- Prometheus: http://localhost:9090 +- Alertmanager: http://localhost:9093 diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 0000000..35b8373 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,166 @@ +# Observability + +Brain-Storm uses a full observability stack: **distributed tracing** (OpenTelemetry), **metrics** (Prometheus), **log aggregation** (Winston + Loki), and **alerting** (Prometheus Alertmanager) — all visualized in **Grafana**. + +--- + +## Quick Start + +```bash +# Start the full monitoring stack +docker compose -f docker-compose.monitoring.yml up -d + +# Access dashboards +# Grafana: http://localhost:3002 (admin / admin) +# Prometheus: http://localhost:9090 +# Alertmanager: http://localhost:9093 +# Loki: http://localhost:3100 +``` + +--- + +## Distributed Tracing (OpenTelemetry) + +The backend auto-instruments HTTP, PostgreSQL, and Redis using the OpenTelemetry Node.js SDK. + +### Configuration + +| Env Var | Description | Default | +|---|---|---| +| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP trace collector URL | _(disabled)_ | + +When `OTEL_EXPORTER_OTLP_ENDPOINT` is not set, tracing runs in no-op mode (zero overhead). Set it to send traces to Jaeger, Tempo, or any OTLP-compatible backend. + +### Custom Spans + +Inject `TracingService` to create custom spans: + +```typescript +import { TracingService } from './tracing'; + +constructor(private tracing: TracingService) {} + +async issueCredential(userId: string) { + return this.tracing.withSpan('credential.issue', async (span) => { + span.setAttribute('user.id', userId); + // ... your logic + }); +} +``` + +--- + +## Metrics (Prometheus) + +Metrics are exposed at `GET /metrics` (Prometheus scrape format). + +### Available Metrics + +| Metric | Type | Description | +|---|---|---| +| `http_requests_total` | Counter | HTTP requests by method, route, status_code | +| `http_request_duration_seconds` | Histogram | HTTP request duration | +| `credential_issued_total` | Counter | Credentials issued by type | +| `bst_minted_total` | Counter | BST tokens minted by user | +| `stellar_rpc_latency_seconds` | Histogram | Stellar RPC call latency | +| `enrollments_total` | Counter | Course enrollments by course_id | +| `course_completions_total` | Counter | Course completions by course_id | +| `auth_attempts_total` | Counter | Auth attempts by type and status | +| `active_connections` | Gauge | Active HTTP connections | +| `nodejs_heap_size_used_bytes` | Gauge | Node.js heap memory used | +| `nodejs_eventloop_lag_seconds` | Gauge | Event loop lag | + +### Recording Business Metrics + +```typescript +import { MetricsService } from './metrics/metrics.service'; + +// In your service +this.metricsService.incrementCredentialIssued('course-completion'); +this.metricsService.incrementEnrollments(courseId); +this.metricsService.incrementAuthAttempts('login', 'success'); +this.metricsService.observeStellarRpcLatency('submitTransaction', 'success', durationSeconds); +``` + +--- + +## Log Aggregation (Winston + Loki) + +Structured JSON logs are written to stdout in production. Optionally shipped to Grafana Loki. + +### Configuration + +| Env Var | Description | Default | +|---|---|---| +| `LOG_LEVEL` | Minimum log level | `info` | +| `LOKI_URL` | Loki push URL | _(disabled)_ | + +Set `LOKI_URL=http://loki:3100` to enable log shipping. Logs are batched and sent every 5 seconds with labels `app=brain-storm-backend` and `env=`. + +### Log Format + +In production (`NODE_ENV=production`), logs are emitted as JSON: + +```json +{ + "timestamp": "2026-05-29T18:00:00.000Z", + "level": "info", + "message": "Health check completed", + "context": "HealthController", + "status": "ok" +} +``` + +In development, logs use a human-readable colored format. + +--- + +## Alerting (Prometheus + Alertmanager) + +Alert rules are defined in `infra/monitoring/alerts.yml`. + +### Alert Rules + +| Alert | Severity | Condition | +|---|---|---| +| `HighErrorRate` | critical | 5xx rate > 5% for 2m | +| `SlowHttpResponses` | warning | p95 latency > 1s for 5m | +| `SlowStellarRpc` | warning | Stellar RPC p95 > 2s for 5m | +| `HighMemoryUsage` | warning | Heap > 400MB for 5m | +| `HighEventLoopLag` | warning | Event loop lag > 100ms for 2m | +| `ServiceDown` | critical | Backend unreachable for 1m | +| `HighAuthFailureRate` | warning | Auth failure rate > 30% for 5m | + +### Alertmanager Configuration + +Configure `ALERTMANAGER_WEBHOOK_URL` in `infra/monitoring/alertmanager.yml` to route alerts to Slack, PagerDuty, or any webhook receiver. + +--- + +## Grafana Dashboards + +Two dashboards are auto-provisioned at startup: + +| Dashboard | Description | +|---|---| +| **Brain-Storm Overview** | Request rate, error rate, latency, business metrics, memory, event loop | +| **NestJS Metrics** | HTTP requests, credentials, BST tokens, Stellar RPC latency | + +Access at `http://localhost:3002` (credentials: `admin` / `admin`). + +--- + +## Infrastructure + +``` +docker-compose.monitoring.yml +├── prometheus :9090 — metrics scraping + alerting rules +├── alertmanager :9093 — alert routing and notification +├── loki :3100 — log aggregation +└── grafana :3002 — dashboards (Prometheus + Loki datasources) +``` + +Alert rules: `infra/monitoring/alerts.yml` +Alertmanager config: `infra/monitoring/alertmanager.yml` +Prometheus config: `infra/monitoring/prometheus.yml` +Grafana dashboards: `infra/monitoring/grafana/dashboards/` diff --git a/infra/monitoring/alertmanager.yml b/infra/monitoring/alertmanager.yml new file mode 100644 index 0000000..102da84 --- /dev/null +++ b/infra/monitoring/alertmanager.yml @@ -0,0 +1,32 @@ +global: + resolve_timeout: 5m + +route: + group_by: ['alertname', 'severity'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'default' + routes: + - match: + severity: critical + receiver: 'critical' + repeat_interval: 1h + +receivers: + - name: 'default' + webhook_configs: + - url: '${ALERTMANAGER_WEBHOOK_URL:-http://localhost:9000/webhook}' + send_resolved: true + + - name: 'critical' + webhook_configs: + - url: '${ALERTMANAGER_WEBHOOK_URL:-http://localhost:9000/webhook}' + send_resolved: true + +inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname'] diff --git a/infra/monitoring/alerts.yml b/infra/monitoring/alerts.yml new file mode 100644 index 0000000..39c1d3a --- /dev/null +++ b/infra/monitoring/alerts.yml @@ -0,0 +1,83 @@ +groups: + - name: brain-storm-backend + interval: 30s + rules: + # High HTTP error rate (5xx) + - alert: HighErrorRate + expr: | + sum(rate(http_requests_total{status_code=~"5.."}[5m])) + / + sum(rate(http_requests_total[5m])) > 0.05 + for: 2m + labels: + severity: critical + annotations: + summary: "High HTTP 5xx error rate" + description: "Error rate is {{ $value | humanizePercentage }} over the last 5 minutes." + + # Slow HTTP responses (p95 > 1s) + - alert: SlowHttpResponses + expr: | + histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route)) + > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Slow HTTP responses on {{ $labels.route }}" + description: "p95 latency is {{ $value }}s on route {{ $labels.route }}." + + # Slow Stellar RPC calls (p95 > 2s) + - alert: SlowStellarRpc + expr: | + histogram_quantile(0.95, sum(rate(stellar_rpc_latency_seconds_bucket[5m])) by (le, method)) + > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "Slow Stellar RPC: {{ $labels.method }}" + description: "Stellar RPC p95 latency is {{ $value }}s for method {{ $labels.method }}." + + # High memory usage (heap > 400MB) + - alert: HighMemoryUsage + expr: nodejs_heap_size_used_bytes > 400 * 1024 * 1024 + for: 5m + labels: + severity: warning + annotations: + summary: "High Node.js heap memory usage" + description: "Heap usage is {{ $value | humanize1024 }}B." + + # Event loop lag > 100ms + - alert: HighEventLoopLag + expr: nodejs_eventloop_lag_seconds > 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: "High Node.js event loop lag" + description: "Event loop lag is {{ $value }}s." + + # Service down (no scrape data) + - alert: ServiceDown + expr: up{job="brain-storm-backend"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Brain-Storm backend is down" + description: "The backend service has been unreachable for more than 1 minute." + + # High authentication failure rate + - alert: HighAuthFailureRate + expr: | + sum(rate(auth_attempts_total{status="failure"}[5m])) + / + sum(rate(auth_attempts_total[5m])) > 0.3 + for: 5m + labels: + severity: warning + annotations: + summary: "High authentication failure rate" + description: "Auth failure rate is {{ $value | humanizePercentage }}." diff --git a/infra/monitoring/grafana/dashboards/brain-storm-overview.json b/infra/monitoring/grafana/dashboards/brain-storm-overview.json new file mode 100644 index 0000000..450cb9a --- /dev/null +++ b/infra/monitoring/grafana/dashboards/brain-storm-overview.json @@ -0,0 +1,184 @@ +{ + "uid": "brain-storm-overview", + "title": "Brain-Storm - Overview", + "tags": ["brain-storm", "overview"], + "timezone": "browser", + "refresh": "30s", + "schemaVersion": 38, + "version": 1, + "panels": [ + { + "id": 1, + "title": "Request Rate (req/s)", + "type": "timeseries", + "gridPos": { "x": 0, "y": 0, "w": 8, "h": 8 }, + "targets": [ + { + "expr": "sum(rate(http_requests_total[5m])) by (method)", + "legendFormat": "{{ method }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } + }, + { + "id": 2, + "title": "Error Rate (%)", + "type": "timeseries", + "gridPos": { "x": 8, "y": 0, "w": 8, "h": 8 }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100", + "legendFormat": "5xx Error Rate" + }, + { + "expr": "sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100", + "legendFormat": "4xx Error Rate" + } + ], + "fieldConfig": { + "defaults": { "unit": "percent" } + } + }, + { + "id": 3, + "title": "HTTP p95 Latency (s)", + "type": "timeseries", + "gridPos": { "x": 16, "y": 0, "w": 8, "h": 8 }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "id": 4, + "title": "Credentials Issued", + "type": "stat", + "gridPos": { "x": 0, "y": 8, "w": 6, "h": 4 }, + "targets": [ + { + "expr": "sum(credential_issued_total)", + "legendFormat": "Total" + } + ] + }, + { + "id": 5, + "title": "BST Tokens Minted", + "type": "stat", + "gridPos": { "x": 6, "y": 8, "w": 6, "h": 4 }, + "targets": [ + { + "expr": "sum(bst_minted_total)", + "legendFormat": "Total" + } + ] + }, + { + "id": 6, + "title": "Course Enrollments", + "type": "stat", + "gridPos": { "x": 12, "y": 8, "w": 6, "h": 4 }, + "targets": [ + { + "expr": "sum(enrollments_total)", + "legendFormat": "Total" + } + ] + }, + { + "id": 7, + "title": "Course Completions", + "type": "stat", + "gridPos": { "x": 18, "y": 8, "w": 6, "h": 4 }, + "targets": [ + { + "expr": "sum(course_completions_total)", + "legendFormat": "Total" + } + ] + }, + { + "id": 8, + "title": "Stellar RPC Latency p95 (s)", + "type": "timeseries", + "gridPos": { "x": 0, "y": 12, "w": 12, "h": 8 }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(stellar_rpc_latency_seconds_bucket[5m])) by (le, method))", + "legendFormat": "{{ method }} p95" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "id": 9, + "title": "Node.js Memory (bytes)", + "type": "timeseries", + "gridPos": { "x": 12, "y": 12, "w": 12, "h": 8 }, + "targets": [ + { + "expr": "nodejs_heap_size_used_bytes", + "legendFormat": "Heap Used" + }, + { + "expr": "nodejs_heap_size_total_bytes", + "legendFormat": "Heap Total" + }, + { + "expr": "process_resident_memory_bytes", + "legendFormat": "RSS" + } + ], + "fieldConfig": { + "defaults": { "unit": "bytes" } + } + }, + { + "id": 10, + "title": "Event Loop Lag (s)", + "type": "timeseries", + "gridPos": { "x": 0, "y": 20, "w": 12, "h": 8 }, + "targets": [ + { + "expr": "nodejs_eventloop_lag_seconds", + "legendFormat": "Event Loop Lag" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "id": 11, + "title": "Auth Attempts", + "type": "timeseries", + "gridPos": { "x": 12, "y": 20, "w": 12, "h": 8 }, + "targets": [ + { + "expr": "sum(rate(auth_attempts_total{status=\"success\"}[5m])) by (type)", + "legendFormat": "{{ type }} success" + }, + { + "expr": "sum(rate(auth_attempts_total{status=\"failure\"}[5m])) by (type)", + "legendFormat": "{{ type }} failure" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } + } + ] +} diff --git a/infra/monitoring/grafana/provisioning/datasources/prometheus.yml b/infra/monitoring/grafana/provisioning/datasources/prometheus.yml index 1a57b69..e90e251 100644 --- a/infra/monitoring/grafana/provisioning/datasources/prometheus.yml +++ b/infra/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -7,3 +7,12 @@ datasources: url: http://prometheus:9090 isDefault: true editable: true + + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: false + editable: true + jsonData: + maxLines: 1000 diff --git a/infra/monitoring/prometheus.yml b/infra/monitoring/prometheus.yml index 1ba9d99..61c1b8b 100644 --- a/infra/monitoring/prometheus.yml +++ b/infra/monitoring/prometheus.yml @@ -2,6 +2,15 @@ global: scrape_interval: 15s evaluation_interval: 15s +rule_files: + - /etc/prometheus/alerts.yml + +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + scrape_configs: - job_name: 'brain-storm-backend' static_configs: