diff --git a/.env.example b/.env.example
index 6b96d32..790ad4c 100644
--- a/.env.example
+++ b/.env.example
@@ -54,6 +54,12 @@ THROTTLE_LIMIT=60
 SENTRY_DSN=your_sentry_dsn_here
 GIT_COMMIT_SHA=
 
+# Observability (OpenTelemetry + Loki)
+# Set OTEL_EXPORTER_OTLP_ENDPOINT to enable distributed tracing (e.g. http://localhost:4318/v1/traces)
+OTEL_EXPORTER_OTLP_ENDPOINT=
+# Set LOKI_URL to enable log aggregation (e.g. http://localhost:3100)
+LOKI_URL=
+
 # Frontend
 NEXT_PUBLIC_API_URL=http://localhost:3000
 NEXT_PUBLIC_STELLAR_NETWORK=testnet
diff --git a/apps/backend/package.json b/apps/backend/package.json
index ffd78ca..6c7b385 100644
--- a/apps/backend/package.json
+++ b/apps/backend/package.json
@@ -45,7 +45,20 @@
     "prom-client": "^15.1.0",
     "typeorm": "^0.3.0",
     "class-sanitizer": "^1.0.1",
-    "sanitize-html": "^2.13.0"
+    "sanitize-html": "^2.13.0",
+    "@opentelemetry/api": "^1.9.0",
+    "@opentelemetry/auto-instrumentations-node": "^0.51.0",
+    "@opentelemetry/exporter-prometheus": "^0.55.0",
+    "@opentelemetry/exporter-trace-otlp-http": "^0.55.0",
+    "@opentelemetry/resources": "^1.28.0",
+    "@opentelemetry/sdk-node": "^0.55.0",
+    "@opentelemetry/semantic-conventions": "^1.28.0",
+    "winston-loki": "^6.1.3",
+    "nest-winston": "^1.10.0",
+    "winston": "^3.14.0",
+    "@nestjs/terminus": "^10.2.3",
+    "@nestjs/axios": "^3.0.3",
+    "axios": "^1.7.9"
   },
   "devDependencies": {
     "@eslint/js": "^8.56.0",
diff --git a/apps/backend/src/app.module.ts b/apps/backend/src/app.module.ts
index bf61a06..d47b216 100644
--- a/apps/backend/src/app.module.ts
+++ b/apps/backend/src/app.module.ts
@@ -14,6 +14,7 @@ import { NotificationsModule } from './notifications/notifications.module';
 import { LoggerModule } from './common/logger';
 import { HealthModule } from './health/health.module';
 import { MetricsModule } from './metrics/metrics.module';
+import { TracingModule } from './tracing';
 import * as redisStore from 'cache-manager-redis-store';
 import { ThrottlerStorageRedisService } from '@nest-lab/throttler-storage-redis';
 import configuration from './config/configuration';
@@ -75,6 +76,7 @@ import { validationSchema } from './config/validation.schema';
     NotificationsModule,
     HealthModule,
     MetricsModule,
+    TracingModule,
   ],
   providers: [{ provide: APP_GUARD, useClass: ThrottlerGuard }],
 })
diff --git a/apps/backend/src/common/logger/logger.module.ts b/apps/backend/src/common/logger/logger.module.ts
index 80442c6..d259d5b 100644
--- a/apps/backend/src/common/logger/logger.module.ts
+++ b/apps/backend/src/common/logger/logger.module.ts
@@ -3,6 +3,24 @@ import { WinstonModule } from 'nest-winston';
 import { ConfigService } from '@nestjs/config';
 import * as winston from 'winston';
 
+// Loki transport for log aggregation (optional, loaded dynamically)
+function createLokiTransport(lokiUrl: string, nodeEnv: string) {
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const LokiTransport = require('winston-loki');
+    return new LokiTransport({
+      host: lokiUrl,
+      labels: { app: 'brain-storm-backend', env: nodeEnv },
+      json: true,
+      batching: true,
+      interval: 5,
+      onConnectionError: (err: Error) => console.error('Loki connection error:', err.message),
+    });
+  } catch {
+    return null;
+  }
+}
+
 @Module({
   imports: [
     WinstonModule.forRootAsync({
@@ -10,35 +28,42 @@ import * as winston from 'winston';
       useFactory: (configService: ConfigService) => {
         const logLevel = configService.get<string>('LOG_LEVEL', 'info');
         const nodeEnv = configService.get<string>('NODE_ENV', 'development');
-        
-        // Define log format based on environment
-        const logFormat = nodeEnv === 'production' 
-          ? winston.format.combine(
-              winston.format.timestamp(),
-              winston.format.errors({ stack: true }),
-              winston.format.json()
-            )
-          : winston.format.combine(
-              winston.format.timestamp(),
-              winston.format.errors({ stack: true }),
-              winston.format.colorize(),
-              winston.format.printf(({ timestamp, level, message, context, ...meta }) => {
-                const contextStr = context ? `[${context}] ` : '';
-                const metaStr = Object.keys(meta).length ? ` ${JSON.stringify(meta)}` : '';
-                return `${timestamp} ${level}: ${contextStr}${message}${metaStr}`;
-              })
-            );
+        const lokiUrl = configService.get<string>('LOKI_URL');
+
+        const logFormat =
+          nodeEnv === 'production'
+            ? winston.format.combine(
+                winston.format.timestamp(),
+                winston.format.errors({ stack: true }),
+                winston.format.json(),
+              )
+            : winston.format.combine(
+                winston.format.timestamp(),
+                winston.format.errors({ stack: true }),
+                winston.format.colorize(),
+                winston.format.printf(({ timestamp, level, message, context, ...meta }) => {
+                  const contextStr = context ? `[${context}] ` : '';
+                  const metaStr = Object.keys(meta).length ? ` ${JSON.stringify(meta)}` : '';
+                  return `${timestamp} ${level}: ${contextStr}${message}${metaStr}`;
+                }),
+              );
+
+        const transports: winston.transport[] = [
+          new winston.transports.Console({
+            handleExceptions: true,
+            handleRejections: true,
+          }),
+        ];
+
+        if (lokiUrl) {
+          const lokiTransport = createLokiTransport(lokiUrl, nodeEnv);
+          if (lokiTransport) transports.push(lokiTransport);
+        }
 
         return {
           level: logLevel,
           format: logFormat,
-          transports: [
-            // Console transport - logs to stdout for container orchestrators
-            new winston.transports.Console({
-              handleExceptions: true,
-              handleRejections: true,
-            }),
-          ],
+          transports,
           exitOnError: false,
         };
       },
@@ -46,4 +71,4 @@ import * as winston from 'winston';
   ],
   exports: [WinstonModule],
 })
-export class LoggerModule {}
\ No newline at end of file
+export class LoggerModule {}
diff --git a/apps/backend/src/main.ts b/apps/backend/src/main.ts
index 5725178..c620067 100644
--- a/apps/backend/src/main.ts
+++ b/apps/backend/src/main.ts
@@ -1,3 +1,4 @@
+import './tracing/otel';
 import './instrument';
 import { NestFactory } from '@nestjs/core';
 import { AppModule } from './app.module';
diff --git a/apps/backend/src/metrics/metrics.interceptor.ts b/apps/backend/src/metrics/metrics.interceptor.ts
index 022819f..d8c7e07 100644
--- a/apps/backend/src/metrics/metrics.interceptor.ts
+++ b/apps/backend/src/metrics/metrics.interceptor.ts
@@ -15,14 +15,14 @@ export class MetricsInterceptor implements NestInterceptor {
   intercept(context: ExecutionContext, next: CallHandler): Observable<any> {
     const request = context.switchToHttp().getRequest();
     const response = context.switchToHttp().getResponse();
+    const startTime = Date.now();
 
     return next.handle().pipe(
       tap(() => {
-        this.metricsService.incrementHttpRequests(
-          request.method,
-          request.route?.path || request.url,
-          response.statusCode,
-        );
+        const route = request.route?.path || request.url;
+        const durationSeconds = (Date.now() - startTime) / 1000;
+        this.metricsService.incrementHttpRequests(request.method, route, response.statusCode);
+        this.metricsService.observeHttpDuration(request.method, route, response.statusCode, durationSeconds);
       }),
     );
   }
diff --git a/apps/backend/src/metrics/metrics.service.ts b/apps/backend/src/metrics/metrics.service.ts
index d2fdb2f..1ecfed0 100644
--- a/apps/backend/src/metrics/metrics.service.ts
+++ b/apps/backend/src/metrics/metrics.service.ts
@@ -1,12 +1,17 @@
-import { Injectable } from '@nestjs/common';
-import { Counter, Histogram, register } from 'prom-client';
+import { Injectable, OnModuleInit } from '@nestjs/common';
+import { Counter, Histogram, Gauge, register } from 'prom-client';
 
 @Injectable()
-export class MetricsService {
+export class MetricsService implements OnModuleInit {
   private readonly httpRequestsTotal: Counter;
+  private readonly httpRequestDuration: Histogram;
   private readonly credentialIssuedTotal: Counter;
   private readonly bstMintedTotal: Counter;
   private readonly stellarRpcLatency: Histogram;
+  private readonly activeConnections: Gauge;
+  private readonly enrollmentsTotal: Counter;
+  private readonly courseCompletionsTotal: Counter;
+  private readonly authAttemptsTotal: Counter;
 
   constructor() {
     this.httpRequestsTotal = new Counter({
@@ -16,6 +21,14 @@ export class MetricsService {
       registers: [register],
     });
 
+    this.httpRequestDuration = new Histogram({
+      name: 'http_request_duration_seconds',
+      help: 'HTTP request duration in seconds',
+      labelNames: ['method', 'route', 'status_code'],
+      buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5],
+      registers: [register],
+    });
+
     this.credentialIssuedTotal = new Counter({
       name: 'credential_issued_total',
       help: 'Total number of credentials issued',
@@ -37,14 +50,45 @@ export class MetricsService {
       buckets: [0.1, 0.5, 1, 2, 5],
       registers: [register],
     });
+
+    this.activeConnections = new Gauge({
+      name: 'active_connections',
+      help: 'Number of active HTTP connections',
+      registers: [register],
+    });
+
+    this.enrollmentsTotal = new Counter({
+      name: 'enrollments_total',
+      help: 'Total number of course enrollments',
+      labelNames: ['course_id'],
+      registers: [register],
+    });
+
+    this.courseCompletionsTotal = new Counter({
+      name: 'course_completions_total',
+      help: 'Total number of course completions',
+      labelNames: ['course_id'],
+      registers: [register],
+    });
+
+    this.authAttemptsTotal = new Counter({
+      name: 'auth_attempts_total',
+      help: 'Total number of authentication attempts',
+      labelNames: ['type', 'status'],
+      registers: [register],
+    });
+  }
+
+  onModuleInit() {
+    // Metrics are registered in constructor; nothing extra needed
   }
 
   incrementHttpRequests(method: string, route: string, statusCode: number) {
-    this.httpRequestsTotal.inc({
-      method,
-      route,
-      status_code: statusCode.toString(),
-    });
+    this.httpRequestsTotal.inc({ method, route, status_code: statusCode.toString() });
+  }
+
+  observeHttpDuration(method: string, route: string, statusCode: number, durationSeconds: number) {
+    this.httpRequestDuration.observe({ method, route, status_code: statusCode.toString() }, durationSeconds);
   }
 
   incrementCredentialIssued(credentialType: string) {
@@ -58,4 +102,20 @@ export class MetricsService {
   observeStellarRpcLatency(method: string, status: string, durationSeconds: number) {
     this.stellarRpcLatency.observe({ method, status }, durationSeconds);
   }
+
+  setActiveConnections(count: number) {
+    this.activeConnections.set(count);
+  }
+
+  incrementEnrollments(courseId: string) {
+    this.enrollmentsTotal.inc({ course_id: courseId });
+  }
+
+  incrementCourseCompletions(courseId: string) {
+    this.courseCompletionsTotal.inc({ course_id: courseId });
+  }
+
+  incrementAuthAttempts(type: 'login' | 'register' | 'refresh', status: 'success' | 'failure') {
+    this.authAttemptsTotal.inc({ type, status });
+  }
 }
diff --git a/apps/backend/src/tracing/index.ts b/apps/backend/src/tracing/index.ts
new file mode 100644
index 0000000..3f22bcd
--- /dev/null
+++ b/apps/backend/src/tracing/index.ts
@@ -0,0 +1,2 @@
+export { TracingModule } from './tracing.module';
+export { TracingService } from './tracing.service';
diff --git a/apps/backend/src/tracing/otel.ts b/apps/backend/src/tracing/otel.ts
new file mode 100644
index 0000000..c71a926
--- /dev/null
+++ b/apps/backend/src/tracing/otel.ts
@@ -0,0 +1,34 @@
+import { NodeSDK } from '@opentelemetry/sdk-node';
+import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
+import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
+import { PrometheusExporter } from '@opentelemetry/exporter-prometheus';
+import { Resource } from '@opentelemetry/resources';
+import { SEMRESATTRS_SERVICE_NAME, SEMRESATTRS_SERVICE_VERSION, SEMRESATTRS_DEPLOYMENT_ENVIRONMENT } from '@opentelemetry/semantic-conventions';
+
+const traceExporter = process.env.OTEL_EXPORTER_OTLP_ENDPOINT
+  ? new OTLPTraceExporter({ url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT })
+  : undefined;
+
+const sdk = new NodeSDK({
+  resource: new Resource({
+    [SEMRESATTRS_SERVICE_NAME]: 'brain-storm-backend',
+    [SEMRESATTRS_SERVICE_VERSION]: process.env.npm_package_version || '1.0.0',
+    [SEMRESATTRS_DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV || 'development',
+  }),
+  traceExporter,
+  instrumentations: [
+    getNodeAutoInstrumentations({
+      '@opentelemetry/instrumentation-http': { enabled: true },
+      '@opentelemetry/instrumentation-express': { enabled: true },
+      '@opentelemetry/instrumentation-pg': { enabled: true },
+      '@opentelemetry/instrumentation-redis': { enabled: true },
+      '@opentelemetry/instrumentation-fs': { enabled: false },
+    }),
+  ],
+});
+
+sdk.start();
+
+process.on('SIGTERM', () => {
+  sdk.shutdown().finally(() => process.exit(0));
+});
diff --git a/apps/backend/src/tracing/tracing.module.ts b/apps/backend/src/tracing/tracing.module.ts
new file mode 100644
index 0000000..0c6f1e7
--- /dev/null
+++ b/apps/backend/src/tracing/tracing.module.ts
@@ -0,0 +1,9 @@
+import { Module, Global } from '@nestjs/common';
+import { TracingService } from './tracing.service';
+
+@Global()
+@Module({
+  providers: [TracingService],
+  exports: [TracingService],
+})
+export class TracingModule {}
diff --git a/apps/backend/src/tracing/tracing.service.ts b/apps/backend/src/tracing/tracing.service.ts
new file mode 100644
index 0000000..4bfaab3
--- /dev/null
+++ b/apps/backend/src/tracing/tracing.service.ts
@@ -0,0 +1,50 @@
+import { Injectable } from '@nestjs/common';
+import { trace, context, SpanStatusCode, Span, Tracer } from '@opentelemetry/api';
+
+@Injectable()
+export class TracingService {
+  private readonly tracer: Tracer;
+
+  constructor() {
+    this.tracer = trace.getTracer('brain-storm-backend', '1.0.0');
+  }
+
+  startSpan(name: string, attributes?: Record<string, string | number | boolean>): Span {
+    const span = this.tracer.startSpan(name);
+    if (attributes) {
+      span.setAttributes(attributes);
+    }
+    return span;
+  }
+
+  async withSpan<T>(
+    name: string,
+    fn: (span: Span) => Promise<T>,
+    attributes?: Record<string, string | number | boolean>,
+  ): Promise<T> {
+    return this.tracer.startActiveSpan(name, { attributes }, async (span) => {
+      try {
+        const result = await fn(span);
+        span.setStatus({ code: SpanStatusCode.OK });
+        return result;
+      } catch (error) {
+        span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
+        span.recordException(error);
+        throw error;
+      } finally {
+        span.end();
+      }
+    });
+  }
+
+  getCurrentSpan(): Span | undefined {
+    return trace.getActiveSpan();
+  }
+
+  addSpanAttributes(attributes: Record<string, string | number | boolean>): void {
+    const span = this.getCurrentSpan();
+    if (span) {
+      span.setAttributes(attributes);
+    }
+  }
+}
diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml
index c63d31b..c078a21 100644
--- a/docker-compose.monitoring.yml
+++ b/docker-compose.monitoring.yml
@@ -8,12 +8,37 @@ services:
       - "9090:9090"
     volumes:
       - ./infra/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./infra/monitoring/alerts.yml:/etc/prometheus/alerts.yml
       - prometheus_data:/prometheus
     command:
       - '--config.file=/etc/prometheus/prometheus.yml'
       - '--storage.tsdb.path=/prometheus'
       - '--web.console.libraries=/usr/share/prometheus/console_libraries'
       - '--web.console.templates=/usr/share/prometheus/consoles'
+      - '--web.enable-lifecycle'
+    restart: unless-stopped
+    networks:
+      - monitoring
+
+  alertmanager:
+    image: prom/alertmanager:latest
+    container_name: alertmanager
+    ports:
+      - "9093:9093"
+    volumes:
+      - ./infra/monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+    restart: unless-stopped
+    networks:
+      - monitoring
+
+  loki:
+    image: grafana/loki:latest
+    container_name: loki
+    ports:
+      - "3100:3100"
+    command: -config.file=/etc/loki/local-config.yaml
     restart: unless-stopped
     networks:
       - monitoring
@@ -36,6 +61,7 @@ services:
       - monitoring
     depends_on:
       - prometheus
+      - loki
 
 volumes:
   prometheus_data:
diff --git a/docs/monitoring.md b/docs/monitoring.md
index a381c75..be2d678 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -1,79 +1,15 @@
-# Monitoring Setup
+# Monitoring
 
-Brain Storm uses Prometheus for metrics collection and Grafana for visualization.
+> **See the full observability guide: [docs/observability.md](./observability.md)**
+
+This document is superseded by the comprehensive observability documentation which covers distributed tracing (OpenTelemetry), metrics (Prometheus), log aggregation (Winston + Loki), alerting (Alertmanager), and Grafana dashboards.
 
 ## Quick Start
 
-1. Start the monitoring stack:
 ```bash
-docker-compose -f docker-compose.monitoring.yml up -d
+docker compose -f docker-compose.monitoring.yml up -d
 ```
 
-2. Access the dashboards:
-- Grafana: http://localhost:3002 (admin/admin)
-- Prometheus: http://localhost:9090
-
-## Available Metrics
-
-### HTTP Metrics
-- `http_requests_total` - Total HTTP requests by method, route, and status code
-
-### Business Metrics
-- `credential_issued_total` - Credentials issued by type
-- `bst_minted_total` - BST tokens minted by user
-
-### Performance Metrics
-- `stellar_rpc_latency_seconds` - Stellar RPC call latency histogram
-
-### System Metrics (default)
-- `process_cpu_user_seconds_total` - CPU usage
-- `process_resident_memory_bytes` - Memory usage
-- `nodejs_eventloop_lag_seconds` - Event loop lag
-- `nodejs_heap_size_total_bytes` - Heap size
-
-## Grafana Dashboard
-
-A pre-built dashboard is automatically provisioned at startup showing:
-- HTTP request rates
-- Credential issuance stats
-- BST token minting stats
-- Stellar RPC latency percentiles
-- Node.js memory usage
-
-## Custom Metrics
-
-To add custom metrics, use the `MetricsService`:
-
-```typescript
-import { MetricsService } from './metrics/metrics.service';
-
-constructor(private metricsService: MetricsService) {}
-
-// Increment counters
-this.metricsService.incrementCredentialIssued('course-completion');
-this.metricsService.incrementBstMinted(userId);
-
-// Observe latency
-const start = Date.now();
-// ... perform operation
-const duration = (Date.now() - start) / 1000;
-this.metricsService.observeStellarRpcLatency('submitTransaction', 'success', duration);
-```
-
-## Production Deployment
-
-In production, configure Prometheus to scrape the `/metrics` endpoint:
-
-```yaml
-scrape_configs:
-  - job_name: 'brain-storm-backend'
-    static_configs:
-      - targets: ['backend:3000']
-```
-
-## Alerting
-
-Configure Prometheus alerting rules in `infra/monitoring/prometheus.yml` for:
-- High error rates (5xx responses)
-- Slow Stellar RPC calls (p95 > 2s)
-- Memory leaks (continuous memory growth)
+- Grafana:      http://localhost:3002 (admin/admin)
+- Prometheus:   http://localhost:9090
+- Alertmanager: http://localhost:9093
diff --git a/docs/observability.md b/docs/observability.md
new file mode 100644
index 0000000..35b8373
--- /dev/null
+++ b/docs/observability.md
@@ -0,0 +1,166 @@
+# Observability
+
+Brain-Storm uses a full observability stack: **distributed tracing** (OpenTelemetry), **metrics** (Prometheus), **log aggregation** (Winston + Loki), and **alerting** (Prometheus Alertmanager) — all visualized in **Grafana**.
+
+---
+
+## Quick Start
+
+```bash
+# Start the full monitoring stack
+docker compose -f docker-compose.monitoring.yml up -d
+
+# Access dashboards
+# Grafana:      http://localhost:3002  (admin / admin)
+# Prometheus:   http://localhost:9090
+# Alertmanager: http://localhost:9093
+# Loki:         http://localhost:3100
+```
+
+---
+
+## Distributed Tracing (OpenTelemetry)
+
+The backend auto-instruments HTTP, PostgreSQL, and Redis using the OpenTelemetry Node.js SDK.
+
+### Configuration
+
+| Env Var | Description | Default |
+|---|---|---|
+| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP trace collector URL | _(disabled)_ |
+
+When `OTEL_EXPORTER_OTLP_ENDPOINT` is not set, tracing runs in no-op mode (zero overhead). Set it to send traces to Jaeger, Tempo, or any OTLP-compatible backend.
+
+### Custom Spans
+
+Inject `TracingService` to create custom spans:
+
+```typescript
+import { TracingService } from './tracing';
+
+constructor(private tracing: TracingService) {}
+
+async issueCredential(userId: string) {
+  return this.tracing.withSpan('credential.issue', async (span) => {
+    span.setAttribute('user.id', userId);
+    // ... your logic
+  });
+}
+```
+
+---
+
+## Metrics (Prometheus)
+
+Metrics are exposed at `GET /metrics` (Prometheus scrape format).
+
+### Available Metrics
+
+| Metric | Type | Description |
+|---|---|---|
+| `http_requests_total` | Counter | HTTP requests by method, route, status_code |
+| `http_request_duration_seconds` | Histogram | HTTP request duration |
+| `credential_issued_total` | Counter | Credentials issued by type |
+| `bst_minted_total` | Counter | BST tokens minted by user |
+| `stellar_rpc_latency_seconds` | Histogram | Stellar RPC call latency |
+| `enrollments_total` | Counter | Course enrollments by course_id |
+| `course_completions_total` | Counter | Course completions by course_id |
+| `auth_attempts_total` | Counter | Auth attempts by type and status |
+| `active_connections` | Gauge | Active HTTP connections |
+| `nodejs_heap_size_used_bytes` | Gauge | Node.js heap memory used |
+| `nodejs_eventloop_lag_seconds` | Gauge | Event loop lag |
+
+### Recording Business Metrics
+
+```typescript
+import { MetricsService } from './metrics/metrics.service';
+
+// In your service
+this.metricsService.incrementCredentialIssued('course-completion');
+this.metricsService.incrementEnrollments(courseId);
+this.metricsService.incrementAuthAttempts('login', 'success');
+this.metricsService.observeStellarRpcLatency('submitTransaction', 'success', durationSeconds);
+```
+
+---
+
+## Log Aggregation (Winston + Loki)
+
+Structured JSON logs are written to stdout in production. Optionally shipped to Grafana Loki.
+
+### Configuration
+
+| Env Var | Description | Default |
+|---|---|---|
+| `LOG_LEVEL` | Minimum log level | `info` |
+| `LOKI_URL` | Loki push URL | _(disabled)_ |
+
+Set `LOKI_URL=http://loki:3100` to enable log shipping. Logs are batched and sent every 5 seconds with labels `app=brain-storm-backend` and `env=<NODE_ENV>`.
+
+### Log Format
+
+In production (`NODE_ENV=production`), logs are emitted as JSON:
+
+```json
+{
+  "timestamp": "2026-05-29T18:00:00.000Z",
+  "level": "info",
+  "message": "Health check completed",
+  "context": "HealthController",
+  "status": "ok"
+}
+```
+
+In development, logs use a human-readable colored format.
+
+---
+
+## Alerting (Prometheus + Alertmanager)
+
+Alert rules are defined in `infra/monitoring/alerts.yml`.
+
+### Alert Rules
+
+| Alert | Severity | Condition |
+|---|---|---|
+| `HighErrorRate` | critical | 5xx rate > 5% for 2m |
+| `SlowHttpResponses` | warning | p95 latency > 1s for 5m |
+| `SlowStellarRpc` | warning | Stellar RPC p95 > 2s for 5m |
+| `HighMemoryUsage` | warning | Heap > 400MB for 5m |
+| `HighEventLoopLag` | warning | Event loop lag > 100ms for 2m |
+| `ServiceDown` | critical | Backend unreachable for 1m |
+| `HighAuthFailureRate` | warning | Auth failure rate > 30% for 5m |
+
+### Alertmanager Configuration
+
+Configure `ALERTMANAGER_WEBHOOK_URL` in `infra/monitoring/alertmanager.yml` to route alerts to Slack, PagerDuty, or any webhook receiver.
+
+---
+
+## Grafana Dashboards
+
+Two dashboards are auto-provisioned at startup:
+
+| Dashboard | Description |
+|---|---|
+| **Brain-Storm Overview** | Request rate, error rate, latency, business metrics, memory, event loop |
+| **NestJS Metrics** | HTTP requests, credentials, BST tokens, Stellar RPC latency |
+
+Access at `http://localhost:3002` (credentials: `admin` / `admin`).
+
+---
+
+## Infrastructure
+
+```
+docker-compose.monitoring.yml
+├── prometheus    :9090  — metrics scraping + alerting rules
+├── alertmanager  :9093  — alert routing and notification
+├── loki          :3100  — log aggregation
+└── grafana       :3002  — dashboards (Prometheus + Loki datasources)
+```
+
+Alert rules: `infra/monitoring/alerts.yml`  
+Alertmanager config: `infra/monitoring/alertmanager.yml`  
+Prometheus config: `infra/monitoring/prometheus.yml`  
+Grafana dashboards: `infra/monitoring/grafana/dashboards/`
diff --git a/infra/monitoring/alertmanager.yml b/infra/monitoring/alertmanager.yml
new file mode 100644
index 0000000..102da84
--- /dev/null
+++ b/infra/monitoring/alertmanager.yml
@@ -0,0 +1,32 @@
+global:
+  resolve_timeout: 5m
+
+route:
+  group_by: ['alertname', 'severity']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 12h
+  receiver: 'default'
+  routes:
+    - match:
+        severity: critical
+      receiver: 'critical'
+      repeat_interval: 1h
+
+receivers:
+  - name: 'default'
+    webhook_configs:
+      - url: '${ALERTMANAGER_WEBHOOK_URL:-http://localhost:9000/webhook}'
+        send_resolved: true
+
+  - name: 'critical'
+    webhook_configs:
+      - url: '${ALERTMANAGER_WEBHOOK_URL:-http://localhost:9000/webhook}'
+        send_resolved: true
+
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname']
diff --git a/infra/monitoring/alerts.yml b/infra/monitoring/alerts.yml
new file mode 100644
index 0000000..39c1d3a
--- /dev/null
+++ b/infra/monitoring/alerts.yml
@@ -0,0 +1,83 @@
+groups:
+  - name: brain-storm-backend
+    interval: 30s
+    rules:
+      # High HTTP error rate (5xx)
+      - alert: HighErrorRate
+        expr: |
+          sum(rate(http_requests_total{status_code=~"5.."}[5m]))
+          /
+          sum(rate(http_requests_total[5m])) > 0.05
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High HTTP 5xx error rate"
+          description: "Error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
+
+      # Slow HTTP responses (p95 > 1s)
+      - alert: SlowHttpResponses
+        expr: |
+          histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))
+          > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Slow HTTP responses on {{ $labels.route }}"
+          description: "p95 latency is {{ $value }}s on route {{ $labels.route }}."
+
+      # Slow Stellar RPC calls (p95 > 2s)
+      - alert: SlowStellarRpc
+        expr: |
+          histogram_quantile(0.95, sum(rate(stellar_rpc_latency_seconds_bucket[5m])) by (le, method))
+          > 2
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Slow Stellar RPC: {{ $labels.method }}"
+          description: "Stellar RPC p95 latency is {{ $value }}s for method {{ $labels.method }}."
+
+      # High memory usage (heap > 400MB)
+      - alert: HighMemoryUsage
+        expr: nodejs_heap_size_used_bytes > 400 * 1024 * 1024
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Node.js heap memory usage"
+          description: "Heap usage is {{ $value | humanize1024 }}B."
+
+      # Event loop lag > 100ms
+      - alert: HighEventLoopLag
+        expr: nodejs_eventloop_lag_seconds > 0.1
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Node.js event loop lag"
+          description: "Event loop lag is {{ $value }}s."
+
+      # Service down (no scrape data)
+      - alert: ServiceDown
+        expr: up{job="brain-storm-backend"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Brain-Storm backend is down"
+          description: "The backend service has been unreachable for more than 1 minute."
+
+      # High authentication failure rate
+      - alert: HighAuthFailureRate
+        expr: |
+          sum(rate(auth_attempts_total{status="failure"}[5m]))
+          /
+          sum(rate(auth_attempts_total[5m])) > 0.3
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High authentication failure rate"
+          description: "Auth failure rate is {{ $value | humanizePercentage }}."
diff --git a/infra/monitoring/grafana/dashboards/brain-storm-overview.json b/infra/monitoring/grafana/dashboards/brain-storm-overview.json
new file mode 100644
index 0000000..450cb9a
--- /dev/null
+++ b/infra/monitoring/grafana/dashboards/brain-storm-overview.json
@@ -0,0 +1,184 @@
+{
+  "uid": "brain-storm-overview",
+  "title": "Brain-Storm - Overview",
+  "tags": ["brain-storm", "overview"],
+  "timezone": "browser",
+  "refresh": "30s",
+  "schemaVersion": 38,
+  "version": 1,
+  "panels": [
+    {
+      "id": 1,
+      "title": "Request Rate (req/s)",
+      "type": "timeseries",
+      "gridPos": { "x": 0, "y": 0, "w": 8, "h": 8 },
+      "targets": [
+        {
+          "expr": "sum(rate(http_requests_total[5m])) by (method)",
+          "legendFormat": "{{ method }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "reqps" }
+      }
+    },
+    {
+      "id": 2,
+      "title": "Error Rate (%)",
+      "type": "timeseries",
+      "gridPos": { "x": 8, "y": 0, "w": 8, "h": 8 },
+      "targets": [
+        {
+          "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
+          "legendFormat": "5xx Error Rate"
+        },
+        {
+          "expr": "sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
+          "legendFormat": "4xx Error Rate"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "percent" }
+      }
+    },
+    {
+      "id": 3,
+      "title": "HTTP p95 Latency (s)",
+      "type": "timeseries",
+      "gridPos": { "x": 16, "y": 0, "w": 8, "h": 8 },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p95"
+        },
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p50"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "id": 4,
+      "title": "Credentials Issued",
+      "type": "stat",
+      "gridPos": { "x": 0, "y": 8, "w": 6, "h": 4 },
+      "targets": [
+        {
+          "expr": "sum(credential_issued_total)",
+          "legendFormat": "Total"
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "title": "BST Tokens Minted",
+      "type": "stat",
+      "gridPos": { "x": 6, "y": 8, "w": 6, "h": 4 },
+      "targets": [
+        {
+          "expr": "sum(bst_minted_total)",
+          "legendFormat": "Total"
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "title": "Course Enrollments",
+      "type": "stat",
+      "gridPos": { "x": 12, "y": 8, "w": 6, "h": 4 },
+      "targets": [
+        {
+          "expr": "sum(enrollments_total)",
+          "legendFormat": "Total"
+        }
+      ]
+    },
+    {
+      "id": 7,
+      "title": "Course Completions",
+      "type": "stat",
+      "gridPos": { "x": 18, "y": 8, "w": 6, "h": 4 },
+      "targets": [
+        {
+          "expr": "sum(course_completions_total)",
+          "legendFormat": "Total"
+        }
+      ]
+    },
+    {
+      "id": 8,
+      "title": "Stellar RPC Latency p95 (s)",
+      "type": "timeseries",
+      "gridPos": { "x": 0, "y": 12, "w": 12, "h": 8 },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(stellar_rpc_latency_seconds_bucket[5m])) by (le, method))",
+          "legendFormat": "{{ method }} p95"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "id": 9,
+      "title": "Node.js Memory (bytes)",
+      "type": "timeseries",
+      "gridPos": { "x": 12, "y": 12, "w": 12, "h": 8 },
+      "targets": [
+        {
+          "expr": "nodejs_heap_size_used_bytes",
+          "legendFormat": "Heap Used"
+        },
+        {
+          "expr": "nodejs_heap_size_total_bytes",
+          "legendFormat": "Heap Total"
+        },
+        {
+          "expr": "process_resident_memory_bytes",
+          "legendFormat": "RSS"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "bytes" }
+      }
+    },
+    {
+      "id": 10,
+      "title": "Event Loop Lag (s)",
+      "type": "timeseries",
+      "gridPos": { "x": 0, "y": 20, "w": 12, "h": 8 },
+      "targets": [
+        {
+          "expr": "nodejs_eventloop_lag_seconds",
+          "legendFormat": "Event Loop Lag"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "id": 11,
+      "title": "Auth Attempts",
+      "type": "timeseries",
+      "gridPos": { "x": 12, "y": 20, "w": 12, "h": 8 },
+      "targets": [
+        {
+          "expr": "sum(rate(auth_attempts_total{status=\"success\"}[5m])) by (type)",
+          "legendFormat": "{{ type }} success"
+        },
+        {
+          "expr": "sum(rate(auth_attempts_total{status=\"failure\"}[5m])) by (type)",
+          "legendFormat": "{{ type }} failure"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "reqps" }
+      }
+    }
+  ]
+}
diff --git a/infra/monitoring/grafana/provisioning/datasources/prometheus.yml b/infra/monitoring/grafana/provisioning/datasources/prometheus.yml
index 1a57b69..e90e251 100644
--- a/infra/monitoring/grafana/provisioning/datasources/prometheus.yml
+++ b/infra/monitoring/grafana/provisioning/datasources/prometheus.yml
@@ -7,3 +7,12 @@ datasources:
     url: http://prometheus:9090
     isDefault: true
     editable: true
+
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    isDefault: false
+    editable: true
+    jsonData:
+      maxLines: 1000
diff --git a/infra/monitoring/prometheus.yml b/infra/monitoring/prometheus.yml
index 1ba9d99..61c1b8b 100644
--- a/infra/monitoring/prometheus.yml
+++ b/infra/monitoring/prometheus.yml
@@ -2,6 +2,15 @@ global:
   scrape_interval: 15s
   evaluation_interval: 15s
 
+rule_files:
+  - /etc/prometheus/alerts.yml
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - alertmanager:9093
+
 scrape_configs:
   - job_name: 'brain-storm-backend'
     static_configs: