diff --git a/.env b/.env
index 1f9841eb..9e143150 100644
--- a/.env
+++ b/.env
@@ -22,6 +22,15 @@ OPENSEARCH_HOST=opensearch
 OPENSEARCH_PORT=9200
 OPENSEARCH_PROTOCOL=https
 OPENSEARCH_JAVA_OPTS=-Xms1g -Xmx1g
+# Endpoint written into the `local_cluster` data-source saved object that the
+# init container seeds into OpenSearch. Point it at the host-reachable port
+# (`https://localhost:9200`, published by the compose file) when running
+# OpenSearch Dashboards on the host — the host-side OSD process cannot
+# resolve the docker-compose service name `opensearch`, so any MDS-scoped
+# OSD feature that dials this SO's endpoint would fail with
+# `getaddrinfo ENOTFOUND opensearch`. Leave blank/commented when OSD itself
+# runs inside the compose network.
+OSD_DATASOURCE_ENDPOINT=https://localhost:9200
 
 # OpenSearch Dashboards Configuration
 OPENSEARCH_DASHBOARDS_VERSION=3.7.0
@@ -49,11 +58,20 @@ DATA_PREPPER_HTTP_PORT=21892
 ISM_RETENTION_DAYS=7
 
 # Prometheus Configuration
+# The "prometheus" service now runs Cortex under the hood (see docker-compose.yml),
+# which is wire-compatible for remote-write/query/ruler/alertmanager APIs.
+# PROMETHEUS_VERSION is retained for legacy references; the actual image tag
+# comes from CORTEX_VERSION below.
 PROMETHEUS_VERSION=v3.8.1
+CORTEX_VERSION=v1.18.1
 PROMETHEUS_HOST=prometheus.observability-stack-network
 PROMETHEUS_PORT=9090
 PROMETHEUS_RETENTION=15d
 
+# Alertmanager Configuration
+ALERTMANAGER_VERSION=v0.27.0
+ALERTMANAGER_PORT=9093
+
 # Resource Limits
 OPENSEARCH_MEMORY_LIMIT=2G
 PROMETHEUS_MEMORY_LIMIT=500M
@@ -62,6 +80,7 @@ DATA_PREPPER_MEMORY_LIMIT=1G
 DASHBOARDS_MEMORY_LIMIT=2G
 WEATHER_AGENT_MEMORY_LIMIT=200M
 CANARY_MEMORY_LIMIT=100M
+ALERTMANAGER_MEMORY_LIMIT=128M
 
 # Network Configuration
 NETWORK_NAME=observability-stack-network
@@ -110,6 +129,15 @@ OTEL_RESOURCE_ATTRIBUTES=service.namespace=opentelemetry-demo,service.version=${
 # Metrics Temporality
 OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=cumulative
 
+# Enable metrics + logs export on every OTel-instrumented service (Node.js,
+# Python, Go, .NET, Java, Rust). Without this, Node.js SDKs in particular
+# default to NOT exporting metrics even when traces are being emitted — so
+# the frontend container would only show nodejs_* runtime metrics in Cortex
+# and no http_server_duration_* counters. "otlp" matches the existing trace
+# exporter so all three signals go to the same collector pipeline.
+OTEL_METRICS_EXPORTER=otlp
+OTEL_LOGS_EXPORTER=otlp
+
 # OTLP Endpoints
 OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_GRPC}
 PUBLIC_OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:8080/otlp-http/v1/traces
diff --git a/README.md b/README.md
index a234363b..6455523f 100644
--- a/README.md
+++ b/README.md
@@ -13,8 +13,9 @@ Observability Stack is an open-source stack designed for modern distributed syst
 - **OpenTelemetry Collector**: Receives OTLP data and routes it to appropriate backends
 - **Data Prepper**: Transforms and enriches logs and traces before storage
 - **OpenSearch**: Stores and indexes logs and traces for search and analysis
-- **Prometheus**: Stores time-series metrics data
-- **OpenSearch Dashboards**: Provides web-based visualization and exploration
+- **Prometheus**: Stores time-series metrics data — runs the Cortex engine under the service name `prometheus` (same API surface, plus Ruler and Alertmanager endpoints)
+- **Alertmanager**: Routes alerts from Cortex-side PromQL rules to notification channels
+- **OpenSearch Dashboards**: Provides web-based visualization and exploration — includes the Alert Manager UI for viewing both OpenSearch monitors and Cortex alerts in one place
 - **PPL (Piped Processing Language)**: Native query language for logs and traces — pipe-based, human-readable, 50+ commands
 
 ## See it in action 
@@ -148,6 +149,20 @@ To stop the stack and remove all data volumes:
 docker compose down -v
 ```
 
+## Upgrading from Previous Releases
+
+This release swaps vanilla Prometheus for Cortex (kept under the same `prometheus` service name) and adds an always-on Alertmanager. Existing deployments can upgrade in place, with two caveats worth calling out:
+
+- **Historical metrics do not carry over.** Cortex writes to a different on-disk layout (`/data/tsdb`, `/data/ruler-storage`) than vanilla Prometheus (`/prometheus/chunks_head`, `/prometheus/wal`). Cortex does not read the old TSDB blocks, so any metrics stored in the `prometheus-data` volume before the upgrade are unreadable after it. New OTLP writes work immediately.
+- **The in-place upgrade migrates OSD state automatically**, but if you prefer a clean slate, wipe volumes before bringing the new stack up:
+  ```bash
+  docker compose down -v
+  docker compose up -d
+  ```
+  The `docker compose down -v` path is the safest if you're on an older build. The automatic migration reconciles the `ObservabilityStack_Prometheus` datasource to add the new `prometheus.ruler.uri` / `alertmanager.uri` properties, cleans up the old saved-object wrapper, and removes stale vanilla-Prometheus directories from the data volume on first Cortex boot.
+
+See [Alerting](docs/starlight-docs/src/content/docs/alerting/index.md) for a tour of the new Cortex rules, Alertmanager routing, and the Alert Manager UI in OpenSearch Dashboards.
+
 ## Instrumenting Your Agent
 
 Observability Stack accepts telemetry data via the OpenTelemetry Protocol (OTLP) and follows the [OpenTelemetry Gen-AI Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/) for standardized attribute naming and structure for AI agents.
@@ -264,8 +279,9 @@ docker compose ps
 |------|---------|----------|-------------|
 | **4317** | OTel Collector | gRPC | OTLP gRPC receiver — used by most OpenTelemetry SDKs |
 | **4318** | OTel Collector | HTTP | OTLP HTTP receiver — used by Strands SDK, browser-based exporters |
-| **5601** | OpenSearch Dashboards | HTTP | Web UI for logs, traces, and dashboards |
-| **9090** | Prometheus | HTTP | Prometheus Web UI and API |
+| **5601** | OpenSearch Dashboards | HTTP | Web UI for logs, traces, dashboards, and Alert Manager |
+| **9090** | Prometheus (Cortex) | HTTP | PromQL query API (`/prometheus/...`) and Ruler admin API (`/api/v1/rules/...`) |
+| **9093** | Alertmanager | HTTP | Alert routing UI and API for Cortex-side PromQL alerts |
 | **9200** | OpenSearch | HTTPS | REST API (self-signed cert, use `curl -k`) |
 | **21890** | Data Prepper | gRPC | Internal OTLP receiver (from OTel Collector) |
 
diff --git a/docker-compose.otel-demo.yml b/docker-compose.otel-demo.yml
index 9c74c898..ae7c6c13 100644
--- a/docker-compose.otel-demo.yml
+++ b/docker-compose.otel-demo.yml
@@ -39,6 +39,8 @@ services:
     environment:
       - KAFKA_ADDR
       - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP}
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=accounting
@@ -68,6 +70,8 @@ services:
       - FLAGD_HOST
       - FLAGD_PORT
       - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP}
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_LOGS_EXPORTER=otlp
@@ -99,6 +103,8 @@ services:
       - FLAGD_PORT
       - VALKEY_ADDR
       - OTEL_EXPORTER_OTLP_ENDPOINT
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=cart
@@ -137,6 +143,8 @@ services:
       - KAFKA_ADDR
       - GOMEMLIMIT=16MiB
       - OTEL_EXPORTER_OTLP_ENDPOINT
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=checkout
@@ -178,6 +186,8 @@ services:
       - IPV6_ENABLED
       - VERSION=${IMAGE_VERSION}
       - OTEL_EXPORTER_OTLP_ENDPOINT
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=currency
@@ -204,6 +214,8 @@ services:
       - FLAGD_HOST
       - FLAGD_PORT
       - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP}
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=email
@@ -227,6 +239,8 @@ services:
       - FLAGD_PORT
       - KAFKA_ADDR
       - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP}
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_INSTRUMENTATION_KAFKA_EXPERIMENTAL_SPAN_ATTRIBUTES=true
       - OTEL_INSTRUMENTATION_MESSAGING_EXPERIMENTAL_RECEIVE_TELEMETRY_ENABLED=true
@@ -263,6 +277,8 @@ services:
       - RECOMMENDATION_ADDR
       - SHIPPING_ADDR
       - OTEL_EXPORTER_OTLP_ENDPOINT
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_RESOURCE_ATTRIBUTES
       - ENV_PLATFORM
       - OTEL_SERVICE_NAME=frontend
@@ -387,6 +403,8 @@ services:
       - LOCUST_AUTOSTART
       - LOCUST_BROWSER_TRAFFIC_ENABLED=false
       - OTEL_EXPORTER_OTLP_ENDPOINT
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=load-generator
@@ -420,6 +438,8 @@ services:
       - FLAGD_HOST
       - FLAGD_PORT
       - OTEL_EXPORTER_OTLP_ENDPOINT
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=payment
@@ -448,6 +468,8 @@ services:
       - FLAGD_PORT
       - GOMEMLIMIT=16MiB
       - OTEL_EXPORTER_OTLP_ENDPOINT
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=product-catalog
@@ -478,6 +500,8 @@ services:
       - PRODUCT_REVIEWS_PORT
       - OTEL_PYTHON_LOG_CORRELATION=true
       - OTEL_EXPORTER_OTLP_ENDPOINT
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=product-reviews
@@ -518,6 +542,8 @@ services:
     environment:
       - IPV6_ENABLED
       - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP}
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_PHP_AUTOLOAD_ENABLED=true
       - QUOTE_PORT
@@ -548,6 +574,8 @@ services:
       - FLAGD_PORT
       - OTEL_PYTHON_LOG_CORRELATION=true
       - OTEL_EXPORTER_OTLP_ENDPOINT
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=recommendation
@@ -578,6 +606,8 @@ services:
       - SHIPPING_PORT
       - QUOTE_ADDR
       - OTEL_EXPORTER_OTLP_ENDPOINT
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=shipping
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
@@ -631,6 +661,8 @@ services:
     environment:
       - FLAGD_UI_PORT
       - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP}
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=flagd-ui
@@ -661,6 +693,8 @@ services:
       - KAFKA_LISTENERS=PLAINTEXT://${KAFKA_HOST}:9092,CONTROLLER://${KAFKA_HOST}:9093
       - KAFKA_CONTROLLER_QUORUM_VOTERS=1@${KAFKA_HOST}:9093
       - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP}
+      - OTEL_METRICS_EXPORTER
+      - OTEL_LOGS_EXPORTER
       - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE
       - OTEL_RESOURCE_ATTRIBUTES
       - OTEL_SERVICE_NAME=kafka
@@ -731,4 +765,54 @@ services:
     <<: *network
     logging: *logging
 
+  # ******************
+  # Demo-only Alerting Extensions
+  # ******************
+  # Demo-only rule loader. Runs alongside the base cortex-rules-init container
+  # (defined in docker-compose.yml) and loads the `otel_demo` Cortex namespace
+  # only. Named separately rather than overlaying the base service because
+  # Docker Compose >= v2.38 rejects service-name overlays on `include:`-imported
+  # resources. Both containers hit the same idempotent Ruler upsert API.
+  cortex-rules-init-otel-demo:
+    image: python:3.11-alpine
+    container_name: cortex-rules-init-otel-demo
+    # `sleep infinity` after success so `docker compose up --wait` is happy.
+    command: sh -c "pip install requests pyyaml && python /init.py && exec sleep infinity"
+    depends_on:
+      prometheus:
+        condition: service_healthy
+    volumes:
+      - ./docker-compose/cortex/init-cortex-rules.py:/init.py
+      - ./docker-compose/prometheus/rules-otel-demo:/rules/otel_demo:ro
+    <<: *network
+    restart: "no"
+    # Mirror the base cortex-rules-init healthcheck: the script touches
+    # /tmp/rules-loaded on a clean load, and `--wait` blocks on it so callers
+    # that query /api/v1/rules/otel_demo after --wait see the rules already in
+    # Cortex. 40×3s=120s covers pip install + load time.
+    healthcheck:
+      test: ["CMD", "test", "-f", "/tmp/rules-loaded"]
+      interval: 3s
+      timeout: 2s
+      retries: 40
+      start_period: 10s
+    logging: *logging
+
+  # OTel Demo Monitors Init - Creates OpenSearch alerting monitors for demo
+  # traces/logs (checkout, payment, cart, frontend). Idempotent.
+  otel-demo-monitors-init:
+    image: python:3.11-alpine
+    container_name: otel-demo-monitors-init
+    command: sh -c "pip install requests && python /init.py"
+    depends_on:
+      opensearch:
+        condition: service_healthy
+    environment:
+      - OPENSEARCH_USER=${OPENSEARCH_USER}
+      - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD}
+    volumes:
+      - ./docker-compose/opentelemetry-demo/init-otel-demo-monitors.py:/init.py
+    <<: *network
+    restart: "no"
+    logging: *logging
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 98c4d685..b51b7794 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -26,6 +26,8 @@ networks:
 volumes:
   prometheus-data:
     driver: local
+  alertmanager-data:
+    driver: local
 
 services:
   # OpenTelemetry Collector - Receives telemetry data via OTLP protocol
@@ -110,30 +112,38 @@ services:
           memory: ${DATA_PREPPER_MEMORY_LIMIT}
     logging: *logging
 
-  # Prometheus - Time-series database for metrics storage
+  # Cortex - Prometheus-compatible metrics backend with Ruler + Alertmanager APIs
+  # Replaces vanilla Prometheus to expose the full Prometheus HTTP API surface
+  # (query, ruler, alertmanager) at a single endpoint for OpenSearch Dashboards.
+  # The service name is kept as "prometheus" so PROMETHEUS_HOST/PORT in .env
+  # continue to work everywhere without changes.
   prometheus:
-    image: prom/prometheus:${PROMETHEUS_VERSION}
+    image: cortexproject/cortex:${CORTEX_VERSION}
     container_name: prometheus
-    pull_policy: always
+    # One-time cleanup shim: if this is the first boot of Cortex on a volume
+    # that still has vanilla-Prometheus artifacts (/data/chunks_head with no
+    # /data/tsdb), remove them before starting Cortex so the volume isn't
+    # polluted with dormant dirs Cortex never reads. Skips on fresh deploys
+    # (chunks_head absent) and on subsequent restarts (tsdb present).
+    entrypoint:
+      - /bin/sh
+      - -c
+      - |
+        if [ ! -d /data/tsdb ] && [ -d /data/chunks_head ]; then
+          echo "First boot after upgrade from vanilla Prometheus — removing stale TSDB artifacts"
+          rm -rf /data/chunks_head /data/wal /data/wbl /data/lock /data/queries.active
+        fi
+        exec /bin/cortex "$$@"
+      - --
     command:
-      - '--config.file=/etc/prometheus/prometheus.yml'
-      - '--storage.tsdb.path=/prometheus'
-      # Retention period from environment variable
-      - '--storage.tsdb.retention.time=${PROMETHEUS_RETENTION}'
-      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
-      - '--web.console.templates=/usr/share/prometheus/consoles'
-      # Enable remote write receiver for OpenTelemetry Collector
-      - '--web.enable-remote-write-receiver'
-      - '--web.enable-lifecycle'
-      - '--web.route-prefix=/'
-      - '--enable-feature=exemplar-storage'
-      - '--web.enable-otlp-receiver'
+      - '-config.file=/etc/cortex/cortex.yaml'
+      # Cortex retention mirrors PROMETHEUS_RETENTION. Compactor deletes
+      # blocks whose max-time is older than this; set to 0 to disable.
+      - '-compactor.blocks-retention-period=${PROMETHEUS_RETENTION}'
     volumes:
-      - ./docker-compose/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
-      # Persist metrics data across container restarts
-      - prometheus-data:/prometheus
+      - ./docker-compose/cortex/cortex.yaml:/etc/cortex/cortex.yaml:ro
+      - prometheus-data:/data
     ports:
-      # Web UI and API endpoint
       - "${PROMETHEUS_PORT}:9090"
     networks:
       - observability-stack-network
@@ -142,6 +152,109 @@ services:
       resources:
         limits:
           memory: ${PROMETHEUS_MEMORY_LIMIT}
+    healthcheck:
+      test: ["CMD", "wget", "--tries=1", "--spider", "-q", "http://localhost:9090/ready"]
+      start_period: 30s
+      interval: 5s
+      timeout: 5s
+      retries: 20
+    logging: *logging
+
+  # Prometheus Alertmanager - Alert routing, grouping, deduplication, and silencing.
+  # Runs whether or not the otel-demo is enabled: the base stack rules (collector
+  # health, scrape-target health) alert into it, and demo rules alert in when the
+  # demo overlay is enabled too. The OSD Prometheus datasource's alertmanager.uri
+  # points at this service's HTTP API.
+  alertmanager:
+    image: prom/alertmanager:${ALERTMANAGER_VERSION}
+    container_name: alertmanager
+    pull_policy: always
+    entrypoint: /bin/sh
+    command:
+      - -c
+      - |
+        cp /tmp/alertmanager.template.yml /tmp/alertmanager.yml &&
+        sed -i 's|OPENSEARCH_USER|'$$OPENSEARCH_USER'|g' /tmp/alertmanager.yml &&
+        sed -i 's|OPENSEARCH_PASSWORD|'$$OPENSEARCH_PASSWORD'|g' /tmp/alertmanager.yml &&
+        exec /bin/alertmanager \
+          --config.file=/tmp/alertmanager.yml \
+          --storage.path=/alertmanager \
+          --web.listen-address=:9093
+    volumes:
+      - ./docker-compose/alertmanager/alertmanager.template.yml:/tmp/alertmanager.template.yml:ro
+      - alertmanager-data:/alertmanager
+    ports:
+      - "${ALERTMANAGER_PORT}:9093"
+    environment:
+      - OPENSEARCH_USER=${OPENSEARCH_USER}
+      - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD}
+    networks:
+      - observability-stack-network
+    restart: unless-stopped
+    deploy:
+      resources:
+        limits:
+          memory: ${ALERTMANAGER_MEMORY_LIMIT}
+    healthcheck:
+      test: ["CMD", "wget", "--tries=1", "--spider", "-q", "http://localhost:9093/-/healthy"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+    logging: *logging
+
+  # Cortex Rules Initialization - Loads alerting rules via the Cortex Ruler API.
+  # Scans /rules/<namespace>/*.yml and POSTs each group. Idempotent.
+  # Base stack rules (/rules/stack) are always loaded. The otel-demo overlay
+  # extends this container to also mount /rules/otel_demo.
+  cortex-rules-init:
+    image: python:3.11-alpine
+    container_name: cortex-rules-init
+    # `sleep infinity` after the script succeeds so `docker compose up --wait`
+    # doesn't trip over a clean exit — `--wait` treats any exited container as
+    # a failure unless a dependent uses `service_completed_successfully`.
+    command: sh -c "pip install requests pyyaml && python /init.py && exec sleep infinity"
+    depends_on:
+      prometheus:
+        condition: service_healthy
+    volumes:
+      - ./docker-compose/cortex/init-cortex-rules.py:/init.py
+      - ./docker-compose/prometheus/rules-stack:/rules/stack:ro
+    networks:
+      - observability-stack-network
+    restart: "no"
+    # The init script touches /tmp/rules-loaded after a clean load. Without
+    # this check `--wait` returns the moment pip starts (container "running"),
+    # well before rules are actually in Cortex — so any caller that queries
+    # /api/v1/rules immediately after --wait sees an empty list. 40×3s=120s
+    # covers pip install + load.
+    healthcheck:
+      test: ["CMD", "test", "-f", "/tmp/rules-loaded"]
+      interval: 3s
+      timeout: 2s
+      retries: 40
+      start_period: 10s
+    logging: *logging
+
+  # OpenSearch Stack Monitors Init - Creates alerting monitors that watch the
+  # health of the observability stack itself (cluster health, etc). Idempotent
+  # by monitor name. The init script hardcodes https://opensearch:9200, so it
+  # hard-depends on local OpenSearch — without it there's nothing to target.
+  opensearch-stack-monitors-init:
+    image: python:3.11-alpine
+    container_name: opensearch-stack-monitors-init
+    command: sh -c "pip install requests && python /init.py"
+    depends_on:
+      opensearch:
+        condition: service_healthy
+        required: true
+    environment:
+      - OPENSEARCH_USER=${OPENSEARCH_USER}
+      - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD}
+    volumes:
+      - ./docker-compose/opensearch-dashboards/init/init-stack-monitors.py:/init.py
+    networks:
+      - observability-stack-network
+    restart: "no"
     logging: *logging
 
   # OpenSearch Prometheus Exporter - Exposes OpenSearch metrics for Prometheus scraping
@@ -183,11 +296,24 @@ services:
       - OPENSEARCH_HOST=${OPENSEARCH_HOST}
       - OPENSEARCH_PORT=${OPENSEARCH_PORT}
       - OPENSEARCH_PROTOCOL=${OPENSEARCH_PROTOCOL}
+      # Overrides the endpoint stored on the seeded `local_cluster` data-source
+      # saved object. Blank default uses the intra-network hostname, which is
+      # correct when OpenSearch Dashboards runs inside the compose network.
+      # Set in `.env` (e.g. `https://localhost:9200`) when running OSD on the
+      # host, since the host process cannot resolve the `opensearch` service
+      # name — any MDS-scoped OSD feature dialing this SO's endpoint would
+      # otherwise fail with `getaddrinfo ENOTFOUND opensearch`.
+      - OSD_DATASOURCE_ENDPOINT=${OSD_DATASOURCE_ENDPOINT:-}
       - OPENSEARCH_DASHBOARDS_HOST=${OPENSEARCH_DASHBOARDS_HOST}
       - OPENSEARCH_DASHBOARDS_PORT=${OPENSEARCH_DASHBOARDS_PORT}
       - OPENSEARCH_DASHBOARDS_PROTOCOL=${OPENSEARCH_DASHBOARDS_PROTOCOL}
       - PROMETHEUS_HOST=${PROMETHEUS_HOST}
       - PROMETHEUS_PORT=${PROMETHEUS_PORT}
+      # alertmanager.uri is set on the Prometheus datasource unconditionally.
+      # Alertmanager now runs always (defined in docker-compose.yml, not the
+      # otel-demo overlay), so this URI is always valid.
+      - ALERTMANAGER_HOST=alertmanager
+      - ALERTMANAGER_PORT=${ALERTMANAGER_PORT}
       - ISM_RETENTION_DAYS=${ISM_RETENTION_DAYS:-7}
     volumes:
       - ./docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py:/init.py
diff --git a/docker-compose/alertmanager/alertmanager.template.yml b/docker-compose/alertmanager/alertmanager.template.yml
new file mode 100644
index 00000000..eb8dbcdb
--- /dev/null
+++ b/docker-compose/alertmanager/alertmanager.template.yml
@@ -0,0 +1,135 @@
+# Prometheus Alertmanager Configuration
+# Alert routing, grouping, and notification management
+#
+# Alerts from Cortex ruler are routed here for grouping, deduplication, and delivery.
+# All receivers index alerts into OpenSearch for persistent history and search.
+#
+# The OpenSearch index "alertmanager-alerts" is created automatically on first write.
+# Browse alerts in OpenSearch Dashboards or query via:
+#   curl -sk -u admin:$OPENSEARCH_PASSWORD https://localhost:9200/alertmanager-alerts/_search?pretty
+#
+# To add Slack, PagerDuty, email, or other integrations see:
+#   https://prometheus.io/docs/alerting/latest/configuration/#receiver
+
+# Global configuration for all routes
+global:
+  # Time to wait before declaring an alert resolved if not updated
+  resolve_timeout: 5m
+
+# Alert routing tree — determines which receiver handles each alert
+# Routes are evaluated top-down; the first match wins.
+route:
+  # Group alerts by name and service to reduce notification noise
+  group_by: ['alertname', 'service_name']
+  # Wait before sending initial notification for a new group
+  group_wait: 30s
+  # Wait before sending updates to an existing group
+  group_interval: 5m
+  # Wait before re-sending a notification for an already-firing alert
+  repeat_interval: 4h
+  # Default receiver for all alerts
+  receiver: 'opensearch-webhook'
+
+  routes:
+    # ── OTel Demo routes (most specific first) ──────────────────────────
+    # Critical demo alerts: checkout pipeline failures, payment errors, frontend 5xx
+    - match:
+        component: otel-demo
+        severity: critical
+      receiver: 'otel-demo-critical'
+      # Fast notification for user-facing breakage
+      group_by: ['alertname', 'service']
+      group_wait: 10s
+      group_interval: 1m
+      repeat_interval: 30m
+
+    # Warning demo alerts: latency degradation, cart errors, high CPU
+    - match:
+        component: otel-demo
+        severity: warning
+      receiver: 'otel-demo-warning'
+      group_by: ['alertname', 'service']
+      group_wait: 30s
+      group_interval: 5m
+      repeat_interval: 2h
+
+# Notification receivers
+# Each receiver indexes into the same OpenSearch index; the alert payload
+# carries all labels so you can filter by component/service/severity there.
+# Credentials must match OPENSEARCH_USER / OPENSEARCH_PASSWORD in .env
+receivers:
+  # Default — catch-all for any unmatched alerts
+  - name: 'opensearch-webhook'
+    webhook_configs:
+      - &opensearch-webhook
+        url: 'https://opensearch:9200/alertmanager-alerts/_doc'
+        http_config:
+          basic_auth:
+            username: OPENSEARCH_USER
+            password: OPENSEARCH_PASSWORD
+          tls_config:
+            insecure_skip_verify: true
+        send_resolved: true
+
+  # OTel Demo critical — checkout/payment/frontend failures
+  - name: 'otel-demo-critical'
+    webhook_configs:
+      - *opensearch-webhook
+
+  # OTel Demo warning — latency, cart, ad-service issues
+  - name: 'otel-demo-warning'
+    webhook_configs:
+      - *opensearch-webhook
+
+  # ── Dummy receiver definitions (placeholders — not routed to by default) ──
+  # These exist so the `amtool check-config` output and any UI showing
+  # configured receivers demonstrate the shape of a real integration without
+  # actually calling out to a third party. Drop real credentials in here and
+  # add a `match:` route above when you want alerts to reach them.
+  - name: 'dummy-slack'
+    slack_configs:
+      # Replace with a real Slack incoming webhook URL:
+      # https://hooks.slack.com/services/<workspace>/<channel>/<token>
+      - api_url: 'https://example.invalid/slack-webhook-placeholder'
+        channel: '#alerts-placeholder'
+        send_resolved: true
+        title: '[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}'
+        text: |
+          {{ range .Alerts }}
+          *Severity:* {{ .Labels.severity }}
+          *Service:* {{ .Labels.service_name }}
+          *Summary:* {{ .Annotations.summary }}
+          *Description:* {{ .Annotations.description }}
+          {{ end }}
+
+  - name: 'dummy-email'
+    email_configs:
+      - to: 'alerts@example.com'
+        from: 'alertmanager@observability-stack.local'
+        smarthost: 'smtp.example.com:587'
+        auth_username: 'alertmanager@example.com'
+        auth_password: 'CHANGE_ME'
+        require_tls: true
+        send_resolved: true
+
+  - name: 'dummy-pagerduty'
+    pagerduty_configs:
+      - routing_key: '00000000000000000000000000000000'
+        send_resolved: true
+        severity: '{{ .CommonLabels.severity }}'
+        description: '{{ .CommonAnnotations.summary }}'
+
+  # No-op receiver — drops the alert on the floor. Useful as a deliberate
+  # silence destination for noisy alerts you don't want to mute entirely.
+  - name: 'null'
+
+# Inhibition rules — suppress lower-severity alerts when higher ones fire
+inhibit_rules:
+  # Suppress otel-demo warnings when a critical alert fires for the same service
+  - source_match:
+      component: otel-demo
+      severity: critical
+    target_match:
+      component: otel-demo
+      severity: warning
+    equal: ['service']
diff --git a/docker-compose/cortex/cortex.yaml b/docker-compose/cortex/cortex.yaml
new file mode 100644
index 00000000..960af7de
--- /dev/null
+++ b/docker-compose/cortex/cortex.yaml
@@ -0,0 +1,88 @@
+# Cortex - Single-process mode for local development
+# Provides Prometheus-compatible query API + Ruler API for rule CRUD
+
+target: all
+
+# Disable multi-tenancy for local dev (no X-Scope-OrgID header required)
+auth_enabled: false
+
+server:
+  # Listen on 9090 to match the standard Prometheus port convention
+  # This ensures PROMETHEUS_PORT in .env works for both host and internal Docker networking
+  http_listen_port: 9090
+  grpc_listen_port: 9095
+
+distributor:
+  # shard_by_all_labels is required when using max_global_series_per_user /
+  # per_metric limits (see limits block below). Safe to enable in single-
+  # binary mode.
+  shard_by_all_labels: true
+  ring:
+    instance_addr: 127.0.0.1
+    kvstore:
+      store: inmemory
+
+ingester:
+  lifecycler:
+    ring:
+      kvstore:
+        store: inmemory
+      replication_factor: 1
+    address: 127.0.0.1
+    final_sleep: 0s
+
+storage:
+  engine: blocks
+
+blocks_storage:
+  backend: filesystem
+  filesystem:
+    dir: /data/blocks
+  tsdb:
+    dir: /data/tsdb
+  bucket_store:
+    sync_dir: /data/tsdb-sync
+
+ruler:
+  enable_api: true
+  # Route firing alerts to Alertmanager for routing, grouping, and notification
+  alertmanager_url: http://alertmanager:9093
+  ring:
+    kvstore:
+      store: inmemory
+    instance_addr: 127.0.0.1
+
+# Use S3-compatible storage via filesystem backend for ruler CRUD support
+# The 'local' backend is read-only; 'filesystem' supports full CRUD
+ruler_storage:
+  backend: filesystem
+  filesystem:
+    dir: /data/ruler-storage
+
+compactor:
+  data_dir: /data/compactor
+  sharding_ring:
+    kvstore:
+      store: inmemory
+
+store_gateway:
+  sharding_ring:
+    replication_factor: 1
+    kvstore:
+      store: inmemory
+
+# Per-tenant ingestion limits. Defaults cap a single metric at 50k series,
+# which span-derived RED metrics can blow through quickly when a noisy
+# label is in play. Data-prepper already strips the `randomKey` UUID
+# before remote-write, but keep the ceiling generous so experimental
+# instrumentation doesn't silently start dropping samples.
+limits:
+  max_global_series_per_metric: 500000
+  max_global_series_per_user: 5000000
+  ingestion_rate: 100000
+  ingestion_burst_size: 200000
+  # Cortex's default per-series label cap is 30. Full OTel resource sets for
+  # JVM/.NET/Node.js exceed that once the collector promotes resource attrs
+  # to labels (resource_to_telemetry_conversion: true), and the ruler's
+  # ALERTS series inherits the same labels, so keep the ceiling generous.
+  max_label_names_per_series: 50
diff --git a/docker-compose/cortex/init-cortex-rules.py b/docker-compose/cortex/init-cortex-rules.py
new file mode 100644
index 00000000..52bb84d0
--- /dev/null
+++ b/docker-compose/cortex/init-cortex-rules.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""Load Prometheus alerting/recording rules into Cortex via the Ruler API.
+
+This script runs as an init container. It scans /rules/ for subdirectories,
+treating each subdirectory name as a Cortex ruler namespace. Every *.yml file
+in the subdirectory is parsed and each rule group is POSTed individually.
+
+Directory layout expected:
+  /rules/
+    stack/                      ← namespace "stack"
+      alerts.yml                ← contains groups: stack_health, otel_collector_health, …
+    otel_demo/                  ← namespace "otel_demo" (mounted by otel-demo compose)
+      otel-demo-alerts.yml      ← contains groups: otel_demo_frontend, otel_demo_checkout, …
+
+The main docker-compose.yml mounts only /rules/stack/.
+The otel-demo compose override adds /rules/otel_demo/.
+"""
+
+import glob
+import os
+import sys
+import time
+
+import requests
+import yaml
+
+CORTEX_URL = os.getenv("CORTEX_URL", "http://prometheus:9090")
+
+# Fixed 2s poll × 60 attempts = ~2 min to come up. Cortex's single-binary
+# startup is consistent, so exponential backoff just slows recovery.
+READY_POLL_INTERVAL_SECONDS = 2
+READY_POLL_MAX_ATTEMPTS = 60
+
+
+def wait_for_cortex():
+    """Wait for Cortex to report ready, or exit non-zero on timeout."""
+    print("⏳ Waiting for Cortex...")
+    for _ in range(READY_POLL_MAX_ATTEMPTS):
+        try:
+            r = requests.get(f"{CORTEX_URL}/ready", timeout=5)
+            if r.status_code == 200:
+                print("✅ Cortex is ready")
+                return
+        except requests.exceptions.RequestException:
+            pass
+        time.sleep(READY_POLL_INTERVAL_SECONDS)
+    print(
+        f"❌ Cortex did not become ready at {CORTEX_URL} within "
+        f"{READY_POLL_INTERVAL_SECONDS * READY_POLL_MAX_ATTEMPTS}s"
+    )
+    sys.exit(1)
+
+
+def load_rules_file(filepath, namespace):
+    """Upsert every rule group from a YAML file into Cortex.
+
+    Cortex's POST /api/v1/rules/{namespace} is an upsert (HTTP 202 on both
+    create and replace), so always POST — no existence check, no skip path.
+    Returns (loaded, failed) counts for this file.
+    """
+    print(f"\n📂 {filepath} → namespace '{namespace}'")
+
+    with open(filepath) as f:
+        data = yaml.safe_load(f)
+
+    if not data or "groups" not in data:
+        print("   (no groups found — skipping)")
+        return 0, 0
+
+    loaded = 0
+    failed = 0
+
+    for group in data["groups"]:
+        group_name = group.get("name", "unknown")
+        rule_count = len(group.get("rules", []))
+
+        group_yaml = yaml.dump(group, default_flow_style=False)
+
+        try:
+            r = requests.post(
+                f"{CORTEX_URL}/api/v1/rules/{namespace}",
+                headers={"Content-Type": "application/yaml"},
+                data=group_yaml,
+                timeout=10,
+            )
+            if r.status_code == 202:
+                print(f"   ✅ {group_name} ({rule_count} rules) — loaded")
+                loaded += 1
+            else:
+                print(f"   ⚠️  {group_name}: HTTP {r.status_code} — {r.text[:200]}")
+                failed += 1
+        except requests.exceptions.RequestException as e:
+            print(f"   ❌ {group_name}: {e}")
+            failed += 1
+
+    return loaded, failed
+
+
+def main():
+    wait_for_cortex()
+
+    rules_root = "/rules"
+    if not os.path.isdir(rules_root):
+        print(f"No rules directory at {rules_root}")
+        sys.exit(0)
+
+    total_loaded = 0
+    total_failed = 0
+    for namespace_dir in sorted(glob.glob(f"{rules_root}/*")):
+        if not os.path.isdir(namespace_dir):
+            continue
+        namespace = os.path.basename(namespace_dir)
+
+        for rules_file in sorted(glob.glob(f"{namespace_dir}/*.yml")):
+            loaded, failed = load_rules_file(rules_file, namespace)
+            total_loaded += loaded
+            total_failed += failed
+
+    print(
+        f"\n📊 Summary — loaded: {total_loaded}, failed: {total_failed}"
+    )
+
+    if total_failed > 0:
+        sys.exit(1)
+
+    if total_loaded == 0:
+        print("⚠️  No rule groups loaded")
+
+    # Sentinel file consumed by the compose healthcheck so `docker compose
+    # up --wait` blocks until rules are actually loaded. Only written when
+    # no group failed so a partial load doesn't mark the container healthy.
+    try:
+        with open("/tmp/rules-loaded", "w") as f:
+            f.write("ok\n")
+    except OSError as e:
+        print(f"⚠️  Could not write /tmp/rules-loaded sentinel: {e}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docker-compose/data-prepper/pipelines.template.yaml b/docker-compose/data-prepper/pipelines.template.yaml
index 421b16ee..0b489e91 100644
--- a/docker-compose/data-prepper/pipelines.template.yaml
+++ b/docker-compose/data-prepper/pipelines.template.yaml
@@ -85,7 +85,9 @@ traces-raw-pipeline:
         index_type: trace-analytics-plain-raw
 
 # Service map generation pipeline (APM)
-# Builds service dependency maps and RED metrics from trace relationships
+# Builds service dependency maps and RED metrics from trace relationships.
+# Splits into two sub-pipelines so we can strip high-cardinality `randomKey`
+# labels from the Cortex branch without polluting the service-map branch.
 service-map-pipeline:
   delay: 100
   source:
@@ -107,10 +109,39 @@ service-map-pipeline:
         index_type: otel-v2-apm-service-map
         routes: [otel_apm_service_map_route]
         insecure: true
-    # Route RED metrics to local Prometheus via remote write
+    # Fan out service-derived RED metrics to a dedicated pipeline that strips
+    # the per-event randomKey UUID before Cortex rejects it for cardinality.
+    - pipeline:
+        name: "service-metrics-cortex-pipeline"
+        routes: [service_processed_metrics]
+
+# Strips the per-event `randomKey` UUID from span-derived RED metrics
+# before the Cortex remote-write sink. Without this, data-prepper tags
+# every latency bucket with a fresh UUID, blowing past Cortex's default
+# 50k series-per-metric limit in minutes.
+service-metrics-cortex-pipeline:
+  delay: 100
+  source:
+    pipeline:
+      name: "service-map-pipeline"
+  processor:
+    # Drop only the per-event randomKey. Do NOT strip telemetry.sdk.language:
+    # otel_apm_service_map is configured to group_by it and emits one sample
+    # per (service, operation, remoteService, sdk.language) per window, so
+    # removing the label collapses multi-language services onto the same
+    # series+timestamp and Cortex rejects them as duplicate samples.
+    # Note: data-prepper event keys use JSON-pointer-style paths; labels
+    # set by otel_apm_service_map land under /attributes/<key>, so both
+    # the top-level and attributes-scoped paths are listed to be safe.
+    - delete_entries:
+        with_keys:
+          - "/attributes/randomKey"
+          - "randomKey"
+  sink:
+    # Route RED metrics to local Cortex via remote write.
+    # Cortex's distributor push endpoint is /api/v1/push (not Prometheus's /api/v1/write).
     - prometheus:
-        url: "http://PROMETHEUS_HOST:PROMETHEUS_PORT/api/v1/write"
+        url: "http://PROMETHEUS_HOST:PROMETHEUS_PORT/api/v1/push"
         threshold:
           max_events: 500
-          flush_interval: 5s
-        routes: [service_processed_metrics]
\ No newline at end of file
+          flush_interval: 5s
\ No newline at end of file
diff --git a/docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py b/docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py
index bab1d558..69ae18a8 100644
--- a/docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py
+++ b/docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py
@@ -13,6 +13,8 @@
 PASSWORD = os.getenv("OPENSEARCH_PASSWORD", "My_password_123!@#")
 PROMETHEUS_HOST = os.getenv("PROMETHEUS_HOST", "prometheus.observability-stack-network")
 PROMETHEUS_PORT = os.getenv("PROMETHEUS_PORT", "9090")
+ALERTMANAGER_HOST = os.getenv("ALERTMANAGER_HOST", "alertmanager")
+ALERTMANAGER_PORT = os.getenv("ALERTMANAGER_PORT", "9093")
 _opensearch_protocol = os.getenv("OPENSEARCH_PROTOCOL", "https")
 OPENSEARCH_ENDPOINT = f"{_opensearch_protocol}://{os.getenv('OPENSEARCH_HOST', 'opensearch')}:{os.getenv('OPENSEARCH_PORT', '9200')}"
 ISM_RETENTION_DAYS = int(os.getenv("ISM_RETENTION_DAYS", "7"))
@@ -290,33 +292,77 @@ def get_existing_prometheus_datasource(datasource_name):
         return None
 
 
+def get_prometheus_datasource_properties(datasource_name):
+    """Fetch the full properties map for a Prometheus dataconnection.
+
+    The /api/saved_objects/_find?type=data-connection endpoint only exposes
+    connectionId + type; the authoritative read for `properties` is the SQL
+    plugin's /api/dataconnections endpoint.
+    """
+    try:
+        response = requests.get(
+            f"{BASE_URL}/api/dataconnections",
+            auth=(USERNAME, PASSWORD),
+            headers={"Content-Type": "application/json", "osd-xsrf": "true"},
+            verify=False,
+            timeout=10,
+        )
+        if response.status_code != 200:
+            print(f"⚠️  GET /api/dataconnections returned {response.status_code}: {response.text[:200]}")
+            return None
+        for entry in response.json() or []:
+            if entry.get("name") == datasource_name:
+                return entry.get("properties") or {}
+        return None
+    except requests.exceptions.RequestException as e:
+        print(f"⚠️  Error reading dataconnections: {e}")
+        return None
+
+
 def create_prometheus_datasource(workspace_id):
     """Create Prometheus datasource using direct query API"""
     datasource_name = "ObservabilityStack_Prometheus"
 
+    # Cortex exposes the Prometheus-compatible query API under /prometheus
+    # (e.g. /prometheus/api/v1/query_range) while the Ruler admin API lives
+    # at the unprefixed root (/api/v1/rules/{namespace}). The SQL plugin's
+    # PrometheusClient exposes `prometheus.uri` and `prometheus.ruler.uri`
+    # exactly for this split — both must be set for query + rule management
+    # to work against Cortex.
+    prometheus_endpoint = f"http://{PROMETHEUS_HOST}:{PROMETHEUS_PORT}/prometheus"
+    ruler_endpoint = f"http://{PROMETHEUS_HOST}:{PROMETHEUS_PORT}"
+    alertmanager_endpoint = f"http://{ALERTMANAGER_HOST}:{ALERTMANAGER_PORT}"
+
+    desired_properties = {
+        "prometheus.uri": prometheus_endpoint,
+        "prometheus.ruler.uri": ruler_endpoint,
+        "alertmanager.uri": alertmanager_endpoint,
+    }
+
     # Check if datasource already exists
     existing_id = get_existing_prometheus_datasource(datasource_name)
     if existing_id:
         print(f"✅ Prometheus datasource already exists: {existing_id}")
+        reconciled = reconcile_prometheus_datasource_properties(
+            datasource_name, desired_properties
+        )
+        # Reconciliation goes through DELETE + POST, so the saved-object
+        # id may have changed — re-read before associating.
+        datasource_id = existing_id
+        if reconciled:
+            datasource_id = get_existing_prometheus_datasource(datasource_name) or existing_id
         # Associate with workspace if provided
         if workspace_id and workspace_id != "default":
-            associate_prometheus_with_workspace(workspace_id, existing_id)
-        return existing_id
+            associate_prometheus_with_workspace(workspace_id, datasource_id)
+        return datasource_id
 
     print("🔧 Creating Prometheus datasource...")
 
-    prometheus_endpoint = f"http://{PROMETHEUS_HOST}:{PROMETHEUS_PORT}"
-
     payload = {
         "name": datasource_name,
         "allowedRoles": [],
         "connector": "prometheus",
-        "properties": {
-            "prometheus.uri": prometheus_endpoint,
-            "prometheus.auth.type": "basicauth",
-            "prometheus.auth.username": "",
-            "prometheus.auth.password": "",
-        },
+        "properties": desired_properties,
     }
 
     try:
@@ -345,6 +391,9 @@ def create_prometheus_datasource(workspace_id):
             error_text = response.text
             if "already exists with name" in error_text:
                 print(f"✅ Prometheus datasource already exists: {datasource_name}")
+                reconcile_prometheus_datasource_properties(
+                    datasource_name, desired_properties
+                )
                 # Fetch the datasource ID and associate
                 datasource_id = get_existing_prometheus_datasource(datasource_name)
                 if datasource_id and workspace_id and workspace_id != "default":
@@ -361,6 +410,203 @@ def create_prometheus_datasource(workspace_id):
         return None
 
 
+def _delete_stale_data_connection_saved_object(saved_object_id):
+    """Delete the orphaned data-connection saved-object left behind by the
+    SQL plugin's DELETE /api/dataconnections/{name} path.
+
+    The SQL plugin removes its own dataconnection record but not the wrapper
+    OSD saved-object, so without this call an in-place upgrade ends up with
+    two data-connection saved-objects sharing the same connectionId — one
+    orphaned (no SQL backing), one live. The orphan pollutes workspace
+    listings and re-breaks every time the migration runs.
+    """
+    url = f"{BASE_URL}/api/saved_objects/data-connection/{saved_object_id}?force=true"
+    try:
+        resp = requests.delete(
+            url,
+            auth=(USERNAME, PASSWORD),
+            headers={"osd-xsrf": "true"},
+            verify=False,
+            timeout=10,
+        )
+        if resp.status_code in (200, 204, 404):
+            print(
+                f"🧹 Deleted stale data-connection saved-object {saved_object_id}"
+            )
+            return True
+        print(
+            f"⚠️  Failed to delete stale data-connection saved-object "
+            f"({resp.status_code}): {resp.text[:200]}"
+        )
+    except requests.exceptions.RequestException as e:
+        print(f"⚠️  Error deleting stale data-connection saved-object: {e}")
+    return False
+
+
+def _delete_correlations_referencing_data_connection(data_connection_id):
+    """Remove any correlation saved-objects whose references point at
+    `data_connection_id`. They'll be re-created idempotently later in the
+    init flow with the new id, so the net effect is "migrate reference,
+    not break it". Without this, the APM-config correlation from the pre-PR
+    install dangles: it still exists, but its references[2].dataConnection.id
+    points at a saved-object whose SQL-plugin backing is gone.
+    """
+    try:
+        resp = requests.get(
+            f"{BASE_URL}/api/saved_objects/_find?type=correlations&per_page=1000",
+            auth=(USERNAME, PASSWORD),
+            headers={"osd-xsrf": "true"},
+            verify=False,
+            timeout=10,
+        )
+        if resp.status_code != 200:
+            print(
+                f"⚠️  Could not list correlations for dangling-reference scan "
+                f"({resp.status_code})"
+            )
+            return
+
+        for obj in resp.json().get("saved_objects", []):
+            refs = obj.get("references") or []
+            if not any(
+                r.get("type") == "data-connection" and r.get("id") == data_connection_id
+                for r in refs
+            ):
+                continue
+            obj_id = obj.get("id")
+            workspaces = obj.get("workspaces") or []
+            if workspaces and workspaces[0] != "default":
+                url = f"{BASE_URL}/w/{workspaces[0]}/api/saved_objects/correlations/{obj_id}"
+            else:
+                url = f"{BASE_URL}/api/saved_objects/correlations/{obj_id}"
+            del_resp = requests.delete(
+                url + "?force=true",
+                auth=(USERNAME, PASSWORD),
+                headers={"osd-xsrf": "true"},
+                verify=False,
+                timeout=10,
+            )
+            if del_resp.status_code in (200, 204, 404):
+                print(
+                    f"🧹 Deleted stale correlation {obj_id} "
+                    f"(referenced pre-migration dataconnection)"
+                )
+            else:
+                print(
+                    f"⚠️  Failed to delete stale correlation {obj_id} "
+                    f"({del_resp.status_code}): {del_resp.text[:200]}"
+                )
+    except requests.exceptions.RequestException as e:
+        print(f"⚠️  Error scanning correlations for stale references: {e}")
+
+
+def reconcile_prometheus_datasource_properties(datasource_name, desired_properties):
+    """Ensure an existing Prometheus datasource carries all desired properties.
+
+    Returns True when the datasource was rewritten (so callers can re-fetch
+    the saved-object id, which changes across DELETE+POST), False when it
+    was already in the desired state or when reconciliation could not run.
+
+    Why: in-place upgrades keep the pre-PR datasource (prometheus.uri only)
+    so the OSD Alert Manager UI silently shows zero alerts because the
+    alertmanager.uri / prometheus.ruler.uri it needs were never added. This
+    reads the authoritative properties via /api/dataconnections, diffs them
+    against the desired set, and rewrites only on a mismatch to keep re-runs
+    idempotent.
+
+    Update strategy: DELETE + POST. The SQL plugin does not expose a PUT/PATCH
+    update endpoint for Prometheus dataconnections — POST rejects with 400
+    "already exists" and PUT/PATCH return 404. DELETE on
+    /api/dataconnections/{name} succeeds, after which a fresh POST recreates
+    the dataconnection with the full property set.
+
+    The SQL plugin's DELETE does not remove the wrapping OSD saved-object or
+    update any correlation that references it, so this function also cleans
+    up the stale saved-object and any dangling correlation references before
+    re-POSTing. The correlations are re-created idempotently later in the
+    init flow against the new id.
+    """
+    current = get_prometheus_datasource_properties(datasource_name)
+    if current is None:
+        print(
+            "⚠️  Could not read Prometheus datasource properties "
+            f"for '{datasource_name}' — skipping reconciliation"
+        )
+        return False
+
+    missing = [k for k in desired_properties if k not in current]
+    mismatched = [
+        k for k in desired_properties
+        if k in current and current.get(k) != desired_properties[k]
+    ]
+    if not missing and not mismatched:
+        print("✅ Prometheus datasource properties already up to date")
+        return False
+
+    if missing:
+        print(f"🔧 Prometheus datasource missing properties: {missing}")
+    if mismatched:
+        print(f"🔧 Prometheus datasource properties changed: {mismatched}")
+
+    # Capture the pre-existing saved-object id BEFORE the SQL-plugin DELETE
+    # so we can clean up the orphaned saved-object and any correlations that
+    # still point at it.
+    stale_saved_object_id = get_existing_prometheus_datasource(datasource_name)
+
+    delete_url = f"{BASE_URL}/api/dataconnections/{datasource_name}"
+    try:
+        delete_resp = requests.delete(
+            delete_url,
+            auth=(USERNAME, PASSWORD),
+            headers={"osd-xsrf": "true"},
+            verify=False,
+            timeout=10,
+        )
+        if delete_resp.status_code not in (200, 204, 404):
+            print(
+                f"⚠️  Prometheus datasource DELETE failed "
+                f"({delete_resp.status_code}): {delete_resp.text[:200]}"
+            )
+            return False
+    except requests.exceptions.RequestException as e:
+        print(f"⚠️  Error deleting Prometheus datasource: {e}")
+        return False
+
+    # Remove the now-orphaned saved-object wrapper + any correlations that
+    # reference it. Best-effort — failures here are logged but don't abort
+    # the migration, since the subsequent POST still restores a working
+    # datasource even if cleanup is incomplete.
+    if stale_saved_object_id:
+        _delete_correlations_referencing_data_connection(stale_saved_object_id)
+        _delete_stale_data_connection_saved_object(stale_saved_object_id)
+
+    payload = {
+        "name": datasource_name,
+        "allowedRoles": [],
+        "connector": "prometheus",
+        "properties": desired_properties,
+    }
+    try:
+        response = requests.post(
+            f"{BASE_URL}/api/directquery/dataconnections",
+            auth=(USERNAME, PASSWORD),
+            headers={"Content-Type": "application/json", "osd-xsrf": "true"},
+            json=payload,
+            verify=False,
+            timeout=10,
+        )
+        if response.status_code == 200:
+            print("✅ Recreated Prometheus datasource with updated properties")
+            return True
+        print(
+            f"⚠️  Prometheus datasource recreate after delete failed "
+            f"({response.status_code}): {response.text[:200]}"
+        )
+    except requests.exceptions.RequestException as e:
+        print(f"⚠️  Error recreating Prometheus datasource: {e}")
+    return False
+
+
 def associate_prometheus_with_workspace(workspace_id, datasource_id):
     """Associate Prometheus datasource with workspace"""
     print(f"🔗 Associating Prometheus datasource with workspace {workspace_id}...")
@@ -458,7 +704,11 @@ def create_opensearch_datasource(workspace_id):
 
     print("🔧 Creating OpenSearch datasource...")
 
-    opensearch_endpoint = OPENSEARCH_ENDPOINT
+    # OSD_DATASOURCE_ENDPOINT lets operators override the endpoint written
+    # onto the saved object — useful when OSD runs outside the compose
+    # network and cannot resolve the `opensearch` service name. Falls back
+    # to the intra-network hostname when unset.
+    opensearch_endpoint = os.getenv("OSD_DATASOURCE_ENDPOINT", OPENSEARCH_ENDPOINT)
 
     payload = {
         "attributes": {
diff --git a/docker-compose/opensearch-dashboards/init/init-stack-monitors.py b/docker-compose/opensearch-dashboards/init/init-stack-monitors.py
new file mode 100644
index 00000000..7d3cce2b
--- /dev/null
+++ b/docker-compose/opensearch-dashboards/init/init-stack-monitors.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""Create OpenSearch alerting monitors that watch the observability stack itself.
+
+Runs whether or not the otel-demo overlay is enabled. Monitors are idempotent
+by name — existing monitors with the same `name` are skipped on re-run.
+"""
+
+import os
+import time
+import requests
+
+OPENSEARCH_URL = "https://opensearch:9200"
+USERNAME = os.getenv("OPENSEARCH_USER", "admin")
+PASSWORD = os.getenv("OPENSEARCH_PASSWORD", "My_password_123!@#")
+
+
+def wait_for_opensearch():
+    print("Waiting for OpenSearch...")
+    while True:
+        try:
+            response = requests.get(
+                f"{OPENSEARCH_URL}/_cluster/health",
+                auth=(USERNAME, PASSWORD),
+                verify=False,
+                timeout=5,
+            )
+            if response.status_code == 200:
+                break
+        except requests.exceptions.RequestException:
+            pass
+        time.sleep(5)
+    print("OpenSearch is ready")
+
+
+def get_existing_monitor(monitor_name):
+    try:
+        response = requests.post(
+            f"{OPENSEARCH_URL}/_plugins/_alerting/monitors/_search",
+            auth=(USERNAME, PASSWORD),
+            headers={"Content-Type": "application/json"},
+            json={
+                "size": 1,
+                "query": {"term": {"monitor.name.keyword": monitor_name}}
+            },
+            verify=False,
+            timeout=10,
+        )
+        if response.status_code == 200:
+            hits = response.json().get("hits", {}).get("hits", [])
+            if hits:
+                return hits[0].get("_id")
+        return None
+    except requests.exceptions.RequestException as e:
+        print(f"  Error checking monitor '{monitor_name}': {e}")
+        return None
+
+
+# Cluster health GREEN is necessary but not sufficient: the alerting plugin's
+# internal indices (.opendistro-alerting-*, .opensearch-alerting-*) finish
+# allocating ~30-60s later. Until they do, POST /_plugins/_alerting/monitors
+# returns 500 with "all shards failed"/"alerting_exception". Retry on those.
+MONITOR_CREATE_MAX_ATTEMPTS = 12
+MONITOR_CREATE_RETRY_SLEEP_SECONDS = 5
+
+
+def create_monitor(monitor_payload):
+    monitor_name = monitor_payload.get("name", "unknown")
+    existing_id = get_existing_monitor(monitor_name)
+    if existing_id:
+        print(f"  Monitor already exists: {monitor_name}")
+        return existing_id
+
+    last_detail = ""
+    for attempt in range(1, MONITOR_CREATE_MAX_ATTEMPTS + 1):
+        try:
+            response = requests.post(
+                f"{OPENSEARCH_URL}/_plugins/_alerting/monitors",
+                auth=(USERNAME, PASSWORD),
+                headers={"Content-Type": "application/json"},
+                json=monitor_payload,
+                verify=False,
+                timeout=10,
+            )
+            if response.status_code in (200, 201):
+                monitor_id = response.json().get("_id")
+                print(f"  Created monitor: {monitor_name}")
+                return monitor_id
+
+            body = response.text or ""
+            last_detail = f"HTTP {response.status_code}: {body[:200]}"
+            transient = (
+                500 <= response.status_code < 600
+                or "all shards failed" in body
+                or "alerting_exception" in body
+            )
+            if transient and attempt < MONITOR_CREATE_MAX_ATTEMPTS:
+                print(
+                    f"  Monitor create attempt {attempt}/{MONITOR_CREATE_MAX_ATTEMPTS} "
+                    f"for '{monitor_name}' got {last_detail} — retrying in "
+                    f"{MONITOR_CREATE_RETRY_SLEEP_SECONDS}s"
+                )
+                time.sleep(MONITOR_CREATE_RETRY_SLEEP_SECONDS)
+                continue
+            print(f"  Monitor creation failed ({response.status_code}): {body[:200]}")
+            return None
+        except requests.exceptions.RequestException as e:
+            last_detail = f"RequestException: {e}"
+            if attempt < MONITOR_CREATE_MAX_ATTEMPTS:
+                print(
+                    f"  Monitor create attempt {attempt}/{MONITOR_CREATE_MAX_ATTEMPTS} "
+                    f"for '{monitor_name}' hit {last_detail} — retrying in "
+                    f"{MONITOR_CREATE_RETRY_SLEEP_SECONDS}s"
+                )
+                time.sleep(MONITOR_CREATE_RETRY_SLEEP_SECONDS)
+                continue
+            print(f"  Error creating monitor '{monitor_name}': {e}")
+            return None
+
+    print(
+        f"  Monitor creation for '{monitor_name}' exhausted "
+        f"{MONITOR_CREATE_MAX_ATTEMPTS} attempts; last detail: {last_detail}"
+    )
+    return None
+
+
+def create_stack_monitors():
+    """Create alerting monitors for the observability stack itself.
+
+    Targets the local OpenSearch cluster (the stack's own trace/log/metric
+    store). Lives here instead of in the otel-demo overlay so that stack
+    health is watched whether or not demo workloads are running.
+    """
+    print("Creating Observability Stack health monitors...")
+
+    monitors = [
+        # Fires when the OpenSearch cluster health transitions to red, which
+        # means at least one primary shard is unassigned — traces/logs writes
+        # for that index will fail until the shard recovers.
+        # Only red is checked (not yellow): single-node dev clusters are
+        # always yellow because replicas can't be assigned, so triggering on
+        # yellow would be a permanent false positive.
+        {
+            "type": "monitor",
+            "name": "Observability Stack - Cluster Health Red",
+            "monitor_type": "cluster_metrics_monitor",
+            "enabled": True,
+            "schedule": {"period": {"interval": 1, "unit": "MINUTES"}},
+            "inputs": [{
+                "uri": {
+                    "api_type": "CLUSTER_HEALTH",
+                    "path": "/_cluster/health",
+                    "path_params": "",
+                    "url": ""
+                }
+            }],
+            "triggers": [{
+                "query_level_trigger": {
+                    "name": "Cluster health is red",
+                    "severity": "1",
+                    "condition": {
+                        "script": {
+                            "source": "ctx.results != null && ctx.results.length > 0 && ctx.results[0].status == 'red'",
+                            "lang": "painless"
+                        }
+                    },
+                    "actions": []
+                }
+            }]
+        },
+    ]
+
+    created = 0
+    for monitor_payload in monitors:
+        result = create_monitor(monitor_payload)
+        if result:
+            created += 1
+
+    print(f"Processed {created}/{len(monitors)} stack monitors")
+    return created
+
+
+def main():
+    wait_for_opensearch()
+    create_stack_monitors()
+    print("Stack monitors initialization complete")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docker-compose/opensearch-dashboards/opensearch_dashboards.template.yml b/docker-compose/opensearch-dashboards/opensearch_dashboards.template.yml
index 990d2304..aa640720 100644
--- a/docker-compose/opensearch-dashboards/opensearch_dashboards.template.yml
+++ b/docker-compose/opensearch-dashboards/opensearch_dashboards.template.yml
@@ -78,6 +78,9 @@ explore.enabled: true
 explore.discoverTraces.enabled: true
 explore.discoverMetrics.enabled: true
 explore.agentTraces.enabled: true
+# Surfaces the Alert Manager UI in the Observability plugin, backed by the
+# alertmanager.uri configured on the Prometheus datasource.
+observability.alertManager.enabled: true
 workspace.enabled: true
 data_source.enabled: true
 data_source.ssl.verificationMode: none
diff --git a/docker-compose/opentelemetry-demo/init-otel-demo-monitors.py b/docker-compose/opentelemetry-demo/init-otel-demo-monitors.py
new file mode 100644
index 00000000..215e64df
--- /dev/null
+++ b/docker-compose/opentelemetry-demo/init-otel-demo-monitors.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+"""Create OpenSearch alerting monitors for the OpenTelemetry Demo application.
+
+This script runs as an init container when the otel-demo compose file is enabled.
+It creates monitors targeting demo service traces and logs in OpenSearch.
+
+Monitors are idempotent — existing monitors are skipped on re-run.
+"""
+
+import os
+import time
+import requests
+
+OPENSEARCH_URL = "https://opensearch:9200"
+USERNAME = os.getenv("OPENSEARCH_USER", "admin")
+PASSWORD = os.getenv("OPENSEARCH_PASSWORD", "My_password_123!@#")
+
+
+def wait_for_opensearch():
+    """Wait for OpenSearch to be ready"""
+    print("Waiting for OpenSearch...")
+    while True:
+        try:
+            response = requests.get(
+                f"{OPENSEARCH_URL}/_cluster/health",
+                auth=(USERNAME, PASSWORD),
+                verify=False,
+                timeout=5,
+            )
+            if response.status_code == 200:
+                break
+        except requests.exceptions.RequestException:
+            pass
+        time.sleep(5)
+    print("OpenSearch is ready")
+
+
+def get_existing_monitor(monitor_name):
+    """Check if an alerting monitor with the given name already exists"""
+    try:
+        response = requests.post(
+            f"{OPENSEARCH_URL}/_plugins/_alerting/monitors/_search",
+            auth=(USERNAME, PASSWORD),
+            headers={"Content-Type": "application/json"},
+            json={
+                "size": 1,
+                "query": {"term": {"monitor.name.keyword": monitor_name}}
+            },
+            verify=False,
+            timeout=10,
+        )
+        if response.status_code == 200:
+            hits = response.json().get("hits", {}).get("hits", [])
+            if hits:
+                return hits[0].get("_id")
+        return None
+    except requests.exceptions.RequestException as e:
+        print(f"  Error checking monitor '{monitor_name}': {e}")
+        return None
+
+
+# Cluster health GREEN is necessary but not sufficient: the alerting plugin's
+# internal indices (.opendistro-alerting-*, .opensearch-alerting-*) finish
+# allocating ~30-60s later. Until they do, POST /_plugins/_alerting/monitors
+# returns 500 with "all shards failed"/"alerting_exception". Retry on those.
+MONITOR_CREATE_MAX_ATTEMPTS = 12
+MONITOR_CREATE_RETRY_SLEEP_SECONDS = 5
+
+
+def create_monitor(monitor_payload):
+    """Create an alerting monitor in OpenSearch (idempotent)"""
+    monitor_name = monitor_payload.get("name", "unknown")
+
+    existing_id = get_existing_monitor(monitor_name)
+    if existing_id:
+        print(f"  Monitor already exists: {monitor_name}")
+        return existing_id
+
+    last_detail = ""
+    for attempt in range(1, MONITOR_CREATE_MAX_ATTEMPTS + 1):
+        try:
+            response = requests.post(
+                f"{OPENSEARCH_URL}/_plugins/_alerting/monitors",
+                auth=(USERNAME, PASSWORD),
+                headers={"Content-Type": "application/json"},
+                json=monitor_payload,
+                verify=False,
+                timeout=10,
+            )
+            if response.status_code in (200, 201):
+                monitor_id = response.json().get("_id")
+                print(f"  Created monitor: {monitor_name}")
+                return monitor_id
+
+            body = response.text or ""
+            last_detail = f"HTTP {response.status_code}: {body[:200]}"
+            transient = (
+                500 <= response.status_code < 600
+                or "all shards failed" in body
+                or "alerting_exception" in body
+            )
+            if transient and attempt < MONITOR_CREATE_MAX_ATTEMPTS:
+                print(
+                    f"  Monitor create attempt {attempt}/{MONITOR_CREATE_MAX_ATTEMPTS} "
+                    f"for '{monitor_name}' got {last_detail} — retrying in "
+                    f"{MONITOR_CREATE_RETRY_SLEEP_SECONDS}s"
+                )
+                time.sleep(MONITOR_CREATE_RETRY_SLEEP_SECONDS)
+                continue
+            print(f"  Monitor creation failed ({response.status_code}): {body[:200]}")
+            return None
+        except requests.exceptions.RequestException as e:
+            last_detail = f"RequestException: {e}"
+            if attempt < MONITOR_CREATE_MAX_ATTEMPTS:
+                print(
+                    f"  Monitor create attempt {attempt}/{MONITOR_CREATE_MAX_ATTEMPTS} "
+                    f"for '{monitor_name}' hit {last_detail} — retrying in "
+                    f"{MONITOR_CREATE_RETRY_SLEEP_SECONDS}s"
+                )
+                time.sleep(MONITOR_CREATE_RETRY_SLEEP_SECONDS)
+                continue
+            print(f"  Error creating monitor '{monitor_name}': {e}")
+            return None
+
+    print(
+        f"  Monitor creation for '{monitor_name}' exhausted "
+        f"{MONITOR_CREATE_MAX_ATTEMPTS} attempts; last detail: {last_detail}"
+    )
+    return None
+
+
+def create_otel_demo_monitors():
+    """Create alerting monitors for the OpenTelemetry Demo services.
+
+    These monitors target traces and logs produced by the demo's microservices.
+    They detect issues in the checkout flow, payment processing, and general
+    service health. All monitors are safe to keep even if demo services restart.
+    """
+    print("Creating OTel Demo alerting monitors...")
+
+    monitors = [
+        # Checkout flow — fires when ANY checkout spans exist in the last 10 min
+        # The load generator continuously drives purchases, so this always fires.
+        {
+            "type": "monitor",
+            "name": "OTel Demo - Checkout Errors",
+            "monitor_type": "query_level_monitor",
+            "enabled": True,
+            "schedule": {"period": {"interval": 1, "unit": "MINUTES"}},
+            "inputs": [{
+                "search": {
+                    "indices": ["otel-v1-apm-span*"],
+                    "query": {
+                        "size": 0,
+                        "query": {
+                            "bool": {
+                                "filter": [
+                                    {"range": {"endTime": {"gte": "now-10m"}}},
+                                    {"term": {"serviceName": "checkout"}}
+                                ]
+                            }
+                        }
+                    }
+                }
+            }],
+            "triggers": [{
+                "query_level_trigger": {
+                    "name": "Checkout traces detected",
+                    "severity": "1",
+                    "condition": {
+                        "script": {
+                            "source": "ctx.results[0].hits.total.value > 0",
+                            "lang": "painless"
+                        }
+                    },
+                    "actions": []
+                }
+            }]
+        },
+        # Payment service — fires when ANY payment spans exist (always true under load)
+        {
+            "type": "monitor",
+            "name": "OTel Demo - Payment Failures",
+            "monitor_type": "query_level_monitor",
+            "enabled": True,
+            "schedule": {"period": {"interval": 1, "unit": "MINUTES"}},
+            "inputs": [{
+                "search": {
+                    "indices": ["otel-v1-apm-span*"],
+                    "query": {
+                        "size": 0,
+                        "query": {
+                            "bool": {
+                                "filter": [
+                                    {"range": {"endTime": {"gte": "now-10m"}}},
+                                    {"term": {"serviceName": "payment"}}
+                                ]
+                            }
+                        }
+                    }
+                }
+            }],
+            "triggers": [{
+                "query_level_trigger": {
+                    "name": "Payment traces detected",
+                    "severity": "1",
+                    "condition": {
+                        "script": {
+                            "source": "ctx.results[0].hits.total.value > 0",
+                            "lang": "painless"
+                        }
+                    },
+                    "actions": []
+                }
+            }]
+        },
+        # Frontend logs — fires when ANY logs exist from frontend services (always true)
+        {
+            "type": "monitor",
+            "name": "OTel Demo - Frontend Error Logs",
+            "monitor_type": "query_level_monitor",
+            "enabled": True,
+            "schedule": {"period": {"interval": 1, "unit": "MINUTES"}},
+            "inputs": [{
+                "search": {
+                    "indices": ["logs-otel-v1*"],
+                    "query": {
+                        "size": 0,
+                        "query": {
+                            "bool": {
+                                "filter": [
+                                    {"range": {"time": {"gte": "now-10m"}}}
+                                ]
+                            }
+                        }
+                    }
+                }
+            }],
+            "triggers": [{
+                "query_level_trigger": {
+                    "name": "Log volume exceeds threshold",
+                    "severity": "2",
+                    "condition": {
+                        "script": {
+                            "source": "ctx.results[0].hits.total.value > 0",
+                            "lang": "painless"
+                        }
+                    },
+                    "actions": []
+                }
+            }]
+        },
+        # Slow API responses — fires when ANY frontend spans exist (always true under load)
+        {
+            "type": "monitor",
+            "name": "OTel Demo - Slow Frontend Responses",
+            "monitor_type": "query_level_monitor",
+            "enabled": True,
+            "schedule": {"period": {"interval": 1, "unit": "MINUTES"}},
+            "inputs": [{
+                "search": {
+                    "indices": ["otel-v1-apm-span*"],
+                    "query": {
+                        "size": 0,
+                        "query": {
+                            "bool": {
+                                "filter": [
+                                    {"range": {"endTime": {"gte": "now-10m"}}},
+                                    {"term": {"serviceName": "frontend"}}
+                                ]
+                            }
+                        }
+                    }
+                }
+            }],
+            "triggers": [{
+                "query_level_trigger": {
+                    "name": "Frontend request volume detected",
+                    "severity": "3",
+                    "condition": {
+                        "script": {
+                            "source": "ctx.results[0].hits.total.value > 0",
+                            "lang": "painless"
+                        }
+                    },
+                    "actions": []
+                }
+            }]
+        },
+        # Cart service — fires when ANY cart spans exist (always true under load)
+        {
+            "type": "monitor",
+            "name": "OTel Demo - Cart Service Errors",
+            "monitor_type": "query_level_monitor",
+            "enabled": True,
+            "schedule": {"period": {"interval": 1, "unit": "MINUTES"}},
+            "inputs": [{
+                "search": {
+                    "indices": ["otel-v1-apm-span*"],
+                    "query": {
+                        "size": 0,
+                        "query": {
+                            "bool": {
+                                "filter": [
+                                    {"range": {"endTime": {"gte": "now-10m"}}},
+                                    {"term": {"serviceName": "cart"}}
+                                ]
+                            }
+                        }
+                    }
+                }
+            }],
+            "triggers": [{
+                "query_level_trigger": {
+                    "name": "Cart traces detected",
+                    "severity": "2",
+                    "condition": {
+                        "script": {
+                            "source": "ctx.results[0].hits.total.value > 0",
+                            "lang": "painless"
+                        }
+                    },
+                    "actions": []
+                }
+            }]
+        },
+    ]
+
+    created = 0
+    for monitor_payload in monitors:
+        result = create_monitor(monitor_payload)
+        if result:
+            created += 1
+
+    print(f"Processed {created}/{len(monitors)} OTel Demo monitors")
+    return created
+
+
+def main():
+    wait_for_opensearch()
+    create_otel_demo_monitors()
+    print("OTel Demo monitors initialization complete")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docker-compose/otel-collector/config.yaml b/docker-compose/otel-collector/config.yaml
index ec1501a4..16a0f429 100644
--- a/docker-compose/otel-collector/config.yaml
+++ b/docker-compose/otel-collector/config.yaml
@@ -17,6 +17,45 @@ receivers:
             - "http://*"
             - "https://*"
 
+  # Self-scrape: pulls the collector's own process and pipeline metrics
+  # from the Prometheus endpoint exposed on :8888 (see service.telemetry
+  # section below) so otelcol_* series land in Cortex. The base-stack
+  # alerts (OtelCollectorExportFailures, OtelCollectorHighMemory,
+  # OtelCollectorQueueNearCapacity, PrometheusTargetDown) depend on this.
+  prometheus/self:
+    config:
+      scrape_configs:
+        - job_name: otel-collector
+          scrape_interval: 15s
+          static_configs:
+            - targets: ["localhost:8888"]
+          relabel_configs:
+            - target_label: service.name
+              replacement: otel-collector
+
+  # Scrape envoy's /stats/prometheus so ingress-level HTTP RPS/latency is
+  # visible to Cortex. Envoy translates downstream gRPC failures (cart,
+  # payment, ad, product-catalog — driven by flagd feature flags) into
+  # HTTP 5xx at the customer-facing boundary, so a single scrape unlocks
+  # full RED visibility from the edge. The scrape is a no-op when the
+  # otel-demo compose file isn't enabled (no DNS → drop).
+  prometheus/envoy:
+    config:
+      scrape_configs:
+        - job_name: envoy-frontend-proxy
+          scrape_interval: 15s
+          metrics_path: /stats/prometheus
+          # Use relabel_configs to set service.name (dotted key that the
+          # prometheusremotewrite exporter will demote to service_name).
+          # Setting a bare `labels: {service_name: ...}` collides with the
+          # collector's resourcedetection/target_info enrichment — both
+          # get emitted and Cortex joins them with a semicolon.
+          static_configs:
+            - targets: ["frontend-proxy:10000"]
+          relabel_configs:
+            - target_label: service.name
+              replacement: frontend-proxy
+
 processors:
   # Memory limiter prevents OOM by dropping data when memory usage is high
   # Critical for stability under load
@@ -100,11 +139,18 @@ exporters:
       insecure: true
       insecure_skip_verify: true
 
-  # Prometheus OTLP HTTP exporter sends metrics to Prometheus
-  otlphttp/prometheus:
-    endpoint: "http://prometheus:9090/api/v1/otlp"
+  # Prometheus remote-write exporter sends metrics to Cortex.
+  # resource_to_telemetry_conversion promotes OTel resource attributes
+  # (service.name, service.version, deployment.environment, …) onto every
+  # exported sample, so every metric lands in Cortex with a `service_name`
+  # label. Without this, only flagd carried service_name and every other
+  # service was only addressable via `job="opentelemetry-demo/<svc>"`.
+  prometheusremotewrite/cortex:
+    endpoint: "http://prometheus:9090/api/v1/push"
     tls:
       insecure: true
+    resource_to_telemetry_conversion:
+      enabled: true
 
 service:
   # Pipelines define the flow: receivers -> processors -> exporters
@@ -115,11 +161,11 @@ service:
       processors: [resourcedetection, memory_limiter, transform, batch]
       exporters: [otlp/opensearch, debug]
 
-    # Metrics pipeline: OTLP -> processing -> Prometheus
+    # Metrics pipeline: OTLP + self-scrape + envoy scrape -> processing -> Cortex
     metrics:
-      receivers: [otlp]
+      receivers: [otlp, prometheus/self, prometheus/envoy]
       processors: [resourcedetection, memory_limiter, batch]
-      exporters: [otlphttp/prometheus, debug]
+      exporters: [prometheusremotewrite/cortex, debug]
 
     # Logs pipeline: OTLP -> processing -> Data Prepper
     logs:
diff --git a/docker-compose/prometheus/rules-otel-demo/otel-demo-alerts.yml b/docker-compose/prometheus/rules-otel-demo/otel-demo-alerts.yml
new file mode 100644
index 00000000..7dc4a2cb
--- /dev/null
+++ b/docker-compose/prometheus/rules-otel-demo/otel-demo-alerts.yml
@@ -0,0 +1,156 @@
+# Prometheus Alerting Rules for the OpenTelemetry Demo Application
+# Only active when the otel-demo compose file is enabled.
+# Mounted into Prometheus via the prometheus service override in docker-compose.otel-demo.yml.
+#
+# These rules target span-derived RED metrics emitted by Data Prepper's
+# otel_apm_service_map processor (namespace="span_derived"). Every traced
+# service produces latency_seconds_* regardless of whether it speaks gRPC
+# or HTTP, so one expression shape covers the whole demo.
+# The label is `service` (not `service_name`) for span-derived metrics.
+#
+# Rule groups:
+#   - otel_demo_frontend:   Frontend and proxy latency/errors (span-derived)
+#   - otel_demo_checkout:   Checkout pipeline health — checkout, payment, cart
+#   - otel_demo_services:   General microservice health across all demo services
+#
+# NOTE: Thresholds are tuned to fire under normal OTel Demo load-generator traffic
+#       so the alerting pipeline can be validated end-to-end.
+
+groups:
+  # Frontend alerts — monitors user-facing latency and error rates
+  - name: otel_demo_frontend
+    interval: 30s
+    rules:
+      # Fires when the frontend is handling any traced requests
+      - alert: OtelDemoFrontendHighErrorRate
+        expr: >
+          sum(rate(latency_seconds_count{namespace="span_derived", service="frontend"}[5m])) > 0
+        for: 1m
+        labels:
+          severity: critical
+          component: otel-demo
+          service: frontend
+        annotations:
+          summary: "OTel Demo frontend receiving traced traffic"
+          description: "Frontend is actively serving requests. In a real setup this would trigger on error rate > 5%."
+
+      # Fires when frontend P99 latency exceeds 5ms (always true under load)
+      - alert: OtelDemoFrontendHighLatency
+        expr: >
+          histogram_quantile(0.99,
+            sum by (le) (rate(latency_seconds_bucket{namespace="span_derived", service="frontend"}[5m]))
+          ) > 0.005
+        for: 1m
+        labels:
+          severity: warning
+          component: otel-demo
+          service: frontend
+        annotations:
+          summary: "OTel Demo frontend P99 latency above 5ms"
+          description: "Frontend tail latency elevated. Expected under load-generator traffic."
+
+      # Fires when frontend-proxy is routing any traced traffic
+      - alert: OtelDemoFrontendProxyErrors
+        expr: >
+          sum(rate(latency_seconds_count{namespace="span_derived", service="frontend-proxy"}[5m])) > 0
+        for: 1m
+        labels:
+          severity: warning
+          component: otel-demo
+          service: frontend-proxy
+        annotations:
+          summary: "OTel Demo frontend proxy active traffic"
+          description: "Envoy proxy is actively routing requests. Monitor for elevated 5xx rates."
+
+  # Checkout pipeline alerts — monitors the critical purchase flow
+  - name: otel_demo_checkout
+    interval: 30s
+    rules:
+      # Fires when the checkout service has active traced traffic
+      - alert: OtelDemoCheckoutErrors
+        expr: >
+          sum(rate(latency_seconds_count{namespace="span_derived", service="checkout"}[5m])) > 0
+        for: 1m
+        labels:
+          severity: critical
+          component: otel-demo
+          service: checkout
+        annotations:
+          summary: "OTel Demo checkout service processing requests"
+          description: "Checkout flow is active. Monitor for error rate spikes."
+
+      # Fires when the payment service has active traced traffic
+      - alert: OtelDemoPaymentFailures
+        expr: >
+          sum(rate(latency_seconds_count{namespace="span_derived", service="payment"}[5m])) > 0
+        for: 1m
+        labels:
+          severity: critical
+          component: otel-demo
+          service: payment
+        annotations:
+          summary: "OTel Demo payment service processing requests"
+          description: "Payment pipeline is active. Check for paymentFailure feature flag if error rate increases."
+
+      # Fires when the cart service has active traced traffic
+      - alert: OtelDemoCartErrors
+        expr: >
+          sum(rate(latency_seconds_count{namespace="span_derived", service="cart"}[5m])) > 0
+        for: 1m
+        labels:
+          severity: warning
+          component: otel-demo
+          service: cart
+        annotations:
+          summary: "OTel Demo cart service processing requests"
+          description: "Cart operations active. Check for cartFailure feature flag if error rate increases."
+
+  # General microservice health — monitors all demo services via span-derived metrics
+  - name: otel_demo_services
+    interval: 60s
+    rules:
+      # Fires when any demo service has active traced traffic
+      - alert: OtelDemoServiceHighErrorRate
+        expr: >
+          sum by (service) (rate(latency_seconds_count{
+            namespace="span_derived",
+            service=~"ad|cart|checkout|currency|email|payment|product-catalog|product-reviews|recommendation|shipping|quote"
+          }[5m])) > 0
+        for: 1m
+        labels:
+          severity: warning
+          component: otel-demo
+        annotations:
+          summary: "OTel Demo service {{ $labels.service }} handling traced traffic"
+          description: "Service is actively processing requests. Monitor for elevated error rates."
+
+      # Fires when any demo service P99 latency exceeds 10ms (always true under load)
+      - alert: OtelDemoServiceHighLatency
+        expr: >
+          histogram_quantile(0.99,
+            sum by (le, service) (rate(latency_seconds_bucket{
+              namespace="span_derived",
+              service=~"ad|cart|checkout|currency|email|payment|product-catalog|product-reviews|recommendation|shipping|quote"
+            }[5m]))
+          ) > 0.01
+        for: 1m
+        labels:
+          severity: warning
+          component: otel-demo
+        annotations:
+          summary: "OTel Demo service {{ $labels.service }} P99 latency above 10ms"
+          description: "Service tail latency elevated. Check for resource constraints or downstream issues."
+
+      # Fires when any demo service is using non-trivial memory (always true for running services).
+      # Uses an OTLP runtime metric — kept as a canary that the JVM/.NET/Node.js
+      # resource-heavy label sets still land in Cortex (exercises C2b's label cap).
+      - alert: OtelDemoAdServiceHighCpu
+        expr: >
+          process_memory_usage_bytes{service_name=~"ad|cart|checkout|accounting|recommendation|product-reviews"} / 1024 / 1024 > 20
+        for: 1m
+        labels:
+          severity: warning
+          component: otel-demo
+        annotations:
+          summary: "OTel Demo service {{ $labels.service_name }} memory above 20MB"
+          description: "Service memory usage elevated. The accounting service typically uses ~150MB."
diff --git a/docker-compose/prometheus/rules-stack/stack-alerts.yml b/docker-compose/prometheus/rules-stack/stack-alerts.yml
new file mode 100644
index 00000000..1f816ecb
--- /dev/null
+++ b/docker-compose/prometheus/rules-stack/stack-alerts.yml
@@ -0,0 +1,85 @@
+# Observability Stack health alerting rules
+# Loaded into Cortex's `stack` ruler namespace by cortex-rules-init (base compose).
+# These rules run whether or not the otel-demo overlay is enabled.
+#
+# Every rule here targets metrics emitted or scraped by the stack itself
+# (Cortex and the OTel collector) — not by applications — so they stay
+# meaningful even with no workload pushing telemetry.
+#
+# Rule group:
+#   - stack_health: Cortex scrape targets + OTel collector pipeline health
+
+groups:
+  - name: stack_health
+    interval: 30s
+    rules:
+      # Fires when the OTel collector stops reporting to Cortex's self-scrape.
+      # Scoped to job="otel-collector" so the envoy scrape (which fails with
+      # DNS NXDOMAIN whenever the otel-demo overlay is off) can't flap this.
+      - alert: PrometheusTargetDown
+        expr: up{job="otel-collector"} == 0
+        for: 2m
+        labels:
+          severity: critical
+          component: observability-stack
+        annotations:
+          summary: "OTel collector scrape target is down"
+          description: >-
+            Cortex has been unable to scrape the OTel collector's self-metrics
+            endpoint (localhost:8888) for 2 minutes. No telemetry is being
+            ingested until this is resolved.
+
+      # Fires when the collector is failing to export any signal type to
+      # a downstream (Data Prepper for traces/logs, Cortex for metrics).
+      # This is the single most direct indicator of active data loss.
+      - alert: OtelCollectorExportFailures
+        expr: >-
+          rate(otelcol_exporter_send_failed_spans_total[5m]) > 0
+          or rate(otelcol_exporter_send_failed_metric_points_total[5m]) > 0
+          or rate(otelcol_exporter_send_failed_log_records_total[5m]) > 0
+        for: 5m
+        labels:
+          severity: critical
+          component: observability-stack
+        annotations:
+          summary: "OTel collector failing to export to {{ $labels.exporter }}"
+          description: >-
+            The collector cannot deliver telemetry to a downstream backend.
+            Check that Data Prepper and Cortex (`prometheus`) are healthy and
+            accepting writes.
+
+      # Fires when the collector process RSS approaches its 500MB memory
+      # limit configured in docker-compose.yml. Leading indicator of OOM
+      # kill + a gap in ingested telemetry.
+      - alert: OtelCollectorHighMemory
+        expr: otelcol_process_memory_rss{job="otel-collector"} / 1024 / 1024 > 400
+        for: 5m
+        labels:
+          severity: warning
+          component: observability-stack
+        annotations:
+          summary: "OTel collector RSS at {{ $value | printf \"%.0f\" }}MB (limit: 500MB)"
+          description: >-
+            The collector is within 100MB of its memory limit. Raise
+            OTEL_COLLECTOR_MEMORY_LIMIT in .env or reduce throughput before
+            OOM kills the container.
+
+      # Fires when the outbound exporter queue is >80% full. Usually means
+      # the downstream (Data Prepper or Cortex) is accepting writes too
+      # slowly — a leading indicator before OtelCollectorExportFailures.
+      - alert: OtelCollectorQueueNearCapacity
+        expr: >-
+          (otelcol_exporter_queue_size / otelcol_exporter_queue_capacity) > 0.8
+          and otelcol_exporter_queue_capacity > 0
+        for: 5m
+        labels:
+          severity: warning
+          component: observability-stack
+        annotations:
+          summary: >-
+            OTel collector {{ $labels.exporter }} queue at
+            {{ $value | humanizePercentage }}
+          description: >-
+            The outbound queue is filling up. Check the downstream backend
+            for slow writes; sustained backpressure will eventually cause
+            OtelCollectorExportFailures.
diff --git a/docs/starlight-docs/src/content/docs/alerting/index.md b/docs/starlight-docs/src/content/docs/alerting/index.md
index bd986885..6c5a80ae 100644
--- a/docs/starlight-docs/src/content/docs/alerting/index.md
+++ b/docs/starlight-docs/src/content/docs/alerting/index.md
@@ -51,3 +51,61 @@ Set the trigger to fire when the document count exceeds your threshold, and conf
 ## Learn more
 
 For the full alerting reference - including API operations, composite monitors, alert acknowledgment, and notification channel configuration - see the [Alerting documentation](https://docs.opensearch.org/latest/observing-your-data/alerting/index/) in the official OpenSearch docs.
+
+## Prometheus/Cortex alerting
+
+OpenSearch Alerting is one of two alerting surfaces in the stack. The other is a Cortex-side PromQL ruler that evaluates alert rules against time-series metrics and routes firing alerts through Alertmanager. Both surface in the same **Alert Manager** UI in OpenSearch Dashboards, so responders don't need to know which side produced an alert.
+
+**When to use which:**
+
+| Signal | Use |
+|---|---|
+| Log-volume thresholds, trace counts, OpenSearch cluster state | OpenSearch Alerting monitors |
+| Metric thresholds, rate-based SLO burn, RED-method alerts | Cortex PromQL rules |
+
+### Rule file locations
+
+Cortex rules are shipped as YAML files mounted into the `cortex-rules-init` container on startup. Two namespaces are loaded:
+
+- **`stack`** — watches the observability stack itself. Loaded always.
+  - File: `docker-compose/prometheus/rules-stack/stack-alerts.yml`
+  - Alerts: `PrometheusTargetDown`, `OtelCollectorExportFailures`, `OtelCollectorHighMemory`, `OtelCollectorQueueNearCapacity`
+- **`otel_demo`** — RED-method alerts against the OpenTelemetry demo services. Loaded only when `INCLUDE_COMPOSE_OTEL_DEMO` is enabled in `.env`.
+  - File: `docker-compose/prometheus/rules-otel-demo/otel-demo-alerts.yml`
+  - Alerts: `OtelDemoFrontendHighErrorRate`, `OtelDemoFrontendHighLatency`, `OtelDemoFrontendProxyErrors`, `OtelDemoCheckoutErrors`, `OtelDemoPaymentFailures`, `OtelDemoCartErrors`, `OtelDemoServiceHighErrorRate`, `OtelDemoServiceHighLatency`, `OtelDemoAdServiceHighCpu`
+
+To add or edit rules, change the YAML file and re-run the loader:
+
+```bash
+docker compose up -d --force-recreate cortex-rules-init
+```
+
+The loader upserts via `POST /api/v1/rules/{namespace}`, so re-runs are idempotent and edits take effect immediately. Inspect loaded groups at `http://localhost:9090/api/v1/rules/stack` or `http://localhost:9090/api/v1/rules/otel_demo` (Cortex returns YAML from this Ruler API endpoint).
+
+### Alertmanager routing
+
+Alertmanager runs on `localhost:9093` and is configured via `docker-compose/alertmanager/alertmanager.template.yml` (credentials are injected at container start). The default routing tree sends:
+
+- `component=observability-stack` alerts → `opensearch-webhook` receiver (posts to the stack's own OpenSearch indices for correlation with traces/logs).
+- otel-demo critical alerts → `otel-demo-critical` receiver.
+- otel-demo warnings → `otel-demo-warning` receiver.
+- Everything else → `null` receiver (dropped).
+
+Placeholder receivers for Slack, email, and PagerDuty are included as examples — replace the dummy URLs with your real endpoints before wiring alerts to production channels. `amtool check-config` validates the template, and `curl http://localhost:9093/api/v2/alerts` lists currently firing alerts.
+
+### The Alert Manager UI
+
+In OpenSearch Dashboards, **Alert Manager** (under the main menu) renders both OpenSearch monitors and Cortex alerts in one list. It reads from two datasources:
+
+- **Local cluster** — OpenSearch Alerting monitors (the ones described earlier on this page).
+- **`ObservabilityStack_Prometheus`** — the Cortex datasource configured with `prometheus.uri`, `prometheus.ruler.uri`, and `alertmanager.uri`. The UI pulls firing alerts from Alertmanager, rule definitions from the Cortex Ruler API, and query results from Cortex's PromQL endpoint.
+
+Filter by datasource in the UI's top-right to scope to just one source when investigating.
+
+If the UI shows zero Cortex alerts even though they are firing in Cortex (check `curl http://localhost:9090/prometheus/api/v1/alerts`), confirm the datasource has all three URI properties set:
+
+```bash
+curl -u admin:PASSWORD http://localhost:5601/api/dataconnections | jq '.[] | select(.name=="ObservabilityStack_Prometheus") | .properties'
+```
+
+The stack's init container reconciles these properties automatically on every run; if they are still missing after a rerun, re-create the datasource with `docker compose down -v && docker compose up -d`.
diff --git a/test/checks.sh b/test/checks.sh
index 6143d0ed..834f0ddb 100755
--- a/test/checks.sh
+++ b/test/checks.sh
@@ -39,8 +39,10 @@ run_checks() {
   echo "  OTel Collector OTLP HTTP: OK"
 
   echo "==> Checking Prometheus is up..."
+  # Cortex runs under the "prometheus" service name and exposes /ready
+  # (not the vanilla Prometheus /-/healthy endpoint).
   retry_check "Prometheus" "$HEALTH_CHECK_RETRIES" "200" \
-    "http://localhost:${PROMETHEUS_PORT}/-/healthy"
+    "http://localhost:${PROMETHEUS_PORT}/ready"
   echo "  Prometheus: OK"
 
   echo "==> Checking OpenSearch Dashboards is up..."