diff --git a/.env b/.env index 1f9841eb..9e143150 100644 --- a/.env +++ b/.env @@ -22,6 +22,15 @@ OPENSEARCH_HOST=opensearch OPENSEARCH_PORT=9200 OPENSEARCH_PROTOCOL=https OPENSEARCH_JAVA_OPTS=-Xms1g -Xmx1g +# Endpoint written into the `local_cluster` data-source saved object that the +# init container seeds into OpenSearch. Point it at the host-reachable port +# (`https://localhost:9200`, published by the compose file) when running +# OpenSearch Dashboards on the host — the host-side OSD process cannot +# resolve the docker-compose service name `opensearch`, so any MDS-scoped +# OSD feature that dials this SO's endpoint would fail with +# `getaddrinfo ENOTFOUND opensearch`. Leave blank/commented when OSD itself +# runs inside the compose network. +OSD_DATASOURCE_ENDPOINT=https://localhost:9200 # OpenSearch Dashboards Configuration OPENSEARCH_DASHBOARDS_VERSION=3.7.0 @@ -49,11 +58,20 @@ DATA_PREPPER_HTTP_PORT=21892 ISM_RETENTION_DAYS=7 # Prometheus Configuration +# The "prometheus" service now runs Cortex under the hood (see docker-compose.yml), +# which is wire-compatible for remote-write/query/ruler/alertmanager APIs. +# PROMETHEUS_VERSION is retained for legacy references; the actual image tag +# comes from CORTEX_VERSION below. PROMETHEUS_VERSION=v3.8.1 +CORTEX_VERSION=v1.18.1 PROMETHEUS_HOST=prometheus.observability-stack-network PROMETHEUS_PORT=9090 PROMETHEUS_RETENTION=15d +# Alertmanager Configuration +ALERTMANAGER_VERSION=v0.27.0 +ALERTMANAGER_PORT=9093 + # Resource Limits OPENSEARCH_MEMORY_LIMIT=2G PROMETHEUS_MEMORY_LIMIT=500M @@ -62,6 +80,7 @@ DATA_PREPPER_MEMORY_LIMIT=1G DASHBOARDS_MEMORY_LIMIT=2G WEATHER_AGENT_MEMORY_LIMIT=200M CANARY_MEMORY_LIMIT=100M +ALERTMANAGER_MEMORY_LIMIT=128M # Network Configuration NETWORK_NAME=observability-stack-network @@ -110,6 +129,15 @@ OTEL_RESOURCE_ATTRIBUTES=service.namespace=opentelemetry-demo,service.version=${ # Metrics Temporality OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=cumulative +# Enable metrics + logs export on every OTel-instrumented service (Node.js, +# Python, Go, .NET, Java, Rust). Without this, Node.js SDKs in particular +# default to NOT exporting metrics even when traces are being emitted — so +# the frontend container would only show nodejs_* runtime metrics in Cortex +# and no http_server_duration_* counters. "otlp" matches the existing trace +# exporter so all three signals go to the same collector pipeline. +OTEL_METRICS_EXPORTER=otlp +OTEL_LOGS_EXPORTER=otlp + # OTLP Endpoints OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_GRPC} PUBLIC_OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:8080/otlp-http/v1/traces diff --git a/README.md b/README.md index a234363b..6455523f 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ Observability Stack is an open-source stack designed for modern distributed syst - **OpenTelemetry Collector**: Receives OTLP data and routes it to appropriate backends - **Data Prepper**: Transforms and enriches logs and traces before storage - **OpenSearch**: Stores and indexes logs and traces for search and analysis -- **Prometheus**: Stores time-series metrics data -- **OpenSearch Dashboards**: Provides web-based visualization and exploration +- **Prometheus**: Stores time-series metrics data — runs the Cortex engine under the service name `prometheus` (same API surface, plus Ruler and Alertmanager endpoints) +- **Alertmanager**: Routes alerts from Cortex-side PromQL rules to notification channels +- **OpenSearch Dashboards**: Provides web-based visualization and exploration — includes the Alert Manager UI for viewing both OpenSearch monitors and Cortex alerts in one place - **PPL (Piped Processing Language)**: Native query language for logs and traces — pipe-based, human-readable, 50+ commands ## See it in action @@ -148,6 +149,20 @@ To stop the stack and remove all data volumes: docker compose down -v ``` +## Upgrading from Previous Releases + +This release swaps vanilla Prometheus for Cortex (kept under the same `prometheus` service name) and adds an always-on Alertmanager. Existing deployments can upgrade in place, with two caveats worth calling out: + +- **Historical metrics do not carry over.** Cortex writes to a different on-disk layout (`/data/tsdb`, `/data/ruler-storage`) than vanilla Prometheus (`/prometheus/chunks_head`, `/prometheus/wal`). Cortex does not read the old TSDB blocks, so any metrics stored in the `prometheus-data` volume before the upgrade are unreadable after it. New OTLP writes work immediately. +- **The in-place upgrade migrates OSD state automatically**, but if you prefer a clean slate, wipe volumes before bringing the new stack up: + ```bash + docker compose down -v + docker compose up -d + ``` + The `docker compose down -v` path is the safest if you're on an older build. The automatic migration reconciles the `ObservabilityStack_Prometheus` datasource to add the new `prometheus.ruler.uri` / `alertmanager.uri` properties, cleans up the old saved-object wrapper, and removes stale vanilla-Prometheus directories from the data volume on first Cortex boot. + +See [Alerting](docs/starlight-docs/src/content/docs/alerting/index.md) for a tour of the new Cortex rules, Alertmanager routing, and the Alert Manager UI in OpenSearch Dashboards. + ## Instrumenting Your Agent Observability Stack accepts telemetry data via the OpenTelemetry Protocol (OTLP) and follows the [OpenTelemetry Gen-AI Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/) for standardized attribute naming and structure for AI agents. @@ -264,8 +279,9 @@ docker compose ps |------|---------|----------|-------------| | **4317** | OTel Collector | gRPC | OTLP gRPC receiver — used by most OpenTelemetry SDKs | | **4318** | OTel Collector | HTTP | OTLP HTTP receiver — used by Strands SDK, browser-based exporters | -| **5601** | OpenSearch Dashboards | HTTP | Web UI for logs, traces, and dashboards | -| **9090** | Prometheus | HTTP | Prometheus Web UI and API | +| **5601** | OpenSearch Dashboards | HTTP | Web UI for logs, traces, dashboards, and Alert Manager | +| **9090** | Prometheus (Cortex) | HTTP | PromQL query API (`/prometheus/...`) and Ruler admin API (`/api/v1/rules/...`) | +| **9093** | Alertmanager | HTTP | Alert routing UI and API for Cortex-side PromQL alerts | | **9200** | OpenSearch | HTTPS | REST API (self-signed cert, use `curl -k`) | | **21890** | Data Prepper | gRPC | Internal OTLP receiver (from OTel Collector) | diff --git a/docker-compose.otel-demo.yml b/docker-compose.otel-demo.yml index 9c74c898..ae7c6c13 100644 --- a/docker-compose.otel-demo.yml +++ b/docker-compose.otel-demo.yml @@ -39,6 +39,8 @@ services: environment: - KAFKA_ADDR - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP} + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=accounting @@ -68,6 +70,8 @@ services: - FLAGD_HOST - FLAGD_PORT - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP} + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_LOGS_EXPORTER=otlp @@ -99,6 +103,8 @@ services: - FLAGD_PORT - VALKEY_ADDR - OTEL_EXPORTER_OTLP_ENDPOINT + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=cart @@ -137,6 +143,8 @@ services: - KAFKA_ADDR - GOMEMLIMIT=16MiB - OTEL_EXPORTER_OTLP_ENDPOINT + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=checkout @@ -178,6 +186,8 @@ services: - IPV6_ENABLED - VERSION=${IMAGE_VERSION} - OTEL_EXPORTER_OTLP_ENDPOINT + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=currency @@ -204,6 +214,8 @@ services: - FLAGD_HOST - FLAGD_PORT - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP} + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=email @@ -227,6 +239,8 @@ services: - FLAGD_PORT - KAFKA_ADDR - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP} + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_INSTRUMENTATION_KAFKA_EXPERIMENTAL_SPAN_ATTRIBUTES=true - OTEL_INSTRUMENTATION_MESSAGING_EXPERIMENTAL_RECEIVE_TELEMETRY_ENABLED=true @@ -263,6 +277,8 @@ services: - RECOMMENDATION_ADDR - SHIPPING_ADDR - OTEL_EXPORTER_OTLP_ENDPOINT + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_RESOURCE_ATTRIBUTES - ENV_PLATFORM - OTEL_SERVICE_NAME=frontend @@ -387,6 +403,8 @@ services: - LOCUST_AUTOSTART - LOCUST_BROWSER_TRAFFIC_ENABLED=false - OTEL_EXPORTER_OTLP_ENDPOINT + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=load-generator @@ -420,6 +438,8 @@ services: - FLAGD_HOST - FLAGD_PORT - OTEL_EXPORTER_OTLP_ENDPOINT + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=payment @@ -448,6 +468,8 @@ services: - FLAGD_PORT - GOMEMLIMIT=16MiB - OTEL_EXPORTER_OTLP_ENDPOINT + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=product-catalog @@ -478,6 +500,8 @@ services: - PRODUCT_REVIEWS_PORT - OTEL_PYTHON_LOG_CORRELATION=true - OTEL_EXPORTER_OTLP_ENDPOINT + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=product-reviews @@ -518,6 +542,8 @@ services: environment: - IPV6_ENABLED - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP} + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_PHP_AUTOLOAD_ENABLED=true - QUOTE_PORT @@ -548,6 +574,8 @@ services: - FLAGD_PORT - OTEL_PYTHON_LOG_CORRELATION=true - OTEL_EXPORTER_OTLP_ENDPOINT + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=recommendation @@ -578,6 +606,8 @@ services: - SHIPPING_PORT - QUOTE_ADDR - OTEL_EXPORTER_OTLP_ENDPOINT + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=shipping - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE @@ -631,6 +661,8 @@ services: environment: - FLAGD_UI_PORT - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP} + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=flagd-ui @@ -661,6 +693,8 @@ services: - KAFKA_LISTENERS=PLAINTEXT://${KAFKA_HOST}:9092,CONTROLLER://${KAFKA_HOST}:9093 - KAFKA_CONTROLLER_QUORUM_VOTERS=1@${KAFKA_HOST}:9093 - OTEL_EXPORTER_OTLP_ENDPOINT=http://${OTEL_COLLECTOR_HOST}:${OTEL_COLLECTOR_PORT_HTTP} + - OTEL_METRICS_EXPORTER + - OTEL_LOGS_EXPORTER - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE - OTEL_RESOURCE_ATTRIBUTES - OTEL_SERVICE_NAME=kafka @@ -731,4 +765,54 @@ services: <<: *network logging: *logging + # ****************** + # Demo-only Alerting Extensions + # ****************** + # Demo-only rule loader. Runs alongside the base cortex-rules-init container + # (defined in docker-compose.yml) and loads the `otel_demo` Cortex namespace + # only. Named separately rather than overlaying the base service because + # Docker Compose >= v2.38 rejects service-name overlays on `include:`-imported + # resources. Both containers hit the same idempotent Ruler upsert API. + cortex-rules-init-otel-demo: + image: python:3.11-alpine + container_name: cortex-rules-init-otel-demo + # `sleep infinity` after success so `docker compose up --wait` is happy. + command: sh -c "pip install requests pyyaml && python /init.py && exec sleep infinity" + depends_on: + prometheus: + condition: service_healthy + volumes: + - ./docker-compose/cortex/init-cortex-rules.py:/init.py + - ./docker-compose/prometheus/rules-otel-demo:/rules/otel_demo:ro + <<: *network + restart: "no" + # Mirror the base cortex-rules-init healthcheck: the script touches + # /tmp/rules-loaded on a clean load, and `--wait` blocks on it so callers + # that query /api/v1/rules/otel_demo after --wait see the rules already in + # Cortex. 40×3s=120s covers pip install + load time. + healthcheck: + test: ["CMD", "test", "-f", "/tmp/rules-loaded"] + interval: 3s + timeout: 2s + retries: 40 + start_period: 10s + logging: *logging + + # OTel Demo Monitors Init - Creates OpenSearch alerting monitors for demo + # traces/logs (checkout, payment, cart, frontend). Idempotent. + otel-demo-monitors-init: + image: python:3.11-alpine + container_name: otel-demo-monitors-init + command: sh -c "pip install requests && python /init.py" + depends_on: + opensearch: + condition: service_healthy + environment: + - OPENSEARCH_USER=${OPENSEARCH_USER} + - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD} + volumes: + - ./docker-compose/opentelemetry-demo/init-otel-demo-monitors.py:/init.py + <<: *network + restart: "no" + logging: *logging diff --git a/docker-compose.yml b/docker-compose.yml index 98c4d685..b51b7794 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,6 +26,8 @@ networks: volumes: prometheus-data: driver: local + alertmanager-data: + driver: local services: # OpenTelemetry Collector - Receives telemetry data via OTLP protocol @@ -110,30 +112,38 @@ services: memory: ${DATA_PREPPER_MEMORY_LIMIT} logging: *logging - # Prometheus - Time-series database for metrics storage + # Cortex - Prometheus-compatible metrics backend with Ruler + Alertmanager APIs + # Replaces vanilla Prometheus to expose the full Prometheus HTTP API surface + # (query, ruler, alertmanager) at a single endpoint for OpenSearch Dashboards. + # The service name is kept as "prometheus" so PROMETHEUS_HOST/PORT in .env + # continue to work everywhere without changes. prometheus: - image: prom/prometheus:${PROMETHEUS_VERSION} + image: cortexproject/cortex:${CORTEX_VERSION} container_name: prometheus - pull_policy: always + # One-time cleanup shim: if this is the first boot of Cortex on a volume + # that still has vanilla-Prometheus artifacts (/data/chunks_head with no + # /data/tsdb), remove them before starting Cortex so the volume isn't + # polluted with dormant dirs Cortex never reads. Skips on fresh deploys + # (chunks_head absent) and on subsequent restarts (tsdb present). + entrypoint: + - /bin/sh + - -c + - | + if [ ! -d /data/tsdb ] && [ -d /data/chunks_head ]; then + echo "First boot after upgrade from vanilla Prometheus — removing stale TSDB artifacts" + rm -rf /data/chunks_head /data/wal /data/wbl /data/lock /data/queries.active + fi + exec /bin/cortex "$$@" + - -- command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - # Retention period from environment variable - - '--storage.tsdb.retention.time=${PROMETHEUS_RETENTION}' - - '--web.console.libraries=/usr/share/prometheus/console_libraries' - - '--web.console.templates=/usr/share/prometheus/consoles' - # Enable remote write receiver for OpenTelemetry Collector - - '--web.enable-remote-write-receiver' - - '--web.enable-lifecycle' - - '--web.route-prefix=/' - - '--enable-feature=exemplar-storage' - - '--web.enable-otlp-receiver' + - '-config.file=/etc/cortex/cortex.yaml' + # Cortex retention mirrors PROMETHEUS_RETENTION. Compactor deletes + # blocks whose max-time is older than this; set to 0 to disable. + - '-compactor.blocks-retention-period=${PROMETHEUS_RETENTION}' volumes: - - ./docker-compose/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - # Persist metrics data across container restarts - - prometheus-data:/prometheus + - ./docker-compose/cortex/cortex.yaml:/etc/cortex/cortex.yaml:ro + - prometheus-data:/data ports: - # Web UI and API endpoint - "${PROMETHEUS_PORT}:9090" networks: - observability-stack-network @@ -142,6 +152,109 @@ services: resources: limits: memory: ${PROMETHEUS_MEMORY_LIMIT} + healthcheck: + test: ["CMD", "wget", "--tries=1", "--spider", "-q", "http://localhost:9090/ready"] + start_period: 30s + interval: 5s + timeout: 5s + retries: 20 + logging: *logging + + # Prometheus Alertmanager - Alert routing, grouping, deduplication, and silencing. + # Runs whether or not the otel-demo is enabled: the base stack rules (collector + # health, scrape-target health) alert into it, and demo rules alert in when the + # demo overlay is enabled too. The OSD Prometheus datasource's alertmanager.uri + # points at this service's HTTP API. + alertmanager: + image: prom/alertmanager:${ALERTMANAGER_VERSION} + container_name: alertmanager + pull_policy: always + entrypoint: /bin/sh + command: + - -c + - | + cp /tmp/alertmanager.template.yml /tmp/alertmanager.yml && + sed -i 's|OPENSEARCH_USER|'$$OPENSEARCH_USER'|g' /tmp/alertmanager.yml && + sed -i 's|OPENSEARCH_PASSWORD|'$$OPENSEARCH_PASSWORD'|g' /tmp/alertmanager.yml && + exec /bin/alertmanager \ + --config.file=/tmp/alertmanager.yml \ + --storage.path=/alertmanager \ + --web.listen-address=:9093 + volumes: + - ./docker-compose/alertmanager/alertmanager.template.yml:/tmp/alertmanager.template.yml:ro + - alertmanager-data:/alertmanager + ports: + - "${ALERTMANAGER_PORT}:9093" + environment: + - OPENSEARCH_USER=${OPENSEARCH_USER} + - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD} + networks: + - observability-stack-network + restart: unless-stopped + deploy: + resources: + limits: + memory: ${ALERTMANAGER_MEMORY_LIMIT} + healthcheck: + test: ["CMD", "wget", "--tries=1", "--spider", "-q", "http://localhost:9093/-/healthy"] + interval: 10s + timeout: 5s + retries: 10 + logging: *logging + + # Cortex Rules Initialization - Loads alerting rules via the Cortex Ruler API. + # Scans /rules//*.yml and POSTs each group. Idempotent. + # Base stack rules (/rules/stack) are always loaded. The otel-demo overlay + # extends this container to also mount /rules/otel_demo. + cortex-rules-init: + image: python:3.11-alpine + container_name: cortex-rules-init + # `sleep infinity` after the script succeeds so `docker compose up --wait` + # doesn't trip over a clean exit — `--wait` treats any exited container as + # a failure unless a dependent uses `service_completed_successfully`. + command: sh -c "pip install requests pyyaml && python /init.py && exec sleep infinity" + depends_on: + prometheus: + condition: service_healthy + volumes: + - ./docker-compose/cortex/init-cortex-rules.py:/init.py + - ./docker-compose/prometheus/rules-stack:/rules/stack:ro + networks: + - observability-stack-network + restart: "no" + # The init script touches /tmp/rules-loaded after a clean load. Without + # this check `--wait` returns the moment pip starts (container "running"), + # well before rules are actually in Cortex — so any caller that queries + # /api/v1/rules immediately after --wait sees an empty list. 40×3s=120s + # covers pip install + load. + healthcheck: + test: ["CMD", "test", "-f", "/tmp/rules-loaded"] + interval: 3s + timeout: 2s + retries: 40 + start_period: 10s + logging: *logging + + # OpenSearch Stack Monitors Init - Creates alerting monitors that watch the + # health of the observability stack itself (cluster health, etc). Idempotent + # by monitor name. The init script hardcodes https://opensearch:9200, so it + # hard-depends on local OpenSearch — without it there's nothing to target. + opensearch-stack-monitors-init: + image: python:3.11-alpine + container_name: opensearch-stack-monitors-init + command: sh -c "pip install requests && python /init.py" + depends_on: + opensearch: + condition: service_healthy + required: true + environment: + - OPENSEARCH_USER=${OPENSEARCH_USER} + - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD} + volumes: + - ./docker-compose/opensearch-dashboards/init/init-stack-monitors.py:/init.py + networks: + - observability-stack-network + restart: "no" logging: *logging # OpenSearch Prometheus Exporter - Exposes OpenSearch metrics for Prometheus scraping @@ -183,11 +296,24 @@ services: - OPENSEARCH_HOST=${OPENSEARCH_HOST} - OPENSEARCH_PORT=${OPENSEARCH_PORT} - OPENSEARCH_PROTOCOL=${OPENSEARCH_PROTOCOL} + # Overrides the endpoint stored on the seeded `local_cluster` data-source + # saved object. Blank default uses the intra-network hostname, which is + # correct when OpenSearch Dashboards runs inside the compose network. + # Set in `.env` (e.g. `https://localhost:9200`) when running OSD on the + # host, since the host process cannot resolve the `opensearch` service + # name — any MDS-scoped OSD feature dialing this SO's endpoint would + # otherwise fail with `getaddrinfo ENOTFOUND opensearch`. + - OSD_DATASOURCE_ENDPOINT=${OSD_DATASOURCE_ENDPOINT:-} - OPENSEARCH_DASHBOARDS_HOST=${OPENSEARCH_DASHBOARDS_HOST} - OPENSEARCH_DASHBOARDS_PORT=${OPENSEARCH_DASHBOARDS_PORT} - OPENSEARCH_DASHBOARDS_PROTOCOL=${OPENSEARCH_DASHBOARDS_PROTOCOL} - PROMETHEUS_HOST=${PROMETHEUS_HOST} - PROMETHEUS_PORT=${PROMETHEUS_PORT} + # alertmanager.uri is set on the Prometheus datasource unconditionally. + # Alertmanager now runs always (defined in docker-compose.yml, not the + # otel-demo overlay), so this URI is always valid. + - ALERTMANAGER_HOST=alertmanager + - ALERTMANAGER_PORT=${ALERTMANAGER_PORT} - ISM_RETENTION_DAYS=${ISM_RETENTION_DAYS:-7} volumes: - ./docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py:/init.py diff --git a/docker-compose/alertmanager/alertmanager.template.yml b/docker-compose/alertmanager/alertmanager.template.yml new file mode 100644 index 00000000..eb8dbcdb --- /dev/null +++ b/docker-compose/alertmanager/alertmanager.template.yml @@ -0,0 +1,135 @@ +# Prometheus Alertmanager Configuration +# Alert routing, grouping, and notification management +# +# Alerts from Cortex ruler are routed here for grouping, deduplication, and delivery. +# All receivers index alerts into OpenSearch for persistent history and search. +# +# The OpenSearch index "alertmanager-alerts" is created automatically on first write. +# Browse alerts in OpenSearch Dashboards or query via: +# curl -sk -u admin:$OPENSEARCH_PASSWORD https://localhost:9200/alertmanager-alerts/_search?pretty +# +# To add Slack, PagerDuty, email, or other integrations see: +# https://prometheus.io/docs/alerting/latest/configuration/#receiver + +# Global configuration for all routes +global: + # Time to wait before declaring an alert resolved if not updated + resolve_timeout: 5m + +# Alert routing tree — determines which receiver handles each alert +# Routes are evaluated top-down; the first match wins. +route: + # Group alerts by name and service to reduce notification noise + group_by: ['alertname', 'service_name'] + # Wait before sending initial notification for a new group + group_wait: 30s + # Wait before sending updates to an existing group + group_interval: 5m + # Wait before re-sending a notification for an already-firing alert + repeat_interval: 4h + # Default receiver for all alerts + receiver: 'opensearch-webhook' + + routes: + # ── OTel Demo routes (most specific first) ────────────────────────── + # Critical demo alerts: checkout pipeline failures, payment errors, frontend 5xx + - match: + component: otel-demo + severity: critical + receiver: 'otel-demo-critical' + # Fast notification for user-facing breakage + group_by: ['alertname', 'service'] + group_wait: 10s + group_interval: 1m + repeat_interval: 30m + + # Warning demo alerts: latency degradation, cart errors, high CPU + - match: + component: otel-demo + severity: warning + receiver: 'otel-demo-warning' + group_by: ['alertname', 'service'] + group_wait: 30s + group_interval: 5m + repeat_interval: 2h + +# Notification receivers +# Each receiver indexes into the same OpenSearch index; the alert payload +# carries all labels so you can filter by component/service/severity there. +# Credentials must match OPENSEARCH_USER / OPENSEARCH_PASSWORD in .env +receivers: + # Default — catch-all for any unmatched alerts + - name: 'opensearch-webhook' + webhook_configs: + - &opensearch-webhook + url: 'https://opensearch:9200/alertmanager-alerts/_doc' + http_config: + basic_auth: + username: OPENSEARCH_USER + password: OPENSEARCH_PASSWORD + tls_config: + insecure_skip_verify: true + send_resolved: true + + # OTel Demo critical — checkout/payment/frontend failures + - name: 'otel-demo-critical' + webhook_configs: + - *opensearch-webhook + + # OTel Demo warning — latency, cart, ad-service issues + - name: 'otel-demo-warning' + webhook_configs: + - *opensearch-webhook + + # ── Dummy receiver definitions (placeholders — not routed to by default) ── + # These exist so the `amtool check-config` output and any UI showing + # configured receivers demonstrate the shape of a real integration without + # actually calling out to a third party. Drop real credentials in here and + # add a `match:` route above when you want alerts to reach them. + - name: 'dummy-slack' + slack_configs: + # Replace with a real Slack incoming webhook URL: + # https://hooks.slack.com/services/// + - api_url: 'https://example.invalid/slack-webhook-placeholder' + channel: '#alerts-placeholder' + send_resolved: true + title: '[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}' + text: | + {{ range .Alerts }} + *Severity:* {{ .Labels.severity }} + *Service:* {{ .Labels.service_name }} + *Summary:* {{ .Annotations.summary }} + *Description:* {{ .Annotations.description }} + {{ end }} + + - name: 'dummy-email' + email_configs: + - to: 'alerts@example.com' + from: 'alertmanager@observability-stack.local' + smarthost: 'smtp.example.com:587' + auth_username: 'alertmanager@example.com' + auth_password: 'CHANGE_ME' + require_tls: true + send_resolved: true + + - name: 'dummy-pagerduty' + pagerduty_configs: + - routing_key: '00000000000000000000000000000000' + send_resolved: true + severity: '{{ .CommonLabels.severity }}' + description: '{{ .CommonAnnotations.summary }}' + + # No-op receiver — drops the alert on the floor. Useful as a deliberate + # silence destination for noisy alerts you don't want to mute entirely. + - name: 'null' + +# Inhibition rules — suppress lower-severity alerts when higher ones fire +inhibit_rules: + # Suppress otel-demo warnings when a critical alert fires for the same service + - source_match: + component: otel-demo + severity: critical + target_match: + component: otel-demo + severity: warning + equal: ['service'] diff --git a/docker-compose/cortex/cortex.yaml b/docker-compose/cortex/cortex.yaml new file mode 100644 index 00000000..960af7de --- /dev/null +++ b/docker-compose/cortex/cortex.yaml @@ -0,0 +1,88 @@ +# Cortex - Single-process mode for local development +# Provides Prometheus-compatible query API + Ruler API for rule CRUD + +target: all + +# Disable multi-tenancy for local dev (no X-Scope-OrgID header required) +auth_enabled: false + +server: + # Listen on 9090 to match the standard Prometheus port convention + # This ensures PROMETHEUS_PORT in .env works for both host and internal Docker networking + http_listen_port: 9090 + grpc_listen_port: 9095 + +distributor: + # shard_by_all_labels is required when using max_global_series_per_user / + # per_metric limits (see limits block below). Safe to enable in single- + # binary mode. + shard_by_all_labels: true + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + +ingester: + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + address: 127.0.0.1 + final_sleep: 0s + +storage: + engine: blocks + +blocks_storage: + backend: filesystem + filesystem: + dir: /data/blocks + tsdb: + dir: /data/tsdb + bucket_store: + sync_dir: /data/tsdb-sync + +ruler: + enable_api: true + # Route firing alerts to Alertmanager for routing, grouping, and notification + alertmanager_url: http://alertmanager:9093 + ring: + kvstore: + store: inmemory + instance_addr: 127.0.0.1 + +# Use S3-compatible storage via filesystem backend for ruler CRUD support +# The 'local' backend is read-only; 'filesystem' supports full CRUD +ruler_storage: + backend: filesystem + filesystem: + dir: /data/ruler-storage + +compactor: + data_dir: /data/compactor + sharding_ring: + kvstore: + store: inmemory + +store_gateway: + sharding_ring: + replication_factor: 1 + kvstore: + store: inmemory + +# Per-tenant ingestion limits. Defaults cap a single metric at 50k series, +# which span-derived RED metrics can blow through quickly when a noisy +# label is in play. Data-prepper already strips the `randomKey` UUID +# before remote-write, but keep the ceiling generous so experimental +# instrumentation doesn't silently start dropping samples. +limits: + max_global_series_per_metric: 500000 + max_global_series_per_user: 5000000 + ingestion_rate: 100000 + ingestion_burst_size: 200000 + # Cortex's default per-series label cap is 30. Full OTel resource sets for + # JVM/.NET/Node.js exceed that once the collector promotes resource attrs + # to labels (resource_to_telemetry_conversion: true), and the ruler's + # ALERTS series inherits the same labels, so keep the ceiling generous. + max_label_names_per_series: 50 diff --git a/docker-compose/cortex/init-cortex-rules.py b/docker-compose/cortex/init-cortex-rules.py new file mode 100644 index 00000000..52bb84d0 --- /dev/null +++ b/docker-compose/cortex/init-cortex-rules.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +"""Load Prometheus alerting/recording rules into Cortex via the Ruler API. + +This script runs as an init container. It scans /rules/ for subdirectories, +treating each subdirectory name as a Cortex ruler namespace. Every *.yml file +in the subdirectory is parsed and each rule group is POSTed individually. + +Directory layout expected: + /rules/ + stack/ ← namespace "stack" + alerts.yml ← contains groups: stack_health, otel_collector_health, … + otel_demo/ ← namespace "otel_demo" (mounted by otel-demo compose) + otel-demo-alerts.yml ← contains groups: otel_demo_frontend, otel_demo_checkout, … + +The main docker-compose.yml mounts only /rules/stack/. +The otel-demo compose override adds /rules/otel_demo/. +""" + +import glob +import os +import sys +import time + +import requests +import yaml + +CORTEX_URL = os.getenv("CORTEX_URL", "http://prometheus:9090") + +# Fixed 2s poll × 60 attempts = ~2 min to come up. Cortex's single-binary +# startup is consistent, so exponential backoff just slows recovery. +READY_POLL_INTERVAL_SECONDS = 2 +READY_POLL_MAX_ATTEMPTS = 60 + + +def wait_for_cortex(): + """Wait for Cortex to report ready, or exit non-zero on timeout.""" + print("⏳ Waiting for Cortex...") + for _ in range(READY_POLL_MAX_ATTEMPTS): + try: + r = requests.get(f"{CORTEX_URL}/ready", timeout=5) + if r.status_code == 200: + print("✅ Cortex is ready") + return + except requests.exceptions.RequestException: + pass + time.sleep(READY_POLL_INTERVAL_SECONDS) + print( + f"❌ Cortex did not become ready at {CORTEX_URL} within " + f"{READY_POLL_INTERVAL_SECONDS * READY_POLL_MAX_ATTEMPTS}s" + ) + sys.exit(1) + + +def load_rules_file(filepath, namespace): + """Upsert every rule group from a YAML file into Cortex. + + Cortex's POST /api/v1/rules/{namespace} is an upsert (HTTP 202 on both + create and replace), so always POST — no existence check, no skip path. + Returns (loaded, failed) counts for this file. + """ + print(f"\n📂 {filepath} → namespace '{namespace}'") + + with open(filepath) as f: + data = yaml.safe_load(f) + + if not data or "groups" not in data: + print(" (no groups found — skipping)") + return 0, 0 + + loaded = 0 + failed = 0 + + for group in data["groups"]: + group_name = group.get("name", "unknown") + rule_count = len(group.get("rules", [])) + + group_yaml = yaml.dump(group, default_flow_style=False) + + try: + r = requests.post( + f"{CORTEX_URL}/api/v1/rules/{namespace}", + headers={"Content-Type": "application/yaml"}, + data=group_yaml, + timeout=10, + ) + if r.status_code == 202: + print(f" ✅ {group_name} ({rule_count} rules) — loaded") + loaded += 1 + else: + print(f" ⚠️ {group_name}: HTTP {r.status_code} — {r.text[:200]}") + failed += 1 + except requests.exceptions.RequestException as e: + print(f" ❌ {group_name}: {e}") + failed += 1 + + return loaded, failed + + +def main(): + wait_for_cortex() + + rules_root = "/rules" + if not os.path.isdir(rules_root): + print(f"No rules directory at {rules_root}") + sys.exit(0) + + total_loaded = 0 + total_failed = 0 + for namespace_dir in sorted(glob.glob(f"{rules_root}/*")): + if not os.path.isdir(namespace_dir): + continue + namespace = os.path.basename(namespace_dir) + + for rules_file in sorted(glob.glob(f"{namespace_dir}/*.yml")): + loaded, failed = load_rules_file(rules_file, namespace) + total_loaded += loaded + total_failed += failed + + print( + f"\n📊 Summary — loaded: {total_loaded}, failed: {total_failed}" + ) + + if total_failed > 0: + sys.exit(1) + + if total_loaded == 0: + print("⚠️ No rule groups loaded") + + # Sentinel file consumed by the compose healthcheck so `docker compose + # up --wait` blocks until rules are actually loaded. Only written when + # no group failed so a partial load doesn't mark the container healthy. + try: + with open("/tmp/rules-loaded", "w") as f: + f.write("ok\n") + except OSError as e: + print(f"⚠️ Could not write /tmp/rules-loaded sentinel: {e}") + + +if __name__ == "__main__": + main() diff --git a/docker-compose/data-prepper/pipelines.template.yaml b/docker-compose/data-prepper/pipelines.template.yaml index 421b16ee..0b489e91 100644 --- a/docker-compose/data-prepper/pipelines.template.yaml +++ b/docker-compose/data-prepper/pipelines.template.yaml @@ -85,7 +85,9 @@ traces-raw-pipeline: index_type: trace-analytics-plain-raw # Service map generation pipeline (APM) -# Builds service dependency maps and RED metrics from trace relationships +# Builds service dependency maps and RED metrics from trace relationships. +# Splits into two sub-pipelines so we can strip high-cardinality `randomKey` +# labels from the Cortex branch without polluting the service-map branch. service-map-pipeline: delay: 100 source: @@ -107,10 +109,39 @@ service-map-pipeline: index_type: otel-v2-apm-service-map routes: [otel_apm_service_map_route] insecure: true - # Route RED metrics to local Prometheus via remote write + # Fan out service-derived RED metrics to a dedicated pipeline that strips + # the per-event randomKey UUID before Cortex rejects it for cardinality. + - pipeline: + name: "service-metrics-cortex-pipeline" + routes: [service_processed_metrics] + +# Strips the per-event `randomKey` UUID from span-derived RED metrics +# before the Cortex remote-write sink. Without this, data-prepper tags +# every latency bucket with a fresh UUID, blowing past Cortex's default +# 50k series-per-metric limit in minutes. +service-metrics-cortex-pipeline: + delay: 100 + source: + pipeline: + name: "service-map-pipeline" + processor: + # Drop only the per-event randomKey. Do NOT strip telemetry.sdk.language: + # otel_apm_service_map is configured to group_by it and emits one sample + # per (service, operation, remoteService, sdk.language) per window, so + # removing the label collapses multi-language services onto the same + # series+timestamp and Cortex rejects them as duplicate samples. + # Note: data-prepper event keys use JSON-pointer-style paths; labels + # set by otel_apm_service_map land under /attributes/, so both + # the top-level and attributes-scoped paths are listed to be safe. + - delete_entries: + with_keys: + - "/attributes/randomKey" + - "randomKey" + sink: + # Route RED metrics to local Cortex via remote write. + # Cortex's distributor push endpoint is /api/v1/push (not Prometheus's /api/v1/write). - prometheus: - url: "http://PROMETHEUS_HOST:PROMETHEUS_PORT/api/v1/write" + url: "http://PROMETHEUS_HOST:PROMETHEUS_PORT/api/v1/push" threshold: max_events: 500 - flush_interval: 5s - routes: [service_processed_metrics] \ No newline at end of file + flush_interval: 5s \ No newline at end of file diff --git a/docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py b/docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py index bab1d558..69ae18a8 100644 --- a/docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py +++ b/docker-compose/opensearch-dashboards/init/init-opensearch-dashboards.py @@ -13,6 +13,8 @@ PASSWORD = os.getenv("OPENSEARCH_PASSWORD", "My_password_123!@#") PROMETHEUS_HOST = os.getenv("PROMETHEUS_HOST", "prometheus.observability-stack-network") PROMETHEUS_PORT = os.getenv("PROMETHEUS_PORT", "9090") +ALERTMANAGER_HOST = os.getenv("ALERTMANAGER_HOST", "alertmanager") +ALERTMANAGER_PORT = os.getenv("ALERTMANAGER_PORT", "9093") _opensearch_protocol = os.getenv("OPENSEARCH_PROTOCOL", "https") OPENSEARCH_ENDPOINT = f"{_opensearch_protocol}://{os.getenv('OPENSEARCH_HOST', 'opensearch')}:{os.getenv('OPENSEARCH_PORT', '9200')}" ISM_RETENTION_DAYS = int(os.getenv("ISM_RETENTION_DAYS", "7")) @@ -290,33 +292,77 @@ def get_existing_prometheus_datasource(datasource_name): return None +def get_prometheus_datasource_properties(datasource_name): + """Fetch the full properties map for a Prometheus dataconnection. + + The /api/saved_objects/_find?type=data-connection endpoint only exposes + connectionId + type; the authoritative read for `properties` is the SQL + plugin's /api/dataconnections endpoint. + """ + try: + response = requests.get( + f"{BASE_URL}/api/dataconnections", + auth=(USERNAME, PASSWORD), + headers={"Content-Type": "application/json", "osd-xsrf": "true"}, + verify=False, + timeout=10, + ) + if response.status_code != 200: + print(f"⚠️ GET /api/dataconnections returned {response.status_code}: {response.text[:200]}") + return None + for entry in response.json() or []: + if entry.get("name") == datasource_name: + return entry.get("properties") or {} + return None + except requests.exceptions.RequestException as e: + print(f"⚠️ Error reading dataconnections: {e}") + return None + + def create_prometheus_datasource(workspace_id): """Create Prometheus datasource using direct query API""" datasource_name = "ObservabilityStack_Prometheus" + # Cortex exposes the Prometheus-compatible query API under /prometheus + # (e.g. /prometheus/api/v1/query_range) while the Ruler admin API lives + # at the unprefixed root (/api/v1/rules/{namespace}). The SQL plugin's + # PrometheusClient exposes `prometheus.uri` and `prometheus.ruler.uri` + # exactly for this split — both must be set for query + rule management + # to work against Cortex. + prometheus_endpoint = f"http://{PROMETHEUS_HOST}:{PROMETHEUS_PORT}/prometheus" + ruler_endpoint = f"http://{PROMETHEUS_HOST}:{PROMETHEUS_PORT}" + alertmanager_endpoint = f"http://{ALERTMANAGER_HOST}:{ALERTMANAGER_PORT}" + + desired_properties = { + "prometheus.uri": prometheus_endpoint, + "prometheus.ruler.uri": ruler_endpoint, + "alertmanager.uri": alertmanager_endpoint, + } + # Check if datasource already exists existing_id = get_existing_prometheus_datasource(datasource_name) if existing_id: print(f"✅ Prometheus datasource already exists: {existing_id}") + reconciled = reconcile_prometheus_datasource_properties( + datasource_name, desired_properties + ) + # Reconciliation goes through DELETE + POST, so the saved-object + # id may have changed — re-read before associating. + datasource_id = existing_id + if reconciled: + datasource_id = get_existing_prometheus_datasource(datasource_name) or existing_id # Associate with workspace if provided if workspace_id and workspace_id != "default": - associate_prometheus_with_workspace(workspace_id, existing_id) - return existing_id + associate_prometheus_with_workspace(workspace_id, datasource_id) + return datasource_id print("🔧 Creating Prometheus datasource...") - prometheus_endpoint = f"http://{PROMETHEUS_HOST}:{PROMETHEUS_PORT}" - payload = { "name": datasource_name, "allowedRoles": [], "connector": "prometheus", - "properties": { - "prometheus.uri": prometheus_endpoint, - "prometheus.auth.type": "basicauth", - "prometheus.auth.username": "", - "prometheus.auth.password": "", - }, + "properties": desired_properties, } try: @@ -345,6 +391,9 @@ def create_prometheus_datasource(workspace_id): error_text = response.text if "already exists with name" in error_text: print(f"✅ Prometheus datasource already exists: {datasource_name}") + reconcile_prometheus_datasource_properties( + datasource_name, desired_properties + ) # Fetch the datasource ID and associate datasource_id = get_existing_prometheus_datasource(datasource_name) if datasource_id and workspace_id and workspace_id != "default": @@ -361,6 +410,203 @@ def create_prometheus_datasource(workspace_id): return None +def _delete_stale_data_connection_saved_object(saved_object_id): + """Delete the orphaned data-connection saved-object left behind by the + SQL plugin's DELETE /api/dataconnections/{name} path. + + The SQL plugin removes its own dataconnection record but not the wrapper + OSD saved-object, so without this call an in-place upgrade ends up with + two data-connection saved-objects sharing the same connectionId — one + orphaned (no SQL backing), one live. The orphan pollutes workspace + listings and re-breaks every time the migration runs. + """ + url = f"{BASE_URL}/api/saved_objects/data-connection/{saved_object_id}?force=true" + try: + resp = requests.delete( + url, + auth=(USERNAME, PASSWORD), + headers={"osd-xsrf": "true"}, + verify=False, + timeout=10, + ) + if resp.status_code in (200, 204, 404): + print( + f"🧹 Deleted stale data-connection saved-object {saved_object_id}" + ) + return True + print( + f"⚠️ Failed to delete stale data-connection saved-object " + f"({resp.status_code}): {resp.text[:200]}" + ) + except requests.exceptions.RequestException as e: + print(f"⚠️ Error deleting stale data-connection saved-object: {e}") + return False + + +def _delete_correlations_referencing_data_connection(data_connection_id): + """Remove any correlation saved-objects whose references point at + `data_connection_id`. They'll be re-created idempotently later in the + init flow with the new id, so the net effect is "migrate reference, + not break it". Without this, the APM-config correlation from the pre-PR + install dangles: it still exists, but its references[2].dataConnection.id + points at a saved-object whose SQL-plugin backing is gone. + """ + try: + resp = requests.get( + f"{BASE_URL}/api/saved_objects/_find?type=correlations&per_page=1000", + auth=(USERNAME, PASSWORD), + headers={"osd-xsrf": "true"}, + verify=False, + timeout=10, + ) + if resp.status_code != 200: + print( + f"⚠️ Could not list correlations for dangling-reference scan " + f"({resp.status_code})" + ) + return + + for obj in resp.json().get("saved_objects", []): + refs = obj.get("references") or [] + if not any( + r.get("type") == "data-connection" and r.get("id") == data_connection_id + for r in refs + ): + continue + obj_id = obj.get("id") + workspaces = obj.get("workspaces") or [] + if workspaces and workspaces[0] != "default": + url = f"{BASE_URL}/w/{workspaces[0]}/api/saved_objects/correlations/{obj_id}" + else: + url = f"{BASE_URL}/api/saved_objects/correlations/{obj_id}" + del_resp = requests.delete( + url + "?force=true", + auth=(USERNAME, PASSWORD), + headers={"osd-xsrf": "true"}, + verify=False, + timeout=10, + ) + if del_resp.status_code in (200, 204, 404): + print( + f"🧹 Deleted stale correlation {obj_id} " + f"(referenced pre-migration dataconnection)" + ) + else: + print( + f"⚠️ Failed to delete stale correlation {obj_id} " + f"({del_resp.status_code}): {del_resp.text[:200]}" + ) + except requests.exceptions.RequestException as e: + print(f"⚠️ Error scanning correlations for stale references: {e}") + + +def reconcile_prometheus_datasource_properties(datasource_name, desired_properties): + """Ensure an existing Prometheus datasource carries all desired properties. + + Returns True when the datasource was rewritten (so callers can re-fetch + the saved-object id, which changes across DELETE+POST), False when it + was already in the desired state or when reconciliation could not run. + + Why: in-place upgrades keep the pre-PR datasource (prometheus.uri only) + so the OSD Alert Manager UI silently shows zero alerts because the + alertmanager.uri / prometheus.ruler.uri it needs were never added. This + reads the authoritative properties via /api/dataconnections, diffs them + against the desired set, and rewrites only on a mismatch to keep re-runs + idempotent. + + Update strategy: DELETE + POST. The SQL plugin does not expose a PUT/PATCH + update endpoint for Prometheus dataconnections — POST rejects with 400 + "already exists" and PUT/PATCH return 404. DELETE on + /api/dataconnections/{name} succeeds, after which a fresh POST recreates + the dataconnection with the full property set. + + The SQL plugin's DELETE does not remove the wrapping OSD saved-object or + update any correlation that references it, so this function also cleans + up the stale saved-object and any dangling correlation references before + re-POSTing. The correlations are re-created idempotently later in the + init flow against the new id. + """ + current = get_prometheus_datasource_properties(datasource_name) + if current is None: + print( + "⚠️ Could not read Prometheus datasource properties " + f"for '{datasource_name}' — skipping reconciliation" + ) + return False + + missing = [k for k in desired_properties if k not in current] + mismatched = [ + k for k in desired_properties + if k in current and current.get(k) != desired_properties[k] + ] + if not missing and not mismatched: + print("✅ Prometheus datasource properties already up to date") + return False + + if missing: + print(f"🔧 Prometheus datasource missing properties: {missing}") + if mismatched: + print(f"🔧 Prometheus datasource properties changed: {mismatched}") + + # Capture the pre-existing saved-object id BEFORE the SQL-plugin DELETE + # so we can clean up the orphaned saved-object and any correlations that + # still point at it. + stale_saved_object_id = get_existing_prometheus_datasource(datasource_name) + + delete_url = f"{BASE_URL}/api/dataconnections/{datasource_name}" + try: + delete_resp = requests.delete( + delete_url, + auth=(USERNAME, PASSWORD), + headers={"osd-xsrf": "true"}, + verify=False, + timeout=10, + ) + if delete_resp.status_code not in (200, 204, 404): + print( + f"⚠️ Prometheus datasource DELETE failed " + f"({delete_resp.status_code}): {delete_resp.text[:200]}" + ) + return False + except requests.exceptions.RequestException as e: + print(f"⚠️ Error deleting Prometheus datasource: {e}") + return False + + # Remove the now-orphaned saved-object wrapper + any correlations that + # reference it. Best-effort — failures here are logged but don't abort + # the migration, since the subsequent POST still restores a working + # datasource even if cleanup is incomplete. + if stale_saved_object_id: + _delete_correlations_referencing_data_connection(stale_saved_object_id) + _delete_stale_data_connection_saved_object(stale_saved_object_id) + + payload = { + "name": datasource_name, + "allowedRoles": [], + "connector": "prometheus", + "properties": desired_properties, + } + try: + response = requests.post( + f"{BASE_URL}/api/directquery/dataconnections", + auth=(USERNAME, PASSWORD), + headers={"Content-Type": "application/json", "osd-xsrf": "true"}, + json=payload, + verify=False, + timeout=10, + ) + if response.status_code == 200: + print("✅ Recreated Prometheus datasource with updated properties") + return True + print( + f"⚠️ Prometheus datasource recreate after delete failed " + f"({response.status_code}): {response.text[:200]}" + ) + except requests.exceptions.RequestException as e: + print(f"⚠️ Error recreating Prometheus datasource: {e}") + return False + + def associate_prometheus_with_workspace(workspace_id, datasource_id): """Associate Prometheus datasource with workspace""" print(f"🔗 Associating Prometheus datasource with workspace {workspace_id}...") @@ -458,7 +704,11 @@ def create_opensearch_datasource(workspace_id): print("🔧 Creating OpenSearch datasource...") - opensearch_endpoint = OPENSEARCH_ENDPOINT + # OSD_DATASOURCE_ENDPOINT lets operators override the endpoint written + # onto the saved object — useful when OSD runs outside the compose + # network and cannot resolve the `opensearch` service name. Falls back + # to the intra-network hostname when unset. + opensearch_endpoint = os.getenv("OSD_DATASOURCE_ENDPOINT", OPENSEARCH_ENDPOINT) payload = { "attributes": { diff --git a/docker-compose/opensearch-dashboards/init/init-stack-monitors.py b/docker-compose/opensearch-dashboards/init/init-stack-monitors.py new file mode 100644 index 00000000..7d3cce2b --- /dev/null +++ b/docker-compose/opensearch-dashboards/init/init-stack-monitors.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +"""Create OpenSearch alerting monitors that watch the observability stack itself. + +Runs whether or not the otel-demo overlay is enabled. Monitors are idempotent +by name — existing monitors with the same `name` are skipped on re-run. +""" + +import os +import time +import requests + +OPENSEARCH_URL = "https://opensearch:9200" +USERNAME = os.getenv("OPENSEARCH_USER", "admin") +PASSWORD = os.getenv("OPENSEARCH_PASSWORD", "My_password_123!@#") + + +def wait_for_opensearch(): + print("Waiting for OpenSearch...") + while True: + try: + response = requests.get( + f"{OPENSEARCH_URL}/_cluster/health", + auth=(USERNAME, PASSWORD), + verify=False, + timeout=5, + ) + if response.status_code == 200: + break + except requests.exceptions.RequestException: + pass + time.sleep(5) + print("OpenSearch is ready") + + +def get_existing_monitor(monitor_name): + try: + response = requests.post( + f"{OPENSEARCH_URL}/_plugins/_alerting/monitors/_search", + auth=(USERNAME, PASSWORD), + headers={"Content-Type": "application/json"}, + json={ + "size": 1, + "query": {"term": {"monitor.name.keyword": monitor_name}} + }, + verify=False, + timeout=10, + ) + if response.status_code == 200: + hits = response.json().get("hits", {}).get("hits", []) + if hits: + return hits[0].get("_id") + return None + except requests.exceptions.RequestException as e: + print(f" Error checking monitor '{monitor_name}': {e}") + return None + + +# Cluster health GREEN is necessary but not sufficient: the alerting plugin's +# internal indices (.opendistro-alerting-*, .opensearch-alerting-*) finish +# allocating ~30-60s later. Until they do, POST /_plugins/_alerting/monitors +# returns 500 with "all shards failed"/"alerting_exception". Retry on those. +MONITOR_CREATE_MAX_ATTEMPTS = 12 +MONITOR_CREATE_RETRY_SLEEP_SECONDS = 5 + + +def create_monitor(monitor_payload): + monitor_name = monitor_payload.get("name", "unknown") + existing_id = get_existing_monitor(monitor_name) + if existing_id: + print(f" Monitor already exists: {monitor_name}") + return existing_id + + last_detail = "" + for attempt in range(1, MONITOR_CREATE_MAX_ATTEMPTS + 1): + try: + response = requests.post( + f"{OPENSEARCH_URL}/_plugins/_alerting/monitors", + auth=(USERNAME, PASSWORD), + headers={"Content-Type": "application/json"}, + json=monitor_payload, + verify=False, + timeout=10, + ) + if response.status_code in (200, 201): + monitor_id = response.json().get("_id") + print(f" Created monitor: {monitor_name}") + return monitor_id + + body = response.text or "" + last_detail = f"HTTP {response.status_code}: {body[:200]}" + transient = ( + 500 <= response.status_code < 600 + or "all shards failed" in body + or "alerting_exception" in body + ) + if transient and attempt < MONITOR_CREATE_MAX_ATTEMPTS: + print( + f" Monitor create attempt {attempt}/{MONITOR_CREATE_MAX_ATTEMPTS} " + f"for '{monitor_name}' got {last_detail} — retrying in " + f"{MONITOR_CREATE_RETRY_SLEEP_SECONDS}s" + ) + time.sleep(MONITOR_CREATE_RETRY_SLEEP_SECONDS) + continue + print(f" Monitor creation failed ({response.status_code}): {body[:200]}") + return None + except requests.exceptions.RequestException as e: + last_detail = f"RequestException: {e}" + if attempt < MONITOR_CREATE_MAX_ATTEMPTS: + print( + f" Monitor create attempt {attempt}/{MONITOR_CREATE_MAX_ATTEMPTS} " + f"for '{monitor_name}' hit {last_detail} — retrying in " + f"{MONITOR_CREATE_RETRY_SLEEP_SECONDS}s" + ) + time.sleep(MONITOR_CREATE_RETRY_SLEEP_SECONDS) + continue + print(f" Error creating monitor '{monitor_name}': {e}") + return None + + print( + f" Monitor creation for '{monitor_name}' exhausted " + f"{MONITOR_CREATE_MAX_ATTEMPTS} attempts; last detail: {last_detail}" + ) + return None + + +def create_stack_monitors(): + """Create alerting monitors for the observability stack itself. + + Targets the local OpenSearch cluster (the stack's own trace/log/metric + store). Lives here instead of in the otel-demo overlay so that stack + health is watched whether or not demo workloads are running. + """ + print("Creating Observability Stack health monitors...") + + monitors = [ + # Fires when the OpenSearch cluster health transitions to red, which + # means at least one primary shard is unassigned — traces/logs writes + # for that index will fail until the shard recovers. + # Only red is checked (not yellow): single-node dev clusters are + # always yellow because replicas can't be assigned, so triggering on + # yellow would be a permanent false positive. + { + "type": "monitor", + "name": "Observability Stack - Cluster Health Red", + "monitor_type": "cluster_metrics_monitor", + "enabled": True, + "schedule": {"period": {"interval": 1, "unit": "MINUTES"}}, + "inputs": [{ + "uri": { + "api_type": "CLUSTER_HEALTH", + "path": "/_cluster/health", + "path_params": "", + "url": "" + } + }], + "triggers": [{ + "query_level_trigger": { + "name": "Cluster health is red", + "severity": "1", + "condition": { + "script": { + "source": "ctx.results != null && ctx.results.length > 0 && ctx.results[0].status == 'red'", + "lang": "painless" + } + }, + "actions": [] + } + }] + }, + ] + + created = 0 + for monitor_payload in monitors: + result = create_monitor(monitor_payload) + if result: + created += 1 + + print(f"Processed {created}/{len(monitors)} stack monitors") + return created + + +def main(): + wait_for_opensearch() + create_stack_monitors() + print("Stack monitors initialization complete") + + +if __name__ == "__main__": + main() diff --git a/docker-compose/opensearch-dashboards/opensearch_dashboards.template.yml b/docker-compose/opensearch-dashboards/opensearch_dashboards.template.yml index 990d2304..aa640720 100644 --- a/docker-compose/opensearch-dashboards/opensearch_dashboards.template.yml +++ b/docker-compose/opensearch-dashboards/opensearch_dashboards.template.yml @@ -78,6 +78,9 @@ explore.enabled: true explore.discoverTraces.enabled: true explore.discoverMetrics.enabled: true explore.agentTraces.enabled: true +# Surfaces the Alert Manager UI in the Observability plugin, backed by the +# alertmanager.uri configured on the Prometheus datasource. +observability.alertManager.enabled: true workspace.enabled: true data_source.enabled: true data_source.ssl.verificationMode: none diff --git a/docker-compose/opentelemetry-demo/init-otel-demo-monitors.py b/docker-compose/opentelemetry-demo/init-otel-demo-monitors.py new file mode 100644 index 00000000..215e64df --- /dev/null +++ b/docker-compose/opentelemetry-demo/init-otel-demo-monitors.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +"""Create OpenSearch alerting monitors for the OpenTelemetry Demo application. + +This script runs as an init container when the otel-demo compose file is enabled. +It creates monitors targeting demo service traces and logs in OpenSearch. + +Monitors are idempotent — existing monitors are skipped on re-run. +""" + +import os +import time +import requests + +OPENSEARCH_URL = "https://opensearch:9200" +USERNAME = os.getenv("OPENSEARCH_USER", "admin") +PASSWORD = os.getenv("OPENSEARCH_PASSWORD", "My_password_123!@#") + + +def wait_for_opensearch(): + """Wait for OpenSearch to be ready""" + print("Waiting for OpenSearch...") + while True: + try: + response = requests.get( + f"{OPENSEARCH_URL}/_cluster/health", + auth=(USERNAME, PASSWORD), + verify=False, + timeout=5, + ) + if response.status_code == 200: + break + except requests.exceptions.RequestException: + pass + time.sleep(5) + print("OpenSearch is ready") + + +def get_existing_monitor(monitor_name): + """Check if an alerting monitor with the given name already exists""" + try: + response = requests.post( + f"{OPENSEARCH_URL}/_plugins/_alerting/monitors/_search", + auth=(USERNAME, PASSWORD), + headers={"Content-Type": "application/json"}, + json={ + "size": 1, + "query": {"term": {"monitor.name.keyword": monitor_name}} + }, + verify=False, + timeout=10, + ) + if response.status_code == 200: + hits = response.json().get("hits", {}).get("hits", []) + if hits: + return hits[0].get("_id") + return None + except requests.exceptions.RequestException as e: + print(f" Error checking monitor '{monitor_name}': {e}") + return None + + +# Cluster health GREEN is necessary but not sufficient: the alerting plugin's +# internal indices (.opendistro-alerting-*, .opensearch-alerting-*) finish +# allocating ~30-60s later. Until they do, POST /_plugins/_alerting/monitors +# returns 500 with "all shards failed"/"alerting_exception". Retry on those. +MONITOR_CREATE_MAX_ATTEMPTS = 12 +MONITOR_CREATE_RETRY_SLEEP_SECONDS = 5 + + +def create_monitor(monitor_payload): + """Create an alerting monitor in OpenSearch (idempotent)""" + monitor_name = monitor_payload.get("name", "unknown") + + existing_id = get_existing_monitor(monitor_name) + if existing_id: + print(f" Monitor already exists: {monitor_name}") + return existing_id + + last_detail = "" + for attempt in range(1, MONITOR_CREATE_MAX_ATTEMPTS + 1): + try: + response = requests.post( + f"{OPENSEARCH_URL}/_plugins/_alerting/monitors", + auth=(USERNAME, PASSWORD), + headers={"Content-Type": "application/json"}, + json=monitor_payload, + verify=False, + timeout=10, + ) + if response.status_code in (200, 201): + monitor_id = response.json().get("_id") + print(f" Created monitor: {monitor_name}") + return monitor_id + + body = response.text or "" + last_detail = f"HTTP {response.status_code}: {body[:200]}" + transient = ( + 500 <= response.status_code < 600 + or "all shards failed" in body + or "alerting_exception" in body + ) + if transient and attempt < MONITOR_CREATE_MAX_ATTEMPTS: + print( + f" Monitor create attempt {attempt}/{MONITOR_CREATE_MAX_ATTEMPTS} " + f"for '{monitor_name}' got {last_detail} — retrying in " + f"{MONITOR_CREATE_RETRY_SLEEP_SECONDS}s" + ) + time.sleep(MONITOR_CREATE_RETRY_SLEEP_SECONDS) + continue + print(f" Monitor creation failed ({response.status_code}): {body[:200]}") + return None + except requests.exceptions.RequestException as e: + last_detail = f"RequestException: {e}" + if attempt < MONITOR_CREATE_MAX_ATTEMPTS: + print( + f" Monitor create attempt {attempt}/{MONITOR_CREATE_MAX_ATTEMPTS} " + f"for '{monitor_name}' hit {last_detail} — retrying in " + f"{MONITOR_CREATE_RETRY_SLEEP_SECONDS}s" + ) + time.sleep(MONITOR_CREATE_RETRY_SLEEP_SECONDS) + continue + print(f" Error creating monitor '{monitor_name}': {e}") + return None + + print( + f" Monitor creation for '{monitor_name}' exhausted " + f"{MONITOR_CREATE_MAX_ATTEMPTS} attempts; last detail: {last_detail}" + ) + return None + + +def create_otel_demo_monitors(): + """Create alerting monitors for the OpenTelemetry Demo services. + + These monitors target traces and logs produced by the demo's microservices. + They detect issues in the checkout flow, payment processing, and general + service health. All monitors are safe to keep even if demo services restart. + """ + print("Creating OTel Demo alerting monitors...") + + monitors = [ + # Checkout flow — fires when ANY checkout spans exist in the last 10 min + # The load generator continuously drives purchases, so this always fires. + { + "type": "monitor", + "name": "OTel Demo - Checkout Errors", + "monitor_type": "query_level_monitor", + "enabled": True, + "schedule": {"period": {"interval": 1, "unit": "MINUTES"}}, + "inputs": [{ + "search": { + "indices": ["otel-v1-apm-span*"], + "query": { + "size": 0, + "query": { + "bool": { + "filter": [ + {"range": {"endTime": {"gte": "now-10m"}}}, + {"term": {"serviceName": "checkout"}} + ] + } + } + } + } + }], + "triggers": [{ + "query_level_trigger": { + "name": "Checkout traces detected", + "severity": "1", + "condition": { + "script": { + "source": "ctx.results[0].hits.total.value > 0", + "lang": "painless" + } + }, + "actions": [] + } + }] + }, + # Payment service — fires when ANY payment spans exist (always true under load) + { + "type": "monitor", + "name": "OTel Demo - Payment Failures", + "monitor_type": "query_level_monitor", + "enabled": True, + "schedule": {"period": {"interval": 1, "unit": "MINUTES"}}, + "inputs": [{ + "search": { + "indices": ["otel-v1-apm-span*"], + "query": { + "size": 0, + "query": { + "bool": { + "filter": [ + {"range": {"endTime": {"gte": "now-10m"}}}, + {"term": {"serviceName": "payment"}} + ] + } + } + } + } + }], + "triggers": [{ + "query_level_trigger": { + "name": "Payment traces detected", + "severity": "1", + "condition": { + "script": { + "source": "ctx.results[0].hits.total.value > 0", + "lang": "painless" + } + }, + "actions": [] + } + }] + }, + # Frontend logs — fires when ANY logs exist from frontend services (always true) + { + "type": "monitor", + "name": "OTel Demo - Frontend Error Logs", + "monitor_type": "query_level_monitor", + "enabled": True, + "schedule": {"period": {"interval": 1, "unit": "MINUTES"}}, + "inputs": [{ + "search": { + "indices": ["logs-otel-v1*"], + "query": { + "size": 0, + "query": { + "bool": { + "filter": [ + {"range": {"time": {"gte": "now-10m"}}} + ] + } + } + } + } + }], + "triggers": [{ + "query_level_trigger": { + "name": "Log volume exceeds threshold", + "severity": "2", + "condition": { + "script": { + "source": "ctx.results[0].hits.total.value > 0", + "lang": "painless" + } + }, + "actions": [] + } + }] + }, + # Slow API responses — fires when ANY frontend spans exist (always true under load) + { + "type": "monitor", + "name": "OTel Demo - Slow Frontend Responses", + "monitor_type": "query_level_monitor", + "enabled": True, + "schedule": {"period": {"interval": 1, "unit": "MINUTES"}}, + "inputs": [{ + "search": { + "indices": ["otel-v1-apm-span*"], + "query": { + "size": 0, + "query": { + "bool": { + "filter": [ + {"range": {"endTime": {"gte": "now-10m"}}}, + {"term": {"serviceName": "frontend"}} + ] + } + } + } + } + }], + "triggers": [{ + "query_level_trigger": { + "name": "Frontend request volume detected", + "severity": "3", + "condition": { + "script": { + "source": "ctx.results[0].hits.total.value > 0", + "lang": "painless" + } + }, + "actions": [] + } + }] + }, + # Cart service — fires when ANY cart spans exist (always true under load) + { + "type": "monitor", + "name": "OTel Demo - Cart Service Errors", + "monitor_type": "query_level_monitor", + "enabled": True, + "schedule": {"period": {"interval": 1, "unit": "MINUTES"}}, + "inputs": [{ + "search": { + "indices": ["otel-v1-apm-span*"], + "query": { + "size": 0, + "query": { + "bool": { + "filter": [ + {"range": {"endTime": {"gte": "now-10m"}}}, + {"term": {"serviceName": "cart"}} + ] + } + } + } + } + }], + "triggers": [{ + "query_level_trigger": { + "name": "Cart traces detected", + "severity": "2", + "condition": { + "script": { + "source": "ctx.results[0].hits.total.value > 0", + "lang": "painless" + } + }, + "actions": [] + } + }] + }, + ] + + created = 0 + for monitor_payload in monitors: + result = create_monitor(monitor_payload) + if result: + created += 1 + + print(f"Processed {created}/{len(monitors)} OTel Demo monitors") + return created + + +def main(): + wait_for_opensearch() + create_otel_demo_monitors() + print("OTel Demo monitors initialization complete") + + +if __name__ == "__main__": + main() diff --git a/docker-compose/otel-collector/config.yaml b/docker-compose/otel-collector/config.yaml index ec1501a4..16a0f429 100644 --- a/docker-compose/otel-collector/config.yaml +++ b/docker-compose/otel-collector/config.yaml @@ -17,6 +17,45 @@ receivers: - "http://*" - "https://*" + # Self-scrape: pulls the collector's own process and pipeline metrics + # from the Prometheus endpoint exposed on :8888 (see service.telemetry + # section below) so otelcol_* series land in Cortex. The base-stack + # alerts (OtelCollectorExportFailures, OtelCollectorHighMemory, + # OtelCollectorQueueNearCapacity, PrometheusTargetDown) depend on this. + prometheus/self: + config: + scrape_configs: + - job_name: otel-collector + scrape_interval: 15s + static_configs: + - targets: ["localhost:8888"] + relabel_configs: + - target_label: service.name + replacement: otel-collector + + # Scrape envoy's /stats/prometheus so ingress-level HTTP RPS/latency is + # visible to Cortex. Envoy translates downstream gRPC failures (cart, + # payment, ad, product-catalog — driven by flagd feature flags) into + # HTTP 5xx at the customer-facing boundary, so a single scrape unlocks + # full RED visibility from the edge. The scrape is a no-op when the + # otel-demo compose file isn't enabled (no DNS → drop). + prometheus/envoy: + config: + scrape_configs: + - job_name: envoy-frontend-proxy + scrape_interval: 15s + metrics_path: /stats/prometheus + # Use relabel_configs to set service.name (dotted key that the + # prometheusremotewrite exporter will demote to service_name). + # Setting a bare `labels: {service_name: ...}` collides with the + # collector's resourcedetection/target_info enrichment — both + # get emitted and Cortex joins them with a semicolon. + static_configs: + - targets: ["frontend-proxy:10000"] + relabel_configs: + - target_label: service.name + replacement: frontend-proxy + processors: # Memory limiter prevents OOM by dropping data when memory usage is high # Critical for stability under load @@ -100,11 +139,18 @@ exporters: insecure: true insecure_skip_verify: true - # Prometheus OTLP HTTP exporter sends metrics to Prometheus - otlphttp/prometheus: - endpoint: "http://prometheus:9090/api/v1/otlp" + # Prometheus remote-write exporter sends metrics to Cortex. + # resource_to_telemetry_conversion promotes OTel resource attributes + # (service.name, service.version, deployment.environment, …) onto every + # exported sample, so every metric lands in Cortex with a `service_name` + # label. Without this, only flagd carried service_name and every other + # service was only addressable via `job="opentelemetry-demo/"`. + prometheusremotewrite/cortex: + endpoint: "http://prometheus:9090/api/v1/push" tls: insecure: true + resource_to_telemetry_conversion: + enabled: true service: # Pipelines define the flow: receivers -> processors -> exporters @@ -115,11 +161,11 @@ service: processors: [resourcedetection, memory_limiter, transform, batch] exporters: [otlp/opensearch, debug] - # Metrics pipeline: OTLP -> processing -> Prometheus + # Metrics pipeline: OTLP + self-scrape + envoy scrape -> processing -> Cortex metrics: - receivers: [otlp] + receivers: [otlp, prometheus/self, prometheus/envoy] processors: [resourcedetection, memory_limiter, batch] - exporters: [otlphttp/prometheus, debug] + exporters: [prometheusremotewrite/cortex, debug] # Logs pipeline: OTLP -> processing -> Data Prepper logs: diff --git a/docker-compose/prometheus/rules-otel-demo/otel-demo-alerts.yml b/docker-compose/prometheus/rules-otel-demo/otel-demo-alerts.yml new file mode 100644 index 00000000..7dc4a2cb --- /dev/null +++ b/docker-compose/prometheus/rules-otel-demo/otel-demo-alerts.yml @@ -0,0 +1,156 @@ +# Prometheus Alerting Rules for the OpenTelemetry Demo Application +# Only active when the otel-demo compose file is enabled. +# Mounted into Prometheus via the prometheus service override in docker-compose.otel-demo.yml. +# +# These rules target span-derived RED metrics emitted by Data Prepper's +# otel_apm_service_map processor (namespace="span_derived"). Every traced +# service produces latency_seconds_* regardless of whether it speaks gRPC +# or HTTP, so one expression shape covers the whole demo. +# The label is `service` (not `service_name`) for span-derived metrics. +# +# Rule groups: +# - otel_demo_frontend: Frontend and proxy latency/errors (span-derived) +# - otel_demo_checkout: Checkout pipeline health — checkout, payment, cart +# - otel_demo_services: General microservice health across all demo services +# +# NOTE: Thresholds are tuned to fire under normal OTel Demo load-generator traffic +# so the alerting pipeline can be validated end-to-end. + +groups: + # Frontend alerts — monitors user-facing latency and error rates + - name: otel_demo_frontend + interval: 30s + rules: + # Fires when the frontend is handling any traced requests + - alert: OtelDemoFrontendHighErrorRate + expr: > + sum(rate(latency_seconds_count{namespace="span_derived", service="frontend"}[5m])) > 0 + for: 1m + labels: + severity: critical + component: otel-demo + service: frontend + annotations: + summary: "OTel Demo frontend receiving traced traffic" + description: "Frontend is actively serving requests. In a real setup this would trigger on error rate > 5%." + + # Fires when frontend P99 latency exceeds 5ms (always true under load) + - alert: OtelDemoFrontendHighLatency + expr: > + histogram_quantile(0.99, + sum by (le) (rate(latency_seconds_bucket{namespace="span_derived", service="frontend"}[5m])) + ) > 0.005 + for: 1m + labels: + severity: warning + component: otel-demo + service: frontend + annotations: + summary: "OTel Demo frontend P99 latency above 5ms" + description: "Frontend tail latency elevated. Expected under load-generator traffic." + + # Fires when frontend-proxy is routing any traced traffic + - alert: OtelDemoFrontendProxyErrors + expr: > + sum(rate(latency_seconds_count{namespace="span_derived", service="frontend-proxy"}[5m])) > 0 + for: 1m + labels: + severity: warning + component: otel-demo + service: frontend-proxy + annotations: + summary: "OTel Demo frontend proxy active traffic" + description: "Envoy proxy is actively routing requests. Monitor for elevated 5xx rates." + + # Checkout pipeline alerts — monitors the critical purchase flow + - name: otel_demo_checkout + interval: 30s + rules: + # Fires when the checkout service has active traced traffic + - alert: OtelDemoCheckoutErrors + expr: > + sum(rate(latency_seconds_count{namespace="span_derived", service="checkout"}[5m])) > 0 + for: 1m + labels: + severity: critical + component: otel-demo + service: checkout + annotations: + summary: "OTel Demo checkout service processing requests" + description: "Checkout flow is active. Monitor for error rate spikes." + + # Fires when the payment service has active traced traffic + - alert: OtelDemoPaymentFailures + expr: > + sum(rate(latency_seconds_count{namespace="span_derived", service="payment"}[5m])) > 0 + for: 1m + labels: + severity: critical + component: otel-demo + service: payment + annotations: + summary: "OTel Demo payment service processing requests" + description: "Payment pipeline is active. Check for paymentFailure feature flag if error rate increases." + + # Fires when the cart service has active traced traffic + - alert: OtelDemoCartErrors + expr: > + sum(rate(latency_seconds_count{namespace="span_derived", service="cart"}[5m])) > 0 + for: 1m + labels: + severity: warning + component: otel-demo + service: cart + annotations: + summary: "OTel Demo cart service processing requests" + description: "Cart operations active. Check for cartFailure feature flag if error rate increases." + + # General microservice health — monitors all demo services via span-derived metrics + - name: otel_demo_services + interval: 60s + rules: + # Fires when any demo service has active traced traffic + - alert: OtelDemoServiceHighErrorRate + expr: > + sum by (service) (rate(latency_seconds_count{ + namespace="span_derived", + service=~"ad|cart|checkout|currency|email|payment|product-catalog|product-reviews|recommendation|shipping|quote" + }[5m])) > 0 + for: 1m + labels: + severity: warning + component: otel-demo + annotations: + summary: "OTel Demo service {{ $labels.service }} handling traced traffic" + description: "Service is actively processing requests. Monitor for elevated error rates." + + # Fires when any demo service P99 latency exceeds 10ms (always true under load) + - alert: OtelDemoServiceHighLatency + expr: > + histogram_quantile(0.99, + sum by (le, service) (rate(latency_seconds_bucket{ + namespace="span_derived", + service=~"ad|cart|checkout|currency|email|payment|product-catalog|product-reviews|recommendation|shipping|quote" + }[5m])) + ) > 0.01 + for: 1m + labels: + severity: warning + component: otel-demo + annotations: + summary: "OTel Demo service {{ $labels.service }} P99 latency above 10ms" + description: "Service tail latency elevated. Check for resource constraints or downstream issues." + + # Fires when any demo service is using non-trivial memory (always true for running services). + # Uses an OTLP runtime metric — kept as a canary that the JVM/.NET/Node.js + # resource-heavy label sets still land in Cortex (exercises C2b's label cap). + - alert: OtelDemoAdServiceHighCpu + expr: > + process_memory_usage_bytes{service_name=~"ad|cart|checkout|accounting|recommendation|product-reviews"} / 1024 / 1024 > 20 + for: 1m + labels: + severity: warning + component: otel-demo + annotations: + summary: "OTel Demo service {{ $labels.service_name }} memory above 20MB" + description: "Service memory usage elevated. The accounting service typically uses ~150MB." diff --git a/docker-compose/prometheus/rules-stack/stack-alerts.yml b/docker-compose/prometheus/rules-stack/stack-alerts.yml new file mode 100644 index 00000000..1f816ecb --- /dev/null +++ b/docker-compose/prometheus/rules-stack/stack-alerts.yml @@ -0,0 +1,85 @@ +# Observability Stack health alerting rules +# Loaded into Cortex's `stack` ruler namespace by cortex-rules-init (base compose). +# These rules run whether or not the otel-demo overlay is enabled. +# +# Every rule here targets metrics emitted or scraped by the stack itself +# (Cortex and the OTel collector) — not by applications — so they stay +# meaningful even with no workload pushing telemetry. +# +# Rule group: +# - stack_health: Cortex scrape targets + OTel collector pipeline health + +groups: + - name: stack_health + interval: 30s + rules: + # Fires when the OTel collector stops reporting to Cortex's self-scrape. + # Scoped to job="otel-collector" so the envoy scrape (which fails with + # DNS NXDOMAIN whenever the otel-demo overlay is off) can't flap this. + - alert: PrometheusTargetDown + expr: up{job="otel-collector"} == 0 + for: 2m + labels: + severity: critical + component: observability-stack + annotations: + summary: "OTel collector scrape target is down" + description: >- + Cortex has been unable to scrape the OTel collector's self-metrics + endpoint (localhost:8888) for 2 minutes. No telemetry is being + ingested until this is resolved. + + # Fires when the collector is failing to export any signal type to + # a downstream (Data Prepper for traces/logs, Cortex for metrics). + # This is the single most direct indicator of active data loss. + - alert: OtelCollectorExportFailures + expr: >- + rate(otelcol_exporter_send_failed_spans_total[5m]) > 0 + or rate(otelcol_exporter_send_failed_metric_points_total[5m]) > 0 + or rate(otelcol_exporter_send_failed_log_records_total[5m]) > 0 + for: 5m + labels: + severity: critical + component: observability-stack + annotations: + summary: "OTel collector failing to export to {{ $labels.exporter }}" + description: >- + The collector cannot deliver telemetry to a downstream backend. + Check that Data Prepper and Cortex (`prometheus`) are healthy and + accepting writes. + + # Fires when the collector process RSS approaches its 500MB memory + # limit configured in docker-compose.yml. Leading indicator of OOM + # kill + a gap in ingested telemetry. + - alert: OtelCollectorHighMemory + expr: otelcol_process_memory_rss{job="otel-collector"} / 1024 / 1024 > 400 + for: 5m + labels: + severity: warning + component: observability-stack + annotations: + summary: "OTel collector RSS at {{ $value | printf \"%.0f\" }}MB (limit: 500MB)" + description: >- + The collector is within 100MB of its memory limit. Raise + OTEL_COLLECTOR_MEMORY_LIMIT in .env or reduce throughput before + OOM kills the container. + + # Fires when the outbound exporter queue is >80% full. Usually means + # the downstream (Data Prepper or Cortex) is accepting writes too + # slowly — a leading indicator before OtelCollectorExportFailures. + - alert: OtelCollectorQueueNearCapacity + expr: >- + (otelcol_exporter_queue_size / otelcol_exporter_queue_capacity) > 0.8 + and otelcol_exporter_queue_capacity > 0 + for: 5m + labels: + severity: warning + component: observability-stack + annotations: + summary: >- + OTel collector {{ $labels.exporter }} queue at + {{ $value | humanizePercentage }} + description: >- + The outbound queue is filling up. Check the downstream backend + for slow writes; sustained backpressure will eventually cause + OtelCollectorExportFailures. diff --git a/docs/starlight-docs/src/content/docs/alerting/index.md b/docs/starlight-docs/src/content/docs/alerting/index.md index bd986885..6c5a80ae 100644 --- a/docs/starlight-docs/src/content/docs/alerting/index.md +++ b/docs/starlight-docs/src/content/docs/alerting/index.md @@ -51,3 +51,61 @@ Set the trigger to fire when the document count exceeds your threshold, and conf ## Learn more For the full alerting reference - including API operations, composite monitors, alert acknowledgment, and notification channel configuration - see the [Alerting documentation](https://docs.opensearch.org/latest/observing-your-data/alerting/index/) in the official OpenSearch docs. + +## Prometheus/Cortex alerting + +OpenSearch Alerting is one of two alerting surfaces in the stack. The other is a Cortex-side PromQL ruler that evaluates alert rules against time-series metrics and routes firing alerts through Alertmanager. Both surface in the same **Alert Manager** UI in OpenSearch Dashboards, so responders don't need to know which side produced an alert. + +**When to use which:** + +| Signal | Use | +|---|---| +| Log-volume thresholds, trace counts, OpenSearch cluster state | OpenSearch Alerting monitors | +| Metric thresholds, rate-based SLO burn, RED-method alerts | Cortex PromQL rules | + +### Rule file locations + +Cortex rules are shipped as YAML files mounted into the `cortex-rules-init` container on startup. Two namespaces are loaded: + +- **`stack`** — watches the observability stack itself. Loaded always. + - File: `docker-compose/prometheus/rules-stack/stack-alerts.yml` + - Alerts: `PrometheusTargetDown`, `OtelCollectorExportFailures`, `OtelCollectorHighMemory`, `OtelCollectorQueueNearCapacity` +- **`otel_demo`** — RED-method alerts against the OpenTelemetry demo services. Loaded only when `INCLUDE_COMPOSE_OTEL_DEMO` is enabled in `.env`. + - File: `docker-compose/prometheus/rules-otel-demo/otel-demo-alerts.yml` + - Alerts: `OtelDemoFrontendHighErrorRate`, `OtelDemoFrontendHighLatency`, `OtelDemoFrontendProxyErrors`, `OtelDemoCheckoutErrors`, `OtelDemoPaymentFailures`, `OtelDemoCartErrors`, `OtelDemoServiceHighErrorRate`, `OtelDemoServiceHighLatency`, `OtelDemoAdServiceHighCpu` + +To add or edit rules, change the YAML file and re-run the loader: + +```bash +docker compose up -d --force-recreate cortex-rules-init +``` + +The loader upserts via `POST /api/v1/rules/{namespace}`, so re-runs are idempotent and edits take effect immediately. Inspect loaded groups at `http://localhost:9090/api/v1/rules/stack` or `http://localhost:9090/api/v1/rules/otel_demo` (Cortex returns YAML from this Ruler API endpoint). + +### Alertmanager routing + +Alertmanager runs on `localhost:9093` and is configured via `docker-compose/alertmanager/alertmanager.template.yml` (credentials are injected at container start). The default routing tree sends: + +- `component=observability-stack` alerts → `opensearch-webhook` receiver (posts to the stack's own OpenSearch indices for correlation with traces/logs). +- otel-demo critical alerts → `otel-demo-critical` receiver. +- otel-demo warnings → `otel-demo-warning` receiver. +- Everything else → `null` receiver (dropped). + +Placeholder receivers for Slack, email, and PagerDuty are included as examples — replace the dummy URLs with your real endpoints before wiring alerts to production channels. `amtool check-config` validates the template, and `curl http://localhost:9093/api/v2/alerts` lists currently firing alerts. + +### The Alert Manager UI + +In OpenSearch Dashboards, **Alert Manager** (under the main menu) renders both OpenSearch monitors and Cortex alerts in one list. It reads from two datasources: + +- **Local cluster** — OpenSearch Alerting monitors (the ones described earlier on this page). +- **`ObservabilityStack_Prometheus`** — the Cortex datasource configured with `prometheus.uri`, `prometheus.ruler.uri`, and `alertmanager.uri`. The UI pulls firing alerts from Alertmanager, rule definitions from the Cortex Ruler API, and query results from Cortex's PromQL endpoint. + +Filter by datasource in the UI's top-right to scope to just one source when investigating. + +If the UI shows zero Cortex alerts even though they are firing in Cortex (check `curl http://localhost:9090/prometheus/api/v1/alerts`), confirm the datasource has all three URI properties set: + +```bash +curl -u admin:PASSWORD http://localhost:5601/api/dataconnections | jq '.[] | select(.name=="ObservabilityStack_Prometheus") | .properties' +``` + +The stack's init container reconciles these properties automatically on every run; if they are still missing after a rerun, re-create the datasource with `docker compose down -v && docker compose up -d`. diff --git a/test/checks.sh b/test/checks.sh index 6143d0ed..834f0ddb 100755 --- a/test/checks.sh +++ b/test/checks.sh @@ -39,8 +39,10 @@ run_checks() { echo " OTel Collector OTLP HTTP: OK" echo "==> Checking Prometheus is up..." + # Cortex runs under the "prometheus" service name and exposes /ready + # (not the vanilla Prometheus /-/healthy endpoint). retry_check "Prometheus" "$HEALTH_CHECK_RETRIES" "200" \ - "http://localhost:${PROMETHEUS_PORT}/-/healthy" + "http://localhost:${PROMETHEUS_PORT}/ready" echo " Prometheus: OK" echo "==> Checking OpenSearch Dashboards is up..."