From 105844dc1cf3ed8f2aecff2a330e08d05721d015 Mon Sep 17 00:00:00 2001 From: Aidan Hall Date: Thu, 16 Apr 2026 10:56:58 +1000 Subject: [PATCH 01/36] feat(metrics): add v1 of chronos prometheus metrics --- Cargo.lock | 195 +++++++++++-- Cargo.toml | 4 + Dockerfile.chronos | 2 +- Dockerfile.chronos-pg-migrations | 2 +- Dockerfile.chronos-slim | 84 ++++++ Makefile | 19 +- chronos_bin/Cargo.toml | 4 + chronos_bin/src/bin/chronos.rs | 4 + chronos_bin/src/kafka/consumer.rs | 2 +- chronos_bin/src/kafka/producer.rs | 2 +- chronos_bin/src/lib.rs | 1 + chronos_bin/src/message_processor.rs | 66 ++++- chronos_bin/src/message_receiver.rs | 63 ++++- chronos_bin/src/metrics/mod.rs | 3 + chronos_bin/src/metrics/registry.rs | 257 +++++++++++++++++ chronos_bin/src/metrics/server.rs | 27 ++ chronos_bin/src/monitor.rs | 17 +- chronos_bin/src/runner.rs | 23 +- chronos_bin/src/utils/config.rs | 2 + docker-compose.yml | 260 ++++++++---------- examples/chronos_ex/Cargo.toml | 3 + examples/chronos_ex/examples/chronos_ex.rs | 4 + .../examples/publish_test_message.rs | 56 ++++ .../chronos_ex/examples/telemetry_simple.rs | 3 +- rust-toolchain.toml | 2 +- scripts/integration.sh | 191 +++++++++++++ 26 files changed, 1093 insertions(+), 203 deletions(-) create mode 100644 Dockerfile.chronos-slim create mode 100644 chronos_bin/src/metrics/mod.rs create mode 100644 chronos_bin/src/metrics/registry.rs create mode 100644 chronos_bin/src/metrics/server.rs create mode 100644 examples/chronos_ex/examples/publish_test_message.rs create mode 100755 scripts/integration.sh diff --git a/Cargo.lock b/Cargo.lock index 7df7f80..889f2bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -142,13 +142,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" dependencies = [ "async-trait", - "axum-core", + "axum-core 0.3.4", "bitflags 1.3.2", "bytes", "futures-util", - "http", - "http-body", - "hyper", + "http 0.2.9", + "http-body 0.4.5", + "hyper 0.14.27", "itoa", "matchit", "memchr", @@ -157,7 +157,37 @@ dependencies = [ "pin-project-lite", "rustversion", "serde", - "sync_wrapper", + "sync_wrapper 0.1.2", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf" +dependencies = [ + "async-trait", + "axum-core 0.4.5", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper 1.5.2", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper 1.0.2", + "tokio", "tower", "tower-layer", "tower-service", @@ -172,14 +202,34 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http", - "http-body", + "http 0.2.9", + "http-body 0.4.5", "mime", "rustversion", "tower-layer", "tower-service", ] +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper 1.0.2", + "tower-layer", + "tower-service", +] + [[package]] name = "backtrace" version = "0.3.69" @@ -298,6 +348,7 @@ version = "0.2.1" dependencies = [ "anyhow", "async-trait", + "axum 0.7.5", "cargo-husky", "chrono", "clap", @@ -313,6 +364,7 @@ dependencies = [ "opentelemetry-otlp", "opentelemetry_api", "opentelemetry_sdk", + "prometheus", "rand", "rdkafka", "refinery", @@ -332,6 +384,7 @@ dependencies = [ name = "chronos_ex" version = "0.0.2" dependencies = [ + "chrono", "chronos_bin", "dotenv", "env_logger 0.9.3", @@ -343,10 +396,12 @@ dependencies = [ "opentelemetry-stdout", "opentelemetry_api", "opentelemetry_sdk", + "serde_json", "tokio", "tracing", "tracing-opentelemetry", "tracing-subscriber", + "uuid", ] [[package]] @@ -812,7 +867,7 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http", + "http 0.2.9", "indexmap 1.9.3", "slab", "tokio", @@ -873,6 +928,16 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + [[package]] name = "http-body" version = "0.4.5" @@ -880,7 +945,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", - "http", + "http 0.2.9", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.4.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http 1.4.0", + "http-body 1.0.1", "pin-project-lite", ] @@ -913,8 +1001,8 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http", - "http-body", + "http 0.2.9", + "http-body 0.4.5", "httparse", "httpdate", "itoa", @@ -926,18 +1014,52 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", +] + [[package]] name = "hyper-timeout" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.27", "pin-project-lite", "tokio", "tokio-io-timeout", ] +[[package]] +name = "hyper-util" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" +dependencies = [ + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "hyper 1.5.2", + "pin-project-lite", + "tokio", +] + [[package]] name = "iana-time-zone" version = "0.1.57" @@ -1257,7 +1379,7 @@ checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b" dependencies = [ "async-trait", "bytes", - "http", + "http 0.2.9", "opentelemetry_api", "reqwest", ] @@ -1285,7 +1407,7 @@ checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275" dependencies = [ "async-trait", "futures-core", - "http", + "http 0.2.9", "opentelemetry-http", "opentelemetry-proto", "opentelemetry-semantic-conventions", @@ -1566,6 +1688,21 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prometheus" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "memchr", + "parking_lot", + "protobuf", + "thiserror", +] + [[package]] name = "prost" version = "0.11.9" @@ -1589,6 +1726,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" + [[package]] name = "quote" version = "1.0.33" @@ -1772,9 +1915,9 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http", - "http-body", - "hyper", + "http 0.2.9", + "http-body 0.4.5", + "hyper 0.14.27", "ipnet", "js-sys", "log", @@ -2019,9 +2162,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.1" +version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" [[package]] name = "socket2" @@ -2162,6 +2305,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" + [[package]] name = "system-configuration" version = "0.5.1" @@ -2432,15 +2581,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" dependencies = [ "async-trait", - "axum", + "axum 0.6.20", "base64 0.21.4", "bytes", "futures-core", "futures-util", "h2", - "http", - "http-body", - "hyper", + "http 0.2.9", + "http-body 0.4.5", + "hyper 0.14.27", "hyper-timeout", "percent-encoding", "pin-project", diff --git a/Cargo.toml b/Cargo.toml index 16987fa..2fca9a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,3 +43,7 @@ opentelemetry-stdout = { version = "0.1.0", features = ["trace"] } opentelemetry-otlp = { version = "0.13.0", features = ["http-proto", "reqwest-client"] } opentelemetry-http = "0.9.0" +# metrics +prometheus = "0.13" +axum = { version = "0.7", default-features = false, features = ["http1", "tokio"] } + diff --git a/Dockerfile.chronos b/Dockerfile.chronos index 0a081e8..2e90a53 100644 --- a/Dockerfile.chronos +++ b/Dockerfile.chronos @@ -1,4 +1,4 @@ -FROM rust:1.75.0-bookworm AS BUILD +FROM rust:1.94-bookworm AS BUILD # Install software RUN update-ca-certificates && apt-get update && apt-get install -y libsasl2-dev # Create appuser diff --git a/Dockerfile.chronos-pg-migrations b/Dockerfile.chronos-pg-migrations index d6e7026..d4a4f85 100644 --- a/Dockerfile.chronos-pg-migrations +++ b/Dockerfile.chronos-pg-migrations @@ -1,4 +1,4 @@ -FROM rust:1.75.0-bookworm AS BUILD +FROM rust:1.94-bookworm AS BUILD # Install software RUN update-ca-certificates && apt-get update && apt-get install -y libsasl2-dev # Create appuser diff --git a/Dockerfile.chronos-slim b/Dockerfile.chronos-slim new file mode 100644 index 0000000..90b6741 --- /dev/null +++ b/Dockerfile.chronos-slim @@ -0,0 +1,84 @@ +# syntax=docker/dockerfile:1 +# +# Dockerfile.chronos-slim — scratch image for minimal production deployments. +# +# Key differences from Dockerfile.chronos ("fat" / glibc image): +# - Uses Alpine + musl to produce a fully static binary (zero runtime OS deps) +# - Unit tests are executed during the build stage; the image build fails if +# any test fails +# - The final stage is FROM scratch — no OS, shell, or package manager +# +# To use this image in docker-compose, change the chronos service to: +# build: +# context: . +# dockerfile: Dockerfile.chronos-slim + +# ───────────────────────────────────────────────────────────────────────────── +# Build stage +# Alpine's musl toolchain is used throughout. rdkafka compiles librdkafka from +# source (cmake). The SASL feature requires Cyrus SASL, which Alpine packages +# include as static libs so the final binary has no shared-library dependencies. +# ───────────────────────────────────────────────────────────────────────────── +FROM rust:1.94-alpine AS builder + +RUN apk add --no-cache \ + musl-dev \ + cmake \ + make \ + g++ \ + cyrus-sasl-dev \ + openssl-dev \ + openssl-libs-static \ + pkgconfig \ + perl + +WORKDIR /build +COPY ./ . + +# ── Run unit tests ───────────────────────────────────────────────────────── +# Library unit tests run without external services (no Kafka or Postgres). +# Building the test binary also verifies that the release code compiles cleanly +# under musl. The image build is aborted here if any test fails. +RUN PKG_CONFIG_ALL_STATIC=1 \ + RUSTFLAGS="-C target-feature=+crt-static" \ + cargo test --lib -p chronos_bin \ + --target x86_64-unknown-linux-musl + +# ── Build static release binary ───────────────────────────────────────────── +# PKG_CONFIG_ALL_STATIC=1 → pkg-config prefers static (.a) variants of all +# C libraries (sasl2, openssl, …) +# +crt-static → embed the musl C runtime; no libc.so at runtime +# +# The release compile is fast here because the test stage above already built +# all library crates under the same flags and target. +RUN PKG_CONFIG_ALL_STATIC=1 \ + RUSTFLAGS="-C target-feature=+crt-static" \ + cargo build --release -p chronos_bin \ + --target x86_64-unknown-linux-musl + +# Minimal passwd/group entries for the non-root user in the scratch image +RUN printf 'chronos:x:1000:1000::/nonexistent:/sbin/nologin\n' > /tmp/passwd && \ + printf 'chronos:x:1000:\n' > /tmp/group + +# ───────────────────────────────────────────────────────────────────────────── +# Runtime stage — FROM scratch +# The binary is the entire filesystem contents (plus certs and user files). +# ───────────────────────────────────────────────────────────────────────────── +FROM scratch + +# TLS root certificates required for: +# - OTLP trace exporters (HTTPS) +# - Kafka with TLS listeners +COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ + +# User and group files so the container runs as non-root (uid 1000) +COPY --from=builder /tmp/passwd /etc/passwd +COPY --from=builder /tmp/group /etc/group + +# The statically compiled binary — the only executable in this image +COPY --from=builder \ + /build/target/x86_64-unknown-linux-musl/release/chronos \ + /chronos + +USER 1000:1000 +ENTRYPOINT ["/chronos"] diff --git a/Makefile b/Makefile index 84767ef..08c0ab7 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,8 @@ #!make SHELL:=/bin/bash +RUST_VERSION := $(shell grep 'channel' rust-toolchain.toml | sed 's/.*"\(.*\)"/\1/') + # pp - pretty print function yellow := $(shell tput setaf 3) normal := $(shell tput sgr0) @@ -30,7 +32,6 @@ withenv: # export CPPFLAGS=-I/opt/homebrew/opt/openssl@1.1/include dev.init: install $(call pp,install git hooks...) - cargo install cargo-watch cargo test ## dev.kafka_init: 🥁 Init kafka topic @@ -98,6 +99,22 @@ test.unit: $(call pp,rust unit tests...) cargo test +## integration: 🧪 Start deps, migrate, run Chronos, publish test message, verify metrics +integration: build + $(call pp,running integration test...) + @bash scripts/integration.sh + +## integration.down: 🛑 Stop docker services started by make integration +integration.down: + $(call pp,stopping integration services...) + docker compose stop postgres kafka jaeger-all-in-one otel-collector 2>/dev/null || true + docker compose rm -f postgres kafka jaeger-all-in-one otel-collector 2>/dev/null || true + +## metrics.check: 🔍 Verify /metrics endpoint responds (requires running app) +metrics.check: + $(call pp,check metrics endpoint...) + curl -sf http://localhost:9090/metrics | head -20 + ## test.unit.coverage: 🧪 Runs rust unit tests with coverage 'cobertura' and 'junit' reports test.unit.coverage: $(call pp,rust unit tests...) diff --git a/chronos_bin/Cargo.toml b/chronos_bin/Cargo.toml index cee9e01..0a723c7 100644 --- a/chronos_bin/Cargo.toml +++ b/chronos_bin/Cargo.toml @@ -64,6 +64,10 @@ opentelemetry-jaeger.workspace = true # opentelemetry-jaeger.workspace = true opentelemetry-otlp.workspace = true +# metrics +prometheus.workspace = true +axum.workspace = true + [dev-dependencies] serial_test.workspace = true diff --git a/chronos_bin/src/bin/chronos.rs b/chronos_bin/src/bin/chronos.rs index 0d9a36b..b146037 100644 --- a/chronos_bin/src/bin/chronos.rs +++ b/chronos_bin/src/bin/chronos.rs @@ -1,6 +1,7 @@ use chronos_bin::kafka::config::KafkaConfig; use chronos_bin::kafka::consumer::KafkaConsumer; use chronos_bin::kafka::producer::KafkaProducer; +use chronos_bin::metrics::ChronosMetrics; use chronos_bin::postgres::config::PgConfig; use chronos_bin::postgres::pg::Pg; use chronos_bin::runner::Runner; @@ -40,11 +41,14 @@ async fn main() { }, }; + let metrics = Arc::new(ChronosMetrics::new().expect("Failed to initialize metrics registry")); + info!("starting chronos establish connections"); let r = Runner { data_store: Arc::new(data_store), producer: Arc::new(kafka_producer), consumer: Arc::new(kafka_consumer), + metrics, }; debug!("debug logs starting chronos"); diff --git a/chronos_bin/src/kafka/consumer.rs b/chronos_bin/src/kafka/consumer.rs index 336f0e8..04a0a3e 100644 --- a/chronos_bin/src/kafka/consumer.rs +++ b/chronos_bin/src/kafka/consumer.rs @@ -7,7 +7,7 @@ use rdkafka::message::BorrowedMessage; use super::config::KafkaConfig; -use tracing::{instrument, trace, warn}; +use tracing::{trace, warn}; // Kafka Consumer Client pub struct KafkaConsumer { diff --git a/chronos_bin/src/kafka/producer.rs b/chronos_bin/src/kafka/producer.rs index 3fc3b94..0192df3 100644 --- a/chronos_bin/src/kafka/producer.rs +++ b/chronos_bin/src/kafka/producer.rs @@ -25,7 +25,7 @@ impl KafkaProducer { Self { producer, topic } } #[instrument(skip_all, fields(topic = %self.topic))] - pub(crate) async fn kafka_publish(&self, message: String, headers: Option>, key: String) -> Result { + pub async fn kafka_publish(&self, message: String, headers: Option>, key: String) -> Result { // Only because never expecting wrong headers to reach here let unwrap_header = &headers.unwrap_or_default(); diff --git a/chronos_bin/src/lib.rs b/chronos_bin/src/lib.rs index 98bac13..3f969e8 100644 --- a/chronos_bin/src/lib.rs +++ b/chronos_bin/src/lib.rs @@ -2,6 +2,7 @@ pub mod core; mod message_processor; mod message_receiver; +pub mod metrics; mod monitor; pub mod runner; diff --git a/chronos_bin/src/message_processor.rs b/chronos_bin/src/message_processor.rs index 9e03172..4ca42e8 100644 --- a/chronos_bin/src/message_processor.rs +++ b/chronos_bin/src/message_processor.rs @@ -1,4 +1,5 @@ use crate::kafka::producer::KafkaProducer; +use crate::metrics::ChronosMetrics; use crate::postgres::pg::{GetReady, Pg, TableRow}; use crate::utils::config::ChronosConfig; use crate::utils::delay_controller::DelayController; @@ -12,6 +13,7 @@ use uuid::Uuid; pub struct MessageProcessor { pub(crate) data_store: Arc, pub(crate) producer: Arc, + pub(crate) metrics: Arc, } impl MessageProcessor { @@ -49,6 +51,9 @@ impl MessageProcessor { } }; + // Capture deadline before updated_row fields are moved into the publish call. + let deadline = updated_row.deadline; + let readied_by_column = Some(updated_row.readied_by.to_string()); tracing::Span::current().record("correlationId", &readied_by_column); @@ -60,6 +65,10 @@ impl MessageProcessor { .kafka_publish(updated_row.message_value.to_string(), Some(headers), updated_row.message_key.to_string()) .await { + // msg_jitter: difference between actual publish time and client-requested deadline. + // Floored at 0 to guard against clock skew producing negative jitter. + let jitter_secs = (Utc::now() - deadline).num_milliseconds().max(0) as f64 / 1000.0; + self.metrics.msg_jitter.observe(jitter_secs); Ok(id) } else { Err("error occurred while publishing".to_string()) @@ -88,11 +97,14 @@ impl MessageProcessor { } } + /// Returns `(returned, status)` where: + /// - `returned = true` means the loop exited early (no rows ready to fire) + /// - `returned = false` means rows were processed (or a terminal error occurred) + /// - `status = "pass"` on success, `"fail"` on unrecoverable error #[tracing::instrument(skip_all)] - async fn processor_message_ready(&self, node_id: Uuid) { + async fn processor_message_ready(&self, node_id: Uuid) -> (bool, &'static str) { loop { log::debug!("retry loop"); - // thread::sleep(Duration::from_millis(100)); let max_retry_count = 3; let mut retry_count = 0; @@ -102,8 +114,6 @@ impl MessageProcessor { readied_at: deadline, readied_by: node_id, deadline, - // limit: 1000, - // order: "asc", }; let readied_by_column: Option = None; @@ -111,31 +121,28 @@ impl MessageProcessor { match resp { Ok(ready_to_publish_rows) => { if ready_to_publish_rows.is_empty() { - log::debug!("no rows ready to fire for dealine {}", deadline); - break; + log::debug!("no rows ready to fire for deadline {}", deadline); + return (true, "pass"); } else { let publish_futures = ready_to_publish_rows.into_iter().map(|row| self.prepare_to_publish(row)); let results = futures::future::join_all(publish_futures).await; - // closure to gather ids from results vector and ignore error from result - let ids: Vec = results.into_iter().filter_map(|result| result.ok()).collect(); if !ids.is_empty() { let _ = self.delete_fired_records_from_db(&ids).await; log::debug!("number of rows published successfully and deleted from DB {}", ids.len()); - break; + return (false, "pass"); } } } Err(e) => { if e.contains("could not serialize access due to concurrent update") && retry_count < max_retry_count { - //retry goes here retry_count += 1; if retry_count == max_retry_count { log::error!("Error: max retry count {} reached by node {:?} for row ", max_retry_count, readied_by_column); - break; + return (false, "fail"); } } log::error!("Error: error occurred in message processor while publishing {}", e); @@ -143,6 +150,7 @@ impl MessageProcessor { } } } + pub async fn run(&self) { log::info!("MessageProcessor ON!"); @@ -154,9 +162,43 @@ impl MessageProcessor { loop { log::debug!("MessageProcessor loop"); tokio::time::sleep(Duration::from_millis(10)).await; - self.processor_message_ready(node_id).await; + + // msg_process_latency: time the full processor_message_ready() call. + let timer = std::time::Instant::now(); + let (returned, status) = self.processor_message_ready(node_id).await; + let elapsed = timer.elapsed().as_secs_f64(); + if let Ok(obs) = self.metrics.msg_process_latency.get_metric_with_label_values(&[&returned.to_string(), status]) { + obs.observe(elapsed); + } else { + log::error!("metrics: failed to observe msg_process_latency"); + } delay_controller.sleep().await; } } } + +#[cfg(test)] +mod tests { + use crate::metrics::ChronosMetrics; + + #[test] + fn test_jitter_calculation_positive() { + use chrono::{Duration, Utc}; + let deadline = Utc::now() - Duration::milliseconds(300); + let jitter_ms = (Utc::now() - deadline).num_milliseconds().max(0); + assert!(jitter_ms >= 300, "jitter should be at least 300ms when deadline was 300ms ago"); + } + + #[test] + fn test_jitter_below_500ms_within_sla() { + let metrics = ChronosMetrics::new().unwrap(); + // A 300ms jitter is within the 500ms SLA — must land in the <=0.5s bucket + metrics.msg_jitter.observe(0.3); + let families = metrics.registry.gather(); + let fam = families.iter().find(|f| f.get_name() == "msg_jitter").unwrap(); + let hist = fam.get_metric()[0].get_histogram(); + let bucket_500 = hist.get_bucket().iter().find(|b| (b.get_upper_bound() - 0.5).abs() < 1e-9).unwrap(); + assert_eq!(bucket_500.get_cumulative_count(), 1, "300ms jitter must be counted in the <=500ms bucket"); + } +} diff --git a/chronos_bin/src/message_receiver.rs b/chronos_bin/src/message_receiver.rs index 93c0ea8..d79b548 100644 --- a/chronos_bin/src/message_receiver.rs +++ b/chronos_bin/src/message_receiver.rs @@ -1,18 +1,21 @@ use chrono::{DateTime, Utc}; +use rdkafka::message::BorrowedMessage; +use rdkafka::Message; use serde_json::json; +use std::{collections::HashMap, str::FromStr, sync::Arc}; use tracing::instrument; use crate::kafka::consumer::KafkaConsumer; use crate::kafka::producer::KafkaProducer; +use crate::metrics::ChronosMetrics; use crate::postgres::pg::{Pg, TableInsertRow}; use crate::utils::util::{get_message_key, get_payload_utf8, required_headers, CHRONOS_ID, DEADLINE}; -use rdkafka::message::BorrowedMessage; -use std::{collections::HashMap, str::FromStr, sync::Arc}; pub struct MessageReceiver { pub(crate) consumer: Arc, pub(crate) producer: Arc, pub(crate) data_store: Arc, + pub(crate) metrics: Arc, } impl MessageReceiver { @@ -81,21 +84,48 @@ impl MessageReceiver { #[tracing::instrument(name = "receiver_handle_message", skip_all, fields(correlationId, error))] pub async fn handle_message(&self, message: &BorrowedMessage<'_>) { + // msg_wait_time: record how long the message waited in the Kafka input queue. + // Uses the Kafka-assigned message timestamp; guards against clock skew with max(0). + if let Some(kafka_ts_ms) = message.timestamp().to_millis() { + let wait_secs = (Utc::now().timestamp_millis() - kafka_ts_ms).max(0) as f64 / 1000.0; + self.metrics.msg_wait_time.observe(wait_secs); + } + + let timer = std::time::Instant::now(); + let mut destination = "unknown"; + let mut status = "pass"; + let new_message = &message; if let Some(reqd_headers) = required_headers(new_message) { tracing::Span::current().record("correlationId", &reqd_headers[CHRONOS_ID]); if let Ok(message_deadline) = DateTime::::from_str(&reqd_headers[DEADLINE]) { if message_deadline <= Utc::now() { + destination = "kafka"; if let Some(err) = self.prepare_and_publish(new_message, reqd_headers).await { + status = "fail"; log::error!("{}", err); tracing::Span::current().record("error", &err); } - } else if let Some(err_string) = self.insert_into_db(new_message, reqd_headers, message_deadline).await { - log::error!("{}", err_string); - tracing::Span::current().record("error", &err_string); + } else { + destination = "postgres"; + if let Some(err_string) = self.insert_into_db(new_message, reqd_headers, message_deadline).await { + status = "fail"; + log::error!("{}", err_string); + tracing::Span::current().record("error", &err_string); + } } } } + + // msg_consume_latency: only record when destination was determined (valid message headers). + if destination != "unknown" { + let elapsed = timer.elapsed().as_secs_f64(); + if let Ok(obs) = self.metrics.msg_consume_latency.get_metric_with_label_values(&[destination, status]) { + obs.observe(elapsed); + } else { + log::error!("metrics: failed to observe msg_consume_latency"); + } + } } pub async fn run(&self) { @@ -110,9 +140,26 @@ impl MessageReceiver { log::error!("error while consuming message {:?}", e); } } - // if let Ok(message) = &self.consumer.kafka_consume_message().await { - // self.handle_message(message).await; - // } } } } + +#[cfg(test)] +mod tests { + #[test] + fn test_wait_time_calculation_non_negative() { + let kafka_ts_ms: i64 = 1_700_000_000_000; + let now_ms: i64 = kafka_ts_ms + 5_000; + let wait_secs = (now_ms - kafka_ts_ms).max(0) as f64 / 1000.0; + assert!((wait_secs - 5.0).abs() < 1e-9); + } + + #[test] + fn test_wait_time_calculation_clock_skew() { + // Simulates a future Kafka timestamp (clock skew) — should floor to 0.0 + let kafka_ts_ms: i64 = 9_999_999_999_999; + let now_ms: i64 = 1_700_000_000_000; + let wait_secs = (now_ms - kafka_ts_ms).max(0) as f64 / 1000.0; + assert_eq!(wait_secs, 0.0); + } +} diff --git a/chronos_bin/src/metrics/mod.rs b/chronos_bin/src/metrics/mod.rs new file mode 100644 index 0000000..ea3574c --- /dev/null +++ b/chronos_bin/src/metrics/mod.rs @@ -0,0 +1,3 @@ +pub mod registry; +pub mod server; +pub use registry::ChronosMetrics; diff --git a/chronos_bin/src/metrics/registry.rs b/chronos_bin/src/metrics/registry.rs new file mode 100644 index 0000000..2f547ad --- /dev/null +++ b/chronos_bin/src/metrics/registry.rs @@ -0,0 +1,257 @@ +use prometheus::{exponential_buckets, histogram_opts, opts, Counter, Histogram, HistogramVec, Registry}; + +/// All Prometheus metrics for Chronos. +/// Uses a per-instance Registry so tests can create isolated instances +/// without "already registered" collisions. +pub struct ChronosMetrics { + pub registry: Registry, + /// Duration of handle_message() in message_receiver. Labels: [destination, status] + /// destination = "kafka" | "postgres" + /// status = "pass" | "fail" + pub msg_consume_latency: HistogramVec, + /// Duration of processor_message_ready() loop in message_processor. Labels: [returned, status] + /// returned = "true" (no rows, loop returned early) | "false" (rows processed) + /// status = "pass" | "fail" + pub msg_process_latency: HistogramVec, + /// Time a message spent in the Kafka input queue before being processed. + pub msg_wait_time: Histogram, + /// Difference between actual publish time and client-requested deadline (jitter). + /// Includes an explicit 0.5s bucket matching the 500ms SLA. + pub msg_jitter: Histogram, + /// Number of records reset by reset_to_init_db() (the monitor task). + pub msg_reset: Counter, +} + +impl std::fmt::Debug for ChronosMetrics { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ChronosMetrics").finish() + } +} + +impl ChronosMetrics { + pub fn new() -> Result { + let registry = Registry::new(); + + let consume_buckets = exponential_buckets(0.001, 2.0, 12)?; + let msg_consume_latency = HistogramVec::new( + histogram_opts!("msg_consume_latency", "Duration of handle_message() in message_receiver", consume_buckets), + &["destination", "status"], + )?; + registry.register(Box::new(msg_consume_latency.clone()))?; + // Pre-warm all label combinations so the metric family always appears in gather() + // output from startup — HistogramVec is omitted from gather() until at least one + // label combination has been touched. + for destination in &["kafka", "postgres"] { + for status in &["pass", "fail"] { + msg_consume_latency.get_metric_with_label_values(&[destination, status])?; + } + } + + let process_buckets = exponential_buckets(0.001, 2.0, 12)?; + let msg_process_latency = HistogramVec::new( + histogram_opts!( + "msg_process_latency", + "Duration of processor_message_ready() loop in message_processor", + process_buckets + ), + &["returned", "status"], + )?; + registry.register(Box::new(msg_process_latency.clone()))?; + // Pre-warm all label combinations for the same reason as msg_consume_latency above. + for returned in &["true", "false"] { + for status in &["pass", "fail"] { + msg_process_latency.get_metric_with_label_values(&[returned, status])?; + } + } + + let wait_buckets = exponential_buckets(0.1, 2.0, 14)?; + let msg_wait_time = Histogram::with_opts(histogram_opts!( + "msg_wait_time", + "Time a message spent in the Kafka input queue before processing", + wait_buckets + ))?; + registry.register(Box::new(msg_wait_time.clone()))?; + + // Custom buckets with explicit 0.5s boundary for the 500ms SLA + let jitter_buckets = vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]; + let msg_jitter = Histogram::with_opts(histogram_opts!( + "msg_jitter", + "Difference between actual publish time and client-requested deadline", + jitter_buckets + ))?; + registry.register(Box::new(msg_jitter.clone()))?; + + let msg_reset = Counter::with_opts(opts!("msg_reset", "Number of records reset by reset_to_init_db()"))?; + registry.register(Box::new(msg_reset.clone()))?; + + Ok(ChronosMetrics { + registry, + msg_consume_latency, + msg_process_latency, + msg_wait_time, + msg_jitter, + msg_reset, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use prometheus::{Encoder, TextEncoder}; + + #[test] + fn test_metrics_registry_creates_successfully() { + assert!(ChronosMetrics::new().is_ok()); + } + + #[test] + fn test_msg_consume_latency_records_observation() { + let metrics = ChronosMetrics::new().unwrap(); + metrics + .msg_consume_latency + .get_metric_with_label_values(&["kafka", "pass"]) + .unwrap() + .observe(0.05); + + let families = metrics.registry.gather(); + let fam = families.iter().find(|f| f.get_name() == "msg_consume_latency").unwrap(); + // With pre-warming there are 4 entries; find the kafka/pass one by its labels. + let kafka_pass = fam.get_metric().iter().find(|m| { + m.get_label().iter().any(|l| l.get_name() == "destination" && l.get_value() == "kafka") + && m.get_label().iter().any(|l| l.get_name() == "status" && l.get_value() == "pass") + }); + assert!(kafka_pass.is_some(), "kafka/pass label combination must exist"); + let sample_sum = kafka_pass.unwrap().get_histogram().get_sample_sum(); + assert!((sample_sum - 0.05).abs() < 1e-9); + } + + #[test] + fn test_msg_jitter_has_500ms_bucket() { + let metrics = ChronosMetrics::new().unwrap(); + // Observe a value just below 500ms + metrics.msg_jitter.observe(0.499); + + let families = metrics.registry.gather(); + let fam = families.iter().find(|f| f.get_name() == "msg_jitter").unwrap(); + let histogram = fam.get_metric()[0].get_histogram(); + + let bucket_500ms = histogram.get_bucket().iter().find(|b| (b.get_upper_bound() - 0.5).abs() < 1e-9); + assert!(bucket_500ms.is_some(), "0.5s bucket must exist in msg_jitter"); + assert_eq!( + bucket_500ms.unwrap().get_cumulative_count(), + 1, + "0.499s observation must be counted in the <=0.5s bucket" + ); + } + + #[test] + fn test_msg_reset_increments_correctly() { + let metrics = ChronosMetrics::new().unwrap(); + metrics.msg_reset.inc_by(3.0); + metrics.msg_reset.inc_by(2.0); + + let families = metrics.registry.gather(); + let fam = families.iter().find(|f| f.get_name() == "msg_reset").unwrap(); + let value = fam.get_metric()[0].get_counter().get_value(); + assert!((value - 5.0).abs() < 1e-9); + } + + #[test] + fn test_msg_wait_time_records_observation() { + let metrics = ChronosMetrics::new().unwrap(); + metrics.msg_wait_time.observe(1.5); + + let families = metrics.registry.gather(); + let fam = families.iter().find(|f| f.get_name() == "msg_wait_time").unwrap(); + let sample_count = fam.get_metric()[0].get_histogram().get_sample_count(); + assert_eq!(sample_count, 1); + } + + #[test] + fn test_msg_process_latency_label_values() { + let metrics = ChronosMetrics::new().unwrap(); + metrics + .msg_process_latency + .get_metric_with_label_values(&["true", "pass"]) + .unwrap() + .observe(0.01); + metrics + .msg_process_latency + .get_metric_with_label_values(&["false", "pass"]) + .unwrap() + .observe(0.05); + metrics + .msg_process_latency + .get_metric_with_label_values(&["false", "fail"]) + .unwrap() + .observe(0.1); + + let families = metrics.registry.gather(); + let fam = families.iter().find(|f| f.get_name() == "msg_process_latency").unwrap(); + // 3 explicit observations + pre-warming fills all 4 combos; de-dup means 4 entries. + assert_eq!(fam.get_metric().len(), 4); + } + + #[test] + fn test_metrics_text_encode_produces_output() { + let metrics = ChronosMetrics::new().unwrap(); + // All metrics should appear without any manual observations because HistogramVec + // combos are pre-warmed in new(). Scalar histograms and counters always appear. + let encoder = TextEncoder::new(); + let mut buffer = Vec::new(); + encoder.encode(&metrics.registry.gather(), &mut buffer).unwrap(); + let output = String::from_utf8(buffer).unwrap(); + + assert!(output.contains("msg_reset")); + assert!(output.contains("msg_jitter")); + assert!(output.contains("msg_wait_time")); + assert!(output.contains("msg_consume_latency")); + assert!(output.contains("msg_process_latency")); + } + + /// All 5 metric families must be present in a freshly constructed registry even + /// before any messages are processed (i.e., with zero observations). + #[test] + fn test_all_metrics_present_in_fresh_registry() { + let metrics = ChronosMetrics::new().unwrap(); + let encoder = TextEncoder::new(); + let mut buffer = Vec::new(); + encoder.encode(&metrics.registry.gather(), &mut buffer).unwrap(); + let output = String::from_utf8(buffer).unwrap(); + + for name in &["msg_consume_latency", "msg_process_latency", "msg_wait_time", "msg_jitter", "msg_reset"] { + assert!( + output.contains(&format!("# HELP {}", name)), + "metric {} must appear in fresh registry output", + name + ); + } + } + + /// msg_consume_latency must have exactly 4 pre-initialized label combinations + /// (kafka/postgres × pass/fail) so it is always present in the scrape output. + #[test] + fn test_consume_latency_all_label_combos_initialized() { + let metrics = ChronosMetrics::new().unwrap(); + let families = metrics.registry.gather(); + let fam = families + .iter() + .find(|f| f.get_name() == "msg_consume_latency") + .expect("msg_consume_latency must be present in a fresh registry"); + assert_eq!(fam.get_metric().len(), 4, "expected 4 pre-warmed label combos (kafka/postgres × pass/fail)"); + } + + /// msg_process_latency must have exactly 4 pre-initialized label combinations + /// (true/false × pass/fail) so it is always present in the scrape output. + #[test] + fn test_process_latency_all_label_combos_initialized() { + let metrics = ChronosMetrics::new().unwrap(); + let families = metrics.registry.gather(); + let fam = families + .iter() + .find(|f| f.get_name() == "msg_process_latency") + .expect("msg_process_latency must be present in a fresh registry"); + assert_eq!(fam.get_metric().len(), 4, "expected 4 pre-warmed label combos (true/false × pass/fail)"); + } +} diff --git a/chronos_bin/src/metrics/server.rs b/chronos_bin/src/metrics/server.rs new file mode 100644 index 0000000..2d77776 --- /dev/null +++ b/chronos_bin/src/metrics/server.rs @@ -0,0 +1,27 @@ +use crate::metrics::ChronosMetrics; +use axum::{extract::State, http::StatusCode, response::IntoResponse, routing::get, Router}; +use prometheus::{Encoder, TextEncoder}; +use std::sync::Arc; + +async fn metrics_handler(State(metrics): State>) -> impl IntoResponse { + let encoder = TextEncoder::new(); + let metric_families = metrics.registry.gather(); + let mut buffer = Vec::new(); + match encoder.encode(&metric_families, &mut buffer) { + Ok(_) => (StatusCode::OK, [("content-type", "text/plain; version=0.0.4; charset=utf-8")], buffer).into_response(), + Err(e) => { + log::error!("Failed to encode metrics: {}", e); + StatusCode::INTERNAL_SERVER_ERROR.into_response() + } + } +} + +pub async fn run_metrics_server(metrics: Arc, port: u16) { + let app = Router::new().route("/metrics", get(metrics_handler)).with_state(metrics); + + let addr = std::net::SocketAddr::from(([0, 0, 0, 0], port)); + log::info!("Metrics server listening on {}", addr); + + let listener = tokio::net::TcpListener::bind(addr).await.expect("Failed to bind metrics server port"); + axum::serve(listener, app).await.expect("Metrics server failed"); +} diff --git a/chronos_bin/src/monitor.rs b/chronos_bin/src/monitor.rs index aaaffd3..abf4d60 100644 --- a/chronos_bin/src/monitor.rs +++ b/chronos_bin/src/monitor.rs @@ -1,3 +1,4 @@ +use crate::metrics::ChronosMetrics; use crate::postgres::pg::Pg; use crate::utils::config::ChronosConfig; use chrono::Utc; @@ -7,6 +8,7 @@ use std::time::Duration; #[derive(Debug)] pub struct FailureDetector { pub(crate) data_store: Arc, + pub(crate) metrics: Arc, } impl FailureDetector { @@ -22,11 +24,16 @@ impl FailureDetector { #[tracing::instrument(skip_all, fields(error))] async fn reset_to_init_db(&self, fetched_rows: &std::vec::Vec) { if !fetched_rows.is_empty() { - if let Err(e) = &self.data_store.reset_to_init_db(fetched_rows).await { - tracing::Span::current().record("error", e); - log::error!("error in monitor reset_to_init {}", e); - } else { - log::debug!("reset_to_init_db success for {:?}", fetched_rows) + match &self.data_store.reset_to_init_db(fetched_rows).await { + Ok(reset_ids) => { + // msg_reset: count the number of messages reset by the monitor task. + self.metrics.msg_reset.inc_by(reset_ids.len() as f64); + log::debug!("reset_to_init_db success for {:?}", fetched_rows) + } + Err(e) => { + tracing::Span::current().record("error", e); + log::error!("error in monitor reset_to_init {}", e); + } } } } diff --git a/chronos_bin/src/runner.rs b/chronos_bin/src/runner.rs index 7431ec7..2b9231f 100644 --- a/chronos_bin/src/runner.rs +++ b/chronos_bin/src/runner.rs @@ -2,8 +2,11 @@ use crate::kafka::consumer::KafkaConsumer; use crate::kafka::producer::KafkaProducer; use crate::message_processor::MessageProcessor; use crate::message_receiver::MessageReceiver; +use crate::metrics::server::run_metrics_server; +use crate::metrics::ChronosMetrics; use crate::monitor::FailureDetector; use crate::postgres::pg::Pg; +use crate::utils::config::ChronosConfig; use std::fs::{create_dir, read, write}; use std::sync::Arc; @@ -11,27 +14,44 @@ pub struct Runner { pub consumer: Arc, pub producer: Arc, pub data_store: Arc, + pub metrics: Arc, } impl Runner { pub async fn run(&self) { let monitor_ds = Arc::clone(&self.data_store); + let monitor_metrics = Arc::clone(&self.metrics); let process_ds = Arc::clone(&self.data_store); let process_producer = self.producer.clone(); + let process_metrics = Arc::clone(&self.metrics); let receiver_ds = Arc::clone(&self.data_store); let receiver_prod = self.producer.clone(); let receiver_consumer = self.consumer.clone(); + let receiver_metrics = Arc::clone(&self.metrics); + + let metrics_port = ChronosConfig::from_env().metrics_port; + let metrics_for_server = Arc::clone(&self.metrics); + + // Spawn metrics server as an independent background task. + // A failure here is logged but does not stop the processing tasks. + tokio::task::spawn(async move { + run_metrics_server(metrics_for_server, metrics_port).await; + }); let monitor_handler = tokio::task::spawn(async { - let monitor = FailureDetector { data_store: monitor_ds }; + let monitor = FailureDetector { + data_store: monitor_ds, + metrics: monitor_metrics, + }; monitor.run().await; }); let message_processor_handler = tokio::task::spawn(async { let message_processor = MessageProcessor { producer: process_producer, data_store: process_ds, + metrics: process_metrics, }; message_processor.run().await; }); @@ -40,6 +60,7 @@ impl Runner { consumer: receiver_consumer, producer: receiver_prod, data_store: receiver_ds, + metrics: receiver_metrics, }; message_receiver.run().await; diff --git a/chronos_bin/src/utils/config.rs b/chronos_bin/src/utils/config.rs index 03d0fbb..134c15e 100644 --- a/chronos_bin/src/utils/config.rs +++ b/chronos_bin/src/utils/config.rs @@ -5,6 +5,7 @@ pub struct ChronosConfig { pub processor_db_poll: u64, pub time_advance: u64, pub fail_detect_interval: u64, + pub metrics_port: u16, } impl ChronosConfig { @@ -15,6 +16,7 @@ impl ChronosConfig { processor_db_poll: std::env::var("PROCESSOR_DB_POLL").unwrap_or_else(|_| 5.to_string()).parse().unwrap_or(5), time_advance: std::env::var("TIMING_ADVANCE").unwrap_or_else(|_| 0.to_string()).parse().unwrap_or(0), fail_detect_interval: std::env::var("FAIL_DETECT_INTERVAL").unwrap_or_else(|_| 10.to_string()).parse().unwrap_or(10), + metrics_port: std::env::var("METRICS_PORT").unwrap_or_else(|_| "9090".to_string()).parse().unwrap_or(9090), } } } diff --git a/docker-compose.yml b/docker-compose.yml index 0f4f21f..bafcd2b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,170 +1,142 @@ -version: '3.1' -services: - -#postgres DB - # postgres: - # image: postgres:13.3 - # ports: - # - 5432:5432 - # volumes: - # - postgres:/var/lib/postgresql/data/ - # environment: - # POSTGRES_USER: admin - # POSTGRES_PASSWORD: admin - # POSTGRES_DB: chronos_db - # networks: - # - chronos - # migration / init container - # chronos-pg-mig: - # image: mig - # networks: - # - chronos - # working_dir: /opt/chronos - # volumes: - # - ../../:/opt/chronos - # environment: - # PG_HOST: postgres - # PG_PORT: 5432 - # PG_USER: admin - # PG_PASSWORD: admin - # PG_DATABASE: chronos_db - # PG_POOL_SIZE: 50 - # RUST_LOG: "${RUST_LOG:-info}" - # depends_on: - # - postgres +version: '3.8' - # zookeeper: - # image: bitnami/zookeeper:3.7.0 - # ports: - # - 2180:2181 - # volumes: - # - zookeeper:/bitnami/zookeeper - # environment: - # ALLOW_ANONYMOUS_LOGIN: "yes" - # networks: - # - chronos +# ───────────────────────────────────────────────────────────────────────────── +# Core infrastructure +# ───────────────────────────────────────────────────────────────────────────── +services: - # kafka: - # image: bitnami/kafka:2.8.0 - # ports: - # - 9092:9092 - # - 9093:9093 - # - 9094:9094 - # volumes: - # - kafka:/bitnami/kafka - # - ./infra:/opt/infra - # environment: - # KAFKA_BROKER_ID: "1" - # KAFKA_CFG_LISTENERS: "INTERNAL://:9092, EXTERNAL://:9093, K8S://:9094" - # KAFKA_CFG_ADVERTISED_LISTENERS: "INTERNAL://kafka:9092, EXTERNAL://localhost:9093" - # KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP: "INTERNAL:PLAINTEXT, EXTERNAL:PLAINTEXT, K8S:PLAINTEXT" - # KAFKA_CFG_ZOOKEEPER_CONNECT: "zookeeper:2181" - # KAFKA_INTER_BROKER_LISTENER_NAME: "INTERNAL" - # KAFKA_INTER_BROKER_USER: "admin" - # KAFKA_INTER_BROKER_PASSWORD: "admin-secret" - # KAFKA_CFG_NUM_PARTITIONS: "1" - # KAFKA_LOG_RETENTION_BYTES: -1 - # KAFKA_LOG_RETENTION_MS: -1 - # KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE: "true" - # KAFKA_CFG_SUPER_USERS: "User:admin" - # KAFKA_CLIENT_USERS: "admin,kafdrop" - # KAFKA_CLIENT_PASSWORDS: "admin-secret,admin-secret" - # ALLOW_PLAINTEXT_LISTENER: "yes" - # networks: - # - chronos - # depends_on: - # - zookeeper + postgres: + image: postgres:16 + ports: + - "5432:5432" + environment: + POSTGRES_USER: admin + POSTGRES_PASSWORD: admin + POSTGRES_DB: chronos_db + volumes: + - postgres:/var/lib/postgresql/data/ + healthcheck: + test: ["CMD-SHELL", "pg_isready -U admin -d chronos_db"] + interval: 5s + timeout: 5s + retries: 10 + networks: + - chronos - # kowl: - # image: quay.io/cloudhut/kowl:master - # ports: - # - 9091:8080 - # environment: - # KAFKA_BROKERS: "kafka:9092" - # networks: - # - chronos - # depends_on: - # - kafka - # - zookeeper + kafka: + image: bitnami/kafka:latest + ports: + - "9094:9094" # External port for host-machine access + environment: + # KRaft (no ZooKeeper required) + KAFKA_CFG_NODE_ID: "0" + KAFKA_CFG_PROCESS_ROLES: controller,broker + KAFKA_CFG_CONTROLLER_QUORUM_VOTERS: 0@kafka:9093 + # Listeners + KAFKA_CFG_LISTENERS: PLAINTEXT://:9092,CONTROLLER://:9093,EXTERNAL://:9094 + KAFKA_CFG_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,EXTERNAL://localhost:9094 + KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT + KAFKA_CFG_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_CFG_INTER_BROKER_LISTENER_NAME: PLAINTEXT + KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE: "true" + healthcheck: + test: ["CMD-SHELL", "kafka-topics.sh --bootstrap-server localhost:9092 --list"] + interval: 10s + timeout: 10s + retries: 15 + start_period: 30s + networks: + - chronos - # chronos - # chronos-delay-scheduler: - # image: chronos - # networks: - # - chronos - # ports: - # - 8181:8181 - # working_dir: /opt/chronos - # volumes: - # - ../../:/opt/chronos - # environment: - # ENVIRONMENT: "dev" - # SERVICE_NAME: "chronos-delay-scheduler" - # BUILD_VERSION: "0.0.0" - # KAFKA_HOST: kafka - # KAFKA_PORT: 9092 - # KAFKA_CLIENT_ID: "chronos" - # KAFKA_GROUP_ID: "chronos" - # KAFKA_IN_TOPIC: "chronos.in" - # KAFKA_OUT_TOPIC: "chronos.out" - # KAFKA_USERNAME: - # KAFKA_PASSWORD: - # PG_HOST: postgres - # PG_PORT: 5432 - # PG_USER: admin - # PG_PASSWORD: admin - # PG_DATABASE: chronos_db - # PG_POOL_SIZE: 50 - # RUST_LOG: "${RUST_LOG:-info}" - # # App config (optional) - # # DELAY_TIME: 0 - # # RANDOMNESS_DELAY: 100 - # # MONITOR_POLL_INTERVAL: 5 - # # TIMING_ADVANCE: 0 - # # FAIL_DETECT_INTERVAL: 500 - # depends_on: - # - postgres - # - zookeeper - # - kafka +# ───────────────────────────────────────────────────────────────────────────── +# Database migrations (one-shot init container) +# ───────────────────────────────────────────────────────────────────────────── + chronos-pg-migrations: + build: + context: . + dockerfile: Dockerfile.chronos-pg-migrations + environment: + PG_HOST: postgres + PG_PORT: "5432" + PG_USER: admin + PG_PASSWORD: admin + PG_DATABASE: chronos_db + depends_on: + postgres: + condition: service_healthy + restart: "no" + networks: + - chronos - # ******************** - # Telemetry Components - # ******************** - # Jaeger +# ───────────────────────────────────────────────────────────────────────────── +# Chronos application +# ───────────────────────────────────────────────────────────────────────────── + chronos: + build: + context: . + dockerfile: Dockerfile.chronos + ports: + - "9090:9090" # Prometheus /metrics endpoint + environment: + KAFKA_HOST: kafka + KAFKA_PORT: "9092" + KAFKA_CLIENT_ID: chronos + KAFKA_GROUP_ID: chronos + KAFKA_IN_TOPIC: chronos.in + KAFKA_OUT_TOPIC: chronos.out + KAFKA_USERNAME: "" + KAFKA_PASSWORD: "" + PG_HOST: postgres + PG_PORT: "5432" + PG_USER: admin + PG_PASSWORD: admin + PG_DATABASE: chronos_db + PG_POOL_SIZE: "10" + RUST_LOG: info + METRICS_PORT: "9090" + MONITOR_DB_POLL: "5" + PROCESSOR_DB_POLL: "5" + TIMING_ADVANCE: "0" + FAIL_DETECT_INTERVAL: "10" + depends_on: + postgres: + condition: service_healthy + kafka: + condition: service_healthy + chronos-pg-migrations: + condition: service_completed_successfully + networks: + - chronos +# ───────────────────────────────────────────────────────────────────────────── +# Telemetry (optional – used for distributed tracing) +# ───────────────────────────────────────────────────────────────────────────── jaeger-all-in-one: image: jaegertracing/all-in-one:latest ports: - "16686:16686" - - "14268" - - "14250" container_name: Jaeger environment: - # COLLECTOR_OTLP_ENABLED is false in case running Jaeger as Backend - COLLECTOR_OTLP_ENABLED=true networks: - chronos - # Collector otel-collector: image: otel/opentelemetry-collector:latest container_name: otelcol - networks: - - chronos restart: unless-stopped - command: [ "--config=/etc/otelcol-config.yml" ] + command: ["--config=/etc/otelcol-config.yml"] volumes: - ./infra/otelcol-config.yml:/etc/otelcol-config.yml ports: - - "1888:1888" # pprof extension - - "13133:13133" # health_check extension - - "4317:4317" # OTLP gRPC receiver - - "4318:4318" # OTLP HTTP receiver - - "55670:55679" # zpages extension + - "1888:1888" # pprof extension + - "13133:13133" # health_check extension + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver depends_on: - jaeger-all-in-one - - + networks: + - chronos networks: chronos: @@ -173,7 +145,3 @@ networks: volumes: postgres: driver: local - zookeeper: - driver: local - kafka: - driver: local \ No newline at end of file diff --git a/examples/chronos_ex/Cargo.toml b/examples/chronos_ex/Cargo.toml index 3f8d40a..4c17705 100644 --- a/examples/chronos_ex/Cargo.toml +++ b/examples/chronos_ex/Cargo.toml @@ -12,6 +12,9 @@ tokio.workspace = true futures.workspace = true chronos_bin={path="../../chronos_bin"} +chrono = "0.4.23" +uuid = { version = "1.3.0", features = ["v4", "fast-rng"] } +serde_json = "1.0.93" #tracing tracing.workspace = true diff --git a/examples/chronos_ex/examples/chronos_ex.rs b/examples/chronos_ex/examples/chronos_ex.rs index d1567b8..5cc314d 100644 --- a/examples/chronos_ex/examples/chronos_ex.rs +++ b/examples/chronos_ex/examples/chronos_ex.rs @@ -1,6 +1,7 @@ use chronos_bin::kafka::config::KafkaConfig; use chronos_bin::kafka::consumer::KafkaConsumer; use chronos_bin::kafka::producer::KafkaProducer; +use chronos_bin::metrics::ChronosMetrics; use chronos_bin::postgres::config::PgConfig; use chronos_bin::postgres::pg::Pg; use chronos_bin::runner::Runner; @@ -91,10 +92,13 @@ async fn main() { let kafka_producer = KafkaProducer::new(&kafka_config); let data_store = Pg::new(pg_config).await.unwrap(); + let metrics = Arc::new(ChronosMetrics::new().expect("Failed to initialize metrics registry")); + let r = Runner { data_store: Arc::new(data_store), producer: Arc::new(kafka_producer), consumer: Arc::new(kafka_consumer), + metrics, }; debug!("debug logs starting chronos"); diff --git a/examples/chronos_ex/examples/publish_test_message.rs b/examples/chronos_ex/examples/publish_test_message.rs new file mode 100644 index 0000000..3cf6998 --- /dev/null +++ b/examples/chronos_ex/examples/publish_test_message.rs @@ -0,0 +1,56 @@ +/// Publishes a single test message to the Chronos input Kafka topic. +/// +/// Required environment variables (same as the main Chronos service): +/// KAFKA_HOST, KAFKA_PORT, KAFKA_CLIENT_ID, KAFKA_GROUP_ID, +/// KAFKA_IN_TOPIC, KAFKA_OUT_TOPIC, KAFKA_USERNAME, KAFKA_PASSWORD +/// +/// Optional environment variables: +/// CHRONOS_DEADLINE RFC3339 timestamp for the message deadline. +/// Defaults to 1 minute in the past, which causes +/// Chronos to fire the message immediately and generate +/// observable msg_jitter metrics. +/// CHRONOS_MSG_ID Override the generated message UUID. +use chrono::{Duration, Utc}; +use chronos_bin::kafka::config::KafkaConfig; +use chronos_bin::kafka::producer::KafkaProducer; +use std::collections::HashMap; +use uuid::Uuid; + +#[tokio::main] +async fn main() { + env_logger::init(); + dotenv::dotenv().ok(); + + let msg_id = std::env::var("CHRONOS_MSG_ID").unwrap_or_else(|_| Uuid::new_v4().to_string()); + + // Default: 1 minute in the past so Chronos fires immediately (exercises jitter metrics). + // Override with a future timestamp to test the "store and delay" path. + let deadline = std::env::var("CHRONOS_DEADLINE").unwrap_or_else(|_| (Utc::now() - Duration::minutes(1)).to_rfc3339()); + + let payload = serde_json::json!({ + "source": "integration-test", + "message_id": msg_id, + "sent_at": Utc::now().to_rfc3339(), + }) + .to_string(); + + let mut headers = HashMap::new(); + headers.insert("chronosMessageId".to_string(), msg_id.clone()); + headers.insert("chronosDeadline".to_string(), deadline.clone()); + + println!("Publishing test message"); + println!(" id: {}", msg_id); + println!(" deadline: {}", deadline); + println!(" payload: {}", payload); + + let kafka_config = KafkaConfig::from_env(); + let producer = KafkaProducer::new(&kafka_config); + + match producer.kafka_publish(payload, Some(headers), msg_id.clone()).await { + Ok(id) => println!("✓ Published successfully (returned id: {})", id), + Err(e) => { + eprintln!("✗ Failed to publish: {}", e); + std::process::exit(1); + } + } +} diff --git a/examples/chronos_ex/examples/telemetry_simple.rs b/examples/chronos_ex/examples/telemetry_simple.rs index 25faded..b268c6a 100644 --- a/examples/chronos_ex/examples/telemetry_simple.rs +++ b/examples/chronos_ex/examples/telemetry_simple.rs @@ -1,8 +1,7 @@ use opentelemetry::trace::TracerProvider as _; -use opentelemetry_otlp::ExportConfig; + use opentelemetry_sdk::{runtime::Tokio, trace::TracerProvider}; use tracing::{info_span, instrument}; -use tracing_subscriber::prelude::*; use tokio::time::Duration; diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 6d833ff..4683c9e 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "1.75" +channel = "1.94" diff --git a/scripts/integration.sh b/scripts/integration.sh new file mode 100755 index 0000000..6e5d28d --- /dev/null +++ b/scripts/integration.sh @@ -0,0 +1,191 @@ +#!/usr/bin/env bash +# integration.sh — starts Docker dependencies, runs migrations, starts Chronos +# locally, publishes a test message, and verifies metrics are being recorded. +# +# Usage: called by `make integration` from the repo root. +set -euo pipefail + +# ─── configuration ──────────────────────────────────────────────────────────── +KAFKA_EXT_PORT="${KAFKA_EXT_PORT:-9094}" +PG_PORT="${PG_PORT:-5432}" +METRICS_PORT="${METRICS_PORT:-9090}" +CHRONOS_PID_FILE="/tmp/chronos_integration.pid" +CHRONOS_LOG="/tmp/chronos_integration.log" +MAX_WAIT=120 # seconds to wait for each readiness check + +# Unique ID for this test run — used to identify our message on the output topic +MSG_ID="integration-test-$(date +%s)" + +# ─── helpers ────────────────────────────────────────────────────────────────── +log() { printf '\033[0;33m%s\033[0m\n' "→ $*"; } +ok() { printf '\033[0;32m%s\033[0m\n' "✓ $*"; } +fail() { printf '\033[0;31m%s\033[0m\n' "✗ $*" >&2; exit 1; } + +wait_for() { + local label="$1"; shift + local elapsed=0 + printf '%s ' "→ Waiting for ${label}..." + until "$@" > /dev/null 2>&1; do + printf '.' + sleep 2 + elapsed=$((elapsed + 2)) + if [ "${elapsed}" -ge "${MAX_WAIT}" ]; then + echo "" + fail "Timed out waiting for ${label} after ${MAX_WAIT}s" + fi + done + echo " ready" +} + +# cleanup() { +# if [ -f "${CHRONOS_PID_FILE}" ]; then +# local pid +# pid="$(cat "${CHRONOS_PID_FILE}")" +# if kill -0 "${pid}" 2>/dev/null; then +# log "Stopping Chronos (pid ${pid})..." +# kill "${pid}" 2>/dev/null || true +# wait "${pid}" 2>/dev/null || true +# fi +# rm -f "${CHRONOS_PID_FILE}" +# fi +# } +# trap cleanup EXIT + +# ─── 1. start infrastructure ────────────────────────────────────────────────── +log "Starting infrastructure (postgres + kafka)..." +docker compose up -d postgres kafka + +# ─── 2. wait for postgres ───────────────────────────────────────────────────── +wait_for "postgres" \ + docker compose exec -T postgres pg_isready -U admin -d chronos_db + +# ─── 3. wait for kafka ──────────────────────────────────────────────────────── +wait_for "kafka" \ + docker compose exec -T kafka \ + /opt/bitnami/kafka/bin/kafka-topics.sh --bootstrap-server localhost:9092 --list + +# ─── 4. run migrations ──────────────────────────────────────────────────────── +log "Running database migrations..." +PG_HOST=localhost \ +PG_PORT="${PG_PORT}" \ +PG_USER=admin \ +PG_PASSWORD=admin \ +PG_DATABASE=chronos_db \ + cargo run --quiet --package pg_mig --bin chronos-pg-migrations +ok "Migrations complete" + +# ─── 5. start chronos in background ────────────────────────────────────────── +log "Starting Chronos (logs → ${CHRONOS_LOG})..." +KAFKA_HOST=localhost \ +KAFKA_PORT="${KAFKA_EXT_PORT}" \ +KAFKA_CLIENT_ID=chronos \ +KAFKA_GROUP_ID=chronos \ +KAFKA_IN_TOPIC=chronos.in \ +KAFKA_OUT_TOPIC=chronos.out \ +KAFKA_USERNAME="" \ +KAFKA_PASSWORD="" \ +PG_HOST=localhost \ +PG_PORT="${PG_PORT}" \ +PG_USER=admin \ +PG_PASSWORD=admin \ +PG_DATABASE=chronos_db \ +PG_POOL_SIZE=10 \ +RUST_LOG=warn \ +METRICS_PORT="${METRICS_PORT}" \ +MONITOR_DB_POLL=5 \ +PROCESSOR_DB_POLL=5 \ +TIMING_ADVANCE=0 \ +FAIL_DETECT_INTERVAL=10 \ + cargo run --quiet --package chronos_bin --bin chronos \ + > "${CHRONOS_LOG}" 2>&1 & +echo $! > "${CHRONOS_PID_FILE}" + +# ─── 6. wait for metrics endpoint ──────────────────────────────────────────── +wait_for "Chronos metrics endpoint" \ + curl -sf "http://localhost:${METRICS_PORT}/metrics" + +# ─── 7. publish test message ───────────────────────────────────────────────── +# The deadline is 1 minute in the past so Chronos fires the message immediately +# to the output topic, exercising the full consume → store-or-fire path. +log "Publishing test message (id: ${MSG_ID})..." +CHRONOS_MSG_ID="${MSG_ID}" \ +KAFKA_HOST=localhost \ +KAFKA_PORT="${KAFKA_EXT_PORT}" \ +KAFKA_CLIENT_ID=chronos-test-publisher \ +KAFKA_GROUP_ID=chronos-test-publisher \ +KAFKA_IN_TOPIC=chronos.in \ +KAFKA_OUT_TOPIC=chronos.out \ +KAFKA_USERNAME="" \ +KAFKA_PASSWORD="" \ + cargo run --quiet --package chronos_ex --example publish_test_message +ok "Message published" + +# ─── 8. verify message fired to output topic ───────────────────────────────── +# Consume from chronos.out from the beginning, waiting up to 30s for the message +# to appear. kafka-console-consumer exits when max-messages is reached OR when +# no new messages arrive within timeout-ms — whichever comes first. +# The || true prevents set -e from aborting on the consumer's non-zero exit +# (timeout reached) which is normal when the topic drains before max-messages. +log "Waiting for message ${MSG_ID} on chronos.out (up to 30s)..." +FIRED_OUTPUT=$( + docker compose exec -T kafka \ + /opt/bitnami/kafka/bin/kafka-console-consumer.sh \ + --bootstrap-server localhost:9092 \ + --topic chronos.out \ + --from-beginning \ + --max-messages 50 \ + --timeout-ms 30000 \ + 2>/dev/null || true +) + +if echo "${FIRED_OUTPUT}" | grep -q "${MSG_ID}"; then + ok "Message ${MSG_ID} arrived on chronos.out" +else + echo "" + printf '\033[0;31m%s\033[0m\n' "✗ Message ${MSG_ID} was NOT found on chronos.out" >&2 + echo " Last 20 lines of Chronos log:" >&2 + tail -20 "${CHRONOS_LOG}" >&2 + fail "Message delivery test failed" +fi + +# ─── 9. show metrics ───────────────────────────────────────────────────────── +echo "" +echo "══════════════════════════════════════════════════════" +echo " Chronos metrics (http://localhost:${METRICS_PORT}/metrics)" +echo "══════════════════════════════════════════════════════" +curl -sf "http://localhost:${METRICS_PORT}/metrics" \ + | grep -E "^(# HELP|# TYPE|msg_)" \ + | sort +echo "" + +# ─── 10. verify all five metric families are present ───────────────────────── +log "Verifying metric families..." +METRICS_OUTPUT="$(curl -sf "http://localhost:${METRICS_PORT}/metrics")" +EXPECTED_METRICS=( + "msg_consume_latency" + "msg_process_latency" + "msg_wait_time" + "msg_jitter" + "msg_reset" +) +ALL_OK=true +for metric in "${EXPECTED_METRICS[@]}"; do + if echo "${METRICS_OUTPUT}" | grep -q "^# HELP ${metric}"; then + ok "${metric} present" + else + printf '\033[0;31m%s\033[0m\n' "✗ ${metric} MISSING" >&2 + ALL_OK=false + fi +done + +echo "" +if [ "${ALL_OK}" = "true" ]; then + ok "All metrics verified" +else + fail "One or more metrics are missing — check ${CHRONOS_LOG}" +fi + +echo "" +ok "Integration test complete" +echo " Chronos logs: ${CHRONOS_LOG}" +echo " Run 'make integration.down' to stop Docker services." From bece16d404a55a4ae9de9be7e3ae804a16b495ec Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Wed, 29 Apr 2026 21:34:57 +1000 Subject: [PATCH 02/36] docs: add agent workflow guidance Document Chronos project context, verification commands, and the action-trail expectations future agents should follow. Verification: not run (docs-only change). Model-version: GPT-5 --- AGENTS.md | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..b8c46c8 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,129 @@ +# Agent Instructions for Chronos + +## Project Context + +Chronos is an open source Rust project. It listens for messages on a Kafka input topic, stores delayed messages in PostgreSQL, and publishes those messages to a Kafka output topic at a later time. More project background, design notes, and runtime details are in [README.md](README.md). + +This is an existing project. Agents must preserve the project's current structure, style, testing practices, and conventions unless the user explicitly asks for a larger change. + +The active working branch for current work is `feat/prom_metrics`. + +## Working Principles + +- Optimize for a clear action trail. Future agents may start with no conversation history, so decisions must be recoverable from files, commits, and command output summaries. +- Document material changes when making them. At minimum, the commit message must explain the intent, relevant implementation notes, and verification performed. +- Keep edits scoped to the requested change. Do not reformat unrelated files or rewrite working code for style only. +- Do not discard or revert user changes. If the worktree has unrelated modifications, leave them alone. +- Prefer existing module boundaries and patterns over new abstractions. +- Update README, How-to notes, examples, or this file when behavior, setup, tests, or agent workflow expectations change. + +## Rust Conventions + +- This is a Cargo workspace with these members: + - `chronos_bin`: main Chronos binary and library code. + - `pg_mig`: PostgreSQL migration binary. + - `examples/*`: example clients and utilities. +- The Rust toolchain is pinned in [rust-toolchain.toml](rust-toolchain.toml). Use that version unless the user asks to change it. +- Formatting is controlled by [rustfmt.toml](rustfmt.toml): 4-space tabs, `max_width = 160`, Unix newlines. +- Keep tests close to the code under `#[cfg(test)] mod tests` when following the existing unit-test style. +- Prefer typed Rust APIs and project helpers over ad hoc parsing or shelling out from Rust code. +- Preserve the project's async style based on Tokio, Kafka, PostgreSQL, tracing, and Prometheus metrics crates already in use. + +## Verification Commands + +Use the repository's Make targets and scripts as the source of truth. + +- Default pre-commit verification: + + ```sh + sh scripts/pre-commit-checks.sh + ``` + + This runs: + + ```sh + make withenv RECIPE=lint + make withenv RECIPE=test.unit + ``` + +- Lint-only check: + + ```sh + make withenv RECIPE=lint + ``` + + This runs `cargo check`, `cargo fmt -- --check`, and `cargo clippy --all-targets`. + +- Unit tests: + + ```sh + make withenv RECIPE=test.unit + ``` + + This runs `cargo test`. + +- Build: + + ```sh + make build + ``` + + This runs `cargo build`. + +- Metrics/integration verification: + + ```sh + make integration + ``` + + This starts Docker-backed PostgreSQL and Kafka dependencies, runs migrations, starts Chronos, publishes a test message, verifies delivery, and checks the Prometheus `/metrics` endpoint. + +- Stop integration services: + + ```sh + make integration.down + ``` + +Run the narrowest useful checks while iterating, then run the default pre-commit verification before committing. Run `make integration` for changes touching Kafka/PostgreSQL behavior, runtime wiring, Docker setup, migrations, metrics exposure, or end-to-end message flow. + +If a verification command cannot be run, document the reason in the final response and in the commit message. + +## Commit and Push Policy + +Agents should commit and push their changes unless the user explicitly says not to. + +Commit messages must include a footer named `Model-version` containing the model that generated the commit. Example: + +```text +docs: add agent workflow guidance + +Document Chronos project conventions, verification commands, and agent +handoff expectations. + +Verification: +- sh scripts/pre-commit-checks.sh + +Model-version: GPT-5 +``` + +Use concise subject lines that match the existing repository style, such as `feat(...)`, `fix(...)`, `docs:`, or `chore:`. Include enough body detail for a future agent to understand why the change was made and what was verified. + +## Paper Trail Expectations + +For each non-trivial change, leave evidence in one or more of these places: + +- Code comments only where they clarify non-obvious behavior. +- Tests that encode behavioral expectations. +- Documentation updates for changed workflows, configuration, metrics, or operational behavior. +- Commit message body with the reasoning and verification. +- Final response summarizing changed files and checks run. + +When making tradeoffs, record the chosen path and the reason. Avoid relying on chat history as the only explanation. + +## Project-Specific Notes + +- Chronos treats Kafka message bodies opaquely and forwards messages after delay; avoid adding application-level assumptions about payload shape. +- The README describes at-least-once delivery semantics. Preserve behavior that supports persistence, recovery from suspected node failure, and duplicate-safe processing. +- Metrics work on the `feat/prom_metrics` branch currently includes a Prometheus endpoint and metric-family checks in the integration script. Changes to metrics should preserve unit tests for registry output and integration checks for expected metric families. +- Local development commonly uses `.env` copied from [.env.example](.env.example) through `make withenv`. +- Docker Compose is used for local PostgreSQL, Kafka, Jaeger, and OpenTelemetry dependencies. From b002ada4ba5f2b85a170b7f3da60fb3943f65844 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Wed, 29 Apr 2026 23:07:38 +1000 Subject: [PATCH 03/36] feat(metrics): sketch prometheus otlp backend design Add a mock metrics abstraction that defines Chronos metrics once and records through either a Prometheus client registry or an OTLP metrics provider selected by OTEL_METRICS_EXPORTER. The OTLP path uses the gRPC exporter configuration from standard OTLP environment variables, while the Prometheus path uses the prometheus client crate directly. Register the mock as an explicit chronos_ex example target and enable the OpenTelemetry metrics features required by the design. Verification: - cargo fmt --check - cargo check --manifest-path /tmp/chronos_prom_otlp_mock_check/Cargo.toml - cargo check -p chronos_ex --example prom_otlp_mock (blocked: missing system libsasl2 development package) Model-version: GPT-5 --- Cargo.toml | 7 +- examples/chronos_ex/Cargo.toml | 5 + examples/prom_otlp_mock.rs | 317 +++++++++++++++++++++++++++++++++ 3 files changed, 325 insertions(+), 4 deletions(-) create mode 100644 examples/prom_otlp_mock.rs diff --git a/Cargo.toml b/Cargo.toml index 2fca9a9..341386e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,16 +34,15 @@ tracing = "0.1" tracing-subscriber = "0.3" tracing-opentelemetry = "0.21.0" -opentelemetry = { version = "0.20.0", features = ["rt-tokio", "trace"]} -opentelemetry_sdk = { version = "0.20.0", features = ["rt-tokio", "trace"]} +opentelemetry = { version = "0.20.0", features = ["rt-tokio", "trace", "metrics"]} +opentelemetry_sdk = { version = "0.20.0", features = ["rt-tokio", "trace", "metrics"]} opentelemetry_api = { version = "0.20.0"} # Collector opentelemetry-jaeger = {version="0.19.0", features=["rt-tokio"]} opentelemetry-stdout = { version = "0.1.0", features = ["trace"] } -opentelemetry-otlp = { version = "0.13.0", features = ["http-proto", "reqwest-client"] } +opentelemetry-otlp = { version = "0.13.0", features = ["http-proto", "reqwest-client", "metrics", "tonic"] } opentelemetry-http = "0.9.0" # metrics prometheus = "0.13" axum = { version = "0.7", default-features = false, features = ["http1", "tokio"] } - diff --git a/examples/chronos_ex/Cargo.toml b/examples/chronos_ex/Cargo.toml index 4c17705..df6cb77 100644 --- a/examples/chronos_ex/Cargo.toml +++ b/examples/chronos_ex/Cargo.toml @@ -3,6 +3,10 @@ name = "chronos_ex" version = "0.0.2" edition = "2021" +[[example]] +name = "prom_otlp_mock" +path = "../prom_otlp_mock.rs" + [dependencies] env_logger = "0.9.0" dotenv = "0.15.0" @@ -25,6 +29,7 @@ opentelemetry_api.workspace = true opentelemetry_sdk.workspace = true opentelemetry-otlp.workspace = true opentelemetry-jaeger.workspace = true +prometheus.workspace = true # opentelemetry-stdout.workspace = true # opentelemetry = "0.20" diff --git a/examples/prom_otlp_mock.rs b/examples/prom_otlp_mock.rs new file mode 100644 index 0000000..e72bf7c --- /dev/null +++ b/examples/prom_otlp_mock.rs @@ -0,0 +1,317 @@ +//! Mock design for a Chronos metrics abstraction that can export through either +//! the Prometheus client library or OpenTelemetry OTLP metrics. +//! +//! Selection is intentionally driven by the standard OpenTelemetry metric +//! exporter variable: +//! +//! - `OTEL_METRICS_EXPORTER=prometheus` uses the `prometheus` crate registry. +//! - `OTEL_METRICS_EXPORTER=otlp` uses the OTLP gRPC exporter. +//! - unset defaults to Prometheus for local compatibility. +//! +//! This file is a design sketch for the Chronos rewrite, not wired into the +//! runtime yet. The important shape is that metric definitions live once in +//! `MetricDefinition`, while the backend-specific registrations stay behind the +//! `MetricsBackend` interface. + +use std::collections::HashMap; +use std::env; + +use opentelemetry::global; +use opentelemetry::metrics::{Counter as OtlpCounter, Histogram as OtlpHistogram, Unit}; +use opentelemetry::KeyValue; +use opentelemetry_otlp::WithExportConfig; +use prometheus::{histogram_opts, opts, CounterVec as PromCounterVec, HistogramVec as PromHistogramVec, Registry}; + +const OTEL_METRICS_EXPORTER: &str = "OTEL_METRICS_EXPORTER"; +const OTEL_EXPORTER_OTLP_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_PROTOCOL"; +const OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"; + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +enum MetricId { + MsgConsumed, + MsgConsumeLatency, +} + +#[derive(Clone, Copy, Debug)] +enum MetricKind { + Counter, + Histogram, +} + +#[derive(Clone, Copy, Debug)] +struct MetricDefinition { + id: MetricId, + name: &'static str, + description: &'static str, + unit: Option<&'static str>, + label_names: &'static [&'static str], + kind: MetricKind, +} + +const METRIC_DEFINITIONS: &[MetricDefinition] = &[ + MetricDefinition { + id: MetricId::MsgConsumed, + name: "chronos_messages_consumed_total", + description: "Total number of Chronos input messages consumed", + unit: Some("1"), + label_names: &["destination", "status"], + kind: MetricKind::Counter, + }, + MetricDefinition { + id: MetricId::MsgConsumeLatency, + name: "chronos_message_consume_latency_seconds", + description: "Time spent handling a consumed Chronos message", + unit: Some("s"), + label_names: &["destination", "status"], + kind: MetricKind::Histogram, + }, +]; + +trait MetricsBackend: Send + Sync { + fn inc_counter(&self, id: MetricId, value: u64, labels: &[(&'static str, String)]); + fn observe_histogram(&self, id: MetricId, value: f64, labels: &[(&'static str, String)]); + fn render_prometheus(&self) -> Option; + fn shutdown(&self); +} + +struct ChronosMetrics { + backend: Box, +} + +impl ChronosMetrics { + fn from_env() -> Result> { + let backend: Box = match MetricsExporter::from_env()? { + MetricsExporter::Prometheus => Box::new(PrometheusMetricsBackend::new()?), + MetricsExporter::Otlp => Box::new(OtlpMetricsBackend::new()?), + }; + + Ok(Self { backend }) + } + + fn message_consumed(&self, destination: &'static str, status: &'static str) { + self.backend.inc_counter( + MetricId::MsgConsumed, + 1, + &[("destination", destination.to_string()), ("status", status.to_string())], + ); + } + + fn consume_latency(&self, seconds: f64, destination: &'static str, status: &'static str) { + self.backend.observe_histogram( + MetricId::MsgConsumeLatency, + seconds, + &[("destination", destination.to_string()), ("status", status.to_string())], + ); + } + + fn prometheus_text(&self) -> Option { + self.backend.render_prometheus() + } + + fn shutdown(&self) { + self.backend.shutdown(); + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum MetricsExporter { + Prometheus, + Otlp, +} + +impl MetricsExporter { + fn from_env() -> Result> { + match env::var(OTEL_METRICS_EXPORTER).unwrap_or_else(|_| "prometheus".to_string()).as_str() { + "prometheus" => Ok(Self::Prometheus), + "otlp" => { + require_grpc_protocol()?; + Ok(Self::Otlp) + } + "none" => Err("metrics exporter disabled by OTEL_METRICS_EXPORTER=none".into()), + other => Err(format!("unsupported {OTEL_METRICS_EXPORTER} value: {other}").into()), + } + } +} + +fn require_grpc_protocol() -> Result<(), Box> { + let protocol = env::var(OTEL_EXPORTER_OTLP_METRICS_PROTOCOL) + .or_else(|_| env::var(OTEL_EXPORTER_OTLP_PROTOCOL)) + .unwrap_or_else(|_| "grpc".to_string()); + + if protocol == "grpc" { + Ok(()) + } else { + Err(format!("unsupported OTLP metrics protocol {protocol:?}; use grpc for this design").into()) + } +} + +struct PrometheusMetricsBackend { + registry: Registry, + counters: HashMap, + histograms: HashMap, +} + +impl PrometheusMetricsBackend { + fn new() -> Result { + let registry = Registry::new(); + let mut counters = HashMap::new(); + let mut histograms = HashMap::new(); + + for definition in METRIC_DEFINITIONS { + match definition.kind { + MetricKind::Counter => { + let metric = PromCounterVec::new(opts!(definition.name, definition.description), definition.label_names)?; + registry.register(Box::new(metric.clone()))?; + counters.insert(definition.id, metric); + } + MetricKind::Histogram => { + let metric = PromHistogramVec::new(histogram_opts!(definition.name, definition.description), definition.label_names)?; + registry.register(Box::new(metric.clone()))?; + histograms.insert(definition.id, metric); + } + } + } + + Ok(Self { + registry, + counters, + histograms, + }) + } +} + +impl MetricsBackend for PrometheusMetricsBackend { + fn inc_counter(&self, id: MetricId, value: u64, labels: &[(&'static str, String)]) { + if let Some(counter) = self.counters.get(&id) { + let label_values = prometheus_label_values(id, labels); + if let Ok(metric) = counter.get_metric_with_label_values(&label_values) { + metric.inc_by(value as f64); + } + } + } + + fn observe_histogram(&self, id: MetricId, value: f64, labels: &[(&'static str, String)]) { + if let Some(histogram) = self.histograms.get(&id) { + let label_values = prometheus_label_values(id, labels); + if let Ok(metric) = histogram.get_metric_with_label_values(&label_values) { + metric.observe(value); + } + } + } + + fn render_prometheus(&self) -> Option { + use prometheus::{Encoder, TextEncoder}; + + let encoder = TextEncoder::new(); + let mut buffer = Vec::new(); + encoder.encode(&self.registry.gather(), &mut buffer).ok()?; + String::from_utf8(buffer).ok() + } + + fn shutdown(&self) {} +} + +struct OtlpMetricsBackend { + provider: opentelemetry_sdk::metrics::MeterProvider, + counters: HashMap>, + histograms: HashMap>, +} + +impl OtlpMetricsBackend { + fn new() -> Result> { + let exporter = opentelemetry_otlp::new_exporter().tonic().with_env(); + let provider = opentelemetry_otlp::new_pipeline() + .metrics(opentelemetry::runtime::Tokio) + .with_exporter(exporter) + .build()?; + + global::set_meter_provider(provider.clone()); + let meter = global::meter("chronos"); + + let mut counters = HashMap::new(); + let mut histograms = HashMap::new(); + + for definition in METRIC_DEFINITIONS { + match definition.kind { + MetricKind::Counter => { + let mut builder = meter.u64_counter(definition.name).with_description(definition.description); + if let Some(unit) = definition.unit { + builder = builder.with_unit(Unit::new(unit)); + } + counters.insert(definition.id, builder.init()); + } + MetricKind::Histogram => { + let mut builder = meter.f64_histogram(definition.name).with_description(definition.description); + if let Some(unit) = definition.unit { + builder = builder.with_unit(Unit::new(unit)); + } + histograms.insert(definition.id, builder.init()); + } + } + } + + Ok(Self { + provider, + counters, + histograms, + }) + } +} + +impl MetricsBackend for OtlpMetricsBackend { + fn inc_counter(&self, id: MetricId, value: u64, labels: &[(&'static str, String)]) { + if let Some(counter) = self.counters.get(&id) { + counter.add(value, &labels_to_key_values(labels)); + } + } + + fn observe_histogram(&self, id: MetricId, value: f64, labels: &[(&'static str, String)]) { + if let Some(histogram) = self.histograms.get(&id) { + histogram.record(value, &labels_to_key_values(labels)); + } + } + + fn render_prometheus(&self) -> Option { + None + } + + fn shutdown(&self) { + let _ = self.provider.shutdown(); + } +} + +fn labels_to_key_values(labels: &[(&'static str, String)]) -> Vec { + labels.iter().map(|(key, value)| KeyValue::new(*key, value.clone())).collect() +} + +fn prometheus_label_values<'a>(id: MetricId, labels: &'a [(&'static str, String)]) -> Vec<&'a str> { + let Some(definition) = METRIC_DEFINITIONS.iter().find(|definition| definition.id == id) else { + return Vec::new(); + }; + + definition + .label_names + .iter() + .map(|name| { + labels + .iter() + .find(|(label_name, _)| label_name == name) + .map(|(_, value)| value.as_str()) + .unwrap_or("unknown") + }) + .collect() +} + +fn main() -> Result<(), Box> { + let metrics = ChronosMetrics::from_env()?; + + metrics.message_consumed("postgres", "pass"); + metrics.consume_latency(0.042, "postgres", "pass"); + + if let Some(text) = metrics.prometheus_text() { + println!("{text}"); + } + + metrics.shutdown(); + Ok(()) +} From 16fe597ab4dce3844ff5138491b07b57d37ab34c Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Wed, 29 Apr 2026 23:12:11 +1000 Subject: [PATCH 04/36] chore(metrics): add mock exporter make target Add a metrics.mock recipe that runs the Prometheus/OTLP mock with EXPORTER=prom or EXPORTER=otlp. Use a minimal standalone example package so the mock can run without pulling in Chronos Kafka dependencies and the local libsasl2 development package. The OTLP mode sets OTEL_METRICS_EXPORTER=otlp and OTEL_EXPORTER_OTLP_PROTOCOL=grpc, while Prometheus mode sets OTEL_METRICS_EXPORTER=prometheus and prints the text exposition. Verification: - cargo fmt --check - make -n metrics.mock EXPORTER=prom - make -n metrics.mock EXPORTER=otlp - make metrics.mock EXPORTER=prom - make metrics.mock EXPORTER=otlp - make metrics.mock EXPORTER=bad Model-version: GPT-5 --- Cargo.lock | 13 ++++++++++++- Makefile | 10 ++++++++++ examples/chronos_ex/Cargo.toml | 5 ----- examples/prom_otlp_mock.rs | 3 ++- examples/prom_otlp_mock_runner/Cargo.toml | 15 +++++++++++++++ 5 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 examples/prom_otlp_mock_runner/Cargo.toml diff --git a/Cargo.lock b/Cargo.lock index 889f2bc..1bd8021 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "addr2line" @@ -1688,6 +1688,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prom_otlp_mock_runner" +version = "0.0.0" +dependencies = [ + "opentelemetry", + "opentelemetry-otlp", + "opentelemetry_sdk", + "prometheus", + "tokio", +] + [[package]] name = "prometheus" version = "0.13.4" diff --git a/Makefile b/Makefile index 08c0ab7..629b7d5 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ SHELL:=/bin/bash RUST_VERSION := $(shell grep 'channel' rust-toolchain.toml | sed 's/.*"\(.*\)"/\1/') +EXPORTER ?= prom # pp - pretty print function yellow := $(shell tput setaf 3) @@ -115,6 +116,15 @@ metrics.check: $(call pp,check metrics endpoint...) curl -sf http://localhost:9090/metrics | head -20 +## metrics.mock: 🔍 Run Prometheus/OTLP metrics mock example with EXPORTER=prom|otlp +metrics.mock: + $(call pp,run metrics mock example with exporter $(EXPORTER)...) + @case "$(EXPORTER)" in \ + prom|prometheus) OTEL_METRICS_EXPORTER=prometheus cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ + otlp) OTEL_METRICS_EXPORTER=otlp OTEL_EXPORTER_OTLP_PROTOCOL=grpc cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ + *) echo "unsupported EXPORTER=$(EXPORTER); use EXPORTER=prom or EXPORTER=otlp" >&2; exit 2 ;; \ + esac + ## test.unit.coverage: 🧪 Runs rust unit tests with coverage 'cobertura' and 'junit' reports test.unit.coverage: $(call pp,rust unit tests...) diff --git a/examples/chronos_ex/Cargo.toml b/examples/chronos_ex/Cargo.toml index df6cb77..4c17705 100644 --- a/examples/chronos_ex/Cargo.toml +++ b/examples/chronos_ex/Cargo.toml @@ -3,10 +3,6 @@ name = "chronos_ex" version = "0.0.2" edition = "2021" -[[example]] -name = "prom_otlp_mock" -path = "../prom_otlp_mock.rs" - [dependencies] env_logger = "0.9.0" dotenv = "0.15.0" @@ -29,7 +25,6 @@ opentelemetry_api.workspace = true opentelemetry_sdk.workspace = true opentelemetry-otlp.workspace = true opentelemetry-jaeger.workspace = true -prometheus.workspace = true # opentelemetry-stdout.workspace = true # opentelemetry = "0.20" diff --git a/examples/prom_otlp_mock.rs b/examples/prom_otlp_mock.rs index e72bf7c..ef7d394 100644 --- a/examples/prom_otlp_mock.rs +++ b/examples/prom_otlp_mock.rs @@ -302,7 +302,8 @@ fn prometheus_label_values<'a>(id: MetricId, labels: &'a [(&'static str, String) .collect() } -fn main() -> Result<(), Box> { +#[tokio::main] +async fn main() -> Result<(), Box> { let metrics = ChronosMetrics::from_env()?; metrics.message_consumed("postgres", "pass"); diff --git a/examples/prom_otlp_mock_runner/Cargo.toml b/examples/prom_otlp_mock_runner/Cargo.toml new file mode 100644 index 0000000..404e90b --- /dev/null +++ b/examples/prom_otlp_mock_runner/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "prom_otlp_mock_runner" +version = "0.0.0" +edition = "2021" + +[[bin]] +name = "prom_otlp_mock" +path = "../prom_otlp_mock.rs" + +[dependencies] +opentelemetry.workspace = true +opentelemetry_sdk.workspace = true +opentelemetry-otlp.workspace = true +prometheus.workspace = true +tokio.workspace = true From be271d12191143f8119f1f1382fa00d914486786 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Wed, 29 Apr 2026 23:28:16 +1000 Subject: [PATCH 05/36] feat: add Grafana LGTM dev stack Set up a pinned grafana/otel-lgtm:0.24.1 compose overlay with local Prometheus, OpenTelemetry Collector, and Grafana dashboard provisioning overrides. The Prometheus config keeps the upstream OTLP/resource defaults and scrapes the LGTM services plus Chronos on chronos:9091. Wire Chronos metrics binding through OTEL_EXPORTER_PROMETHEUS_HOST and OTEL_EXPORTER_PROMETHEUS_PORT, keeping METRICS_PORT as a backward-compatible fallback. Update local integration helpers and docs to use the OpenTelemetry Prometheus exporter variables. Verification: - cargo fmt --check - docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml config - make lgtm.validate - sh scripts/pre-commit-checks.sh (fails: host is missing libsasl2 development headers required by sasl2-sys) Model-version: GPT-5 --- How-to.md | 20 +++++++-- Makefile | 16 ++++++- chronos_bin/src/metrics/server.rs | 6 +-- chronos_bin/src/runner.rs | 6 ++- chronos_bin/src/utils/config.rs | 48 ++++++++++++++++++++- dev/dashboards.yaml | 9 ++++ dev/dashboards/.gitkeep | 1 + dev/docker-compose-lgtm.yaml | 31 ++++++++++++++ dev/otelcol-contrib.yaml | 68 ++++++++++++++++++++++++++++++ dev/prometheus.yaml | 69 +++++++++++++++++++++++++++++++ docker-compose.yml | 5 ++- scripts/integration.sh | 5 ++- 12 files changed, 270 insertions(+), 14 deletions(-) create mode 100644 dev/dashboards.yaml create mode 100644 dev/dashboards/.gitkeep create mode 100644 dev/docker-compose-lgtm.yaml create mode 100644 dev/otelcol-contrib.yaml create mode 100644 dev/prometheus.yaml diff --git a/How-to.md b/How-to.md index a7cd568..dc9b538 100644 --- a/How-to.md +++ b/How-to.md @@ -51,6 +51,8 @@ These values are set to fine tune performance Chrono in need, refer to [Chronos] | TIMING_ADVANCE|0 sec | FAIL_DETECT_INTERVAL|10 sec | HEALTHCHECK_FILE|healthcheck/chronos_healthcheck +| OTEL_EXPORTER_PROMETHEUS_HOST|0.0.0.0 +| OTEL_EXPORTER_PROMETHEUS_PORT|9090 ## Observability @@ -62,6 +64,21 @@ At this time Chronos supports Http protocol based connectivity to the Otel colle | OTEL_EXPORTER_OTLP_TRACES_ENDPOINT|"http://localhost:4318/v1/traces" | OTEL_EXPORTER_OTLP_PROTOCOL|"http/json" +### Local Grafana LGTM stack +Use the Grafana LGTM compose overlay with the main Docker Compose file to run Grafana, Loki, Tempo, Prometheus, Pyroscope, and the OpenTelemetry Collector in one container: + +```sh +docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml up -d +``` + +The overlay mounts local override files for Prometheus, the OpenTelemetry Collector, and Grafana dashboard provisioning. Chronos exposes its Prometheus metrics endpoint with `OTEL_EXPORTER_PROMETHEUS_HOST` and `OTEL_EXPORTER_PROMETHEUS_PORT`; when run from `docker-compose.yml` the endpoint is `chronos:9091`. + +Validate the LGTM configuration files with: + +```sh +make lgtm.validate +``` + ## Chronos Images Two images are published for each [RELEASE]( `https://github.com/kindredgroup/chronos/pkgs/container/chronos`) - migrations image @@ -71,6 +88,3 @@ Two images are published for each [RELEASE]( `https://github.com/kindredgroup/ch - - - diff --git a/Makefile b/Makefile index 629b7d5..18fd4fb 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ SHELL:=/bin/bash RUST_VERSION := $(shell grep 'channel' rust-toolchain.toml | sed 's/.*"\(.*\)"/\1/') EXPORTER ?= prom +LGTM_IMAGE ?= grafana/otel-lgtm:0.24.1 # pp - pretty print function yellow := $(shell tput setaf 3) @@ -114,7 +115,7 @@ integration.down: ## metrics.check: 🔍 Verify /metrics endpoint responds (requires running app) metrics.check: $(call pp,check metrics endpoint...) - curl -sf http://localhost:9090/metrics | head -20 + curl -sf "http://localhost:$${OTEL_EXPORTER_PROMETHEUS_PORT:-$${METRICS_PORT:-9090}}/metrics" | head -20 ## metrics.mock: 🔍 Run Prometheus/OTLP metrics mock example with EXPORTER=prom|otlp metrics.mock: @@ -125,6 +126,19 @@ metrics.mock: *) echo "unsupported EXPORTER=$(EXPORTER); use EXPORTER=prom or EXPORTER=otlp" >&2; exit 2 ;; \ esac +## lgtm.validate: 🔍 Validate LGTM Prometheus and OpenTelemetry Collector configs +lgtm.validate: + $(call pp,validate LGTM Prometheus config with $(LGTM_IMAGE)...) + docker run --rm \ + -v "$(PWD)/dev/prometheus.yaml:/otel-lgtm/prometheus.yaml:ro" \ + --entrypoint /otel-lgtm/prometheus/promtool \ + $(LGTM_IMAGE) check config /otel-lgtm/prometheus.yaml + $(call pp,validate LGTM OpenTelemetry Collector config with $(LGTM_IMAGE)...) + docker run --rm \ + -v "$(PWD)/dev/otelcol-contrib.yaml:/otel-lgtm/otelcol-config.yaml:ro" \ + --entrypoint /otel-lgtm/otelcol-contrib/otelcol-contrib \ + $(LGTM_IMAGE) validate --config=file:/otel-lgtm/otelcol-config.yaml --feature-gates=service.profilesSupport + ## test.unit.coverage: 🧪 Runs rust unit tests with coverage 'cobertura' and 'junit' reports test.unit.coverage: $(call pp,rust unit tests...) diff --git a/chronos_bin/src/metrics/server.rs b/chronos_bin/src/metrics/server.rs index 2d77776..8ac687c 100644 --- a/chronos_bin/src/metrics/server.rs +++ b/chronos_bin/src/metrics/server.rs @@ -16,12 +16,12 @@ async fn metrics_handler(State(metrics): State>) -> impl Int } } -pub async fn run_metrics_server(metrics: Arc, port: u16) { +pub async fn run_metrics_server(metrics: Arc, host: String, port: u16) { let app = Router::new().route("/metrics", get(metrics_handler)).with_state(metrics); - let addr = std::net::SocketAddr::from(([0, 0, 0, 0], port)); + let addr = format!("{}:{}", host, port); log::info!("Metrics server listening on {}", addr); - let listener = tokio::net::TcpListener::bind(addr).await.expect("Failed to bind metrics server port"); + let listener = tokio::net::TcpListener::bind(&addr).await.expect("Failed to bind metrics server port"); axum::serve(listener, app).await.expect("Metrics server failed"); } diff --git a/chronos_bin/src/runner.rs b/chronos_bin/src/runner.rs index 2b9231f..0649af5 100644 --- a/chronos_bin/src/runner.rs +++ b/chronos_bin/src/runner.rs @@ -31,13 +31,15 @@ impl Runner { let receiver_consumer = self.consumer.clone(); let receiver_metrics = Arc::clone(&self.metrics); - let metrics_port = ChronosConfig::from_env().metrics_port; + let chronos_config = ChronosConfig::from_env(); + let metrics_host = chronos_config.metrics_host; + let metrics_port = chronos_config.metrics_port; let metrics_for_server = Arc::clone(&self.metrics); // Spawn metrics server as an independent background task. // A failure here is logged but does not stop the processing tasks. tokio::task::spawn(async move { - run_metrics_server(metrics_for_server, metrics_port).await; + run_metrics_server(metrics_for_server, metrics_host, metrics_port).await; }); let monitor_handler = tokio::task::spawn(async { diff --git a/chronos_bin/src/utils/config.rs b/chronos_bin/src/utils/config.rs index 134c15e..fa6f14f 100644 --- a/chronos_bin/src/utils/config.rs +++ b/chronos_bin/src/utils/config.rs @@ -5,6 +5,7 @@ pub struct ChronosConfig { pub processor_db_poll: u64, pub time_advance: u64, pub fail_detect_interval: u64, + pub metrics_host: String, pub metrics_port: u16, } @@ -16,7 +17,52 @@ impl ChronosConfig { processor_db_poll: std::env::var("PROCESSOR_DB_POLL").unwrap_or_else(|_| 5.to_string()).parse().unwrap_or(5), time_advance: std::env::var("TIMING_ADVANCE").unwrap_or_else(|_| 0.to_string()).parse().unwrap_or(0), fail_detect_interval: std::env::var("FAIL_DETECT_INTERVAL").unwrap_or_else(|_| 10.to_string()).parse().unwrap_or(10), - metrics_port: std::env::var("METRICS_PORT").unwrap_or_else(|_| "9090".to_string()).parse().unwrap_or(9090), + metrics_host: std::env::var("OTEL_EXPORTER_PROMETHEUS_HOST").unwrap_or_else(|_| "0.0.0.0".to_string()), + metrics_port: std::env::var("OTEL_EXPORTER_PROMETHEUS_PORT") + .or_else(|_| std::env::var("METRICS_PORT")) + .unwrap_or_else(|_| "9090".to_string()) + .parse() + .unwrap_or(9090), } } } + +#[cfg(test)] +mod tests { + use super::ChronosConfig; + use serial_test::serial; + + fn remove_metrics_env() { + std::env::remove_var("OTEL_EXPORTER_PROMETHEUS_HOST"); + std::env::remove_var("OTEL_EXPORTER_PROMETHEUS_PORT"); + std::env::remove_var("METRICS_PORT"); + } + + #[test] + #[serial] + fn prometheus_spec_env_overrides_metrics_binding() { + remove_metrics_env(); + std::env::set_var("OTEL_EXPORTER_PROMETHEUS_HOST", "127.0.0.1"); + std::env::set_var("OTEL_EXPORTER_PROMETHEUS_PORT", "9464"); + std::env::set_var("METRICS_PORT", "9090"); + + let config = ChronosConfig::from_env(); + + assert_eq!(config.metrics_host, "127.0.0.1"); + assert_eq!(config.metrics_port, 9464); + remove_metrics_env(); + } + + #[test] + #[serial] + fn metrics_port_remains_backward_compatible_fallback() { + remove_metrics_env(); + std::env::set_var("METRICS_PORT", "9091"); + + let config = ChronosConfig::from_env(); + + assert_eq!(config.metrics_host, "0.0.0.0"); + assert_eq!(config.metrics_port, 9091); + remove_metrics_env(); + } +} diff --git a/dev/dashboards.yaml b/dev/dashboards.yaml new file mode 100644 index 0000000..ee8b9a1 --- /dev/null +++ b/dev/dashboards.yaml @@ -0,0 +1,9 @@ +apiVersion: 1 + +providers: + - name: "Chronos" + type: file + updateIntervalSeconds: 10 + options: + path: /otel-lgtm/grafana/conf/provisioning/dashboards/chronos + foldersFromFilesStructure: true diff --git a/dev/dashboards/.gitkeep b/dev/dashboards/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/dev/dashboards/.gitkeep @@ -0,0 +1 @@ + diff --git a/dev/docker-compose-lgtm.yaml b/dev/docker-compose-lgtm.yaml new file mode 100644 index 0000000..8a0350a --- /dev/null +++ b/dev/docker-compose-lgtm.yaml @@ -0,0 +1,31 @@ +services: + chronos: + environment: + OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: http://lgtm:4318/v1/traces + + jaeger-all-in-one: + profiles: + - legacy-otel + + otel-collector: + profiles: + - legacy-otel + + lgtm: + image: grafana/otel-lgtm:0.24.1 + container_name: lgtm + ports: + - "3000:3000" # Grafana + - "3100:3100" # Loki + - "3200:3200" # Tempo + - "4040:4040" # Pyroscope + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + - "9090:9090" # Prometheus + volumes: + - ./dev/prometheus.yaml:/otel-lgtm/prometheus.yaml:ro + - ./dev/otelcol-contrib.yaml:/otel-lgtm/otelcol-config.yaml:ro + - ./dev/dashboards.yaml:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos.yaml:ro + - ./dev/dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos:ro + networks: + - chronos diff --git a/dev/otelcol-contrib.yaml b/dev/otelcol-contrib.yaml new file mode 100644 index 0000000..06a1f06 --- /dev/null +++ b/dev/otelcol-contrib.yaml @@ -0,0 +1,68 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + cors: + allowed_origins: + - http://* + prometheus/collector: + config: + scrape_configs: + - job_name: "opentelemetry-collector" + scrape_interval: 1s + static_configs: + - targets: ["127.0.0.1:8888"] + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + path: "/ready" + +processors: + batch: + +exporters: + otlp_http/metrics: + endpoint: http://127.0.0.1:9090/api/v1/otlp + tls: + insecure: true + otlp_http/traces: + endpoint: http://127.0.0.1:4418 + tls: + insecure: true + otlp_http/logs: + endpoint: http://127.0.0.1:3100/otlp + tls: + insecure: true + otlp/profiles: + endpoint: http://127.0.0.1:4040 + tls: + insecure: true + debug/metrics: + verbosity: detailed + debug/traces: + verbosity: detailed + debug/logs: + verbosity: detailed + +service: + extensions: [health_check] + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp_http/traces] + metrics: + receivers: [otlp, prometheus/collector] + processors: [batch] + exporters: [otlp_http/metrics] + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlp_http/logs] + profiles: + receivers: [otlp] + exporters: [otlp/profiles] diff --git a/dev/prometheus.yaml b/dev/prometheus.yaml new file mode 100644 index 0000000..16b2260 --- /dev/null +++ b/dev/prometheus.yaml @@ -0,0 +1,69 @@ +--- +global: + scrape_interval: 15s + scrape_native_histograms: true + +otlp: + keep_identifying_resource_attributes: true + promote_resource_attributes: + - service.instance.id + - service.name + - service.namespace + - service.version + - cloud.availability_zone + - cloud.region + - container.name + - deployment.environment + - deployment.environment.name + - k8s.cluster.name + - k8s.container.name + - k8s.cronjob.name + - k8s.daemonset.name + - k8s.deployment.name + - k8s.job.name + - k8s.namespace.name + - k8s.node.name + - k8s.pod.name + - k8s.replicaset.name + - k8s.statefulset.name + - host.name + - postgresql.database.name + - postgresql.schema.name + - postgresql.table.name + - postgresql.index.name + - database + - kafka.cluster.alias + +storage: + tsdb: + out_of_order_time_window: 10m + +scrape_configs: + - job_name: chronos + static_configs: + - targets: ["chronos:9091"] + + - job_name: grafana + static_configs: + - targets: ["127.0.0.1:3000"] + + - job_name: loki + static_configs: + - targets: ["127.0.0.1:3100"] + + - job_name: prometheus + static_configs: + - targets: ["127.0.0.1:9090"] + + - job_name: pyroscope + static_configs: + - targets: ["127.0.0.1:4040"] + + - job_name: tempo + static_configs: + - targets: ["127.0.0.1:3200"] + + - job_name: opentelemetry-collector + scrape_interval: 1s + static_configs: + - targets: ["127.0.0.1:8888"] diff --git a/docker-compose.yml b/docker-compose.yml index bafcd2b..2ee777f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -76,7 +76,7 @@ services: context: . dockerfile: Dockerfile.chronos ports: - - "9090:9090" # Prometheus /metrics endpoint + - "9091:9091" # Prometheus /metrics endpoint environment: KAFKA_HOST: kafka KAFKA_PORT: "9092" @@ -93,7 +93,8 @@ services: PG_DATABASE: chronos_db PG_POOL_SIZE: "10" RUST_LOG: info - METRICS_PORT: "9090" + OTEL_EXPORTER_PROMETHEUS_HOST: "0.0.0.0" + OTEL_EXPORTER_PROMETHEUS_PORT: "9091" MONITOR_DB_POLL: "5" PROCESSOR_DB_POLL: "5" TIMING_ADVANCE: "0" diff --git a/scripts/integration.sh b/scripts/integration.sh index 6e5d28d..f88fc38 100755 --- a/scripts/integration.sh +++ b/scripts/integration.sh @@ -8,7 +8,7 @@ set -euo pipefail # ─── configuration ──────────────────────────────────────────────────────────── KAFKA_EXT_PORT="${KAFKA_EXT_PORT:-9094}" PG_PORT="${PG_PORT:-5432}" -METRICS_PORT="${METRICS_PORT:-9090}" +METRICS_PORT="${OTEL_EXPORTER_PROMETHEUS_PORT:-${METRICS_PORT:-9090}}" CHRONOS_PID_FILE="/tmp/chronos_integration.pid" CHRONOS_LOG="/tmp/chronos_integration.log" MAX_WAIT=120 # seconds to wait for each readiness check @@ -91,7 +91,8 @@ PG_PASSWORD=admin \ PG_DATABASE=chronos_db \ PG_POOL_SIZE=10 \ RUST_LOG=warn \ -METRICS_PORT="${METRICS_PORT}" \ +OTEL_EXPORTER_PROMETHEUS_HOST=0.0.0.0 \ +OTEL_EXPORTER_PROMETHEUS_PORT="${METRICS_PORT}" \ MONITOR_DB_POLL=5 \ PROCESSOR_DB_POLL=5 \ TIMING_ADVANCE=0 \ From a55939ab741a21f3bec41c462e2b16c867757f23 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Wed, 29 Apr 2026 23:29:38 +1000 Subject: [PATCH 06/36] chore: enable detailed otelcol telemetry Configure the LGTM OpenTelemetry Collector override to emit JSON logs and detailed internal metrics via service.telemetry. Verification: - make lgtm.validate Model-version: GPT-5 --- dev/otelcol-contrib.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dev/otelcol-contrib.yaml b/dev/otelcol-contrib.yaml index 06a1f06..29250c3 100644 --- a/dev/otelcol-contrib.yaml +++ b/dev/otelcol-contrib.yaml @@ -49,6 +49,11 @@ exporters: verbosity: detailed service: + telemetry: + logs: + encoding: json + metrics: + level: detailed extensions: [health_check] pipelines: traces: From d333b2c73a3ce13aa62628503ef3e20024b042a4 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Wed, 29 Apr 2026 23:32:14 +1000 Subject: [PATCH 07/36] chore: add standalone LGTM make recipe Add lgtm.up and lgtm.down recipes that operate only on the LGTM service while using the main compose file for the shared chronos network. Update the local observability docs to point at the new start command. Verification: - docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml config --services - make -n lgtm.up - make lgtm.validate Model-version: GPT-5 --- How-to.md | 3 +-- Makefile | 11 +++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/How-to.md b/How-to.md index dc9b538..17b9792 100644 --- a/How-to.md +++ b/How-to.md @@ -68,7 +68,7 @@ At this time Chronos supports Http protocol based connectivity to the Otel colle Use the Grafana LGTM compose overlay with the main Docker Compose file to run Grafana, Loki, Tempo, Prometheus, Pyroscope, and the OpenTelemetry Collector in one container: ```sh -docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml up -d +make lgtm.up ``` The overlay mounts local override files for Prometheus, the OpenTelemetry Collector, and Grafana dashboard provisioning. Chronos exposes its Prometheus metrics endpoint with `OTEL_EXPORTER_PROMETHEUS_HOST` and `OTEL_EXPORTER_PROMETHEUS_PORT`; when run from `docker-compose.yml` the endpoint is `chronos:9091`. @@ -87,4 +87,3 @@ Two images are published for each [RELEASE]( `https://github.com/kindredgroup/ch - diff --git a/Makefile b/Makefile index 18fd4fb..2ab5861 100644 --- a/Makefile +++ b/Makefile @@ -139,6 +139,17 @@ lgtm.validate: --entrypoint /otel-lgtm/otelcol-contrib/otelcol-contrib \ $(LGTM_IMAGE) validate --config=file:/otel-lgtm/otelcol-config.yaml --feature-gates=service.profilesSupport +## lgtm.up: 📈 Start standalone Grafana LGTM stack +lgtm.up: + $(call pp,start standalone LGTM stack...) + docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml up -d lgtm + +## lgtm.down: 🛑 Stop standalone Grafana LGTM stack +lgtm.down: + $(call pp,stop standalone LGTM stack...) + docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml stop lgtm 2>/dev/null || true + docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml rm -f lgtm 2>/dev/null || true + ## test.unit.coverage: 🧪 Runs rust unit tests with coverage 'cobertura' and 'junit' reports test.unit.coverage: $(call pp,rust unit tests...) From f40c5ee8717d658def1a7d8d83f9f118bbb9bab7 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Wed, 29 Apr 2026 23:52:20 +1000 Subject: [PATCH 08/36] chore: collect LGTM service logs Enable LGTM service logging, route backend stdout and stderr through a JSON-line logging wrapper, and have the embedded OpenTelemetry Collector re-ingest the generated log files with the file_log receiver. Add an LGTM healthcheck script that checks Grafana, Loki, Tempo, Pyroscope, Prometheus, and the OpenTelemetry Collector readiness endpoints. Tempo 2.10.3 does not expose a JSON log-format flag, so the wrapper normalizes non-JSON service lines into JSON records while preserving native JSON records from services that support them. Verification: - make lgtm.validate - docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml config - make lgtm.up - docker ps --filter name=lgtm --format '{{.Names}} {{.Status}}' - docker exec lgtm sh -c 'sh /otel-lgtm/chronos-healthcheck.sh' - docker exec lgtm sh -c 'ls -1 /data/lgtm/logs && for f in /data/lgtm/logs/*.jsonl; do head -n 2 ""; done' - docker exec lgtm sh -c 'curl -sfG http://127.0.0.1:3100/loki/api/v1/query_range --data-urlencode query={service_name="unknown_service"} --data-urlencode limit=1' Model-version: GPT-5 --- dev/docker-compose-lgtm.yaml | 17 +++++++++++++ dev/lgtm-healthcheck.sh | 20 +++++++++++++++ dev/lgtm-logging.sh | 48 ++++++++++++++++++++++++++++++++++++ dev/otelcol-contrib.yaml | 8 +++++- 4 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 dev/lgtm-healthcheck.sh create mode 100644 dev/lgtm-logging.sh diff --git a/dev/docker-compose-lgtm.yaml b/dev/docker-compose-lgtm.yaml index 8a0350a..f67ae08 100644 --- a/dev/docker-compose-lgtm.yaml +++ b/dev/docker-compose-lgtm.yaml @@ -14,6 +14,15 @@ services: lgtm: image: grafana/otel-lgtm:0.24.1 container_name: lgtm + environment: + ENABLE_LOGS_ALL: "true" + GF_LOG_CONSOLE_FORMAT: json + GF_LOG_FORMAT: json + GF_LOG_MODE: console + LGTM_LOG_DIR: /data/lgtm/logs + LOKI_EXTRA_ARGS: -log.format=json + PROMETHEUS_EXTRA_ARGS: --log.format=json + PYROSCOPE_EXTRA_ARGS: -log.format=json ports: - "3000:3000" # Grafana - "3100:3100" # Loki @@ -22,9 +31,17 @@ services: - "4317:4317" # OTLP gRPC - "4318:4318" # OTLP HTTP - "9090:9090" # Prometheus + healthcheck: + test: ["CMD-SHELL", "sh /otel-lgtm/chronos-healthcheck.sh"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s volumes: - ./dev/prometheus.yaml:/otel-lgtm/prometheus.yaml:ro - ./dev/otelcol-contrib.yaml:/otel-lgtm/otelcol-config.yaml:ro + - ./dev/lgtm-healthcheck.sh:/otel-lgtm/chronos-healthcheck.sh:ro + - ./dev/lgtm-logging.sh:/otel-lgtm/logging.sh:ro - ./dev/dashboards.yaml:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos.yaml:ro - ./dev/dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos:ro networks: diff --git a/dev/lgtm-healthcheck.sh b/dev/lgtm-healthcheck.sh new file mode 100644 index 0000000..fecbbe6 --- /dev/null +++ b/dev/lgtm-healthcheck.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env sh + +set -eu + +check_service() { + name=$1 + url=$2 + + echo "Checking ${name} at ${url}" + curl -sf "${url}" >/dev/null +} + +check_service "Grafana" "http://127.0.0.1:3000/api/health" +check_service "Loki" "http://127.0.0.1:3100/ready" +check_service "Tempo" "http://127.0.0.1:3200/ready" +check_service "Pyroscope" "http://127.0.0.1:4040/ready" +check_service "Prometheus" "http://127.0.0.1:9090/-/ready" +check_service "OpenTelemetry Collector" "http://127.0.0.1:13133/ready" + +echo "All LGTM services healthy" diff --git a/dev/lgtm-logging.sh b/dev/lgtm-logging.sh new file mode 100644 index 0000000..445a811 --- /dev/null +++ b/dev/lgtm-logging.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +set -euo pipefail + +function json_lines() { + service=$1 + stream=$2 + + awk -v service="${service}" -v stream="${stream}" ' + function escape_json(value) { + gsub(/\\/, "\\\\", value) + gsub(/"/, "\\\"", value) + gsub(/\t/, "\\t", value) + gsub(/\r/, "\\r", value) + return value + } + /^[[:space:]]*\{/ { + print + fflush() + next + } + { + message = escape_json($0) + printf("{\"service\":\"%s\",\"stream\":\"%s\",\"message\":\"%s\"}\n", service, stream, message) + fflush() + } + ' +} + +function run_with_logging() { + name=$1 + shift + envvar=$1 + shift + + safe_name=$(printf '%s' "${name}" | tr '[:upper:] ' '[:lower:]_' | tr -cd '[:alnum:]_.-') + log_dir="${LGTM_LOG_DIR:-/data/lgtm/logs}" + log_file="${log_dir}/${safe_name}.jsonl" + + if [[ ${envvar} == "true" || ${ENABLE_LOGS_ALL:-false} == "true" ]]; then + echo "Running ${name} logging=true file=${log_file}" + mkdir -p "${log_dir}" + exec "$@" > >(json_lines "${name}" stdout | tee -a "${log_file}") 2> >(json_lines "${name}" stderr | tee -a "${log_file}" >&2) + else + echo "Running ${name} logging=false" + exec "$@" >/dev/null 2>&1 + fi +} diff --git a/dev/otelcol-contrib.yaml b/dev/otelcol-contrib.yaml index 29250c3..f1f207d 100644 --- a/dev/otelcol-contrib.yaml +++ b/dev/otelcol-contrib.yaml @@ -15,6 +15,12 @@ receivers: scrape_interval: 1s static_configs: - targets: ["127.0.0.1:8888"] + file_log/lgtm: + include: + - /data/lgtm/logs/*.jsonl + include_file_name: true + include_file_path: true + start_at: beginning extensions: health_check: @@ -65,7 +71,7 @@ service: processors: [batch] exporters: [otlp_http/metrics] logs: - receivers: [otlp] + receivers: [otlp, file_log/lgtm] processors: [batch] exporters: [otlp_http/logs] profiles: From c6daeece32e7449007e9ab115770a3b240c8e976 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Thu, 30 Apr 2026 00:07:48 +1000 Subject: [PATCH 09/36] chore: use service names for LGTM log files Derive LGTM log filenames from service names instead of the full service label that includes component versions. Map the OpenTelemetry Collector label to otelcol.jsonl for a stable service-specific file name. Verification: - make lgtm.validate - docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml up -d --force-recreate lgtm - docker exec lgtm sh -c 'ls -1 /data/lgtm/logs' - docker ps --filter name=lgtm --format '{{.Names}} {{.Status}}' Model-version: GPT-5 --- dev/lgtm-logging.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dev/lgtm-logging.sh b/dev/lgtm-logging.sh index 445a811..ffebc13 100644 --- a/dev/lgtm-logging.sh +++ b/dev/lgtm-logging.sh @@ -33,7 +33,11 @@ function run_with_logging() { envvar=$1 shift - safe_name=$(printf '%s' "${name}" | tr '[:upper:] ' '[:lower:]_' | tr -cd '[:alnum:]_.-') + case "${name}" in + "OpenTelemetry Collector"*) service_name=otelcol ;; + *) service_name=${name%% *} ;; + esac + safe_name=$(printf '%s' "${service_name}" | tr '[:upper:]' '[:lower:]' | tr -cd '[:alnum:]_.-') log_dir="${LGTM_LOG_DIR:-/data/lgtm/logs}" log_file="${log_dir}/${safe_name}.jsonl" From 96f70eb0987e17c4df15850d282133c6dfdc903c Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Thu, 30 Apr 2026 00:19:08 +1000 Subject: [PATCH 10/36] fix: point OTLP metrics mock at LGTM Make the OTLP metrics mock explicit about the LGTM gRPC endpoint and service resource metadata so make metrics.mock EXPORTER=otlp sends to the local LGTM collector instead of relying on older SDK endpoint defaults. Flush the metrics provider before shutdown so short-lived mock runs export their points. Set LGTM file-ingested logs through a transform processor that fills missing resource service.name from log.file.name without the .jsonl extension, preserving records that already carry a service name. Verification: - cargo fmt --check - cargo check -p prom_otlp_mock_runner - make lgtm.validate - make metrics.mock EXPORTER=otlp - docker exec lgtm sh -c 'curl -sf http://127.0.0.1:3100/loki/api/v1/label/service_name/values' Note: direct curl to localhost:4318/9090 from this sandbox network namespace fails even while Docker reports LGTM ports published; Prometheus/Loki checks were run from inside the LGTM container. Model-version: GPT-5 --- Makefile | 2 +- dev/otelcol-contrib.yaml | 7 ++++++- examples/prom_otlp_mock.rs | 18 ++++++++++++++++-- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 2ab5861..5f56f83 100644 --- a/Makefile +++ b/Makefile @@ -122,7 +122,7 @@ metrics.mock: $(call pp,run metrics mock example with exporter $(EXPORTER)...) @case "$(EXPORTER)" in \ prom|prometheus) OTEL_METRICS_EXPORTER=prometheus cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ - otlp) OTEL_METRICS_EXPORTER=otlp OTEL_EXPORTER_OTLP_PROTOCOL=grpc cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ + otlp) OTEL_SERVICE_NAME=chronos-metrics-mock OTEL_RESOURCE_ATTRIBUTES=service.instance.id=chronos-metrics-mock-local OTEL_METRICS_EXPORTER=otlp OTEL_EXPORTER_OTLP_PROTOCOL=grpc OTEL_EXPORTER_OTLP_METRICS_ENDPOINT=http://127.0.0.1:4317 OTEL_METRIC_EXPORT_INTERVAL=1000 cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ *) echo "unsupported EXPORTER=$(EXPORTER); use EXPORTER=prom or EXPORTER=otlp" >&2; exit 2 ;; \ esac diff --git a/dev/otelcol-contrib.yaml b/dev/otelcol-contrib.yaml index f1f207d..7dc0dbb 100644 --- a/dev/otelcol-contrib.yaml +++ b/dev/otelcol-contrib.yaml @@ -28,6 +28,11 @@ extensions: path: "/ready" processors: + transform/lgtm_logs: + log_statements: + - context: log + statements: + - set(resource.attributes["service.name"], ExtractPatterns(attributes["log.file.name"], "^(?P.*)\\.jsonl$")["service_name"]) where resource.attributes["service.name"] == nil batch: exporters: @@ -72,7 +77,7 @@ service: exporters: [otlp_http/metrics] logs: receivers: [otlp, file_log/lgtm] - processors: [batch] + processors: [transform/lgtm_logs, batch] exporters: [otlp_http/logs] profiles: receivers: [otlp] diff --git a/examples/prom_otlp_mock.rs b/examples/prom_otlp_mock.rs index ef7d394..9d91faa 100644 --- a/examples/prom_otlp_mock.rs +++ b/examples/prom_otlp_mock.rs @@ -23,6 +23,8 @@ use opentelemetry_otlp::WithExportConfig; use prometheus::{histogram_opts, opts, CounterVec as PromCounterVec, HistogramVec as PromHistogramVec, Registry}; const OTEL_METRICS_EXPORTER: &str = "OTEL_METRICS_EXPORTER"; +const OTEL_EXPORTER_OTLP_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_ENDPOINT"; +const OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"; const OTEL_EXPORTER_OTLP_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_PROTOCOL"; const OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"; @@ -219,7 +221,10 @@ struct OtlpMetricsBackend { impl OtlpMetricsBackend { fn new() -> Result> { - let exporter = opentelemetry_otlp::new_exporter().tonic().with_env(); + let endpoint = env::var(OTEL_EXPORTER_OTLP_METRICS_ENDPOINT) + .or_else(|_| env::var(OTEL_EXPORTER_OTLP_ENDPOINT)) + .unwrap_or_else(|_| "http://127.0.0.1:4317".to_string()); + let exporter = opentelemetry_otlp::new_exporter().tonic().with_env().with_endpoint(endpoint); let provider = opentelemetry_otlp::new_pipeline() .metrics(opentelemetry::runtime::Tokio) .with_exporter(exporter) @@ -276,7 +281,12 @@ impl MetricsBackend for OtlpMetricsBackend { } fn shutdown(&self) { - let _ = self.provider.shutdown(); + if let Err(err) = self.provider.force_flush(&opentelemetry::Context::current()) { + eprintln!("failed to flush OTLP metrics: {err}"); + } + if let Err(err) = self.provider.shutdown() { + eprintln!("failed to shut down OTLP metrics provider: {err}"); + } } } @@ -313,6 +323,10 @@ async fn main() -> Result<(), Box> { println!("{text}"); } + if MetricsExporter::from_env()? == MetricsExporter::Otlp { + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + } + metrics.shutdown(); Ok(()) } From 0d668c89277c57f419e0420029158333250a91b8 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Thu, 30 Apr 2026 20:13:15 +1000 Subject: [PATCH 11/36] docs: add weaver metrics proposal Add an example OpenTelemetry Weaver registry for Chronos metrics, a Rust template skeleton, and a checked-in generated definition example that follows the Prometheus/OTLP abstraction in examples/prom_otlp_mock.rs. The template has been verified with the otel/weaver:v0.23.0 Docker image. The generated Rust example is checked in after rustfmt so future work has a concrete target for integrating generated metric definitions. Verification: - docker run --rm otel/weaver:v0.23.0 --version - docker run --rm -v /home/ah34/work/opensource/chronos:/work -w /work otel/weaver:v0.23.0 registry check -r examples/weaver/registry - docker run --rm -v /home/ah34/work/opensource/chronos:/work -v /tmp/chronos-weaver-out:/out -w /work otel/weaver:v0.23.0 registry generate -r examples/weaver/registry --templates examples/weaver/templates rust /out - rustfmt --check examples/weaver/generated/chronos_metric_definitions.rs - rustfmt --config-path rustfmt.toml /tmp/chronos-weaver-out/chronos_metric_definitions.rs && diff -u examples/weaver/generated/chronos_metric_definitions.rs /tmp/chronos-weaver-out/chronos_metric_definitions.rs - python3 -c 'import yaml; yaml.safe_load(open("examples/weaver/registry/chronos/metrics.yaml")); yaml.safe_load(open("examples/weaver/templates/registry/rust/weaver.yaml")); print("yaml ok")' - git diff --cached --check Model-version: GPT-5 --- docs/weaver-metrics-proposal.md | 52 ++++++++ .../generated/chronos_metric_definitions.rs | 93 ++++++++++++++ examples/weaver/registry/chronos/metrics.yaml | 117 ++++++++++++++++++ .../templates/registry/rust/registry.rs.j2 | 44 +++++++ .../templates/registry/rust/weaver.yaml | 20 +++ 5 files changed, 326 insertions(+) create mode 100644 docs/weaver-metrics-proposal.md create mode 100644 examples/weaver/generated/chronos_metric_definitions.rs create mode 100644 examples/weaver/registry/chronos/metrics.yaml create mode 100644 examples/weaver/templates/registry/rust/registry.rs.j2 create mode 100644 examples/weaver/templates/registry/rust/weaver.yaml diff --git a/docs/weaver-metrics-proposal.md b/docs/weaver-metrics-proposal.md new file mode 100644 index 0000000..023309f --- /dev/null +++ b/docs/weaver-metrics-proposal.md @@ -0,0 +1,52 @@ +# Managing Chronos Metrics with OpenTelemetry Weaver + +This proposal keeps the model from `examples/prom_otlp_mock.rs`: metric definitions live once, then the Prometheus and OTLP backends register instruments from that shared definition set. Weaver becomes the source of truth for the shared definition set. + +## Example Spec + +The example registry is in `examples/weaver/registry/chronos/metrics.yaml`. It defines Chronos metrics using OpenTelemetry-style names: + +| OpenTelemetry metric | Prometheus output name | Instrument | +| --- | --- | --- | +| `chronos.message.consumed` | `chronos_messages_consumed_total` | counter | +| `chronos.message.consume.duration` | `chronos_message_consume_duration_seconds` | histogram | +| `chronos.message.process.duration` | `chronos_message_process_duration_seconds` | histogram | +| `chronos.message.wait.duration` | `chronos_message_wait_duration_seconds` | histogram | +| `chronos.message.jitter` | `chronos_message_jitter_seconds` | histogram | +| `chronos.message.reset` | `chronos_messages_reset_total` | counter | + +The checked-in generated example is `examples/weaver/generated/chronos_metric_definitions.rs`. It mirrors the `MetricDefinition` table in `examples/prom_otlp_mock.rs`, with both `otel_name` and `prometheus_name` so each exporter can use the native naming convention it expects. + +## Suggested Workflow + +Pin Weaver to the version used by the branch and make the generated file reproducible: + +```sh +WEAVER_VERSION=0.23.0 +docker run --rm \ + -v "$(pwd):/work" \ + -w /work \ + "otel/weaver:v${WEAVER_VERSION}" \ + registry check -r examples/weaver/registry +docker run --rm \ + -v "$(pwd):/work" \ + -w /work \ + "otel/weaver:v${WEAVER_VERSION}" \ + registry generate \ + -r examples/weaver/registry \ + --templates examples/weaver/templates \ + rust chronos_bin/src/metrics/generated +rustfmt chronos_bin/src/metrics/generated/chronos_metric_definitions.rs +``` + +Add a `make metrics.generate` target for the `registry generate` command and a `make metrics.check` target that runs `weaver registry check` plus a diff check that generated files are current. The pre-commit script can then call `make metrics.check` once Weaver is a documented development dependency. + +## Implementation Path + +1. Keep the current Prometheus registry working while introducing generated definitions behind a small module such as `chronos_bin/src/metrics/generated/definitions.rs`. +2. Replace the hand-written metric creation in `chronos_bin/src/metrics/registry.rs` with a loop over generated `METRIC_DEFINITIONS`, following the backend loop already sketched in `examples/prom_otlp_mock.rs`. +3. Preserve compatibility temporarily by either exporting the current `msg_*` Prometheus names or by dual-registering old and new names for one release. The example spec prefers OpenTelemetry names and Prometheus-conventional rendered names. +4. Use generated attribute constants for label names so call sites record attributes by typed identifiers instead of string literals. +5. After the generated table is in use, add a test that gathers the registry and asserts every generated `prometheus_name` appears in the text output. + +Weaver can generate all of the static definition layer: metric IDs, names, descriptions, units, label names, bucket boundaries, and eventually attribute constants. Runtime behavior should remain hand-written because it contains Chronos-specific decisions: which events record which metric, pre-warming label combinations, exporter selection, and shutdown behavior. diff --git a/examples/weaver/generated/chronos_metric_definitions.rs b/examples/weaver/generated/chronos_metric_definitions.rs new file mode 100644 index 0000000..d6e0847 --- /dev/null +++ b/examples/weaver/generated/chronos_metric_definitions.rs @@ -0,0 +1,93 @@ +// Generated from examples/weaver/registry/chronos/metrics.yaml by OpenTelemetry Weaver. +// Do not edit by hand. + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum MetricId { + MsgConsumeLatency, + MsgConsumed, + MsgJitter, + MsgProcessLatency, + MsgReset, + MsgWaitTime, +} + +#[derive(Clone, Copy, Debug)] +pub enum MetricKind { + Counter, + Histogram, +} + +#[derive(Clone, Copy, Debug)] +pub struct MetricDefinition { + pub id: MetricId, + pub otel_name: &'static str, + pub prometheus_name: &'static str, + pub description: &'static str, + pub unit: Option<&'static str>, + pub label_names: &'static [&'static str], + pub kind: MetricKind, + pub buckets: Option<&'static [f64]>, +} + +pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ + MetricDefinition { + id: MetricId::MsgConsumeLatency, + otel_name: "chronos.message.consume.duration", + prometheus_name: "chronos_message_consume_duration_seconds", + description: "Duration of handle_message() in message_receiver.", + unit: Some("s"), + label_names: &["chronos.message.destination", "chronos.operation.status"], + kind: MetricKind::Histogram, + buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), + }, + MetricDefinition { + id: MetricId::MsgConsumed, + otel_name: "chronos.message.consumed", + prometheus_name: "chronos_messages_consumed_total", + description: "Total number of Chronos input messages consumed.", + unit: Some("{message}"), + label_names: &["chronos.message.destination", "chronos.operation.status"], + kind: MetricKind::Counter, + buckets: None, + }, + MetricDefinition { + id: MetricId::MsgJitter, + otel_name: "chronos.message.jitter", + prometheus_name: "chronos_message_jitter_seconds", + description: "Difference between actual publish time and client-requested deadline.", + unit: Some("s"), + label_names: &[], + kind: MetricKind::Histogram, + buckets: Some(&[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]), + }, + MetricDefinition { + id: MetricId::MsgProcessLatency, + otel_name: "chronos.message.process.duration", + prometheus_name: "chronos_message_process_duration_seconds", + description: "Duration of processor_message_ready() loop in message_processor.", + unit: Some("s"), + label_names: &["chronos.operation.status", "chronos.processor.returned"], + kind: MetricKind::Histogram, + buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), + }, + MetricDefinition { + id: MetricId::MsgReset, + otel_name: "chronos.message.reset", + prometheus_name: "chronos_messages_reset_total", + description: "Number of records reset by reset_to_init_db() in the monitor task.", + unit: Some("{message}"), + label_names: &[], + kind: MetricKind::Counter, + buckets: None, + }, + MetricDefinition { + id: MetricId::MsgWaitTime, + otel_name: "chronos.message.wait.duration", + prometheus_name: "chronos_message_wait_duration_seconds", + description: "Time a message spent in the Kafka input queue before processing.", + unit: Some("s"), + label_names: &[], + kind: MetricKind::Histogram, + buckets: Some(&[0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2]), + }, +]; diff --git a/examples/weaver/registry/chronos/metrics.yaml b/examples/weaver/registry/chronos/metrics.yaml new file mode 100644 index 0000000..9127985 --- /dev/null +++ b/examples/weaver/registry/chronos/metrics.yaml @@ -0,0 +1,117 @@ +groups: + - id: metric_attributes.chronos.message_store + type: attribute_group + stability: development + brief: Common attributes used by Chronos message store metrics. + attributes: + - id: chronos.message.destination + type: string + stability: development + brief: Destination used by Chronos while handling a consumed message. + examples: ["kafka", "postgres"] + requirement_level: required + - id: chronos.operation.status + type: string + stability: development + brief: Low-cardinality operation result. + examples: ["pass", "fail"] + requirement_level: required + + - id: metric_attributes.chronos.processor + type: attribute_group + stability: development + brief: Common attributes used by Chronos processor loop metrics. + attributes: + - id: chronos.processor.returned + type: boolean + stability: development + brief: Whether the processor loop returned before processing any rows. + examples: [true, false] + requirement_level: required + - ref: chronos.operation.status + requirement_level: required + + - id: metric.chronos.message.consumed + type: metric + metric_name: chronos.message.consumed + stability: development + brief: Total number of Chronos input messages consumed. + instrument: counter + unit: "{message}" + extends: metric_attributes.chronos.message_store + annotations: + code_generation: + rust_name: msg_consumed + metric_value_type: int + prometheus_name: chronos_messages_consumed_total + + - id: metric.chronos.message.consume.duration + type: metric + metric_name: chronos.message.consume.duration + stability: development + brief: Duration of handle_message() in message_receiver. + instrument: histogram + unit: s + extends: metric_attributes.chronos.message_store + annotations: + code_generation: + rust_name: msg_consume_latency + metric_value_type: double + prometheus_name: chronos_message_consume_duration_seconds + buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] + + - id: metric.chronos.message.process.duration + type: metric + metric_name: chronos.message.process.duration + stability: development + brief: Duration of processor_message_ready() loop in message_processor. + instrument: histogram + unit: s + extends: metric_attributes.chronos.processor + annotations: + code_generation: + rust_name: msg_process_latency + metric_value_type: double + prometheus_name: chronos_message_process_duration_seconds + buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] + + - id: metric.chronos.message.wait.duration + type: metric + metric_name: chronos.message.wait.duration + stability: development + brief: Time a message spent in the Kafka input queue before processing. + instrument: histogram + unit: s + annotations: + code_generation: + rust_name: msg_wait_time + metric_value_type: double + prometheus_name: chronos_message_wait_duration_seconds + buckets: [0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2] + + - id: metric.chronos.message.jitter + type: metric + metric_name: chronos.message.jitter + stability: development + brief: Difference between actual publish time and client-requested deadline. + instrument: histogram + unit: s + annotations: + code_generation: + rust_name: msg_jitter + metric_value_type: double + prometheus_name: chronos_message_jitter_seconds + buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] + + - id: metric.chronos.message.reset + type: metric + metric_name: chronos.message.reset + stability: development + brief: Number of records reset by reset_to_init_db() in the monitor task. + instrument: counter + unit: "{message}" + annotations: + code_generation: + rust_name: msg_reset + metric_value_type: int + prometheus_name: chronos_messages_reset_total diff --git a/examples/weaver/templates/registry/rust/registry.rs.j2 b/examples/weaver/templates/registry/rust/registry.rs.j2 new file mode 100644 index 0000000..1e707ec --- /dev/null +++ b/examples/weaver/templates/registry/rust/registry.rs.j2 @@ -0,0 +1,44 @@ +// Generated from examples/weaver/registry/chronos/metrics.yaml by OpenTelemetry Weaver. +// Do not edit by hand. + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum MetricId { +{%- for metric in ctx.metrics %} + {{ metric.rust_name | pascal_case }}, +{%- endfor %} +} + +#[derive(Clone, Copy, Debug)] +pub enum MetricKind { + Counter, + Histogram, +} + +#[derive(Clone, Copy, Debug)] +pub struct MetricDefinition { + pub id: MetricId, + pub otel_name: &'static str, + pub prometheus_name: &'static str, + pub description: &'static str, + pub unit: Option<&'static str>, + pub label_names: &'static [&'static str], + pub kind: MetricKind, + pub buckets: Option<&'static [f64]>, +} + +pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ +{%- for metric in ctx.metrics %} + MetricDefinition { + id: MetricId::{{ metric.rust_name | pascal_case }}, + otel_name: "{{ metric.metric_name }}", + prometheus_name: "{{ metric.prometheus_name }}", + description: "{{ metric.brief }}", + unit: {% if metric.unit %}Some("{{ metric.unit }}"){% else %}None{% endif %}, + label_names: &[{% for attribute in metric.attributes %}"{{ attribute }}"{% if not loop.last %}, {% endif %}{% endfor %}], + kind: MetricKind::{{ metric.instrument | pascal_case }}, + buckets: {% if metric.buckets %}{% if metric.buckets | length > 10 %}Some(&[ + {{ metric.buckets | join(", ") }}, + ]){% else %}Some(&[{{ metric.buckets | join(", ") }}]){% endif %}{% else %}None{% endif %}, + }, +{%- endfor %} +]; diff --git a/examples/weaver/templates/registry/rust/weaver.yaml b/examples/weaver/templates/registry/rust/weaver.yaml new file mode 100644 index 0000000..8351daf --- /dev/null +++ b/examples/weaver/templates/registry/rust/weaver.yaml @@ -0,0 +1,20 @@ +templates: + - pattern: registry.rs.j2 + filter: > + { + metrics: (.groups + | map(select(.type == "metric")) + | map({ + id, + metric_name, + rust_name: .annotations.code_generation.rust_name, + prometheus_name: .annotations.code_generation.prometheus_name, + brief, + instrument, + unit, + attributes: (.attributes // [] | map(.name // .id // .ref)), + buckets: .annotations.code_generation.buckets + })) + } + application_mode: single + file_name: chronos_metric_definitions.rs From 4c89b2840f56aeebf163baaf0a4951c7c2b1e61f Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Thu, 30 Apr 2026 21:33:43 +1000 Subject: [PATCH 12/36] feat(metrics): add weaver live-check workflow Keep the Prometheus/OTLP metrics mock running until interrupted and record counter/histogram samples on every cycle. Use OpenTelemetry environment variables and messaging semantic convention names for emitted metric attributes, with Prometheus-safe rendered names where required. Add Docker-backed Weaver Make recipes for registry checks, Rust generation, markdown generation, JSON schema generation, and live-check. Check in the generated markdown and resolved-registry schema outputs. Verification: - cargo check --package prom_otlp_mock_runner - cargo fmt --package prom_otlp_mock_runner -- --check - rustfmt --edition 2021 --check examples/prom_otlp_mock.rs examples/weaver/generated/chronos_metric_definitions.rs - python3 -c 'import yaml; yaml.safe_load(open("examples/weaver/registry/chronos/metrics.yaml")); yaml.safe_load(open("examples/weaver/templates/registry/rust/weaver.yaml")); yaml.safe_load(open("examples/weaver/templates/registry/markdown/weaver.yaml")); print("yaml ok")' - make weaver.generate - make weaver.check - make weaver.live-check - Prometheus mock smoke test on http://127.0.0.1:19092/metrics - git diff --cached --check Model-version: GPT-5 --- Cargo.toml | 3 + Makefile | 71 +- docs/weaver-metrics-proposal.md | 21 +- examples/prom_otlp_mock.rs | 160 +- .../generated/chronos_metric_definitions.rs | 25 +- examples/weaver/generated/chronos_metrics.md | 12 + .../generated/resolved-registry.schema.json | 1830 +++++++++++++++++ examples/weaver/registry/chronos/metrics.yaml | 58 +- .../templates/registry/markdown/metrics.md.j2 | 9 + .../templates/registry/markdown/weaver.yaml | 17 + .../templates/registry/rust/registry.rs.j2 | 2 + .../templates/registry/rust/weaver.yaml | 1 + 12 files changed, 2147 insertions(+), 62 deletions(-) create mode 100644 examples/weaver/generated/chronos_metrics.md create mode 100644 examples/weaver/generated/resolved-registry.schema.json create mode 100644 examples/weaver/templates/registry/markdown/metrics.md.j2 create mode 100644 examples/weaver/templates/registry/markdown/weaver.yaml diff --git a/Cargo.toml b/Cargo.toml index 341386e..d327280 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,9 @@ members = [ # migrations binary "pg_mig" ] +exclude = [ + "examples/weaver" +] [workspace.dependencies] diff --git a/Makefile b/Makefile index 5f56f83..e281e83 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,14 @@ SHELL:=/bin/bash RUST_VERSION := $(shell grep 'channel' rust-toolchain.toml | sed 's/.*"\(.*\)"/\1/') EXPORTER ?= prom LGTM_IMAGE ?= grafana/otel-lgtm:0.24.1 +WEAVER_VERSION ?= 0.23.0 +WEAVER_IMAGE ?= otel/weaver:v$(WEAVER_VERSION) +WEAVER_REGISTRY ?= examples/weaver/registry +WEAVER_TEMPLATES ?= examples/weaver/templates +WEAVER_GENERATED_DIR ?= examples/weaver/generated +WEAVER_LIVE_CHECK_PORT ?= 4319 +WEAVER_LIVE_CHECK_ADMIN_PORT ?= 4320 +WEAVER_LIVE_CHECK_OUT ?= /tmp/chronos-weaver-live-check # pp - pretty print function yellow := $(shell tput setaf 3) @@ -121,11 +129,70 @@ metrics.check: metrics.mock: $(call pp,run metrics mock example with exporter $(EXPORTER)...) @case "$(EXPORTER)" in \ - prom|prometheus) OTEL_METRICS_EXPORTER=prometheus cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ - otlp) OTEL_SERVICE_NAME=chronos-metrics-mock OTEL_RESOURCE_ATTRIBUTES=service.instance.id=chronos-metrics-mock-local OTEL_METRICS_EXPORTER=otlp OTEL_EXPORTER_OTLP_PROTOCOL=grpc OTEL_EXPORTER_OTLP_METRICS_ENDPOINT=http://127.0.0.1:4317 OTEL_METRIC_EXPORT_INTERVAL=1000 cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ + prom|prometheus) OTEL_METRICS_EXPORTER=prometheus OTEL_EXPORTER_PROMETHEUS_HOST=$${OTEL_EXPORTER_PROMETHEUS_HOST:-127.0.0.1} OTEL_EXPORTER_PROMETHEUS_PORT=$${OTEL_EXPORTER_PROMETHEUS_PORT:-9092} cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ + otlp) OTEL_SERVICE_NAME=chronos-metrics-mock OTEL_RESOURCE_ATTRIBUTES=service.instance.id=chronos-metrics-mock-local OTEL_METRICS_EXPORTER=otlp OTEL_EXPORTER_OTLP_PROTOCOL=grpc OTEL_EXPORTER_OTLP_METRICS_ENDPOINT=$${OTEL_EXPORTER_OTLP_METRICS_ENDPOINT:-http://127.0.0.1:4317} OTEL_METRIC_EXPORT_INTERVAL=$${OTEL_METRIC_EXPORT_INTERVAL:-1000} cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ *) echo "unsupported EXPORTER=$(EXPORTER); use EXPORTER=prom or EXPORTER=otlp" >&2; exit 2 ;; \ esac +## weaver.check: 🔍 Validate the Chronos Weaver registry +weaver.check: + $(call pp,check Weaver registry with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry check -r $(WEAVER_REGISTRY) + +## weaver.generate.rust: 🧵 Generate Rust metric definitions with Weaver +weaver.generate.rust: + $(call pp,generate Rust metric definitions with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_REGISTRY) --templates $(WEAVER_TEMPLATES) rust $(WEAVER_GENERATED_DIR) + rustfmt --config-path rustfmt.toml $(WEAVER_GENERATED_DIR)/chronos_metric_definitions.rs + +## weaver.generate.markdown: 🧵 Generate Chronos metrics markdown docs with Weaver +weaver.generate.markdown: + $(call pp,generate metrics markdown docs with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_REGISTRY) --templates $(WEAVER_TEMPLATES) markdown $(WEAVER_GENERATED_DIR) + +## weaver.generate.json-schema: 🧵 Generate Weaver resolved-registry JSON schema +weaver.generate.json-schema: + $(call pp,generate Weaver JSON schema with $(WEAVER_IMAGE)...) + mkdir -p $(WEAVER_GENERATED_DIR) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry json-schema -o $(WEAVER_GENERATED_DIR)/resolved-registry.schema.json + +## weaver.generate: 🧵 Generate all Weaver artifacts +weaver.generate: weaver.generate.rust weaver.generate.markdown weaver.generate.json-schema + +## weaver.live-check: 🔍 Run Weaver live-check against the OTLP metrics mock +weaver.live-check: + $(call pp,run Weaver live-check against metrics mock...) + @set -euo pipefail; \ + cargo build --package prom_otlp_mock_runner; \ + rm -rf "$(WEAVER_LIVE_CHECK_OUT)"; \ + mkdir -p "$(WEAVER_LIVE_CHECK_OUT)"; \ + docker run --rm --network host \ + -v "$(PWD):/work" \ + -v "$(WEAVER_LIVE_CHECK_OUT):/out" \ + -w /work \ + $(WEAVER_IMAGE) registry live-check \ + -r $(WEAVER_REGISTRY) \ + --input-source otlp \ + --otlp-grpc-address 127.0.0.1 \ + --otlp-grpc-port $(WEAVER_LIVE_CHECK_PORT) \ + --admin-port $(WEAVER_LIVE_CHECK_ADMIN_PORT) \ + --inactivity-timeout 5 \ + --no-stream \ + --format json \ + -o /out & \ + live_check_pid=$$!; \ + trap 'kill "$$live_check_pid" 2>/dev/null || true' EXIT; \ + sleep 2; \ + OTEL_SERVICE_NAME=chronos-metrics-mock \ + OTEL_RESOURCE_ATTRIBUTES=service.instance.id=chronos-metrics-mock-live-check \ + OTEL_METRICS_EXPORTER=otlp \ + OTEL_EXPORTER_OTLP_PROTOCOL=grpc \ + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT=http://127.0.0.1:$(WEAVER_LIVE_CHECK_PORT) \ + OTEL_METRIC_EXPORT_INTERVAL=500 \ + timeout -s INT 10 cargo run --quiet --package prom_otlp_mock_runner --bin prom_otlp_mock || test "$$?" -eq 124; \ + wait "$$live_check_pid"; \ + find "$(WEAVER_LIVE_CHECK_OUT)" -maxdepth 1 -type f -print + ## lgtm.validate: 🔍 Validate LGTM Prometheus and OpenTelemetry Collector configs lgtm.validate: $(call pp,validate LGTM Prometheus config with $(LGTM_IMAGE)...) diff --git a/docs/weaver-metrics-proposal.md b/docs/weaver-metrics-proposal.md index 023309f..4d5524f 100644 --- a/docs/weaver-metrics-proposal.md +++ b/docs/weaver-metrics-proposal.md @@ -8,14 +8,14 @@ The example registry is in `examples/weaver/registry/chronos/metrics.yaml`. It d | OpenTelemetry metric | Prometheus output name | Instrument | | --- | --- | --- | -| `chronos.message.consumed` | `chronos_messages_consumed_total` | counter | -| `chronos.message.consume.duration` | `chronos_message_consume_duration_seconds` | histogram | -| `chronos.message.process.duration` | `chronos_message_process_duration_seconds` | histogram | +| `messaging.client.consumed.messages` | `messaging_client_consumed_messages_total` | counter | +| `messaging.client.operation.duration` | `messaging_client_operation_duration_seconds` | histogram | +| `messaging.process.duration` | `messaging_process_duration_seconds` | histogram | | `chronos.message.wait.duration` | `chronos_message_wait_duration_seconds` | histogram | | `chronos.message.jitter` | `chronos_message_jitter_seconds` | histogram | | `chronos.message.reset` | `chronos_messages_reset_total` | counter | -The checked-in generated example is `examples/weaver/generated/chronos_metric_definitions.rs`. It mirrors the `MetricDefinition` table in `examples/prom_otlp_mock.rs`, with both `otel_name` and `prometheus_name` so each exporter can use the native naming convention it expects. +The checked-in generated example is `examples/weaver/generated/chronos_metric_definitions.rs`. It mirrors the `MetricDefinition` table in `examples/prom_otlp_mock.rs`, with both `otel_name` and `prometheus_name` so each exporter can use the native naming convention it expects. The messaging metrics and attributes use OpenTelemetry semantic convention names; Chronos-specific timing and recovery metrics remain under the `chronos.*` namespace. ## Suggested Workflow @@ -39,7 +39,18 @@ docker run --rm \ rustfmt chronos_bin/src/metrics/generated/chronos_metric_definitions.rs ``` -Add a `make metrics.generate` target for the `registry generate` command and a `make metrics.check` target that runs `weaver registry check` plus a diff check that generated files are current. The pre-commit script can then call `make metrics.check` once Weaver is a documented development dependency. +The repository now has Make targets for the main Weaver workflows: + +```sh +make weaver.check +make weaver.generate +make weaver.generate.rust +make weaver.generate.markdown +make weaver.generate.json-schema +make weaver.live-check +``` + +`make weaver.live-check` starts Weaver's OTLP live-check receiver with Docker, runs the mock with `OTEL_METRICS_EXPORTER=otlp`, and writes the report to `/tmp/chronos-weaver-live-check/live_check.json`. ## Implementation Path diff --git a/examples/prom_otlp_mock.rs b/examples/prom_otlp_mock.rs index 9d91faa..5aa53ed 100644 --- a/examples/prom_otlp_mock.rs +++ b/examples/prom_otlp_mock.rs @@ -15,6 +15,8 @@ use std::collections::HashMap; use std::env; +use std::sync::Arc; +use std::time::Duration; use opentelemetry::global; use opentelemetry::metrics::{Counter as OtlpCounter, Histogram as OtlpHistogram, Unit}; @@ -27,6 +29,9 @@ const OTEL_EXPORTER_OTLP_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_ENDPOINT"; const OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"; const OTEL_EXPORTER_OTLP_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_PROTOCOL"; const OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"; +const OTEL_METRIC_EXPORT_INTERVAL: &str = "OTEL_METRIC_EXPORT_INTERVAL"; +const OTEL_EXPORTER_PROMETHEUS_HOST: &str = "OTEL_EXPORTER_PROMETHEUS_HOST"; +const OTEL_EXPORTER_PROMETHEUS_PORT: &str = "OTEL_EXPORTER_PROMETHEUS_PORT"; #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] enum MetricId { @@ -43,28 +48,34 @@ enum MetricKind { #[derive(Clone, Copy, Debug)] struct MetricDefinition { id: MetricId, - name: &'static str, + otel_name: &'static str, + prometheus_name: &'static str, description: &'static str, unit: Option<&'static str>, - label_names: &'static [&'static str], + attribute_names: &'static [&'static str], + prometheus_label_names: &'static [&'static str], kind: MetricKind, } const METRIC_DEFINITIONS: &[MetricDefinition] = &[ MetricDefinition { id: MetricId::MsgConsumed, - name: "chronos_messages_consumed_total", + otel_name: "messaging.client.consumed.messages", + prometheus_name: "messaging_client_consumed_messages", description: "Total number of Chronos input messages consumed", - unit: Some("1"), - label_names: &["destination", "status"], + unit: Some("{message}"), + attribute_names: &["messaging.system", "messaging.operation.name", "messaging.destination.name"], + prometheus_label_names: &["messaging_system", "messaging_operation_name", "messaging_destination_name"], kind: MetricKind::Counter, }, MetricDefinition { id: MetricId::MsgConsumeLatency, - name: "chronos_message_consume_latency_seconds", + otel_name: "messaging.process.duration", + prometheus_name: "messaging_process_duration_seconds", description: "Time spent handling a consumed Chronos message", unit: Some("s"), - label_names: &["destination", "status"], + attribute_names: &["messaging.system", "messaging.operation.name", "messaging.destination.name"], + prometheus_label_names: &["messaging_system", "messaging_operation_name", "messaging_destination_name"], kind: MetricKind::Histogram, }, ]; @@ -90,22 +101,38 @@ impl ChronosMetrics { Ok(Self { backend }) } - fn message_consumed(&self, destination: &'static str, status: &'static str) { + fn message_consumed(&self, destination: &'static str) { self.backend.inc_counter( MetricId::MsgConsumed, 1, - &[("destination", destination.to_string()), ("status", status.to_string())], + &[ + ("messaging.system", "kafka".to_string()), + ("messaging.operation.name", "receive".to_string()), + ("messaging.destination.name", destination.to_string()), + ], ); } - fn consume_latency(&self, seconds: f64, destination: &'static str, status: &'static str) { + fn consume_latency(&self, seconds: f64, destination: &'static str) { self.backend.observe_histogram( MetricId::MsgConsumeLatency, seconds, - &[("destination", destination.to_string()), ("status", status.to_string())], + &[ + ("messaging.system", "kafka".to_string()), + ("messaging.operation.name", "process".to_string()), + ("messaging.destination.name", destination.to_string()), + ], ); } + fn record_cycle(&self, cycle: u64) { + let destination = if cycle % 2 == 0 { "chronos-input" } else { "chronos-retry" }; + let latency_seconds = 0.005 + ((cycle % 20) as f64 * 0.0025); + + self.message_consumed(destination); + self.consume_latency(latency_seconds, destination); + } + fn prometheus_text(&self) -> Option { self.backend.render_prometheus() } @@ -115,6 +142,33 @@ impl ChronosMetrics { } } +struct MockRuntimeConfig { + interval: Duration, + prometheus_host: String, + prometheus_port: u16, +} + +impl MockRuntimeConfig { + fn from_env() -> Result> { + Ok(Self { + interval: env_duration_ms(OTEL_METRIC_EXPORT_INTERVAL, 1_000)?, + prometheus_host: env::var(OTEL_EXPORTER_PROMETHEUS_HOST).unwrap_or_else(|_| "127.0.0.1".to_string()), + prometheus_port: env::var(OTEL_EXPORTER_PROMETHEUS_PORT) + .unwrap_or_else(|_| "9092".to_string()) + .parse() + .map_err(|err| format!("invalid {OTEL_EXPORTER_PROMETHEUS_PORT}: {err}"))?, + }) + } +} + +fn env_duration_ms(name: &'static str, default_ms: u64) -> Result> { + let millis = env::var(name) + .unwrap_or_else(|_| default_ms.to_string()) + .parse() + .map_err(|err| format!("invalid {name}: {err}"))?; + Ok(Duration::from_millis(millis)) +} + #[derive(Clone, Copy, Debug, Eq, PartialEq)] enum MetricsExporter { Prometheus, @@ -162,12 +216,15 @@ impl PrometheusMetricsBackend { for definition in METRIC_DEFINITIONS { match definition.kind { MetricKind::Counter => { - let metric = PromCounterVec::new(opts!(definition.name, definition.description), definition.label_names)?; + let metric = PromCounterVec::new(opts!(definition.prometheus_name, definition.description), definition.prometheus_label_names)?; registry.register(Box::new(metric.clone()))?; counters.insert(definition.id, metric); } MetricKind::Histogram => { - let metric = PromHistogramVec::new(histogram_opts!(definition.name, definition.description), definition.label_names)?; + let metric = PromHistogramVec::new( + histogram_opts!(definition.prometheus_name, definition.description), + definition.prometheus_label_names, + )?; registry.register(Box::new(metric.clone()))?; histograms.insert(definition.id, metric); } @@ -239,14 +296,14 @@ impl OtlpMetricsBackend { for definition in METRIC_DEFINITIONS { match definition.kind { MetricKind::Counter => { - let mut builder = meter.u64_counter(definition.name).with_description(definition.description); + let mut builder = meter.u64_counter(definition.otel_name).with_description(definition.description); if let Some(unit) = definition.unit { builder = builder.with_unit(Unit::new(unit)); } counters.insert(definition.id, builder.init()); } MetricKind::Histogram => { - let mut builder = meter.f64_histogram(definition.name).with_description(definition.description); + let mut builder = meter.f64_histogram(definition.otel_name).with_description(definition.description); if let Some(unit) = definition.unit { builder = builder.with_unit(Unit::new(unit)); } @@ -300,7 +357,7 @@ fn prometheus_label_values<'a>(id: MetricId, labels: &'a [(&'static str, String) }; definition - .label_names + .attribute_names .iter() .map(|name| { labels @@ -312,21 +369,80 @@ fn prometheus_label_values<'a>(id: MetricId, labels: &'a [(&'static str, String) .collect() } +async fn spawn_prometheus_server( + metrics: Arc, + host: String, + port: u16, +) -> Result, Box> { + use tokio::io::{AsyncReadExt, AsyncWriteExt}; + + let listener = tokio::net::TcpListener::bind(format!("{host}:{port}")).await?; + eprintln!("Prometheus metrics mock listening on http://{host}:{port}/metrics"); + + Ok(tokio::spawn(async move { + loop { + let Ok((mut stream, _)) = listener.accept().await else { + continue; + }; + let metrics = Arc::clone(&metrics); + tokio::spawn(async move { + let mut request = [0_u8; 1024]; + let bytes_read = stream.read(&mut request).await.unwrap_or(0); + let request_line = String::from_utf8_lossy(&request[..bytes_read]); + let (status, body) = if request_line.starts_with("GET /metrics ") { + ("200 OK", metrics.prometheus_text().unwrap_or_default()) + } else { + ("404 Not Found", "not found\n".to_string()) + }; + let response = format!( + "HTTP/1.1 {status}\r\ncontent-type: text/plain; version=0.0.4; charset=utf-8\r\ncontent-length: {}\r\nconnection: close\r\n\r\n{body}", + body.len() + ); + let _ = stream.write_all(response.as_bytes()).await; + }); + } + })) +} + +async fn run_workload(metrics: Arc, config: &MockRuntimeConfig) { + let mut cycle = 0_u64; + loop { + cycle += 1; + metrics.record_cycle(cycle); + + tokio::time::sleep(config.interval).await; + } +} + #[tokio::main] async fn main() -> Result<(), Box> { - let metrics = ChronosMetrics::from_env()?; + let exporter = MetricsExporter::from_env()?; + let config = MockRuntimeConfig::from_env()?; + let metrics = Arc::new(ChronosMetrics::from_env()?); - metrics.message_consumed("postgres", "pass"); - metrics.consume_latency(0.042, "postgres", "pass"); + let prometheus_server = if exporter == MetricsExporter::Prometheus { + let metrics_for_server = Arc::clone(&metrics); + Some(spawn_prometheus_server(metrics_for_server, config.prometheus_host.clone(), config.prometheus_port).await?) + } else { + None + }; - if let Some(text) = metrics.prometheus_text() { - println!("{text}"); + eprintln!("Metrics mock running until interrupted"); + + tokio::select! { + _ = run_workload(Arc::clone(&metrics), &config) => {} + result = tokio::signal::ctrl_c() => { + result?; + } } - if MetricsExporter::from_env()? == MetricsExporter::Otlp { + if exporter == MetricsExporter::Otlp { tokio::time::sleep(std::time::Duration::from_secs(2)).await; } metrics.shutdown(); + if let Some(server) = prometheus_server { + server.abort(); + } Ok(()) } diff --git a/examples/weaver/generated/chronos_metric_definitions.rs b/examples/weaver/generated/chronos_metric_definitions.rs index d6e0847..38ed8c8 100644 --- a/examples/weaver/generated/chronos_metric_definitions.rs +++ b/examples/weaver/generated/chronos_metric_definitions.rs @@ -25,6 +25,7 @@ pub struct MetricDefinition { pub description: &'static str, pub unit: Option<&'static str>, pub label_names: &'static [&'static str], + pub prometheus_label_names: &'static [&'static str], pub kind: MetricKind, pub buckets: Option<&'static [f64]>, } @@ -32,21 +33,23 @@ pub struct MetricDefinition { pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ MetricDefinition { id: MetricId::MsgConsumeLatency, - otel_name: "chronos.message.consume.duration", - prometheus_name: "chronos_message_consume_duration_seconds", + otel_name: "messaging.client.operation.duration", + prometheus_name: "messaging_client_operation_duration_seconds", description: "Duration of handle_message() in message_receiver.", unit: Some("s"), - label_names: &["chronos.message.destination", "chronos.operation.status"], + label_names: &["messaging.destination.name", "messaging.operation.name", "messaging.system"], + prometheus_label_names: &["messaging_destination_name", "messaging_operation_name", "messaging_system"], kind: MetricKind::Histogram, buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), }, MetricDefinition { id: MetricId::MsgConsumed, - otel_name: "chronos.message.consumed", - prometheus_name: "chronos_messages_consumed_total", + otel_name: "messaging.client.consumed.messages", + prometheus_name: "messaging_client_consumed_messages_total", description: "Total number of Chronos input messages consumed.", unit: Some("{message}"), - label_names: &["chronos.message.destination", "chronos.operation.status"], + label_names: &["messaging.destination.name", "messaging.operation.name", "messaging.system"], + prometheus_label_names: &["messaging_destination_name", "messaging_operation_name", "messaging_system"], kind: MetricKind::Counter, buckets: None, }, @@ -57,16 +60,18 @@ pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ description: "Difference between actual publish time and client-requested deadline.", unit: Some("s"), label_names: &[], + prometheus_label_names: &[], kind: MetricKind::Histogram, buckets: Some(&[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]), }, MetricDefinition { id: MetricId::MsgProcessLatency, - otel_name: "chronos.message.process.duration", - prometheus_name: "chronos_message_process_duration_seconds", + otel_name: "messaging.process.duration", + prometheus_name: "messaging_process_duration_seconds", description: "Duration of processor_message_ready() loop in message_processor.", unit: Some("s"), - label_names: &["chronos.operation.status", "chronos.processor.returned"], + label_names: &["messaging.destination.name", "messaging.operation.name", "messaging.system"], + prometheus_label_names: &["messaging_destination_name", "messaging_operation_name", "messaging_system"], kind: MetricKind::Histogram, buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), }, @@ -77,6 +82,7 @@ pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ description: "Number of records reset by reset_to_init_db() in the monitor task.", unit: Some("{message}"), label_names: &[], + prometheus_label_names: &[], kind: MetricKind::Counter, buckets: None, }, @@ -87,6 +93,7 @@ pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ description: "Time a message spent in the Kafka input queue before processing.", unit: Some("s"), label_names: &[], + prometheus_label_names: &[], kind: MetricKind::Histogram, buckets: Some(&[0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2]), }, diff --git a/examples/weaver/generated/chronos_metrics.md b/examples/weaver/generated/chronos_metrics.md new file mode 100644 index 0000000..81797f4 --- /dev/null +++ b/examples/weaver/generated/chronos_metrics.md @@ -0,0 +1,12 @@ +# Chronos Metrics + +Generated from `examples/weaver/registry/chronos/metrics.yaml` by OpenTelemetry Weaver. + +| Metric | Prometheus Name | Instrument | Unit | Attributes | Description | +| --- | --- | --- | --- | --- | --- | +| `messaging.client.operation.duration` | `messaging_client_operation_duration_seconds` | `histogram` | `s` | `messaging.destination.name`, `messaging.operation.name`, `messaging.system` | Duration of handle_message() in message_receiver. | +| `messaging.client.consumed.messages` | `messaging_client_consumed_messages_total` | `counter` | `{message}` | `messaging.destination.name`, `messaging.operation.name`, `messaging.system` | Total number of Chronos input messages consumed. | +| `chronos.message.jitter` | `chronos_message_jitter_seconds` | `histogram` | `s` | - | Difference between actual publish time and client-requested deadline. | +| `messaging.process.duration` | `messaging_process_duration_seconds` | `histogram` | `s` | `messaging.destination.name`, `messaging.operation.name`, `messaging.system` | Duration of processor_message_ready() loop in message_processor. | +| `chronos.message.reset` | `chronos_messages_reset_total` | `counter` | `{message}` | - | Number of records reset by reset_to_init_db() in the monitor task. | +| `chronos.message.wait.duration` | `chronos_message_wait_duration_seconds` | `histogram` | `s` | - | Time a message spent in the Kafka input queue before processing. | diff --git a/examples/weaver/generated/resolved-registry.schema.json b/examples/weaver/generated/resolved-registry.schema.json new file mode 100644 index 0000000..081f975 --- /dev/null +++ b/examples/weaver/generated/resolved-registry.schema.json @@ -0,0 +1,1830 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ResolvedRegistry", + "description": "A resolved semantic convention registry used in the context of the template and policy\nengines.", + "type": "object", + "properties": { + "groups": { + "description": "A list of semantic convention groups.", + "type": "array", + "items": { + "$ref": "#/$defs/ResolvedGroup" + } + }, + "registry_url": { + "description": "The semantic convention registry url.", + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "registry_url", + "groups" + ], + "$defs": { + "AnyValueSpec": { + "description": "The AnyValueTypeSpec is a specification of a value that can be of any type.", + "oneOf": [ + { + "description": "A boolean attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "boolean" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "A integer attribute (signed 64 bit integer).", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "int" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "A double attribute (double precision floating point (IEEE 754-1985)).", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "double" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "A string attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "string" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "An array of strings attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "string[]" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "An array of integer attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "int[]" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "An array of double attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "double[]" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "An array of boolean attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "boolean[]" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "The value type is a map of key, value pairs", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "fields": { + "description": "The collection of key, values where the value is an `AnyValueSpec`", + "type": "array", + "items": { + "$ref": "#/$defs/AnyValueSpec" + } + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "map" + } + }, + "required": [ + "type", + "id", + "requirement_level", + "fields" + ] + }, + { + "description": "The value type is a map of key, value pairs", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "fields": { + "description": "The collection of key, values where the value is an `AnyValueSpec`", + "type": "array", + "items": { + "$ref": "#/$defs/AnyValueSpec" + } + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "map[]" + } + }, + "required": [ + "type", + "id", + "requirement_level", + "fields" + ] + }, + { + "description": "The value type will just be a bytes.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "bytes" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "The value type is not specified.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "undefined" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "An enum definition type.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "members": { + "description": "List of enum entries.", + "type": "array", + "items": { + "$ref": "#/$defs/EnumEntriesSpec" + } + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "enum" + } + }, + "required": [ + "type", + "id", + "requirement_level", + "members" + ] + } + ] + }, + "Attribute": { + "description": "An attribute definition.", + "type": "object", + "properties": { + "annotations": { + "description": "Annotations for the group.", + "type": [ + "object", + "null" + ], + "additionalProperties": { + "$ref": "#/$defs/YamlValue" + } + }, + "brief": { + "description": "A brief description of the attribute.", + "type": "string" + }, + "deprecated": { + "description": "Specifies if the attribute is deprecated.", + "anyOf": [ + { + "$ref": "#/$defs/Deprecated" + }, + { + "type": "null" + } + ] + }, + "examples": { + "description": "Sequence of example values for the attribute or single example\nvalue. They are required only for string and string array\nattributes. Example values must be of the same type of the\nattribute. If only a single example is provided, it can directly\nbe reported without encapsulating it into a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "name": { + "description": "Attribute name.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the attribute.\nIt defaults to an empty string.", + "type": "string" + }, + "prefix": { + "description": "Specifies the prefix of the attribute.\nIf this parameter is set, the resolved id of the referenced attribute will\nhave group prefix added to it.\nIt defaults to false.", + "type": "boolean" + }, + "requirement_level": { + "description": "Specifies if the attribute is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe attribute is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the attribute is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "role": { + "description": "Whether the attribute is identifying or descriptive.", + "anyOf": [ + { + "$ref": "#/$defs/AttributeRole" + }, + { + "type": "null" + } + ] + }, + "sampling_relevant": { + "description": "Specifies if the attribute is (especially) relevant for sampling\nand thus should be set at span start. It defaults to false.\nNote: this field is experimental.", + "type": [ + "boolean", + "null" + ] + }, + "stability": { + "description": "Specifies the stability of the attribute.\nNote that, if stability is missing but deprecated is present, it will\nautomatically set the stability to deprecated. If deprecated is\npresent and stability differs from deprecated, this will result in an\nerror.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "tag": { + "description": "Associates a tag (\"sub-group\") to the attribute. It carries no\nparticular semantic meaning but can be used e.g. for filtering\nin the markdown generator.", + "type": [ + "string", + "null" + ] + }, + "tags": { + "description": "A set of tags for the attribute.", + "anyOf": [ + { + "$ref": "#/$defs/Tags" + }, + { + "type": "null" + } + ] + }, + "type": { + "description": "Either a string literal denoting the type as a primitive or an\narray type, a template type or an enum definition.", + "$ref": "#/$defs/AttributeType" + }, + "value": { + "description": "The value of the attribute.\nNote: This is only used in a telemetry schema specification.", + "anyOf": [ + { + "$ref": "#/$defs/Value" + }, + { + "type": "null" + } + ] + } + }, + "additionalProperties": false, + "required": [ + "name", + "type", + "brief", + "requirement_level" + ] + }, + "AttributeLineage": { + "description": "Attribute lineage (at the field level).", + "type": "object", + "properties": { + "inherited_fields": { + "description": "A list of fields that are inherited from the source group.", + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "locally_overridden_fields": { + "description": "A list of fields that are overridden in the local group.", + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "source_group": { + "description": "The group id where the attribute is coming from.", + "type": "string" + } + }, + "required": [ + "source_group" + ] + }, + "AttributeRole": { + "description": "The different roles for attributes in groups.", + "oneOf": [ + { + "description": "The attribute is considered identifying for the signal it is associated with.", + "type": "string", + "const": "identifying" + }, + { + "description": "The attribute is considered descriptive for the signal it is associated with.", + "type": "string", + "const": "descriptive" + } + ] + }, + "AttributeType": { + "description": "The different types of attributes (specification).", + "anyOf": [ + { + "description": "Primitive or array type.", + "$ref": "#/$defs/PrimitiveOrArrayTypeSpec" + }, + { + "description": "A template type.", + "$ref": "#/$defs/TemplateTypeSpec" + }, + { + "description": "An enum definition type.", + "type": "object", + "properties": { + "members": { + "description": "List of enum entries.", + "type": "array", + "items": { + "$ref": "#/$defs/EnumEntriesSpec" + } + } + }, + "required": [ + "members" + ] + } + ] + }, + "BasicRequirementLevelSpec": { + "description": "The different types of basic requirement levels.", + "oneOf": [ + { + "description": "A required requirement level.", + "type": "string", + "const": "required" + }, + { + "description": "An optional requirement level.", + "type": "string", + "const": "recommended" + }, + { + "description": "An opt-in requirement level.", + "type": "string", + "const": "opt_in" + } + ] + }, + "Deprecated": { + "description": "The different ways to deprecate an attribute, a metric, ...", + "oneOf": [ + { + "description": "The telemetry object containing the deprecated field has been renamed to an\nexisting or a new telemetry object.", + "type": "object", + "properties": { + "note": { + "description": "The note to provide more context about the deprecation.", + "type": "string" + }, + "reason": { + "type": "string", + "const": "renamed" + }, + "renamed_to": { + "description": "The new name of the telemetry object.", + "type": "string" + } + }, + "required": [ + "reason", + "renamed_to", + "note" + ] + }, + { + "description": "The telemetry object containing the deprecated field has been obsoleted\nbecause it no longer exists and has no valid replacement.\n\nThe `brief` field should contain the reason why the field has been obsoleted.", + "type": "object", + "properties": { + "note": { + "description": "The note to provide more context about the deprecation.", + "type": "string" + }, + "reason": { + "type": "string", + "const": "obsoleted" + } + }, + "required": [ + "reason", + "note" + ] + }, + { + "description": "The telemetry object containing the deprecated field has been deprecated for\ncomplex reasons (split, merge, ...) which are currently not precisely defined\nin the supported deprecation reasons.\n\nThe `brief` field should contain the reason for this uncategorized deprecation.", + "type": "object", + "properties": { + "note": { + "description": "The note to provide more context about the deprecation.", + "type": "string" + }, + "reason": { + "type": "string", + "const": "uncategorized" + } + }, + "required": [ + "reason", + "note" + ] + }, + { + "description": "This variant is used to capture old, unstructured deprecated \"string\".\nUsed for backward-compatibility only.", + "type": "object", + "properties": { + "note": { + "description": "The note to provide more context about the deprecation.", + "type": "string" + }, + "reason": { + "type": "string", + "const": "unspecified" + } + }, + "required": [ + "reason", + "note" + ] + } + ] + }, + "EnumEntriesSpec": { + "description": "Possible enum entries.", + "type": "object", + "properties": { + "annotations": { + "description": "Annotations for the member.", + "type": [ + "object", + "null" + ], + "additionalProperties": { + "$ref": "#/$defs/YamlValue" + } + }, + "brief": { + "description": "Brief description of the enum entry value.\nIt defaults to the value of id.", + "type": [ + "string", + "null" + ] + }, + "deprecated": { + "description": "Deprecation note.", + "anyOf": [ + { + "$ref": "#/$defs/Deprecated" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "Longer description.\nIt defaults to an empty string.", + "type": [ + "string", + "null" + ] + }, + "stability": { + "description": "Stability of this enum value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "value": { + "description": "String, int, or boolean; value of the enum entry.", + "$ref": "#/$defs/ValueSpec" + } + }, + "additionalProperties": false, + "required": [ + "id", + "value" + ] + }, + "Examples": { + "description": "The different types of examples.", + "anyOf": [ + { + "description": "A boolean example.", + "type": "boolean" + }, + { + "description": "A integer example.", + "type": "integer", + "format": "int64" + }, + { + "description": "A double example.", + "type": "number", + "format": "double" + }, + { + "description": "A string example.", + "type": "string" + }, + { + "description": "A any example.", + "$ref": "#/$defs/ValueSpec" + }, + { + "description": "A array of integers example.", + "type": "array", + "items": { + "type": "integer", + "format": "int64" + } + }, + { + "description": "A array of doubles example.", + "type": "array", + "items": { + "type": "number", + "format": "double" + } + }, + { + "description": "A array of bools example.", + "type": "array", + "items": { + "type": "boolean" + } + }, + { + "description": "A array of strings example.", + "type": "array", + "items": { + "type": "string" + } + }, + { + "description": "A array of anys example.", + "type": "array", + "items": { + "$ref": "#/$defs/ValueSpec" + } + }, + { + "description": "List of arrays of integers example.", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer", + "format": "int64" + } + } + }, + { + "description": "List of arrays of doubles example.", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "number", + "format": "double" + } + } + }, + { + "description": "List of arrays of bools example.", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "boolean" + } + } + }, + { + "description": "List of arrays of strings example.", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "string" + } + } + } + ] + }, + "GroupLineage": { + "description": "Group lineage.", + "type": "object", + "properties": { + "attributes": { + "description": "The lineage per attribute.\n\nNote: Use a BTreeMap to ensure a deterministic order of attributes.\nThis is important to keep unit tests stable.", + "type": "object", + "additionalProperties": { + "$ref": "#/$defs/AttributeLineage" + } + }, + "extends_group": { + "description": "The group that this group extended, if available.", + "type": [ + "string", + "null" + ] + }, + "includes_group": { + "description": "(V2 Only) Attribute groups included in this group.", + "type": "array", + "items": { + "type": "string" + } + }, + "provenance": { + "description": "The provenance of the source file where the group is defined.", + "$ref": "#/$defs/Provenance" + } + }, + "required": [ + "provenance" + ] + }, + "GroupType": { + "description": "The different types of groups: `attribute_group`, `span`, `event`, `metric`, `entity`, `scope`.\n\nNote: The `resource` type is no longer used and is an alias for `entity`.", + "oneOf": [ + { + "description": "Attribute group (attribute_group type) defines a set of attributes that\ncan be declared once and referenced by semantic conventions for\ndifferent signals, for example spans and logs. Attribute groups don't\nhave any specific fields and follow the general semconv semantics.", + "type": "string", + "const": "attribute_group" + }, + { + "description": "Span semantic convention.", + "type": "string", + "const": "span" + }, + { + "description": "Event semantic convention.", + "type": "string", + "const": "event" + }, + { + "description": "Metric semantic convention.", + "type": "string", + "const": "metric" + }, + { + "description": "The metric group semconv is a group where related metric attributes can\nbe defined and then referenced from other metric groups using ref.", + "type": "string", + "const": "metric_group" + }, + { + "description": "Entity semantic convention.", + "type": "string", + "const": "entity" + }, + { + "description": "Scope.", + "type": "string", + "const": "scope" + }, + { + "description": "Undefined group type.", + "type": "string", + "const": "undefined" + } + ] + }, + "InstrumentSpec": { + "description": "The type of the metric.", + "oneOf": [ + { + "description": "An up-down counter metric.", + "type": "string", + "const": "updowncounter" + }, + { + "description": "A counter metric.", + "type": "string", + "const": "counter" + }, + { + "description": "A gauge metric.", + "type": "string", + "const": "gauge" + }, + { + "description": "A histogram metric.", + "type": "string", + "const": "histogram" + } + ] + }, + "PrimitiveOrArrayTypeSpec": { + "description": "Primitive or array types.", + "oneOf": [ + { + "description": "A boolean attribute.", + "type": "string", + "const": "boolean" + }, + { + "description": "A integer attribute (signed 64 bit integer).", + "type": "string", + "const": "int" + }, + { + "description": "A double attribute (double precision floating point (IEEE 754-1985)).", + "type": "string", + "const": "double" + }, + { + "description": "A string attribute.", + "type": "string", + "const": "string" + }, + { + "description": "An any type attribute (accepts any valid value).", + "type": "string", + "const": "any" + }, + { + "description": "An array of strings attribute.", + "type": "string", + "const": "string[]" + }, + { + "description": "An array of integer attribute.", + "type": "string", + "const": "int[]" + }, + { + "description": "An array of double attribute.", + "type": "string", + "const": "double[]" + }, + { + "description": "An array of boolean attribute.", + "type": "string", + "const": "boolean[]" + } + ] + }, + "Provenance": { + "description": "The provenance a semantic convention specification file.", + "type": "object", + "properties": { + "path": { + "description": "The path to the specification file.\n\nThis is the path is only available *locally*. When publishing resolved schemas,\nthis field is not included.", + "type": "string" + }, + "schema_url": { + "description": "The schema URL where this was specified.\n\nThe Schema url contains the registry id and the version of the schema.\nIt can be used to detect conflicts or resolve multiple \"ids\" existing across\ndependency chains but being the same thing, conceptually.", + "$ref": "#/$defs/SchemaUrl" + } + }, + "required": [ + "schema_url", + "path" + ] + }, + "RequirementLevel": { + "description": "The different requirement level specifications.", + "anyOf": [ + { + "description": "A basic requirement level.", + "$ref": "#/$defs/BasicRequirementLevelSpec" + }, + { + "description": "A conditional requirement level.", + "type": "object", + "properties": { + "conditionally_required": { + "description": "The description of the condition.", + "type": "string" + } + }, + "required": [ + "conditionally_required" + ] + }, + { + "description": "A recommended requirement level.", + "type": "object", + "properties": { + "recommended": { + "description": "The description of the recommendation.", + "type": "string" + } + }, + "required": [ + "recommended" + ] + }, + { + "description": "An opt in requirement level.", + "type": "object", + "properties": { + "opt_in": { + "description": "The description of the recommendation.", + "type": "string" + } + }, + "required": [ + "opt_in" + ] + } + ] + }, + "ResolvedGroup": { + "description": "Resolved group specification used in the context of the template engine.", + "type": "object", + "properties": { + "annotations": { + "description": "Annotations for the group.", + "type": [ + "object", + "null" + ], + "additionalProperties": { + "$ref": "#/$defs/YamlValue" + } + }, + "attributes": { + "description": "List of attributes that belong to the semantic convention.", + "type": "array", + "items": { + "$ref": "#/$defs/Attribute" + } + }, + "body": { + "description": "The body specification used for event semantic conventions.", + "anyOf": [ + { + "$ref": "#/$defs/AnyValueSpec" + }, + { + "type": "null" + } + ] + }, + "brief": { + "description": "A brief description of the semantic convention.", + "type": "string" + }, + "deprecated": { + "description": "Specifies if the semantic convention is deprecated. The string\nprovided as `description` MUST specify why it's deprecated and/or what\nto use instead. See also stability.", + "anyOf": [ + { + "$ref": "#/$defs/Deprecated" + }, + { + "type": "null" + } + ] + }, + "display_name": { + "description": "The readable name for attribute groups used when generating registry tables.", + "type": [ + "string", + "null" + ] + }, + "entity_associations": { + "description": "The associated entities of this group.", + "type": "array", + "items": { + "type": "string" + } + }, + "events": { + "description": "List of strings that specify the ids of event semantic conventions\nassociated with this span semantic convention.\nNote: only valid if type is span", + "type": "array", + "items": { + "type": "string" + } + }, + "extends": { + "description": "Reference another semantic convention id. It inherits\nall attributes defined in the specified semantic\nconvention.", + "type": [ + "string", + "null" + ] + }, + "id": { + "description": "The id that uniquely identifies the semantic convention.", + "type": "string" + }, + "instrument": { + "description": "The instrument type that should be used to record the metric. Note that\nthe semantic conventions must be written using the names of the\nsynchronous instrument types (counter, gauge, updowncounter and\nhistogram).\nFor more details: [Metrics semantic conventions - Instrument types](https://github.com/open-telemetry/opentelemetry-specification/tree/main/specification/metrics/semantic_conventions#instrument-types).\nNote: This field is required if type is metric.", + "anyOf": [ + { + "$ref": "#/$defs/InstrumentSpec" + }, + { + "type": "null" + } + ] + }, + "lineage": { + "description": "The lineage of the group.", + "anyOf": [ + { + "$ref": "#/$defs/GroupLineage" + }, + { + "type": "null" + } + ] + }, + "metric_name": { + "description": "The metric name as described by the [OpenTelemetry Specification](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/data-model.md#timeseries-model).\nNote: This field is required if type is metric.", + "type": [ + "string", + "null" + ] + }, + "name": { + "description": "The name of the event. If not specified, the prefix is used.\nIf prefix is empty (or unspecified), name is required.", + "type": [ + "string", + "null" + ] + }, + "note": { + "description": "A more elaborate description of the semantic convention.\nIt defaults to an empty string.", + "type": "string" + }, + "prefix": { + "description": "Prefix for the attributes for this semantic convention.\nIt defaults to an empty string.", + "type": "string" + }, + "span_kind": { + "description": "Specifies the kind of the span.\nNote: only valid if type is span", + "anyOf": [ + { + "$ref": "#/$defs/SpanKindSpec" + }, + { + "type": "null" + } + ] + }, + "stability": { + "description": "Specifies the stability of the semantic convention.\nNote that, if stability is missing but deprecated is present, it will\nautomatically set the stability to deprecated. If deprecated is\npresent and stability differs from deprecated, this will result in an\nerror.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "description": "The type of the group including the specific fields for each type.", + "$ref": "#/$defs/GroupType" + }, + "unit": { + "description": "The unit in which the metric is measured, which should adhere to the\n[guidelines](https://github.com/open-telemetry/opentelemetry-specification/tree/main/specification/metrics/semantic_conventions#instrument-units).\nNote: This field is required if type is metric.", + "type": [ + "string", + "null" + ] + } + }, + "required": [ + "id", + "type", + "brief" + ] + }, + "SchemaUrl": { + "description": "Represents the schema URL of a registry, which serves as a unique identifier for the registry\nalong with its version.", + "type": "object", + "properties": { + "url": { + "description": "The schema URL string.", + "type": "string" + } + }, + "required": [ + "url" + ] + }, + "SpanKindSpec": { + "description": "The span kind.", + "oneOf": [ + { + "description": "An internal span.", + "type": "string", + "const": "internal" + }, + { + "description": "A client span.", + "type": "string", + "const": "client" + }, + { + "description": "A server span.", + "type": "string", + "const": "server" + }, + { + "description": "A producer span.", + "type": "string", + "const": "producer" + }, + { + "description": "A consumer span.", + "type": "string", + "const": "consumer" + } + ] + }, + "Stability": { + "description": "The level of stability for a definition. Defined in [OTEP-232](https://github.com/open-telemetry/oteps/blob/main/text/0232-maturity-of-otel.md)", + "oneOf": [ + { + "description": "A deprecated definition.", + "type": "string", + "const": "deprecated", + "deprecated": true + }, + { + "description": "A stable definition.", + "type": "string", + "const": "stable" + }, + { + "description": "A definition in development. Formally known as experimental.", + "type": "string", + "const": "development" + }, + { + "description": "An alpha definition.", + "type": "string", + "const": "alpha" + }, + { + "description": "A beta definition.", + "type": "string", + "const": "beta" + }, + { + "description": "A release candidate definition.", + "type": "string", + "const": "release_candidate" + } + ] + }, + "Tags": { + "description": "A set of tags.\n\nExamples of tags:\n- sensitivity: pii\n- sensitivity: phi\n- data_classification: restricted\n- semantic_type: email\n- semantic_type: first_name\n- owner:\n- provenance: browser_sensor", + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "TemplateTypeSpec": { + "description": "Template types.", + "oneOf": [ + { + "description": "A boolean attribute.", + "type": "string", + "const": "template[boolean]" + }, + { + "description": "A integer attribute.", + "type": "string", + "const": "template[int]" + }, + { + "description": "A double attribute.", + "type": "string", + "const": "template[double]" + }, + { + "description": "A string attribute.", + "type": "string", + "const": "template[string]" + }, + { + "description": "A any attribute.", + "type": "string", + "const": "template[any]" + }, + { + "description": "An array of strings attribute.", + "type": "string", + "const": "template[string[]]" + }, + { + "description": "An array of integer attribute.", + "type": "string", + "const": "template[int[]]" + }, + { + "description": "An array of double attribute.", + "type": "string", + "const": "template[double[]]" + }, + { + "description": "An array of boolean attribute.", + "type": "string", + "const": "template[boolean[]]" + } + ] + }, + "Value": { + "description": "The different types of values.", + "oneOf": [ + { + "description": "A integer value.", + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "Int" + }, + "value": { + "description": "The value", + "type": "integer", + "format": "int64" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "description": "A double value.", + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "Double" + }, + "value": { + "description": "The value", + "type": "number", + "format": "double" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "description": "A string value.", + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "String" + }, + "value": { + "description": "The value", + "type": "string" + } + }, + "required": [ + "type", + "value" + ] + } + ] + }, + "ValueSpec": { + "description": "The different types of values.", + "anyOf": [ + { + "description": "A integer value.", + "type": "integer", + "format": "int64" + }, + { + "description": "A double value.", + "type": "number", + "format": "double" + }, + { + "description": "A string value.", + "type": "string" + }, + { + "description": "A boolean value.", + "type": "boolean" + } + ] + }, + "YamlValue": { + "type": [ + "null", + "boolean", + "object", + "array", + "number", + "string" + ] + } + } +} \ No newline at end of file diff --git a/examples/weaver/registry/chronos/metrics.yaml b/examples/weaver/registry/chronos/metrics.yaml index 9127985..9b1010e 100644 --- a/examples/weaver/registry/chronos/metrics.yaml +++ b/examples/weaver/registry/chronos/metrics.yaml @@ -1,39 +1,49 @@ groups: - - id: metric_attributes.chronos.message_store + - id: resource.chronos.service type: attribute_group stability: development - brief: Common attributes used by Chronos message store metrics. + brief: Resource attributes emitted by the Chronos metrics mock. attributes: - - id: chronos.message.destination + - id: service.name type: string - stability: development - brief: Destination used by Chronos while handling a consumed message. - examples: ["kafka", "postgres"] + stability: stable + brief: Logical name of the service. + examples: ["chronos-metrics-mock"] requirement_level: required - - id: chronos.operation.status + - id: service.instance.id type: string - stability: development - brief: Low-cardinality operation result. - examples: ["pass", "fail"] + stability: stable + brief: The string ID of the service instance. + examples: ["chronos-metrics-mock-live-check"] requirement_level: required - - id: metric_attributes.chronos.processor + - id: metric_attributes.chronos.message_store type: attribute_group stability: development - brief: Common attributes used by Chronos processor loop metrics. + brief: Common attributes used by Chronos message store metrics. attributes: - - id: chronos.processor.returned - type: boolean + - id: messaging.system + type: string stability: development - brief: Whether the processor loop returned before processing any rows. - examples: [true, false] + brief: The messaging system as identified by the client instrumentation. + examples: ["kafka"] requirement_level: required - - ref: chronos.operation.status + - id: messaging.operation.name + type: string + stability: development + brief: The system-specific name of the messaging operation. + examples: ["receive", "process"] + requirement_level: required + - id: messaging.destination.name + type: string + stability: development + brief: The message destination name. + examples: ["chronos-input", "chronos-output"] requirement_level: required - id: metric.chronos.message.consumed type: metric - metric_name: chronos.message.consumed + metric_name: messaging.client.consumed.messages stability: development brief: Total number of Chronos input messages consumed. instrument: counter @@ -43,11 +53,11 @@ groups: code_generation: rust_name: msg_consumed metric_value_type: int - prometheus_name: chronos_messages_consumed_total + prometheus_name: messaging_client_consumed_messages_total - id: metric.chronos.message.consume.duration type: metric - metric_name: chronos.message.consume.duration + metric_name: messaging.client.operation.duration stability: development brief: Duration of handle_message() in message_receiver. instrument: histogram @@ -57,22 +67,22 @@ groups: code_generation: rust_name: msg_consume_latency metric_value_type: double - prometheus_name: chronos_message_consume_duration_seconds + prometheus_name: messaging_client_operation_duration_seconds buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] - id: metric.chronos.message.process.duration type: metric - metric_name: chronos.message.process.duration + metric_name: messaging.process.duration stability: development brief: Duration of processor_message_ready() loop in message_processor. instrument: histogram unit: s - extends: metric_attributes.chronos.processor + extends: metric_attributes.chronos.message_store annotations: code_generation: rust_name: msg_process_latency metric_value_type: double - prometheus_name: chronos_message_process_duration_seconds + prometheus_name: messaging_process_duration_seconds buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] - id: metric.chronos.message.wait.duration diff --git a/examples/weaver/templates/registry/markdown/metrics.md.j2 b/examples/weaver/templates/registry/markdown/metrics.md.j2 new file mode 100644 index 0000000..4dd3ffa --- /dev/null +++ b/examples/weaver/templates/registry/markdown/metrics.md.j2 @@ -0,0 +1,9 @@ +# Chronos Metrics + +Generated from `examples/weaver/registry/chronos/metrics.yaml` by OpenTelemetry Weaver. + +| Metric | Prometheus Name | Instrument | Unit | Attributes | Description | +| --- | --- | --- | --- | --- | --- | +{% for metric in ctx.metrics -%} +| `{{ metric.metric_name }}` | `{{ metric.prometheus_name }}` | `{{ metric.instrument }}` | `{{ metric.unit }}` | {% if metric.attributes %}{% for attribute in metric.attributes %}`{{ attribute }}`{% if not loop.last %}, {% endif %}{% endfor %}{% else %}-{% endif %} | {{ metric.brief }} | +{% endfor -%} diff --git a/examples/weaver/templates/registry/markdown/weaver.yaml b/examples/weaver/templates/registry/markdown/weaver.yaml new file mode 100644 index 0000000..9cb398b --- /dev/null +++ b/examples/weaver/templates/registry/markdown/weaver.yaml @@ -0,0 +1,17 @@ +templates: + - pattern: metrics.md.j2 + filter: > + { + metrics: (.groups + | map(select(.type == "metric")) + | map({ + metric_name, + prometheus_name: .annotations.code_generation.prometheus_name, + brief, + instrument, + unit, + attributes: (.attributes // [] | map(.name // .id // .ref)) + })) + } + application_mode: single + file_name: chronos_metrics.md diff --git a/examples/weaver/templates/registry/rust/registry.rs.j2 b/examples/weaver/templates/registry/rust/registry.rs.j2 index 1e707ec..e63546b 100644 --- a/examples/weaver/templates/registry/rust/registry.rs.j2 +++ b/examples/weaver/templates/registry/rust/registry.rs.j2 @@ -22,6 +22,7 @@ pub struct MetricDefinition { pub description: &'static str, pub unit: Option<&'static str>, pub label_names: &'static [&'static str], + pub prometheus_label_names: &'static [&'static str], pub kind: MetricKind, pub buckets: Option<&'static [f64]>, } @@ -35,6 +36,7 @@ pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ description: "{{ metric.brief }}", unit: {% if metric.unit %}Some("{{ metric.unit }}"){% else %}None{% endif %}, label_names: &[{% for attribute in metric.attributes %}"{{ attribute }}"{% if not loop.last %}, {% endif %}{% endfor %}], + prometheus_label_names: &[{% for label in metric.prometheus_labels %}"{{ label }}"{% if not loop.last %}, {% endif %}{% endfor %}], kind: MetricKind::{{ metric.instrument | pascal_case }}, buckets: {% if metric.buckets %}{% if metric.buckets | length > 10 %}Some(&[ {{ metric.buckets | join(", ") }}, diff --git a/examples/weaver/templates/registry/rust/weaver.yaml b/examples/weaver/templates/registry/rust/weaver.yaml index 8351daf..05afd4d 100644 --- a/examples/weaver/templates/registry/rust/weaver.yaml +++ b/examples/weaver/templates/registry/rust/weaver.yaml @@ -13,6 +13,7 @@ templates: instrument, unit, attributes: (.attributes // [] | map(.name // .id // .ref)), + prometheus_labels: (.attributes // [] | map((.name // .id // .ref) | gsub("\\."; "_"))), buckets: .annotations.code_generation.buckets })) } From e346d194a38979fa91c3d310793a65a0eb375bb7 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Thu, 30 Apr 2026 21:45:01 +1000 Subject: [PATCH 13/36] feat(metrics): add generated metric spec Capture the current Chronos Prometheus metric surface in chronos_bin/src/metrics/spec.yaml, cross-referenced with issue #12 and the Prometheus/OTLP abstraction sketched in examples/prom_otlp_mock.rs. Add an unused generated Rust definition table with metric IDs, Prometheus names, OTLP names, labels, buckets, and pre-warm label values. The generated module is intentionally not imported yet, so runtime metrics remain hand-written. Verification: - cargo fmt -- --check - sh scripts/pre-commit-checks.sh (fails: missing libsasl2 development package; cargo check also reports time 0.3.30 type inference error before tests run) Model-version: GPT-5 --- .../generated/chronos_metric_definitions.rs | 105 +++++++++++++++++ chronos_bin/src/metrics/spec.yaml | 107 ++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 chronos_bin/src/metrics/generated/chronos_metric_definitions.rs create mode 100644 chronos_bin/src/metrics/spec.yaml diff --git a/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs b/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs new file mode 100644 index 0000000..19a2bc0 --- /dev/null +++ b/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs @@ -0,0 +1,105 @@ +// Generated from chronos_bin/src/metrics/spec.yaml. +// Do not edit by hand. +// +// This generated definition table is intentionally not imported by +// chronos_bin/src/metrics/mod.rs yet. The current hand-written Prometheus +// registry remains the runtime implementation until the generated registry +// replacement is wired in explicitly. + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum MetricId { + MsgConsumeLatency, + MsgJitter, + MsgProcessLatency, + MsgReset, + MsgWaitTime, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MetricKind { + Counter, + Histogram, +} + +#[derive(Clone, Copy, Debug)] +pub struct MetricDefinition { + pub id: MetricId, + pub rust_name: &'static str, + pub prometheus_name: &'static str, + pub otel_name: &'static str, + pub description: &'static str, + pub unit: Option<&'static str>, + pub label_names: &'static [&'static str], + pub otel_label_names: &'static [&'static str], + pub kind: MetricKind, + pub buckets: Option<&'static [f64]>, + pub prewarm_label_values: &'static [&'static [&'static str]], +} + +pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ + MetricDefinition { + id: MetricId::MsgConsumeLatency, + rust_name: "msg_consume_latency", + prometheus_name: "msg_consume_latency", + otel_name: "chronos.message.consume.duration", + description: "Duration of message_receiver::MessageReceiver::handle_message().", + unit: Some("s"), + label_names: &["destination", "status"], + otel_label_names: &["chronos.destination", "chronos.status"], + kind: MetricKind::Histogram, + buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), + prewarm_label_values: &[&["kafka", "pass"], &["kafka", "fail"], &["postgres", "pass"], &["postgres", "fail"]], + }, + MetricDefinition { + id: MetricId::MsgJitter, + rust_name: "msg_jitter", + prometheus_name: "msg_jitter", + otel_name: "chronos.message.jitter", + description: "Difference between actual publish time and client-requested deadline.", + unit: Some("s"), + label_names: &[], + otel_label_names: &[], + kind: MetricKind::Histogram, + buckets: Some(&[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]), + prewarm_label_values: &[], + }, + MetricDefinition { + id: MetricId::MsgProcessLatency, + rust_name: "msg_process_latency", + prometheus_name: "msg_process_latency", + otel_name: "chronos.message.process.duration", + description: "Duration of message_processor::MessageProcessor::processor_message_ready().", + unit: Some("s"), + label_names: &["returned", "status"], + otel_label_names: &["chronos.processor.returned", "chronos.status"], + kind: MetricKind::Histogram, + buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), + prewarm_label_values: &[&["true", "pass"], &["true", "fail"], &["false", "pass"], &["false", "fail"]], + }, + MetricDefinition { + id: MetricId::MsgReset, + rust_name: "msg_reset", + prometheus_name: "msg_reset", + otel_name: "chronos.message.reset", + description: "Number of records reset by postgres::pg::Pg::reset_to_init_db() in the monitor task.", + unit: Some("{message}"), + label_names: &[], + otel_label_names: &[], + kind: MetricKind::Counter, + buckets: None, + prewarm_label_values: &[], + }, + MetricDefinition { + id: MetricId::MsgWaitTime, + rust_name: "msg_wait_time", + prometheus_name: "msg_wait_time", + otel_name: "chronos.message.wait.duration", + description: "Time a message spent in the Kafka input queue before processing.", + unit: Some("s"), + label_names: &[], + otel_label_names: &[], + kind: MetricKind::Histogram, + buckets: Some(&[0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2]), + prewarm_label_values: &[], + }, +]; diff --git a/chronos_bin/src/metrics/spec.yaml b/chronos_bin/src/metrics/spec.yaml new file mode 100644 index 0000000..1e0728e --- /dev/null +++ b/chronos_bin/src/metrics/spec.yaml @@ -0,0 +1,107 @@ +# Chronos metrics specification. +# +# This file is intentionally not wired into the runtime yet. It captures the +# current Prometheus metric surface from registry.rs while carrying the +# OpenTelemetry-style names needed by a future generated registry. +# +# Sources: +# - https://github.com/kindredgroup/chronos/issues/12 +# - examples/prom_otlp_mock.rs + +schema_version: 1 +service: chronos +stability: development +source_issue: https://github.com/kindredgroup/chronos/issues/12 +notes: + - >- + Metrics operations must stay outside the critical message path. Failures to + record metrics must not block consuming, storing, publishing, or deleting messages. + - >- + Chronos exposes Prometheus pull metrics today; OTLP names are included so the + same definitions can generate an OTLP backend later. + - >- + The message jitter histogram keeps an explicit 0.5 second bucket for the + 500ms scheduling SLA discussed in issue #12. + +label_sets: + consume_result: + labels: + - name: destination + otel_name: chronos.destination + description: Downstream selected by message_receiver::handle_message. + values: [kafka, postgres] + - name: status + otel_name: chronos.status + description: Whether the consume path completed successfully. + values: [pass, fail] + prewarm: + - [kafka, pass] + - [kafka, fail] + - [postgres, pass] + - [postgres, fail] + process_result: + labels: + - name: returned + otel_name: chronos.processor.returned + description: Whether the processor loop returned early because no rows were ready. + values: ["true", "false"] + - name: status + otel_name: chronos.status + description: Whether the processor loop completed successfully. + values: [pass, fail] + prewarm: + - ["true", pass] + - ["true", fail] + - ["false", pass] + - ["false", fail] + +metrics: + - id: msg_consume_latency + kind: histogram + prometheus_name: msg_consume_latency + otel_name: chronos.message.consume.duration + description: Duration of message_receiver::MessageReceiver::handle_message(). + unit: s + labels: consume_result + buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] + issue_12_signal: latency + + - id: msg_process_latency + kind: histogram + prometheus_name: msg_process_latency + otel_name: chronos.message.process.duration + description: Duration of message_processor::MessageProcessor::processor_message_ready(). + unit: s + labels: process_result + buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] + issue_12_signal: latency + + - id: msg_wait_time + kind: histogram + prometheus_name: msg_wait_time + otel_name: chronos.message.wait.duration + description: Time a message spent in the Kafka input queue before processing. + unit: s + labels: [] + buckets: [0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2] + issue_12_signal: latency + + - id: msg_jitter + kind: histogram + prometheus_name: msg_jitter + otel_name: chronos.message.jitter + description: Difference between actual publish time and client-requested deadline. + unit: s + labels: [] + buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] + issue_12_signal: latency + sla_bucket_seconds: 0.5 + + - id: msg_reset + kind: counter + prometheus_name: msg_reset + otel_name: chronos.message.reset + description: Number of records reset by postgres::pg::Pg::reset_to_init_db() in the monitor task. + unit: "{message}" + labels: [] + issue_12_signal: errors From 1d7d5fb48e37a0ce4149e32241b9232ca3699c47 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Thu, 30 Apr 2026 22:24:09 +1000 Subject: [PATCH 14/36] ci: enforce warning-free pre-commit checks Make the standard lint and unit-test targets run with warnings denied, remove the obsolete clippy crate dependency, and fix the existing Rust and clippy warnings that blocked that policy. Update the lockfile for the Rust 1.94-compatible time dependency set and refreshed native Kafka build dependencies. Add a pre-commit GitHub Actions workflow for pushes to non-main branches, plus an act-backed Make target to run that workflow locally. Add .github/config.json and a repo.config.apply target for applying repository settings, Actions permissions, and main branch protection through gh. Verification: - sh scripts/pre-commit-checks.sh - make workflow.pre-commit.act Model-version: GPT-5 --- .env.example | 2 +- .github/config.json | 40 ++++ .github/workflows/pre-commit.yml | 28 +++ Cargo.lock | 207 +++++------------- Cargo.toml | 1 - Makefile | 24 +- chronos_bin/Cargo.toml | 1 - chronos_bin/src/kafka/consumer.rs | 2 +- chronos_bin/src/postgres/pg.rs | 4 +- chronos_bin/src/telemetry/otlp_collector.rs | 5 +- chronos_bin/src/utils/env.rs | 4 +- chronos_bin/src/utils/util.rs | 14 +- examples/chronos_ex/examples/chronos_ex.rs | 5 +- .../chronos_ex/examples/telemetry_simple.rs | 6 +- .../chronos_ex/examples/tracing_example.rs | 2 +- examples/prom_otlp_mock.rs | 2 +- scripts/apply-github-config.sh | 31 +++ 17 files changed, 194 insertions(+), 184 deletions(-) create mode 100644 .github/config.json create mode 100644 .github/workflows/pre-commit.yml create mode 100755 scripts/apply-github-config.sh diff --git a/.env.example b/.env.example index 40a1135..68def42 100644 --- a/.env.example +++ b/.env.example @@ -1,5 +1,5 @@ # WHEN DEVELOPING LOCALLY, WE NEED TO ACCESS THE HOST NETWORK FROM K8S (FOR POSTGRES/KAFKA/ELASTIC/ETC) -LOCAL_HOST_IP=$(ifconfig en0 | grep inet | grep -v inet6 | awk '{print $2}') +LOCAL_HOST_IP=${LOCAL_HOST_IP:-$(hostname -I 2>/dev/null | awk '{print $1}')} # RUST version RUST_VERSION=stable diff --git a/.github/config.json b/.github/config.json new file mode 100644 index 0000000..8fbbb2a --- /dev/null +++ b/.github/config.json @@ -0,0 +1,40 @@ +{ + "repository": { + "allow_auto_merge": true, + "allow_merge_commit": false, + "allow_rebase_merge": true, + "allow_squash_merge": true, + "delete_branch_on_merge": true, + "squash_merge_commit_message": "PR_BODY", + "squash_merge_commit_title": "PR_TITLE", + "web_commit_signoff_required": false + }, + "actions": { + "default_workflow_permissions": "read", + "can_approve_pull_request_reviews": false + }, + "branches": { + "main": { + "protection": { + "required_status_checks": { + "strict": true, + "contexts": [ + "pre-commit" + ] + }, + "enforce_admins": true, + "required_pull_request_reviews": { + "dismiss_stale_reviews": true, + "require_code_owner_reviews": false, + "required_approving_review_count": 1 + }, + "restrictions": null, + "required_linear_history": true, + "allow_force_pushes": false, + "allow_deletions": false, + "block_creations": false, + "required_conversation_resolution": true + } + } + } +} diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..1b35497 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,28 @@ +name: pre-commit + +on: + push: + branches-ignore: + - main + +permissions: + contents: read + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@master + with: + toolchain: 1.94 + components: rustfmt,clippy + + - name: Install system dependencies + run: scripts/ubuntu-setup.sh + + - name: Run pre-commit checks + run: sh scripts/pre-commit-checks.sh diff --git a/Cargo.lock b/Cargo.lock index 1bd8021..2ca93fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,18 +95,6 @@ version = "1.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" -[[package]] -name = "arrayref" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" - -[[package]] -name = "arrayvec" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" - [[package]] name = "async-trait" version = "0.1.74" @@ -245,12 +233,6 @@ dependencies = [ "rustc-demangle", ] -[[package]] -name = "base64" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" - [[package]] name = "base64" version = "0.21.4" @@ -269,17 +251,6 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" -[[package]] -name = "blake2b_simd" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587" -dependencies = [ - "arrayref", - "arrayvec", - "constant_time_eq", -] - [[package]] name = "block-buffer" version = "0.10.4" @@ -315,11 +286,12 @@ checksum = "7b02b629252fe8ef6460461409564e2c21d0c8e77e0944f3d189ff06c4e932ad" [[package]] name = "cc" -version = "1.0.83" +version = "1.2.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d" dependencies = [ - "libc", + "find-msvc-tools", + "shlex", ] [[package]] @@ -352,7 +324,6 @@ dependencies = [ "cargo-husky", "chrono", "clap", - "clippy", "deadpool-postgres", "dotenvy", "env_logger 0.10.0", @@ -444,27 +415,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961" -[[package]] -name = "clippy" -version = "0.0.302" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d911ee15579a3f50880d8c1d59ef6e79f9533127a3bd342462f5d584f5e8c294" -dependencies = [ - "term", -] - [[package]] name = "colorchoice" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" -[[package]] -name = "constant_time_eq" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" - [[package]] name = "core-foundation" version = "0.9.3" @@ -568,9 +524,9 @@ dependencies = [ [[package]] name = "deranged" -version = "0.3.9" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f32d04922c60427da6f9fef14d042d9edddef64cb9d4ce0d64d0685fbeb1fd3" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ "powerfmt", ] @@ -586,17 +542,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "dirs" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fd78930633bd1c6e35c4b42b1df7b0cbc6bc191146e512bb3bedf243fcc3901" -dependencies = [ - "libc", - "redox_users", - "winapi", -] - [[package]] name = "dotenv" version = "0.15.0" @@ -693,6 +638,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + [[package]] name = "finl_unicode" version = "1.2.0" @@ -828,17 +779,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "getrandom" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" -dependencies = [ - "cfg-if", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", -] - [[package]] name = "getrandom" version = "0.2.10" @@ -847,7 +787,7 @@ checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", ] [[package]] @@ -1168,15 +1108,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.149" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "libz-sys" -version = "1.1.12" +version = "1.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +checksum = "fc3a226e576f50782b3305c5ccf458698f92798987f551c6a02efe8276721e22" dependencies = [ "cc", "libc", @@ -1254,7 +1194,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" dependencies = [ "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "windows-sys", ] @@ -1268,6 +1208,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-traits" version = "0.2.17" @@ -1289,23 +1235,24 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.5.11" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f646caf906c20226733ed5b1374287eb97e3c2a5c227ce668c1f2ce20ae57c9" +checksum = "5d0bca838442ec211fa11de3a8b0e0e8f3a4522575b5c4c06ed722e005036f26" dependencies = [ "num_enum_derive", + "rustversion", ] [[package]] name = "num_enum_derive" -version = "0.5.11" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcbff9bc912032c62bf65ef1d5aea88983b420f4f839db1e9b0c281a25c9c799" +checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.38", ] [[package]] @@ -1351,9 +1298,9 @@ dependencies = [ [[package]] name = "openssl-sys" -version = "0.9.93" +version = "0.9.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db4d56a4c0478783083cfafcc42493dd4a981d41669da64b4572a2a089b51b1d" +checksum = "13ce1245cd07fcc4cfdb438f7507b0c7e4f3849a69fd84d52374c66d83741bb6" dependencies = [ "cc", "libc", @@ -1546,7 +1493,7 @@ checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.3.5", + "redox_syscall", "smallvec", "windows-targets", ] @@ -1620,9 +1567,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" [[package]] name = "postgres-protocol" @@ -1630,7 +1577,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49b6c5ef183cd3ab4ba005f1ca64c21e8bd97ce4699cfea9e8d9a2c4958ca520" dependencies = [ - "base64 0.21.4", + "base64", "byteorder", "bytes", "fallible-iterator", @@ -1779,7 +1726,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.10", + "getrandom", ] [[package]] @@ -1802,9 +1749,9 @@ dependencies = [ [[package]] name = "rdkafka-sys" -version = "4.6.0+2.2.0" +version = "4.10.0+2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad63c279fca41a27c231c450a2d2ad18288032e9cbb159ad16c9d96eba35aaaf" +checksum = "e234cf318915c1059d4921ef7f75616b5219b10b46e9f3a511a15eb4b56a3f77" dependencies = [ "libc", "libz-sys", @@ -1814,12 +1761,6 @@ dependencies = [ "sasl2-sys", ] -[[package]] -name = "redox_syscall" -version = "0.1.57" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" - [[package]] name = "redox_syscall" version = "0.3.5" @@ -1829,17 +1770,6 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "redox_users" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de0737333e7a9502c789a36d7c7fa6092a49895d4faa31ca5df163857ded2e9d" -dependencies = [ - "getrandom 0.1.16", - "redox_syscall 0.1.57", - "rust-argon2", -] - [[package]] name = "refinery" version = "0.8.11" @@ -1920,7 +1850,7 @@ version = "0.11.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b" dependencies = [ - "base64 0.21.4", + "base64", "bytes", "encoding_rs", "futures-core", @@ -1955,18 +1885,6 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4389f1d5789befaf6029ebd9f7dac4af7f7e3d61b69d4f30e2ac02b57e7712b0" -[[package]] -name = "rust-argon2" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b18820d944b33caa75a71378964ac46f58517c92b6ae5f762636247c09e78fb" -dependencies = [ - "base64 0.13.1", - "blake2b_simd", - "constant_time_eq", - "crossbeam-utils", -] - [[package]] name = "rustc-demangle" version = "0.1.23" @@ -2009,9 +1927,9 @@ dependencies = [ [[package]] name = "sasl2-sys" -version = "0.1.20+2.1.28" +version = "0.1.22+2.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e645bd98535fc8fd251c43ba7c7c1f9be1e0369c99b6a5ea719052a773e655c" +checksum = "05f2a7f7efd9fc98b3a9033272df10709f5ee3fa0eabbd61a527a3a1ed6bd3c6" dependencies = [ "cc", "duct", @@ -2141,6 +2059,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook-registry" version = "1.4.1" @@ -2343,17 +2267,6 @@ dependencies = [ "libc", ] -[[package]] -name = "term" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edd106a334b7657c10b7c540a0106114feadeb4dc314513e97df481d5d966f42" -dependencies = [ - "byteorder", - "dirs", - "winapi", -] - [[package]] name = "termcolor" version = "1.3.0" @@ -2417,12 +2330,13 @@ dependencies = [ [[package]] name = "time" -version = "0.3.30" +version = "0.3.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4a34ab300f2dee6e562c10a046fc05e358b29f9bf92277f30c3c8d82275f6f5" +checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" dependencies = [ "deranged", "itoa", + "num-conv", "powerfmt", "serde", "time-core", @@ -2431,16 +2345,17 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.2" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" +checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" [[package]] name = "time-macros" -version = "0.2.15" +version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20" +checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" dependencies = [ + "num-conv", "time-core", ] @@ -2593,7 +2508,7 @@ checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" dependencies = [ "async-trait", "axum 0.6.20", - "base64 0.21.4", + "base64", "bytes", "futures-core", "futures-util", @@ -2780,7 +2695,7 @@ version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79daa5ed5740825c40b389c5e50312b9c86df53fccd33f281df655642b43869d" dependencies = [ - "getrandom 0.2.10", + "getrandom", "rand", "uuid-macro-internal", ] @@ -2869,12 +2784,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" diff --git a/Cargo.toml b/Cargo.toml index d327280..7e48054 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,6 @@ deadpool-postgres = "0.10" refinery = { version = "0.8.7", features = ["tokio-postgres"] } -clippy = "0.0.302" rand = "0.8.5" serial_test = "2.0.0" diff --git a/Makefile b/Makefile index e281e83..72ff23b 100644 --- a/Makefile +++ b/Makefile @@ -12,10 +12,14 @@ WEAVER_GENERATED_DIR ?= examples/weaver/generated WEAVER_LIVE_CHECK_PORT ?= 4319 WEAVER_LIVE_CHECK_ADMIN_PORT ?= 4320 WEAVER_LIVE_CHECK_OUT ?= /tmp/chronos-weaver-live-check +GITHUB_CONFIG ?= .github/config.json +ACT_WORKFLOW ?= .github/workflows/pre-commit.yml +ACT_EVENT ?= push +ACT_JOB ?= pre-commit # pp - pretty print function -yellow := $(shell tput setaf 3) -normal := $(shell tput sgr0) +yellow := $(shell tput setaf 3 2>/dev/null || true) +normal := $(shell tput sgr0 2>/dev/null || true) define pp @printf '$(yellow)$(1)$(normal)\n' endef @@ -100,14 +104,14 @@ run.release: ## lint: 🧹 Checks for lint failures on rust lint: $(call pp,lint rust...) - cargo check + RUSTFLAGS="-D warnings" cargo check cargo fmt -- --check - cargo clippy --all-targets + RUSTFLAGS="-D warnings" cargo clippy --all-targets -- -D warnings ## test.unit: 🧪 Runs unit tests test.unit: $(call pp,rust unit tests...) - cargo test + RUSTFLAGS="-D warnings" cargo test ## integration: 🧪 Start deps, migrate, run Chronos, publish test message, verify metrics integration: build @@ -139,6 +143,16 @@ weaver.check: $(call pp,check Weaver registry with $(WEAVER_IMAGE)...) docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry check -r $(WEAVER_REGISTRY) +## repo.config.apply: 🔐 Apply GitHub repository and branch settings from .github/config.json +repo.config.apply: + $(call pp,apply GitHub repository config from $(GITHUB_CONFIG)...) + scripts/apply-github-config.sh "$(GITHUB_CONFIG)" + +## workflow.pre-commit.act: 🎬 Run the pre-commit GitHub Actions workflow locally with act +workflow.pre-commit.act: + $(call pp,run $(ACT_WORKFLOW) locally with act...) + act "$(ACT_EVENT)" -W "$(ACT_WORKFLOW)" -j "$(ACT_JOB)" + ## weaver.generate.rust: 🧵 Generate Rust metric definitions with Weaver weaver.generate.rust: $(call pp,generate Rust metric definitions with $(WEAVER_IMAGE)...) diff --git a/chronos_bin/Cargo.toml b/chronos_bin/Cargo.toml index 0a723c7..fd11e2d 100644 --- a/chronos_bin/Cargo.toml +++ b/chronos_bin/Cargo.toml @@ -39,7 +39,6 @@ anyhow = "1.0.42" chrono = "0.4.23" #config -clippy.workspace = true clap = { version="4.1.4", features = ["derive"] } dotenvy = "0.15" uuid = { version="1.3.0", features = [ diff --git a/chronos_bin/src/kafka/consumer.rs b/chronos_bin/src/kafka/consumer.rs index 04a0a3e..04caa3e 100644 --- a/chronos_bin/src/kafka/consumer.rs +++ b/chronos_bin/src/kafka/consumer.rs @@ -65,7 +65,7 @@ impl KafkaConsumer { }; } - pub(crate) async fn kafka_consume_message(&self) -> Result { + pub(crate) async fn kafka_consume_message(&self) -> Result, KafkaAdapterError> { self.consumer.recv().await.map_err(KafkaAdapterError::ReceiveMessage) } } diff --git a/chronos_bin/src/postgres/pg.rs b/chronos_bin/src/postgres/pg.rs index 9e39462..4be47e9 100644 --- a/chronos_bin/src/postgres/pg.rs +++ b/chronos_bin/src/postgres/pg.rs @@ -63,7 +63,7 @@ struct PgAccess { } impl PgAccess { - pub async fn build_txn(&mut self) -> Result { + pub async fn build_txn(&mut self) -> Result, PgError> { let txn = self .client .build_transaction() @@ -191,7 +191,7 @@ impl Pg { } #[tracing::instrument(skip_all)] - pub(crate) async fn delete_fired(&self, ids: &Vec) -> Result { + pub(crate) async fn delete_fired(&self, ids: &[String]) -> Result { // let query_execute_instant = Instant::now(); let pg_client = match self.get_client().await { Ok(client) => client, diff --git a/chronos_bin/src/telemetry/otlp_collector.rs b/chronos_bin/src/telemetry/otlp_collector.rs index db972d0..a045bdc 100644 --- a/chronos_bin/src/telemetry/otlp_collector.rs +++ b/chronos_bin/src/telemetry/otlp_collector.rs @@ -31,10 +31,7 @@ impl OtlpCollector { log::error!("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT not set"); // trace error - Err(TraceError::Other(Box::new(std::io::Error::new( - std::io::ErrorKind::Other, - "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT not set", - )))) + Err(TraceError::Other(Box::new(std::io::Error::other("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT not set")))) } } } diff --git a/chronos_bin/src/utils/env.rs b/chronos_bin/src/utils/env.rs index 7dc5803..1860be9 100644 --- a/chronos_bin/src/utils/env.rs +++ b/chronos_bin/src/utils/env.rs @@ -56,7 +56,7 @@ pub fn get_env_vars_with_prefix(prefix: &str) -> Option> /// ``` /// /// - When the `key` and value return `type` is passed, the environment variable is -/// read for the key and the value is parsed into the `type` passed as argument. +/// read for the key and the value is parsed into the `type` passed as argument. /// /// ## Example /// ```ignore @@ -65,7 +65,7 @@ pub fn get_env_vars_with_prefix(prefix: &str) -> Option> /// ``` /// /// - Special scenario to convert the string value to Vector. -/// When the `key` and value return `type` is passed as `Vec` +/// When the `key` and value return `type` is passed as `Vec` /// - the environment variable is read for the key. /// - the string value returned is split on `,` to create a Vec. /// - each value of the vec is parsed into the `type` passed as argument. diff --git a/chronos_bin/src/utils/util.rs b/chronos_bin/src/utils/util.rs index b9375ae..2eae6f2 100644 --- a/chronos_bin/src/utils/util.rs +++ b/chronos_bin/src/utils/util.rs @@ -9,14 +9,10 @@ pub fn required_headers(message: &BorrowedMessage) -> Option::new(), |mut acc, header| { - if let Ok(key) = header.key.parse() { - if let Some(value) = header.value { - let value: String = String::from_utf8_lossy(value).into_owned(); - acc.insert(key, value); - acc - } else { - acc - } + if let Some(value) = header.value { + let value: String = String::from_utf8_lossy(value).into_owned(); + acc.insert(header.key.to_string(), value); + acc } else { acc } @@ -48,7 +44,7 @@ pub fn headers_check(headers: &BorrowedHeaders) -> bool { outcome == 2 } -pub fn get_payload_utf8<'a>(message: &'a BorrowedMessage) -> Option<&'a [u8]> { +pub fn get_payload_utf8<'a>(message: &'a BorrowedMessage<'_>) -> Option<&'a [u8]> { message.payload() } diff --git a/examples/chronos_ex/examples/chronos_ex.rs b/examples/chronos_ex/examples/chronos_ex.rs index 5cc314d..4afd691 100644 --- a/examples/chronos_ex/examples/chronos_ex.rs +++ b/examples/chronos_ex/examples/chronos_ex.rs @@ -55,10 +55,7 @@ fn init_tracer() -> Result { log::error!("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT not set"); // trace error - Err(TraceError::Other(Box::new(std::io::Error::new( - std::io::ErrorKind::Other, - "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT not set", - )))) + Err(TraceError::Other(Box::new(std::io::Error::other("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT not set")))) } } diff --git a/examples/chronos_ex/examples/telemetry_simple.rs b/examples/chronos_ex/examples/telemetry_simple.rs index b268c6a..5131a25 100644 --- a/examples/chronos_ex/examples/telemetry_simple.rs +++ b/examples/chronos_ex/examples/telemetry_simple.rs @@ -121,9 +121,9 @@ async fn main() { let handler = tokio::task::spawn(async { println!("this is spawning"); - // let runner = Runner {}; - // runner.run(); - // runner.run_sub_db().await; + let runner = Runner {}; + runner.run(); + runner.run_sub_db().await; let mut count = 0; loop { count += 1; diff --git a/examples/chronos_ex/examples/tracing_example.rs b/examples/chronos_ex/examples/tracing_example.rs index 358d738..d8c5866 100644 --- a/examples/chronos_ex/examples/tracing_example.rs +++ b/examples/chronos_ex/examples/tracing_example.rs @@ -39,7 +39,7 @@ pub fn shave(yak: usize) -> Result<(), Box> { // note that this is intended to demonstrate `tracing`'s features, not idiomatic // error handling! in a library or application, you should consider returning // a dedicated `YakError`. libraries like snafu or thiserror make this easy. - return Err(io::Error::new(io::ErrorKind::Other, "shaving yak failed!").into()); + return Err(io::Error::other("shaving yak failed!").into()); } else { debug!("yak shaved successfully"); } diff --git a/examples/prom_otlp_mock.rs b/examples/prom_otlp_mock.rs index 5aa53ed..621d8c2 100644 --- a/examples/prom_otlp_mock.rs +++ b/examples/prom_otlp_mock.rs @@ -126,7 +126,7 @@ impl ChronosMetrics { } fn record_cycle(&self, cycle: u64) { - let destination = if cycle % 2 == 0 { "chronos-input" } else { "chronos-retry" }; + let destination = if cycle.is_multiple_of(2) { "chronos-input" } else { "chronos-retry" }; let latency_seconds = 0.005 + ((cycle % 20) as f64 * 0.0025); self.message_consumed(destination); diff --git a/scripts/apply-github-config.sh b/scripts/apply-github-config.sh new file mode 100755 index 0000000..1c0cd1c --- /dev/null +++ b/scripts/apply-github-config.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +config_path="${1:-.github/config.json}" +repo="${GITHUB_REPOSITORY:-}" + +if [[ -z "${repo}" ]]; then + repo="$(gh repo view --json nameWithOwner --jq '.nameWithOwner')" +fi + +if [[ -z "${repo}" ]]; then + echo "Unable to determine GitHub repository. Set GITHUB_REPOSITORY=owner/name." >&2 + exit 2 +fi + +if [[ ! -f "${config_path}" ]]; then + echo "GitHub config file not found: ${config_path}" >&2 + exit 2 +fi + +echo "Applying repository settings to ${repo}" +jq -c '.repository' "${config_path}" | gh api --method PATCH "repos/${repo}" --input - + +echo "Applying Actions workflow permissions to ${repo}" +jq -c '.actions' "${config_path}" | gh api --method PUT "repos/${repo}/actions/permissions/workflow" --input - + +for branch in $(jq -r '.branches | keys[]' "${config_path}"); do + echo "Applying branch protection to ${repo}:${branch}" + jq -c --arg branch "${branch}" '.branches[$branch].protection' "${config_path}" | + gh api --method PUT "repos/${repo}/branches/${branch}/protection" --input - +done From 85e2432eb268a3ac1644b9e17b0ab19dbce8aae2 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Thu, 30 Apr 2026 22:46:08 +1000 Subject: [PATCH 15/36] feat(metrics): use generated metric definitions Wire Chronos runtime metrics through the Weaver-generated metric definition table and add Prometheus/OTLP exporter selection via OTEL_METRICS_EXPORTER. Prometheus metrics now use the chronos namespace while preserving the issue #12 metric dimensions and buckets. Generate Weaver Rust definitions, Markdown metric docs, and resolved registry JSON schema into chronos_bin/src/metrics/generated as part of make build. Verification: - make weaver.generate - make lint - env CARGO_HUSKY_DONT_INSTALL_HOOKS=true cargo test -p chronos_bin - make build - sh scripts/pre-commit-checks.sh - make integration Model-version: GPT-5 --- How-to.md | 5 +- Makefile | 4 +- chronos_bin/src/message_processor.rs | 20 +- chronos_bin/src/message_receiver.rs | 8 +- .../generated/chronos_metric_definitions.rs | 55 +- .../src/metrics/generated/chronos_metrics.md | 11 + chronos_bin/src/metrics/generated/mod.rs | 3 + .../generated/resolved-registry.schema.json | 1830 +++++++++++++++++ chronos_bin/src/metrics/mod.rs | 1 + chronos_bin/src/metrics/registry.rs | 588 ++++-- chronos_bin/src/metrics/server.rs | 18 +- chronos_bin/src/monitor.rs | 2 +- examples/weaver/registry/chronos/metrics.yaml | 80 +- .../templates/registry/markdown/weaver.yaml | 2 +- .../templates/registry/rust/registry.rs.j2 | 12 + .../templates/registry/rust/weaver.yaml | 7 +- scripts/integration.sh | 12 +- 17 files changed, 2349 insertions(+), 309 deletions(-) create mode 100644 chronos_bin/src/metrics/generated/chronos_metrics.md create mode 100644 chronos_bin/src/metrics/generated/mod.rs create mode 100644 chronos_bin/src/metrics/generated/resolved-registry.schema.json diff --git a/How-to.md b/How-to.md index 17b9792..26d6b10 100644 --- a/How-to.md +++ b/How-to.md @@ -73,6 +73,10 @@ make lgtm.up The overlay mounts local override files for Prometheus, the OpenTelemetry Collector, and Grafana dashboard provisioning. Chronos exposes its Prometheus metrics endpoint with `OTEL_EXPORTER_PROMETHEUS_HOST` and `OTEL_EXPORTER_PROMETHEUS_PORT`; when run from `docker-compose.yml` the endpoint is `chronos:9091`. +Chronos metrics are generated from the OpenTelemetry Weaver registry in `examples/weaver/registry/chronos/metrics.yaml` into `chronos_bin/src/metrics/generated`. `OTEL_METRICS_EXPORTER=prometheus` is the default and exposes `/metrics` with the `chronos_` Prometheus namespace, for example `chronos_msg_jitter`. `OTEL_METRICS_EXPORTER=otlp` records the same generated metric IDs through the OTLP gRPC metrics exporter. + +`make build` runs `make weaver.generate` before compiling, which refreshes the generated Rust definitions, Markdown metric docs, and resolved registry JSON schema. + Validate the LGTM configuration files with: ```sh @@ -86,4 +90,3 @@ Two images are published for each [RELEASE]( `https://github.com/kindredgroup/ch - diff --git a/Makefile b/Makefile index 72ff23b..9940b78 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ WEAVER_VERSION ?= 0.23.0 WEAVER_IMAGE ?= otel/weaver:v$(WEAVER_VERSION) WEAVER_REGISTRY ?= examples/weaver/registry WEAVER_TEMPLATES ?= examples/weaver/templates -WEAVER_GENERATED_DIR ?= examples/weaver/generated +WEAVER_GENERATED_DIR ?= chronos_bin/src/metrics/generated WEAVER_LIVE_CHECK_PORT ?= 4319 WEAVER_LIVE_CHECK_ADMIN_PORT ?= 4320 WEAVER_LIVE_CHECK_OUT ?= /tmp/chronos-weaver-live-check @@ -80,7 +80,7 @@ install: cargo fetch ## build: 🧪 Compiles rust -build: +build: weaver.generate $(call pp,build rust...) cargo build diff --git a/chronos_bin/src/message_processor.rs b/chronos_bin/src/message_processor.rs index 4ca42e8..ab52eb7 100644 --- a/chronos_bin/src/message_processor.rs +++ b/chronos_bin/src/message_processor.rs @@ -68,7 +68,7 @@ impl MessageProcessor { // msg_jitter: difference between actual publish time and client-requested deadline. // Floored at 0 to guard against clock skew producing negative jitter. let jitter_secs = (Utc::now() - deadline).num_milliseconds().max(0) as f64 / 1000.0; - self.metrics.msg_jitter.observe(jitter_secs); + self.metrics.observe_jitter(jitter_secs); Ok(id) } else { Err("error occurred while publishing".to_string()) @@ -167,11 +167,7 @@ impl MessageProcessor { let timer = std::time::Instant::now(); let (returned, status) = self.processor_message_ready(node_id).await; let elapsed = timer.elapsed().as_secs_f64(); - if let Ok(obs) = self.metrics.msg_process_latency.get_metric_with_label_values(&[&returned.to_string(), status]) { - obs.observe(elapsed); - } else { - log::error!("metrics: failed to observe msg_process_latency"); - } + self.metrics.observe_process_latency(elapsed, returned, status); delay_controller.sleep().await; } @@ -194,11 +190,11 @@ mod tests { fn test_jitter_below_500ms_within_sla() { let metrics = ChronosMetrics::new().unwrap(); // A 300ms jitter is within the 500ms SLA — must land in the <=0.5s bucket - metrics.msg_jitter.observe(0.3); - let families = metrics.registry.gather(); - let fam = families.iter().find(|f| f.get_name() == "msg_jitter").unwrap(); - let hist = fam.get_metric()[0].get_histogram(); - let bucket_500 = hist.get_bucket().iter().find(|b| (b.get_upper_bound() - 0.5).abs() < 1e-9).unwrap(); - assert_eq!(bucket_500.get_cumulative_count(), 1, "300ms jitter must be counted in the <=500ms bucket"); + metrics.observe_jitter(0.3); + let output = metrics.render_prometheus().unwrap(); + assert!( + output.contains("chronos_msg_jitter_bucket{le=\"0.5\"} 1"), + "300ms jitter must be counted in the <=500ms bucket" + ); } } diff --git a/chronos_bin/src/message_receiver.rs b/chronos_bin/src/message_receiver.rs index d79b548..6b420c1 100644 --- a/chronos_bin/src/message_receiver.rs +++ b/chronos_bin/src/message_receiver.rs @@ -88,7 +88,7 @@ impl MessageReceiver { // Uses the Kafka-assigned message timestamp; guards against clock skew with max(0). if let Some(kafka_ts_ms) = message.timestamp().to_millis() { let wait_secs = (Utc::now().timestamp_millis() - kafka_ts_ms).max(0) as f64 / 1000.0; - self.metrics.msg_wait_time.observe(wait_secs); + self.metrics.observe_wait_time(wait_secs); } let timer = std::time::Instant::now(); @@ -120,11 +120,7 @@ impl MessageReceiver { // msg_consume_latency: only record when destination was determined (valid message headers). if destination != "unknown" { let elapsed = timer.elapsed().as_secs_f64(); - if let Ok(obs) = self.metrics.msg_consume_latency.get_metric_with_label_values(&[destination, status]) { - obs.observe(elapsed); - } else { - log::error!("metrics: failed to observe msg_consume_latency"); - } + self.metrics.observe_consume_latency(elapsed, destination, status); } } diff --git a/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs b/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs index 19a2bc0..f849fb9 100644 --- a/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs +++ b/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs @@ -1,10 +1,5 @@ -// Generated from chronos_bin/src/metrics/spec.yaml. +// Generated from examples/weaver/registry/chronos/metrics.yaml by OpenTelemetry Weaver. // Do not edit by hand. -// -// This generated definition table is intentionally not imported by -// chronos_bin/src/metrics/mod.rs yet. The current hand-written Prometheus -// registry remains the runtime implementation until the generated registry -// replacement is wired in explicitly. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub enum MetricId { @@ -15,22 +10,31 @@ pub enum MetricId { MsgWaitTime, } -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug)] pub enum MetricKind { Counter, Histogram, } +impl MetricKind { + pub fn is_counter(self) -> bool { + matches!(self, Self::Counter) + } + + pub fn is_histogram(self) -> bool { + matches!(self, Self::Histogram) + } +} + #[derive(Clone, Copy, Debug)] pub struct MetricDefinition { pub id: MetricId, - pub rust_name: &'static str, - pub prometheus_name: &'static str, pub otel_name: &'static str, + pub prometheus_name: &'static str, pub description: &'static str, pub unit: Option<&'static str>, pub label_names: &'static [&'static str], - pub otel_label_names: &'static [&'static str], + pub prometheus_label_names: &'static [&'static str], pub kind: MetricKind, pub buckets: Option<&'static [f64]>, pub prewarm_label_values: &'static [&'static [&'static str]], @@ -39,65 +43,60 @@ pub struct MetricDefinition { pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ MetricDefinition { id: MetricId::MsgConsumeLatency, - rust_name: "msg_consume_latency", - prometheus_name: "msg_consume_latency", otel_name: "chronos.message.consume.duration", - description: "Duration of message_receiver::MessageReceiver::handle_message().", + prometheus_name: "msg_consume_latency", + description: "Duration of handle_message() in message_receiver.", unit: Some("s"), label_names: &["destination", "status"], - otel_label_names: &["chronos.destination", "chronos.status"], + prometheus_label_names: &["destination", "status"], kind: MetricKind::Histogram, buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), prewarm_label_values: &[&["kafka", "pass"], &["kafka", "fail"], &["postgres", "pass"], &["postgres", "fail"]], }, MetricDefinition { id: MetricId::MsgJitter, - rust_name: "msg_jitter", - prometheus_name: "msg_jitter", otel_name: "chronos.message.jitter", + prometheus_name: "msg_jitter", description: "Difference between actual publish time and client-requested deadline.", unit: Some("s"), label_names: &[], - otel_label_names: &[], + prometheus_label_names: &[], kind: MetricKind::Histogram, buckets: Some(&[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]), prewarm_label_values: &[], }, MetricDefinition { id: MetricId::MsgProcessLatency, - rust_name: "msg_process_latency", - prometheus_name: "msg_process_latency", otel_name: "chronos.message.process.duration", - description: "Duration of message_processor::MessageProcessor::processor_message_ready().", + prometheus_name: "msg_process_latency", + description: "Duration of processor_message_ready() loop in message_processor.", unit: Some("s"), label_names: &["returned", "status"], - otel_label_names: &["chronos.processor.returned", "chronos.status"], + prometheus_label_names: &["returned", "status"], kind: MetricKind::Histogram, buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), prewarm_label_values: &[&["true", "pass"], &["true", "fail"], &["false", "pass"], &["false", "fail"]], }, MetricDefinition { id: MetricId::MsgReset, - rust_name: "msg_reset", - prometheus_name: "msg_reset", otel_name: "chronos.message.reset", - description: "Number of records reset by postgres::pg::Pg::reset_to_init_db() in the monitor task.", + prometheus_name: "msg_reset", + description: "Number of records reset by reset_to_init_db() in the monitor task.", unit: Some("{message}"), label_names: &[], - otel_label_names: &[], + prometheus_label_names: &[], kind: MetricKind::Counter, buckets: None, prewarm_label_values: &[], }, MetricDefinition { id: MetricId::MsgWaitTime, - rust_name: "msg_wait_time", - prometheus_name: "msg_wait_time", otel_name: "chronos.message.wait.duration", + prometheus_name: "msg_wait_time", description: "Time a message spent in the Kafka input queue before processing.", unit: Some("s"), label_names: &[], - otel_label_names: &[], + prometheus_label_names: &[], kind: MetricKind::Histogram, buckets: Some(&[0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2]), prewarm_label_values: &[], diff --git a/chronos_bin/src/metrics/generated/chronos_metrics.md b/chronos_bin/src/metrics/generated/chronos_metrics.md new file mode 100644 index 0000000..123cbe5 --- /dev/null +++ b/chronos_bin/src/metrics/generated/chronos_metrics.md @@ -0,0 +1,11 @@ +# Chronos Metrics + +Generated from `examples/weaver/registry/chronos/metrics.yaml` by OpenTelemetry Weaver. + +| Metric | Prometheus Name | Instrument | Unit | Attributes | Description | +| --- | --- | --- | --- | --- | --- | +| `chronos.message.consume.duration` | `msg_consume_latency` | `histogram` | `s` | `destination`, `status` | Duration of handle_message() in message_receiver. | +| `chronos.message.jitter` | `msg_jitter` | `histogram` | `s` | - | Difference between actual publish time and client-requested deadline. | +| `chronos.message.process.duration` | `msg_process_latency` | `histogram` | `s` | `returned`, `status` | Duration of processor_message_ready() loop in message_processor. | +| `chronos.message.reset` | `msg_reset` | `counter` | `{message}` | - | Number of records reset by reset_to_init_db() in the monitor task. | +| `chronos.message.wait.duration` | `msg_wait_time` | `histogram` | `s` | - | Time a message spent in the Kafka input queue before processing. | diff --git a/chronos_bin/src/metrics/generated/mod.rs b/chronos_bin/src/metrics/generated/mod.rs new file mode 100644 index 0000000..6851991 --- /dev/null +++ b/chronos_bin/src/metrics/generated/mod.rs @@ -0,0 +1,3 @@ +pub mod chronos_metric_definitions; + +pub use chronos_metric_definitions::*; diff --git a/chronos_bin/src/metrics/generated/resolved-registry.schema.json b/chronos_bin/src/metrics/generated/resolved-registry.schema.json new file mode 100644 index 0000000..081f975 --- /dev/null +++ b/chronos_bin/src/metrics/generated/resolved-registry.schema.json @@ -0,0 +1,1830 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ResolvedRegistry", + "description": "A resolved semantic convention registry used in the context of the template and policy\nengines.", + "type": "object", + "properties": { + "groups": { + "description": "A list of semantic convention groups.", + "type": "array", + "items": { + "$ref": "#/$defs/ResolvedGroup" + } + }, + "registry_url": { + "description": "The semantic convention registry url.", + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "registry_url", + "groups" + ], + "$defs": { + "AnyValueSpec": { + "description": "The AnyValueTypeSpec is a specification of a value that can be of any type.", + "oneOf": [ + { + "description": "A boolean attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "boolean" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "A integer attribute (signed 64 bit integer).", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "int" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "A double attribute (double precision floating point (IEEE 754-1985)).", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "double" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "A string attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "string" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "An array of strings attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "string[]" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "An array of integer attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "int[]" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "An array of double attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "double[]" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "An array of boolean attribute.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "boolean[]" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "The value type is a map of key, value pairs", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "fields": { + "description": "The collection of key, values where the value is an `AnyValueSpec`", + "type": "array", + "items": { + "$ref": "#/$defs/AnyValueSpec" + } + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "map" + } + }, + "required": [ + "type", + "id", + "requirement_level", + "fields" + ] + }, + { + "description": "The value type is a map of key, value pairs", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "fields": { + "description": "The collection of key, values where the value is an `AnyValueSpec`", + "type": "array", + "items": { + "$ref": "#/$defs/AnyValueSpec" + } + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "map[]" + } + }, + "required": [ + "type", + "id", + "requirement_level", + "fields" + ] + }, + { + "description": "The value type will just be a bytes.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "bytes" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "The value type is not specified.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "undefined" + } + }, + "required": [ + "type", + "id", + "requirement_level" + ] + }, + { + "description": "An enum definition type.", + "type": "object", + "properties": { + "brief": { + "description": "A brief description of the value", + "type": "string" + }, + "examples": { + "description": "Sequence of examples for the value or single example\nvalue. If only a single example is provided, it can\ndirectly be reported without encapsulating it\ninto a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "members": { + "description": "List of enum entries.", + "type": "array", + "items": { + "$ref": "#/$defs/EnumEntriesSpec" + } + }, + "note": { + "description": "A more elaborate description of the value.\nIt defaults to an empty string.", + "type": "string" + }, + "requirement_level": { + "description": "Specifies if the field is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe field is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the field is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "stability": { + "description": "Specifies the stability of the value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "type": "string", + "const": "enum" + } + }, + "required": [ + "type", + "id", + "requirement_level", + "members" + ] + } + ] + }, + "Attribute": { + "description": "An attribute definition.", + "type": "object", + "properties": { + "annotations": { + "description": "Annotations for the group.", + "type": [ + "object", + "null" + ], + "additionalProperties": { + "$ref": "#/$defs/YamlValue" + } + }, + "brief": { + "description": "A brief description of the attribute.", + "type": "string" + }, + "deprecated": { + "description": "Specifies if the attribute is deprecated.", + "anyOf": [ + { + "$ref": "#/$defs/Deprecated" + }, + { + "type": "null" + } + ] + }, + "examples": { + "description": "Sequence of example values for the attribute or single example\nvalue. They are required only for string and string array\nattributes. Example values must be of the same type of the\nattribute. If only a single example is provided, it can directly\nbe reported without encapsulating it into a sequence/dictionary.", + "anyOf": [ + { + "$ref": "#/$defs/Examples" + }, + { + "type": "null" + } + ] + }, + "name": { + "description": "Attribute name.", + "type": "string" + }, + "note": { + "description": "A more elaborate description of the attribute.\nIt defaults to an empty string.", + "type": "string" + }, + "prefix": { + "description": "Specifies the prefix of the attribute.\nIf this parameter is set, the resolved id of the referenced attribute will\nhave group prefix added to it.\nIt defaults to false.", + "type": "boolean" + }, + "requirement_level": { + "description": "Specifies if the attribute is mandatory. Can be \"required\",\n\"conditionally_required\", \"recommended\" or \"opt_in\". When omitted,\nthe attribute is \"recommended\". When set to\n\"conditionally_required\", the string provided as MUST\nspecify the conditions under which the attribute is required.", + "$ref": "#/$defs/RequirementLevel" + }, + "role": { + "description": "Whether the attribute is identifying or descriptive.", + "anyOf": [ + { + "$ref": "#/$defs/AttributeRole" + }, + { + "type": "null" + } + ] + }, + "sampling_relevant": { + "description": "Specifies if the attribute is (especially) relevant for sampling\nand thus should be set at span start. It defaults to false.\nNote: this field is experimental.", + "type": [ + "boolean", + "null" + ] + }, + "stability": { + "description": "Specifies the stability of the attribute.\nNote that, if stability is missing but deprecated is present, it will\nautomatically set the stability to deprecated. If deprecated is\npresent and stability differs from deprecated, this will result in an\nerror.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "tag": { + "description": "Associates a tag (\"sub-group\") to the attribute. It carries no\nparticular semantic meaning but can be used e.g. for filtering\nin the markdown generator.", + "type": [ + "string", + "null" + ] + }, + "tags": { + "description": "A set of tags for the attribute.", + "anyOf": [ + { + "$ref": "#/$defs/Tags" + }, + { + "type": "null" + } + ] + }, + "type": { + "description": "Either a string literal denoting the type as a primitive or an\narray type, a template type or an enum definition.", + "$ref": "#/$defs/AttributeType" + }, + "value": { + "description": "The value of the attribute.\nNote: This is only used in a telemetry schema specification.", + "anyOf": [ + { + "$ref": "#/$defs/Value" + }, + { + "type": "null" + } + ] + } + }, + "additionalProperties": false, + "required": [ + "name", + "type", + "brief", + "requirement_level" + ] + }, + "AttributeLineage": { + "description": "Attribute lineage (at the field level).", + "type": "object", + "properties": { + "inherited_fields": { + "description": "A list of fields that are inherited from the source group.", + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "locally_overridden_fields": { + "description": "A list of fields that are overridden in the local group.", + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "source_group": { + "description": "The group id where the attribute is coming from.", + "type": "string" + } + }, + "required": [ + "source_group" + ] + }, + "AttributeRole": { + "description": "The different roles for attributes in groups.", + "oneOf": [ + { + "description": "The attribute is considered identifying for the signal it is associated with.", + "type": "string", + "const": "identifying" + }, + { + "description": "The attribute is considered descriptive for the signal it is associated with.", + "type": "string", + "const": "descriptive" + } + ] + }, + "AttributeType": { + "description": "The different types of attributes (specification).", + "anyOf": [ + { + "description": "Primitive or array type.", + "$ref": "#/$defs/PrimitiveOrArrayTypeSpec" + }, + { + "description": "A template type.", + "$ref": "#/$defs/TemplateTypeSpec" + }, + { + "description": "An enum definition type.", + "type": "object", + "properties": { + "members": { + "description": "List of enum entries.", + "type": "array", + "items": { + "$ref": "#/$defs/EnumEntriesSpec" + } + } + }, + "required": [ + "members" + ] + } + ] + }, + "BasicRequirementLevelSpec": { + "description": "The different types of basic requirement levels.", + "oneOf": [ + { + "description": "A required requirement level.", + "type": "string", + "const": "required" + }, + { + "description": "An optional requirement level.", + "type": "string", + "const": "recommended" + }, + { + "description": "An opt-in requirement level.", + "type": "string", + "const": "opt_in" + } + ] + }, + "Deprecated": { + "description": "The different ways to deprecate an attribute, a metric, ...", + "oneOf": [ + { + "description": "The telemetry object containing the deprecated field has been renamed to an\nexisting or a new telemetry object.", + "type": "object", + "properties": { + "note": { + "description": "The note to provide more context about the deprecation.", + "type": "string" + }, + "reason": { + "type": "string", + "const": "renamed" + }, + "renamed_to": { + "description": "The new name of the telemetry object.", + "type": "string" + } + }, + "required": [ + "reason", + "renamed_to", + "note" + ] + }, + { + "description": "The telemetry object containing the deprecated field has been obsoleted\nbecause it no longer exists and has no valid replacement.\n\nThe `brief` field should contain the reason why the field has been obsoleted.", + "type": "object", + "properties": { + "note": { + "description": "The note to provide more context about the deprecation.", + "type": "string" + }, + "reason": { + "type": "string", + "const": "obsoleted" + } + }, + "required": [ + "reason", + "note" + ] + }, + { + "description": "The telemetry object containing the deprecated field has been deprecated for\ncomplex reasons (split, merge, ...) which are currently not precisely defined\nin the supported deprecation reasons.\n\nThe `brief` field should contain the reason for this uncategorized deprecation.", + "type": "object", + "properties": { + "note": { + "description": "The note to provide more context about the deprecation.", + "type": "string" + }, + "reason": { + "type": "string", + "const": "uncategorized" + } + }, + "required": [ + "reason", + "note" + ] + }, + { + "description": "This variant is used to capture old, unstructured deprecated \"string\".\nUsed for backward-compatibility only.", + "type": "object", + "properties": { + "note": { + "description": "The note to provide more context about the deprecation.", + "type": "string" + }, + "reason": { + "type": "string", + "const": "unspecified" + } + }, + "required": [ + "reason", + "note" + ] + } + ] + }, + "EnumEntriesSpec": { + "description": "Possible enum entries.", + "type": "object", + "properties": { + "annotations": { + "description": "Annotations for the member.", + "type": [ + "object", + "null" + ], + "additionalProperties": { + "$ref": "#/$defs/YamlValue" + } + }, + "brief": { + "description": "Brief description of the enum entry value.\nIt defaults to the value of id.", + "type": [ + "string", + "null" + ] + }, + "deprecated": { + "description": "Deprecation note.", + "anyOf": [ + { + "$ref": "#/$defs/Deprecated" + }, + { + "type": "null" + } + ] + }, + "id": { + "description": "String that uniquely identifies the enum entry.", + "type": "string" + }, + "note": { + "description": "Longer description.\nIt defaults to an empty string.", + "type": [ + "string", + "null" + ] + }, + "stability": { + "description": "Stability of this enum value.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "value": { + "description": "String, int, or boolean; value of the enum entry.", + "$ref": "#/$defs/ValueSpec" + } + }, + "additionalProperties": false, + "required": [ + "id", + "value" + ] + }, + "Examples": { + "description": "The different types of examples.", + "anyOf": [ + { + "description": "A boolean example.", + "type": "boolean" + }, + { + "description": "A integer example.", + "type": "integer", + "format": "int64" + }, + { + "description": "A double example.", + "type": "number", + "format": "double" + }, + { + "description": "A string example.", + "type": "string" + }, + { + "description": "A any example.", + "$ref": "#/$defs/ValueSpec" + }, + { + "description": "A array of integers example.", + "type": "array", + "items": { + "type": "integer", + "format": "int64" + } + }, + { + "description": "A array of doubles example.", + "type": "array", + "items": { + "type": "number", + "format": "double" + } + }, + { + "description": "A array of bools example.", + "type": "array", + "items": { + "type": "boolean" + } + }, + { + "description": "A array of strings example.", + "type": "array", + "items": { + "type": "string" + } + }, + { + "description": "A array of anys example.", + "type": "array", + "items": { + "$ref": "#/$defs/ValueSpec" + } + }, + { + "description": "List of arrays of integers example.", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer", + "format": "int64" + } + } + }, + { + "description": "List of arrays of doubles example.", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "number", + "format": "double" + } + } + }, + { + "description": "List of arrays of bools example.", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "boolean" + } + } + }, + { + "description": "List of arrays of strings example.", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "string" + } + } + } + ] + }, + "GroupLineage": { + "description": "Group lineage.", + "type": "object", + "properties": { + "attributes": { + "description": "The lineage per attribute.\n\nNote: Use a BTreeMap to ensure a deterministic order of attributes.\nThis is important to keep unit tests stable.", + "type": "object", + "additionalProperties": { + "$ref": "#/$defs/AttributeLineage" + } + }, + "extends_group": { + "description": "The group that this group extended, if available.", + "type": [ + "string", + "null" + ] + }, + "includes_group": { + "description": "(V2 Only) Attribute groups included in this group.", + "type": "array", + "items": { + "type": "string" + } + }, + "provenance": { + "description": "The provenance of the source file where the group is defined.", + "$ref": "#/$defs/Provenance" + } + }, + "required": [ + "provenance" + ] + }, + "GroupType": { + "description": "The different types of groups: `attribute_group`, `span`, `event`, `metric`, `entity`, `scope`.\n\nNote: The `resource` type is no longer used and is an alias for `entity`.", + "oneOf": [ + { + "description": "Attribute group (attribute_group type) defines a set of attributes that\ncan be declared once and referenced by semantic conventions for\ndifferent signals, for example spans and logs. Attribute groups don't\nhave any specific fields and follow the general semconv semantics.", + "type": "string", + "const": "attribute_group" + }, + { + "description": "Span semantic convention.", + "type": "string", + "const": "span" + }, + { + "description": "Event semantic convention.", + "type": "string", + "const": "event" + }, + { + "description": "Metric semantic convention.", + "type": "string", + "const": "metric" + }, + { + "description": "The metric group semconv is a group where related metric attributes can\nbe defined and then referenced from other metric groups using ref.", + "type": "string", + "const": "metric_group" + }, + { + "description": "Entity semantic convention.", + "type": "string", + "const": "entity" + }, + { + "description": "Scope.", + "type": "string", + "const": "scope" + }, + { + "description": "Undefined group type.", + "type": "string", + "const": "undefined" + } + ] + }, + "InstrumentSpec": { + "description": "The type of the metric.", + "oneOf": [ + { + "description": "An up-down counter metric.", + "type": "string", + "const": "updowncounter" + }, + { + "description": "A counter metric.", + "type": "string", + "const": "counter" + }, + { + "description": "A gauge metric.", + "type": "string", + "const": "gauge" + }, + { + "description": "A histogram metric.", + "type": "string", + "const": "histogram" + } + ] + }, + "PrimitiveOrArrayTypeSpec": { + "description": "Primitive or array types.", + "oneOf": [ + { + "description": "A boolean attribute.", + "type": "string", + "const": "boolean" + }, + { + "description": "A integer attribute (signed 64 bit integer).", + "type": "string", + "const": "int" + }, + { + "description": "A double attribute (double precision floating point (IEEE 754-1985)).", + "type": "string", + "const": "double" + }, + { + "description": "A string attribute.", + "type": "string", + "const": "string" + }, + { + "description": "An any type attribute (accepts any valid value).", + "type": "string", + "const": "any" + }, + { + "description": "An array of strings attribute.", + "type": "string", + "const": "string[]" + }, + { + "description": "An array of integer attribute.", + "type": "string", + "const": "int[]" + }, + { + "description": "An array of double attribute.", + "type": "string", + "const": "double[]" + }, + { + "description": "An array of boolean attribute.", + "type": "string", + "const": "boolean[]" + } + ] + }, + "Provenance": { + "description": "The provenance a semantic convention specification file.", + "type": "object", + "properties": { + "path": { + "description": "The path to the specification file.\n\nThis is the path is only available *locally*. When publishing resolved schemas,\nthis field is not included.", + "type": "string" + }, + "schema_url": { + "description": "The schema URL where this was specified.\n\nThe Schema url contains the registry id and the version of the schema.\nIt can be used to detect conflicts or resolve multiple \"ids\" existing across\ndependency chains but being the same thing, conceptually.", + "$ref": "#/$defs/SchemaUrl" + } + }, + "required": [ + "schema_url", + "path" + ] + }, + "RequirementLevel": { + "description": "The different requirement level specifications.", + "anyOf": [ + { + "description": "A basic requirement level.", + "$ref": "#/$defs/BasicRequirementLevelSpec" + }, + { + "description": "A conditional requirement level.", + "type": "object", + "properties": { + "conditionally_required": { + "description": "The description of the condition.", + "type": "string" + } + }, + "required": [ + "conditionally_required" + ] + }, + { + "description": "A recommended requirement level.", + "type": "object", + "properties": { + "recommended": { + "description": "The description of the recommendation.", + "type": "string" + } + }, + "required": [ + "recommended" + ] + }, + { + "description": "An opt in requirement level.", + "type": "object", + "properties": { + "opt_in": { + "description": "The description of the recommendation.", + "type": "string" + } + }, + "required": [ + "opt_in" + ] + } + ] + }, + "ResolvedGroup": { + "description": "Resolved group specification used in the context of the template engine.", + "type": "object", + "properties": { + "annotations": { + "description": "Annotations for the group.", + "type": [ + "object", + "null" + ], + "additionalProperties": { + "$ref": "#/$defs/YamlValue" + } + }, + "attributes": { + "description": "List of attributes that belong to the semantic convention.", + "type": "array", + "items": { + "$ref": "#/$defs/Attribute" + } + }, + "body": { + "description": "The body specification used for event semantic conventions.", + "anyOf": [ + { + "$ref": "#/$defs/AnyValueSpec" + }, + { + "type": "null" + } + ] + }, + "brief": { + "description": "A brief description of the semantic convention.", + "type": "string" + }, + "deprecated": { + "description": "Specifies if the semantic convention is deprecated. The string\nprovided as `description` MUST specify why it's deprecated and/or what\nto use instead. See also stability.", + "anyOf": [ + { + "$ref": "#/$defs/Deprecated" + }, + { + "type": "null" + } + ] + }, + "display_name": { + "description": "The readable name for attribute groups used when generating registry tables.", + "type": [ + "string", + "null" + ] + }, + "entity_associations": { + "description": "The associated entities of this group.", + "type": "array", + "items": { + "type": "string" + } + }, + "events": { + "description": "List of strings that specify the ids of event semantic conventions\nassociated with this span semantic convention.\nNote: only valid if type is span", + "type": "array", + "items": { + "type": "string" + } + }, + "extends": { + "description": "Reference another semantic convention id. It inherits\nall attributes defined in the specified semantic\nconvention.", + "type": [ + "string", + "null" + ] + }, + "id": { + "description": "The id that uniquely identifies the semantic convention.", + "type": "string" + }, + "instrument": { + "description": "The instrument type that should be used to record the metric. Note that\nthe semantic conventions must be written using the names of the\nsynchronous instrument types (counter, gauge, updowncounter and\nhistogram).\nFor more details: [Metrics semantic conventions - Instrument types](https://github.com/open-telemetry/opentelemetry-specification/tree/main/specification/metrics/semantic_conventions#instrument-types).\nNote: This field is required if type is metric.", + "anyOf": [ + { + "$ref": "#/$defs/InstrumentSpec" + }, + { + "type": "null" + } + ] + }, + "lineage": { + "description": "The lineage of the group.", + "anyOf": [ + { + "$ref": "#/$defs/GroupLineage" + }, + { + "type": "null" + } + ] + }, + "metric_name": { + "description": "The metric name as described by the [OpenTelemetry Specification](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/data-model.md#timeseries-model).\nNote: This field is required if type is metric.", + "type": [ + "string", + "null" + ] + }, + "name": { + "description": "The name of the event. If not specified, the prefix is used.\nIf prefix is empty (or unspecified), name is required.", + "type": [ + "string", + "null" + ] + }, + "note": { + "description": "A more elaborate description of the semantic convention.\nIt defaults to an empty string.", + "type": "string" + }, + "prefix": { + "description": "Prefix for the attributes for this semantic convention.\nIt defaults to an empty string.", + "type": "string" + }, + "span_kind": { + "description": "Specifies the kind of the span.\nNote: only valid if type is span", + "anyOf": [ + { + "$ref": "#/$defs/SpanKindSpec" + }, + { + "type": "null" + } + ] + }, + "stability": { + "description": "Specifies the stability of the semantic convention.\nNote that, if stability is missing but deprecated is present, it will\nautomatically set the stability to deprecated. If deprecated is\npresent and stability differs from deprecated, this will result in an\nerror.", + "anyOf": [ + { + "$ref": "#/$defs/Stability" + }, + { + "type": "null" + } + ] + }, + "type": { + "description": "The type of the group including the specific fields for each type.", + "$ref": "#/$defs/GroupType" + }, + "unit": { + "description": "The unit in which the metric is measured, which should adhere to the\n[guidelines](https://github.com/open-telemetry/opentelemetry-specification/tree/main/specification/metrics/semantic_conventions#instrument-units).\nNote: This field is required if type is metric.", + "type": [ + "string", + "null" + ] + } + }, + "required": [ + "id", + "type", + "brief" + ] + }, + "SchemaUrl": { + "description": "Represents the schema URL of a registry, which serves as a unique identifier for the registry\nalong with its version.", + "type": "object", + "properties": { + "url": { + "description": "The schema URL string.", + "type": "string" + } + }, + "required": [ + "url" + ] + }, + "SpanKindSpec": { + "description": "The span kind.", + "oneOf": [ + { + "description": "An internal span.", + "type": "string", + "const": "internal" + }, + { + "description": "A client span.", + "type": "string", + "const": "client" + }, + { + "description": "A server span.", + "type": "string", + "const": "server" + }, + { + "description": "A producer span.", + "type": "string", + "const": "producer" + }, + { + "description": "A consumer span.", + "type": "string", + "const": "consumer" + } + ] + }, + "Stability": { + "description": "The level of stability for a definition. Defined in [OTEP-232](https://github.com/open-telemetry/oteps/blob/main/text/0232-maturity-of-otel.md)", + "oneOf": [ + { + "description": "A deprecated definition.", + "type": "string", + "const": "deprecated", + "deprecated": true + }, + { + "description": "A stable definition.", + "type": "string", + "const": "stable" + }, + { + "description": "A definition in development. Formally known as experimental.", + "type": "string", + "const": "development" + }, + { + "description": "An alpha definition.", + "type": "string", + "const": "alpha" + }, + { + "description": "A beta definition.", + "type": "string", + "const": "beta" + }, + { + "description": "A release candidate definition.", + "type": "string", + "const": "release_candidate" + } + ] + }, + "Tags": { + "description": "A set of tags.\n\nExamples of tags:\n- sensitivity: pii\n- sensitivity: phi\n- data_classification: restricted\n- semantic_type: email\n- semantic_type: first_name\n- owner:\n- provenance: browser_sensor", + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "TemplateTypeSpec": { + "description": "Template types.", + "oneOf": [ + { + "description": "A boolean attribute.", + "type": "string", + "const": "template[boolean]" + }, + { + "description": "A integer attribute.", + "type": "string", + "const": "template[int]" + }, + { + "description": "A double attribute.", + "type": "string", + "const": "template[double]" + }, + { + "description": "A string attribute.", + "type": "string", + "const": "template[string]" + }, + { + "description": "A any attribute.", + "type": "string", + "const": "template[any]" + }, + { + "description": "An array of strings attribute.", + "type": "string", + "const": "template[string[]]" + }, + { + "description": "An array of integer attribute.", + "type": "string", + "const": "template[int[]]" + }, + { + "description": "An array of double attribute.", + "type": "string", + "const": "template[double[]]" + }, + { + "description": "An array of boolean attribute.", + "type": "string", + "const": "template[boolean[]]" + } + ] + }, + "Value": { + "description": "The different types of values.", + "oneOf": [ + { + "description": "A integer value.", + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "Int" + }, + "value": { + "description": "The value", + "type": "integer", + "format": "int64" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "description": "A double value.", + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "Double" + }, + "value": { + "description": "The value", + "type": "number", + "format": "double" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "description": "A string value.", + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "String" + }, + "value": { + "description": "The value", + "type": "string" + } + }, + "required": [ + "type", + "value" + ] + } + ] + }, + "ValueSpec": { + "description": "The different types of values.", + "anyOf": [ + { + "description": "A integer value.", + "type": "integer", + "format": "int64" + }, + { + "description": "A double value.", + "type": "number", + "format": "double" + }, + { + "description": "A string value.", + "type": "string" + }, + { + "description": "A boolean value.", + "type": "boolean" + } + ] + }, + "YamlValue": { + "type": [ + "null", + "boolean", + "object", + "array", + "number", + "string" + ] + } + } +} \ No newline at end of file diff --git a/chronos_bin/src/metrics/mod.rs b/chronos_bin/src/metrics/mod.rs index ea3574c..311bf4e 100644 --- a/chronos_bin/src/metrics/mod.rs +++ b/chronos_bin/src/metrics/mod.rs @@ -1,3 +1,4 @@ +pub mod generated; pub mod registry; pub mod server; pub use registry::ChronosMetrics; diff --git a/chronos_bin/src/metrics/registry.rs b/chronos_bin/src/metrics/registry.rs index 2f547ad..2b8750c 100644 --- a/chronos_bin/src/metrics/registry.rs +++ b/chronos_bin/src/metrics/registry.rs @@ -1,257 +1,435 @@ -use prometheus::{exponential_buckets, histogram_opts, opts, Counter, Histogram, HistogramVec, Registry}; +use std::collections::HashMap; +use std::env; -/// All Prometheus metrics for Chronos. -/// Uses a per-instance Registry so tests can create isolated instances -/// without "already registered" collisions. +use opentelemetry::global; +use opentelemetry::metrics::{Counter as OtlpCounter, Histogram as OtlpHistogram, Unit}; +use opentelemetry::KeyValue; +use opentelemetry_otlp::WithExportConfig; +use prometheus::{histogram_opts, opts, CounterVec as PromCounterVec, HistogramVec as PromHistogramVec, Registry}; + +use crate::metrics::generated::{MetricDefinition, MetricId, MetricKind, METRIC_DEFINITIONS}; + +const OTEL_METRICS_EXPORTER: &str = "OTEL_METRICS_EXPORTER"; +const OTEL_EXPORTER_OTLP_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_ENDPOINT"; +const OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"; +const OTEL_EXPORTER_OTLP_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_PROTOCOL"; +const OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"; +const PROMETHEUS_NAMESPACE: &str = "chronos"; + +type MetricLabels<'a> = &'a [(&'static str, String)]; + +trait MetricsBackend: Send + Sync { + fn inc_counter(&self, id: MetricId, value: u64, labels: MetricLabels<'_>); + fn observe_histogram(&self, id: MetricId, value: f64, labels: MetricLabels<'_>); + fn render_prometheus(&self) -> Option; + fn shutdown(&self); +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MetricsExporter { + Prometheus, + Otlp, +} + +impl MetricsExporter { + pub fn from_env() -> Result> { + match env::var(OTEL_METRICS_EXPORTER).unwrap_or_else(|_| "prometheus".to_string()).as_str() { + "prometheus" => Ok(Self::Prometheus), + "otlp" => { + require_grpc_protocol()?; + Ok(Self::Otlp) + } + "none" => Err("metrics exporter disabled by OTEL_METRICS_EXPORTER=none".into()), + other => Err(format!("unsupported {OTEL_METRICS_EXPORTER} value: {other}").into()), + } + } +} + +/// Runtime metrics facade for Chronos. +/// +/// Metric definitions are generated by OpenTelemetry Weaver in +/// `metrics/generated/chronos_metric_definitions.rs`; this facade only chooses +/// a Prometheus or OTLP backend and records by generated metric IDs. pub struct ChronosMetrics { - pub registry: Registry, - /// Duration of handle_message() in message_receiver. Labels: [destination, status] - /// destination = "kafka" | "postgres" - /// status = "pass" | "fail" - pub msg_consume_latency: HistogramVec, - /// Duration of processor_message_ready() loop in message_processor. Labels: [returned, status] - /// returned = "true" (no rows, loop returned early) | "false" (rows processed) - /// status = "pass" | "fail" - pub msg_process_latency: HistogramVec, - /// Time a message spent in the Kafka input queue before being processed. - pub msg_wait_time: Histogram, - /// Difference between actual publish time and client-requested deadline (jitter). - /// Includes an explicit 0.5s bucket matching the 500ms SLA. - pub msg_jitter: Histogram, - /// Number of records reset by reset_to_init_db() (the monitor task). - pub msg_reset: Counter, + exporter: MetricsExporter, + backend: Box, } impl std::fmt::Debug for ChronosMetrics { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("ChronosMetrics").finish() + f.debug_struct("ChronosMetrics").field("exporter", &self.exporter).finish() } } impl ChronosMetrics { - pub fn new() -> Result { - let registry = Registry::new(); - - let consume_buckets = exponential_buckets(0.001, 2.0, 12)?; - let msg_consume_latency = HistogramVec::new( - histogram_opts!("msg_consume_latency", "Duration of handle_message() in message_receiver", consume_buckets), - &["destination", "status"], - )?; - registry.register(Box::new(msg_consume_latency.clone()))?; - // Pre-warm all label combinations so the metric family always appears in gather() - // output from startup — HistogramVec is omitted from gather() until at least one - // label combination has been touched. - for destination in &["kafka", "postgres"] { - for status in &["pass", "fail"] { - msg_consume_latency.get_metric_with_label_values(&[destination, status])?; + pub fn new() -> Result> { + Self::from_env() + } + + pub fn from_env() -> Result> { + let exporter = MetricsExporter::from_env()?; + let backend: Box = match exporter { + MetricsExporter::Prometheus => Box::new(PrometheusMetricsBackend::new()?), + MetricsExporter::Otlp => Box::new(OtlpMetricsBackend::new()?), + }; + + Ok(Self { exporter, backend }) + } + + pub fn is_prometheus(&self) -> bool { + self.exporter == MetricsExporter::Prometheus + } + + pub fn render_prometheus(&self) -> Option { + self.backend.render_prometheus() + } + + pub fn shutdown(&self) { + self.backend.shutdown(); + } + + pub fn observe_consume_latency(&self, seconds: f64, destination: &'static str, status: &'static str) { + self.observe_histogram(MetricId::MsgConsumeLatency, seconds, consume_labels(destination, status)); + } + + pub fn observe_process_latency(&self, seconds: f64, returned: bool, status: &'static str) { + self.observe_histogram(MetricId::MsgProcessLatency, seconds, process_labels(returned, status)); + } + + pub fn observe_wait_time(&self, seconds: f64) { + self.observe_histogram(MetricId::MsgWaitTime, seconds, Vec::new()); + } + + pub fn observe_jitter(&self, seconds: f64) { + self.observe_histogram(MetricId::MsgJitter, seconds, Vec::new()); + } + + pub fn messages_reset(&self, count: u64) { + self.inc_counter(MetricId::MsgReset, count, Vec::new()); + } + + fn inc_counter(&self, id: MetricId, value: u64, labels: Vec<(&'static str, String)>) { + self.backend.inc_counter(id, value, &labels); + } + + fn observe_histogram(&self, id: MetricId, value: f64, labels: Vec<(&'static str, String)>) { + self.backend.observe_histogram(id, value, &labels); + } +} + +impl Drop for ChronosMetrics { + fn drop(&mut self) { + self.backend.shutdown(); + } +} + +struct PrometheusMetricsBackend { + registry: Registry, + counters: HashMap, + histograms: HashMap, +} + +impl PrometheusMetricsBackend { + fn new() -> Result { + let registry = Registry::new_custom(Some(PROMETHEUS_NAMESPACE.to_string()), None)?; + let mut counters = HashMap::new(); + let mut histograms = HashMap::new(); + + for definition in METRIC_DEFINITIONS { + match definition.kind { + MetricKind::Counter => { + let metric = PromCounterVec::new(opts!(definition.prometheus_name, definition.description), definition.prometheus_label_names)?; + registry.register(Box::new(metric.clone()))?; + prewarm_counter(definition, &metric)?; + counters.insert(definition.id, metric); + } + MetricKind::Histogram => { + let opts = match definition.buckets { + Some(buckets) => histogram_opts!(definition.prometheus_name, definition.description, buckets.to_vec()), + None => histogram_opts!(definition.prometheus_name, definition.description), + }; + let metric = PromHistogramVec::new(opts, definition.prometheus_label_names)?; + registry.register(Box::new(metric.clone()))?; + prewarm_histogram(definition, &metric)?; + histograms.insert(definition.id, metric); + } } } - let process_buckets = exponential_buckets(0.001, 2.0, 12)?; - let msg_process_latency = HistogramVec::new( - histogram_opts!( - "msg_process_latency", - "Duration of processor_message_ready() loop in message_processor", - process_buckets - ), - &["returned", "status"], - )?; - registry.register(Box::new(msg_process_latency.clone()))?; - // Pre-warm all label combinations for the same reason as msg_consume_latency above. - for returned in &["true", "false"] { - for status in &["pass", "fail"] { - msg_process_latency.get_metric_with_label_values(&[returned, status])?; + Ok(Self { + registry, + counters, + histograms, + }) + } +} + +fn prewarm_counter(definition: &MetricDefinition, metric: &PromCounterVec) -> Result<(), prometheus::Error> { + if definition.prometheus_label_names.is_empty() { + metric.get_metric_with_label_values(&[])?; + return Ok(()); + } + + for label_values in definition.prewarm_label_values { + metric.get_metric_with_label_values(label_values)?; + } + + Ok(()) +} + +fn prewarm_histogram(definition: &MetricDefinition, metric: &PromHistogramVec) -> Result<(), prometheus::Error> { + if definition.prometheus_label_names.is_empty() { + metric.get_metric_with_label_values(&[])?; + return Ok(()); + } + + for label_values in definition.prewarm_label_values { + metric.get_metric_with_label_values(label_values)?; + } + + Ok(()) +} + +impl MetricsBackend for PrometheusMetricsBackend { + fn inc_counter(&self, id: MetricId, value: u64, labels: MetricLabels<'_>) { + if let Some(counter) = self.counters.get(&id) { + let label_values = prometheus_label_values(id, labels); + match counter.get_metric_with_label_values(&label_values) { + Ok(metric) => metric.inc_by(value as f64), + Err(err) => log::error!("metrics: failed to record counter {:?}: {}", id, err), } } + } - let wait_buckets = exponential_buckets(0.1, 2.0, 14)?; - let msg_wait_time = Histogram::with_opts(histogram_opts!( - "msg_wait_time", - "Time a message spent in the Kafka input queue before processing", - wait_buckets - ))?; - registry.register(Box::new(msg_wait_time.clone()))?; - - // Custom buckets with explicit 0.5s boundary for the 500ms SLA - let jitter_buckets = vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]; - let msg_jitter = Histogram::with_opts(histogram_opts!( - "msg_jitter", - "Difference between actual publish time and client-requested deadline", - jitter_buckets - ))?; - registry.register(Box::new(msg_jitter.clone()))?; - - let msg_reset = Counter::with_opts(opts!("msg_reset", "Number of records reset by reset_to_init_db()"))?; - registry.register(Box::new(msg_reset.clone()))?; - - Ok(ChronosMetrics { - registry, - msg_consume_latency, - msg_process_latency, - msg_wait_time, - msg_jitter, - msg_reset, + fn observe_histogram(&self, id: MetricId, value: f64, labels: MetricLabels<'_>) { + if let Some(histogram) = self.histograms.get(&id) { + let label_values = prometheus_label_values(id, labels); + match histogram.get_metric_with_label_values(&label_values) { + Ok(metric) => metric.observe(value), + Err(err) => log::error!("metrics: failed to observe histogram {:?}: {}", id, err), + } + } + } + + fn render_prometheus(&self) -> Option { + use prometheus::{Encoder, TextEncoder}; + + let encoder = TextEncoder::new(); + let mut buffer = Vec::new(); + encoder.encode(&self.registry.gather(), &mut buffer).ok()?; + String::from_utf8(buffer).ok() + } + + fn shutdown(&self) {} +} + +struct OtlpMetricsBackend { + provider: opentelemetry_sdk::metrics::MeterProvider, + counters: HashMap>, + histograms: HashMap>, +} + +impl OtlpMetricsBackend { + fn new() -> Result> { + let endpoint = env::var(OTEL_EXPORTER_OTLP_METRICS_ENDPOINT) + .or_else(|_| env::var(OTEL_EXPORTER_OTLP_ENDPOINT)) + .unwrap_or_else(|_| "http://127.0.0.1:4317".to_string()); + let exporter = opentelemetry_otlp::new_exporter().tonic().with_env().with_endpoint(endpoint); + let provider = opentelemetry_otlp::new_pipeline() + .metrics(opentelemetry::runtime::Tokio) + .with_exporter(exporter) + .build()?; + + global::set_meter_provider(provider.clone()); + let meter = global::meter("chronos"); + + let mut counters = HashMap::new(); + let mut histograms = HashMap::new(); + + for definition in METRIC_DEFINITIONS { + match definition.kind { + MetricKind::Counter => { + let mut builder = meter.u64_counter(definition.otel_name).with_description(definition.description); + if let Some(unit) = definition.unit { + builder = builder.with_unit(Unit::new(unit)); + } + counters.insert(definition.id, builder.init()); + } + MetricKind::Histogram => { + let mut builder = meter.f64_histogram(definition.otel_name).with_description(definition.description); + if let Some(unit) = definition.unit { + builder = builder.with_unit(Unit::new(unit)); + } + histograms.insert(definition.id, builder.init()); + } + } + } + + Ok(Self { + provider, + counters, + histograms, }) } } +impl MetricsBackend for OtlpMetricsBackend { + fn inc_counter(&self, id: MetricId, value: u64, labels: MetricLabels<'_>) { + if let Some(counter) = self.counters.get(&id) { + counter.add(value, &labels_to_key_values(labels)); + } + } + + fn observe_histogram(&self, id: MetricId, value: f64, labels: MetricLabels<'_>) { + if let Some(histogram) = self.histograms.get(&id) { + histogram.record(value, &labels_to_key_values(labels)); + } + } + + fn render_prometheus(&self) -> Option { + None + } + + fn shutdown(&self) { + if let Err(err) = self.provider.force_flush(&opentelemetry::Context::current()) { + log::error!("failed to flush OTLP metrics: {}", err); + } + if let Err(err) = self.provider.shutdown() { + log::error!("failed to shut down OTLP metrics provider: {}", err); + } + } +} + +fn require_grpc_protocol() -> Result<(), Box> { + let protocol = env::var(OTEL_EXPORTER_OTLP_METRICS_PROTOCOL) + .or_else(|_| env::var(OTEL_EXPORTER_OTLP_PROTOCOL)) + .unwrap_or_else(|_| "grpc".to_string()); + + if protocol == "grpc" { + Ok(()) + } else { + Err(format!("unsupported OTLP metrics protocol {protocol:?}; use grpc").into()) + } +} + +fn consume_labels(destination: &'static str, status: &'static str) -> Vec<(&'static str, String)> { + vec![("destination", destination.to_string()), ("status", status.to_string())] +} + +fn process_labels(returned: bool, status: &'static str) -> Vec<(&'static str, String)> { + vec![("returned", returned.to_string()), ("status", status.to_string())] +} + +fn metric_definition(id: MetricId) -> Option<&'static MetricDefinition> { + METRIC_DEFINITIONS.iter().find(|definition| definition.id == id) +} + +fn prometheus_label_values<'a>(id: MetricId, labels: MetricLabels<'a>) -> Vec<&'a str> { + let Some(definition) = metric_definition(id) else { + return Vec::new(); + }; + + definition + .label_names + .iter() + .map(|name| { + labels + .iter() + .find(|(label_name, _)| label_name == name) + .map(|(_, value)| value.as_str()) + .unwrap_or("unknown") + }) + .collect() +} + +fn labels_to_key_values(labels: MetricLabels<'_>) -> Vec { + labels.iter().map(|(key, value)| KeyValue::new(*key, value.clone())).collect() +} + #[cfg(test)] mod tests { use super::*; - use prometheus::{Encoder, TextEncoder}; + use crate::metrics::generated::METRIC_DEFINITIONS; + use serial_test::serial; - #[test] - fn test_metrics_registry_creates_successfully() { - assert!(ChronosMetrics::new().is_ok()); + fn prometheus_metrics() -> ChronosMetrics { + env::remove_var(OTEL_METRICS_EXPORTER); + ChronosMetrics::new().unwrap() } #[test] - fn test_msg_consume_latency_records_observation() { - let metrics = ChronosMetrics::new().unwrap(); - metrics - .msg_consume_latency - .get_metric_with_label_values(&["kafka", "pass"]) - .unwrap() - .observe(0.05); - - let families = metrics.registry.gather(); - let fam = families.iter().find(|f| f.get_name() == "msg_consume_latency").unwrap(); - // With pre-warming there are 4 entries; find the kafka/pass one by its labels. - let kafka_pass = fam.get_metric().iter().find(|m| { - m.get_label().iter().any(|l| l.get_name() == "destination" && l.get_value() == "kafka") - && m.get_label().iter().any(|l| l.get_name() == "status" && l.get_value() == "pass") - }); - assert!(kafka_pass.is_some(), "kafka/pass label combination must exist"); - let sample_sum = kafka_pass.unwrap().get_histogram().get_sample_sum(); - assert!((sample_sum - 0.05).abs() < 1e-9); + #[serial] + fn metrics_registry_creates_successfully() { + env::remove_var(OTEL_METRICS_EXPORTER); + assert!(ChronosMetrics::new().is_ok()); } #[test] - fn test_msg_jitter_has_500ms_bucket() { - let metrics = ChronosMetrics::new().unwrap(); - // Observe a value just below 500ms - metrics.msg_jitter.observe(0.499); + #[serial] + fn generated_definitions_drive_all_runtime_metrics() { + let metrics = prometheus_metrics(); + let output = metrics.render_prometheus().unwrap(); - let families = metrics.registry.gather(); - let fam = families.iter().find(|f| f.get_name() == "msg_jitter").unwrap(); - let histogram = fam.get_metric()[0].get_histogram(); - - let bucket_500ms = histogram.get_bucket().iter().find(|b| (b.get_upper_bound() - 0.5).abs() < 1e-9); - assert!(bucket_500ms.is_some(), "0.5s bucket must exist in msg_jitter"); - assert_eq!( - bucket_500ms.unwrap().get_cumulative_count(), - 1, - "0.499s observation must be counted in the <=0.5s bucket" - ); + for definition in METRIC_DEFINITIONS { + assert!( + output.contains(&format!("# HELP {PROMETHEUS_NAMESPACE}_{}", definition.prometheus_name)), + "metric {} must be registered from generated definitions", + definition.prometheus_name + ); + } } #[test] - fn test_msg_reset_increments_correctly() { - let metrics = ChronosMetrics::new().unwrap(); - metrics.msg_reset.inc_by(3.0); - metrics.msg_reset.inc_by(2.0); + #[serial] + fn prometheus_metrics_use_chronos_namespace() { + let metrics = prometheus_metrics(); + metrics.observe_jitter(0.499); + let output = metrics.render_prometheus().unwrap(); - let families = metrics.registry.gather(); - let fam = families.iter().find(|f| f.get_name() == "msg_reset").unwrap(); - let value = fam.get_metric()[0].get_counter().get_value(); - assert!((value - 5.0).abs() < 1e-9); + assert!(output.contains("# HELP chronos_msg_jitter")); + assert!(!output.contains("# HELP msg_jitter")); } #[test] - fn test_msg_wait_time_records_observation() { - let metrics = ChronosMetrics::new().unwrap(); - metrics.msg_wait_time.observe(1.5); + #[serial] + fn msg_jitter_has_500ms_bucket() { + let metrics = prometheus_metrics(); + metrics.observe_jitter(0.499); + let output = metrics.render_prometheus().unwrap(); - let families = metrics.registry.gather(); - let fam = families.iter().find(|f| f.get_name() == "msg_wait_time").unwrap(); - let sample_count = fam.get_metric()[0].get_histogram().get_sample_count(); - assert_eq!(sample_count, 1); + assert!(output.contains("chronos_msg_jitter_bucket{le=\"0.5\"} 1")); } #[test] - fn test_msg_process_latency_label_values() { - let metrics = ChronosMetrics::new().unwrap(); - metrics - .msg_process_latency - .get_metric_with_label_values(&["true", "pass"]) - .unwrap() - .observe(0.01); - metrics - .msg_process_latency - .get_metric_with_label_values(&["false", "pass"]) - .unwrap() - .observe(0.05); - metrics - .msg_process_latency - .get_metric_with_label_values(&["false", "fail"]) - .unwrap() - .observe(0.1); - - let families = metrics.registry.gather(); - let fam = families.iter().find(|f| f.get_name() == "msg_process_latency").unwrap(); - // 3 explicit observations + pre-warming fills all 4 combos; de-dup means 4 entries. - assert_eq!(fam.get_metric().len(), 4); - } - - #[test] - fn test_metrics_text_encode_produces_output() { - let metrics = ChronosMetrics::new().unwrap(); - // All metrics should appear without any manual observations because HistogramVec - // combos are pre-warmed in new(). Scalar histograms and counters always appear. - let encoder = TextEncoder::new(); - let mut buffer = Vec::new(); - encoder.encode(&metrics.registry.gather(), &mut buffer).unwrap(); - let output = String::from_utf8(buffer).unwrap(); + #[serial] + fn msg_reset_increments_correctly() { + let metrics = prometheus_metrics(); + metrics.messages_reset(3); + metrics.messages_reset(2); + let output = metrics.render_prometheus().unwrap(); - assert!(output.contains("msg_reset")); - assert!(output.contains("msg_jitter")); - assert!(output.contains("msg_wait_time")); - assert!(output.contains("msg_consume_latency")); - assert!(output.contains("msg_process_latency")); + assert!(output.contains("chronos_msg_reset 5")); } - /// All 5 metric families must be present in a freshly constructed registry even - /// before any messages are processed (i.e., with zero observations). #[test] - fn test_all_metrics_present_in_fresh_registry() { - let metrics = ChronosMetrics::new().unwrap(); - let encoder = TextEncoder::new(); - let mut buffer = Vec::new(); - encoder.encode(&metrics.registry.gather(), &mut buffer).unwrap(); - let output = String::from_utf8(buffer).unwrap(); + #[serial] + fn msg_wait_time_records_observation() { + let metrics = prometheus_metrics(); + metrics.observe_wait_time(1.5); + let output = metrics.render_prometheus().unwrap(); - for name in &["msg_consume_latency", "msg_process_latency", "msg_wait_time", "msg_jitter", "msg_reset"] { - assert!( - output.contains(&format!("# HELP {}", name)), - "metric {} must appear in fresh registry output", - name - ); - } + assert!(output.contains("chronos_msg_wait_time_count 1")); } - /// msg_consume_latency must have exactly 4 pre-initialized label combinations - /// (kafka/postgres × pass/fail) so it is always present in the scrape output. #[test] - fn test_consume_latency_all_label_combos_initialized() { - let metrics = ChronosMetrics::new().unwrap(); - let families = metrics.registry.gather(); - let fam = families - .iter() - .find(|f| f.get_name() == "msg_consume_latency") - .expect("msg_consume_latency must be present in a fresh registry"); - assert_eq!(fam.get_metric().len(), 4, "expected 4 pre-warmed label combos (kafka/postgres × pass/fail)"); - } - - /// msg_process_latency must have exactly 4 pre-initialized label combinations - /// (true/false × pass/fail) so it is always present in the scrape output. - #[test] - fn test_process_latency_all_label_combos_initialized() { - let metrics = ChronosMetrics::new().unwrap(); - let families = metrics.registry.gather(); - let fam = families - .iter() - .find(|f| f.get_name() == "msg_process_latency") - .expect("msg_process_latency must be present in a fresh registry"); - assert_eq!(fam.get_metric().len(), 4, "expected 4 pre-warmed label combos (true/false × pass/fail)"); + #[serial] + fn labeled_metrics_record_issue_dimensions() { + let metrics = prometheus_metrics(); + metrics.observe_consume_latency(0.05, "postgres", "pass"); + metrics.observe_process_latency(0.01, false, "fail"); + let output = metrics.render_prometheus().unwrap(); + + assert!(output.contains("chronos_msg_consume_latency_count{destination=\"postgres\",status=\"pass\"} 1")); + assert!(output.contains("chronos_msg_process_latency_count{returned=\"false\",status=\"fail\"} 1")); } } diff --git a/chronos_bin/src/metrics/server.rs b/chronos_bin/src/metrics/server.rs index 8ac687c..9ef1995 100644 --- a/chronos_bin/src/metrics/server.rs +++ b/chronos_bin/src/metrics/server.rs @@ -1,22 +1,20 @@ use crate::metrics::ChronosMetrics; use axum::{extract::State, http::StatusCode, response::IntoResponse, routing::get, Router}; -use prometheus::{Encoder, TextEncoder}; use std::sync::Arc; async fn metrics_handler(State(metrics): State>) -> impl IntoResponse { - let encoder = TextEncoder::new(); - let metric_families = metrics.registry.gather(); - let mut buffer = Vec::new(); - match encoder.encode(&metric_families, &mut buffer) { - Ok(_) => (StatusCode::OK, [("content-type", "text/plain; version=0.0.4; charset=utf-8")], buffer).into_response(), - Err(e) => { - log::error!("Failed to encode metrics: {}", e); - StatusCode::INTERNAL_SERVER_ERROR.into_response() - } + match metrics.render_prometheus() { + Some(body) => (StatusCode::OK, [("content-type", "text/plain; version=0.0.4; charset=utf-8")], body).into_response(), + None => StatusCode::NOT_FOUND.into_response(), } } pub async fn run_metrics_server(metrics: Arc, host: String, port: u16) { + if !metrics.is_prometheus() { + log::info!("Prometheus metrics server disabled because OTEL_METRICS_EXPORTER is not prometheus"); + return; + } + let app = Router::new().route("/metrics", get(metrics_handler)).with_state(metrics); let addr = format!("{}:{}", host, port); diff --git a/chronos_bin/src/monitor.rs b/chronos_bin/src/monitor.rs index abf4d60..30405a3 100644 --- a/chronos_bin/src/monitor.rs +++ b/chronos_bin/src/monitor.rs @@ -27,7 +27,7 @@ impl FailureDetector { match &self.data_store.reset_to_init_db(fetched_rows).await { Ok(reset_ids) => { // msg_reset: count the number of messages reset by the monitor task. - self.metrics.msg_reset.inc_by(reset_ids.len() as f64); + self.metrics.messages_reset(reset_ids.len() as u64); log::debug!("reset_to_init_db success for {:?}", fetched_rows) } Err(e) => { diff --git a/examples/weaver/registry/chronos/metrics.yaml b/examples/weaver/registry/chronos/metrics.yaml index 9b1010e..e2eb38d 100644 --- a/examples/weaver/registry/chronos/metrics.yaml +++ b/examples/weaver/registry/chronos/metrics.yaml @@ -17,73 +17,85 @@ groups: examples: ["chronos-metrics-mock-live-check"] requirement_level: required - - id: metric_attributes.chronos.message_store + - id: metric_attributes.chronos.consume_result type: attribute_group stability: development - brief: Common attributes used by Chronos message store metrics. + brief: Attributes for Chronos input message handling outcomes. attributes: - - id: messaging.system + - id: destination type: string stability: development - brief: The messaging system as identified by the client instrumentation. - examples: ["kafka"] + brief: Downstream selected by message_receiver::handle_message. + examples: ["kafka", "postgres"] requirement_level: required - - id: messaging.operation.name + - id: chronos.consume.status type: string stability: development - brief: The system-specific name of the messaging operation. - examples: ["receive", "process"] + brief: Whether the consume path completed successfully. + examples: ["pass", "fail"] requirement_level: required - - id: messaging.destination.name + + - id: metric_attributes.chronos.process_result + type: attribute_group + stability: development + brief: Attributes for Chronos ready-message processor loop outcomes. + attributes: + - id: returned type: string stability: development - brief: The message destination name. - examples: ["chronos-input", "chronos-output"] + brief: Whether the processor loop returned early because no rows were ready. + examples: ["true", "false"] + requirement_level: required + - id: chronos.process.status + type: string + stability: development + brief: Whether the processor loop completed successfully. + examples: ["pass", "fail"] requirement_level: required - - - id: metric.chronos.message.consumed - type: metric - metric_name: messaging.client.consumed.messages - stability: development - brief: Total number of Chronos input messages consumed. - instrument: counter - unit: "{message}" - extends: metric_attributes.chronos.message_store - annotations: - code_generation: - rust_name: msg_consumed - metric_value_type: int - prometheus_name: messaging_client_consumed_messages_total - id: metric.chronos.message.consume.duration type: metric - metric_name: messaging.client.operation.duration + metric_name: chronos.message.consume.duration stability: development brief: Duration of handle_message() in message_receiver. instrument: histogram unit: s - extends: metric_attributes.chronos.message_store + extends: metric_attributes.chronos.consume_result annotations: code_generation: rust_name: msg_consume_latency metric_value_type: double - prometheus_name: messaging_client_operation_duration_seconds + prometheus_name: msg_consume_latency + label_names: [destination, status] + prometheus_label_names: [destination, status] buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] + prewarm_label_values: + - [kafka, pass] + - [kafka, fail] + - [postgres, pass] + - [postgres, fail] - id: metric.chronos.message.process.duration type: metric - metric_name: messaging.process.duration + metric_name: chronos.message.process.duration stability: development brief: Duration of processor_message_ready() loop in message_processor. instrument: histogram unit: s - extends: metric_attributes.chronos.message_store + extends: metric_attributes.chronos.process_result annotations: code_generation: rust_name: msg_process_latency metric_value_type: double - prometheus_name: messaging_process_duration_seconds + prometheus_name: msg_process_latency + label_names: [returned, status] + prometheus_label_names: [returned, status] buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] + prewarm_label_values: + - ["true", pass] + - ["true", fail] + - ["false", pass] + - ["false", fail] - id: metric.chronos.message.wait.duration type: metric @@ -96,7 +108,7 @@ groups: code_generation: rust_name: msg_wait_time metric_value_type: double - prometheus_name: chronos_message_wait_duration_seconds + prometheus_name: msg_wait_time buckets: [0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2] - id: metric.chronos.message.jitter @@ -110,7 +122,7 @@ groups: code_generation: rust_name: msg_jitter metric_value_type: double - prometheus_name: chronos_message_jitter_seconds + prometheus_name: msg_jitter buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] - id: metric.chronos.message.reset @@ -124,4 +136,4 @@ groups: code_generation: rust_name: msg_reset metric_value_type: int - prometheus_name: chronos_messages_reset_total + prometheus_name: msg_reset diff --git a/examples/weaver/templates/registry/markdown/weaver.yaml b/examples/weaver/templates/registry/markdown/weaver.yaml index 9cb398b..61b5edf 100644 --- a/examples/weaver/templates/registry/markdown/weaver.yaml +++ b/examples/weaver/templates/registry/markdown/weaver.yaml @@ -10,7 +10,7 @@ templates: brief, instrument, unit, - attributes: (.attributes // [] | map(.name // .id // .ref)) + attributes: (.annotations.code_generation.label_names // (.attributes // [] | map(.name // .id // .ref))) })) } application_mode: single diff --git a/examples/weaver/templates/registry/rust/registry.rs.j2 b/examples/weaver/templates/registry/rust/registry.rs.j2 index e63546b..a319be7 100644 --- a/examples/weaver/templates/registry/rust/registry.rs.j2 +++ b/examples/weaver/templates/registry/rust/registry.rs.j2 @@ -14,6 +14,16 @@ pub enum MetricKind { Histogram, } +impl MetricKind { + pub fn is_counter(self) -> bool { + matches!(self, Self::Counter) + } + + pub fn is_histogram(self) -> bool { + matches!(self, Self::Histogram) + } +} + #[derive(Clone, Copy, Debug)] pub struct MetricDefinition { pub id: MetricId, @@ -25,6 +35,7 @@ pub struct MetricDefinition { pub prometheus_label_names: &'static [&'static str], pub kind: MetricKind, pub buckets: Option<&'static [f64]>, + pub prewarm_label_values: &'static [&'static [&'static str]], } pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ @@ -41,6 +52,7 @@ pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ buckets: {% if metric.buckets %}{% if metric.buckets | length > 10 %}Some(&[ {{ metric.buckets | join(", ") }}, ]){% else %}Some(&[{{ metric.buckets | join(", ") }}]){% endif %}{% else %}None{% endif %}, + prewarm_label_values: &[{% for values in metric.prewarm_label_values %}&[{% for value in values %}"{{ value }}"{% if not loop.last %}, {% endif %}{% endfor %}]{% if not loop.last %}, {% endif %}{% endfor %}], }, {%- endfor %} ]; diff --git a/examples/weaver/templates/registry/rust/weaver.yaml b/examples/weaver/templates/registry/rust/weaver.yaml index 05afd4d..ed7f9fb 100644 --- a/examples/weaver/templates/registry/rust/weaver.yaml +++ b/examples/weaver/templates/registry/rust/weaver.yaml @@ -12,9 +12,10 @@ templates: brief, instrument, unit, - attributes: (.attributes // [] | map(.name // .id // .ref)), - prometheus_labels: (.attributes // [] | map((.name // .id // .ref) | gsub("\\."; "_"))), - buckets: .annotations.code_generation.buckets + attributes: (.annotations.code_generation.label_names // (.attributes // [] | map(.name // .id // .ref))), + prometheus_labels: (.annotations.code_generation.prometheus_label_names // (.attributes // [] | map((.name // .id // .ref) | gsub("\\."; "_")))), + buckets: .annotations.code_generation.buckets, + prewarm_label_values: (.annotations.code_generation.prewarm_label_values // []) })) } application_mode: single diff --git a/scripts/integration.sh b/scripts/integration.sh index f88fc38..7ab3a2d 100755 --- a/scripts/integration.sh +++ b/scripts/integration.sh @@ -155,7 +155,7 @@ echo "════════════════════════ echo " Chronos metrics (http://localhost:${METRICS_PORT}/metrics)" echo "══════════════════════════════════════════════════════" curl -sf "http://localhost:${METRICS_PORT}/metrics" \ - | grep -E "^(# HELP|# TYPE|msg_)" \ + | grep -E "^(# HELP|# TYPE|chronos_msg_)" \ | sort echo "" @@ -163,11 +163,11 @@ echo "" log "Verifying metric families..." METRICS_OUTPUT="$(curl -sf "http://localhost:${METRICS_PORT}/metrics")" EXPECTED_METRICS=( - "msg_consume_latency" - "msg_process_latency" - "msg_wait_time" - "msg_jitter" - "msg_reset" + "chronos_msg_consume_latency" + "chronos_msg_process_latency" + "chronos_msg_wait_time" + "chronos_msg_jitter" + "chronos_msg_reset" ) ALL_OK=true for metric in "${EXPECTED_METRICS[@]}"; do From 483aaac8f70f00957405ec92b461c738fa416a70 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Thu, 30 Apr 2026 23:16:53 +1000 Subject: [PATCH 16/36] ci: add composable branch checks Add reusable GitHub Actions for pre-commit, unit tests, Trivy scanning, static binary builds, container builds, and SBOM generation. CI.yaml now runs the required non-main branch checks in parallel and gates merge protection through a final CI aggregate job. Add act recipes under dev/makefiles/act.mk for running the central CI workflow, individual jobs, and SBOM workflow inputs locally. Keep sbom reusable/manual only so it is not called from CI yet. Verification: - git diff --check - python3 YAML parse for .github/workflows - make -f dev/makefiles/act.mk -n act.ci act.ci.job act.sbom.release - docker run --rm -v /home/ah34/work/opensource/chronos:/repo -w /repo rhysd/actionlint:latest - sh scripts/pre-commit-checks.sh Model-version: GPT-5 --- .github/config.json | 2 +- .github/workflows/CI.yaml | 56 +++++++++++++++++++++ .github/workflows/build-binary.yml | 37 ++++++++++++++ .github/workflows/build-container.yml | 25 ++++++++++ .github/workflows/build.yml | 10 ---- .github/workflows/pre-commit.yml | 9 ++-- .github/workflows/release.yml | 14 +++--- .github/workflows/rust_build.yml | 4 +- .github/workflows/sbom.yml | 72 +++++++++++++++++++++++++++ .github/workflows/scan.yml | 41 +++++++++++++++ .github/workflows/test.yml | 30 +++++++++++ dev/makefiles/act.mk | 57 +++++++++++++++++++++ 12 files changed, 334 insertions(+), 23 deletions(-) create mode 100644 .github/workflows/CI.yaml create mode 100644 .github/workflows/build-binary.yml create mode 100644 .github/workflows/build-container.yml delete mode 100644 .github/workflows/build.yml create mode 100644 .github/workflows/sbom.yml create mode 100644 .github/workflows/scan.yml create mode 100644 .github/workflows/test.yml create mode 100644 dev/makefiles/act.mk diff --git a/.github/config.json b/.github/config.json index 8fbbb2a..1a17ae0 100644 --- a/.github/config.json +++ b/.github/config.json @@ -19,7 +19,7 @@ "required_status_checks": { "strict": true, "contexts": [ - "pre-commit" + "CI" ] }, "enforce_admins": true, diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml new file mode 100644 index 0000000..681fd02 --- /dev/null +++ b/.github/workflows/CI.yaml @@ -0,0 +1,56 @@ +name: CI + +on: + push: + branches-ignore: + - main + +permissions: + contents: read + security-events: write + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + pre-commit: + uses: ./.github/workflows/pre-commit.yml + + test: + uses: ./.github/workflows/test.yml + + scan: + uses: ./.github/workflows/scan.yml + + build-binary: + uses: ./.github/workflows/build-binary.yml + + build-container: + uses: ./.github/workflows/build-container.yml + + ci: + name: CI + runs-on: ubuntu-latest + needs: + - pre-commit + - test + - scan + - build-binary + - build-container + if: ${{ always() }} + steps: + - name: Require all CI jobs to pass + run: | + for result in \ + "${{ needs.pre-commit.result }}" \ + "${{ needs.test.result }}" \ + "${{ needs.scan.result }}" \ + "${{ needs.build-binary.result }}" \ + "${{ needs.build-container.result }}" + do + if [ "${result}" != "success" ]; then + echo "One or more CI jobs did not succeed." >&2 + exit 1 + fi + done diff --git a/.github/workflows/build-binary.yml b/.github/workflows/build-binary.yml new file mode 100644 index 0000000..662498b --- /dev/null +++ b/.github/workflows/build-binary.yml @@ -0,0 +1,37 @@ +name: build-binary + +on: + workflow_call: + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-build-binary-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-binary: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Build static binary image stage + run: docker build --target builder -f Dockerfile.chronos-slim -t chronos-static-builder:${{ github.sha }} . + + - name: Extract static binary + run: | + mkdir -p dist + container_id="$(docker create chronos-static-builder:${{ github.sha }})" + trap 'docker rm -f "${container_id}" >/dev/null 2>&1 || true' EXIT + docker cp "${container_id}:/build/target/x86_64-unknown-linux-musl/release/chronos" dist/chronos-linux-x86_64-musl + chmod 0755 dist/chronos-linux-x86_64-musl + + - name: Upload binary artifact + uses: actions/upload-artifact@v4 + with: + name: chronos-linux-x86_64-musl + path: dist/chronos-linux-x86_64-musl + if-no-files-found: error diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml new file mode 100644 index 0000000..5b48e92 --- /dev/null +++ b/.github/workflows/build-container.yml @@ -0,0 +1,25 @@ +name: build-container + +on: + workflow_call: + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-build-container-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-container: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Build regular container + run: docker build -f Dockerfile.chronos -t chronos:${{ github.sha }} . + + - name: Build scratch container + run: docker build -f Dockerfile.chronos-slim -t chronos-scratch:${{ github.sha }} . diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 152c4e0..0000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: release app binary on tag - -# push to branch -on: - push: - -jobs: - build: - uses: ./.github/workflows/rust_build.yml - \ No newline at end of file diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 1b35497..dc63593 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -1,13 +1,16 @@ name: pre-commit on: - push: - branches-ignore: - - main + workflow_call: + workflow_dispatch: permissions: contents: read +concurrency: + group: ${{ github.workflow }}-pre-commit-${{ github.ref }} + cancel-in-progress: true + jobs: pre-commit: runs-on: ubuntu-latest diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c26d2a7..b04289a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,9 +25,9 @@ jobs: if: needs.build.result == 'success' steps: - name: Checkout the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to GitHub Container Registry - uses: docker/login-action@v1 + uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.actor }} @@ -35,14 +35,14 @@ jobs: - name: Build and publish chronos for chronos image with ver run: | - docker build -f Dockerfile.chronos . --tag ghcr.io/$GITHUB_REPOSITORY:$GITHUB_REF_NAME --tag ghcr.io/$GITHUB_REPOSITORY:latest - docker push ghcr.io/$GITHUB_REPOSITORY:$GITHUB_REF_NAME + docker build -f Dockerfile.chronos . --tag "ghcr.io/${GITHUB_REPOSITORY}:${GITHUB_REF_NAME}" --tag "ghcr.io/${GITHUB_REPOSITORY}:latest" + docker push "ghcr.io/${GITHUB_REPOSITORY}:${GITHUB_REF_NAME}" - name: publish chronos latest for chronos image replace latest run: | - docker push ghcr.io/$GITHUB_REPOSITORY:latest + docker push "ghcr.io/${GITHUB_REPOSITORY}:latest" - name: Build and publish the chronos-pg-migration Docker image run: | - docker build -f Dockerfile.chronos-pg-migrations . --tag ghcr.io/$GITHUB_REPOSITORY/db-migration:$GITHUB_REF_NAME - docker push ghcr.io/$GITHUB_REPOSITORY/db-migration:$GITHUB_REF_NAME \ No newline at end of file + docker build -f Dockerfile.chronos-pg-migrations . --tag "ghcr.io/${GITHUB_REPOSITORY}/db-migration:${GITHUB_REF_NAME}" + docker push "ghcr.io/${GITHUB_REPOSITORY}/db-migration:${GITHUB_REF_NAME}" diff --git a/.github/workflows/rust_build.yml b/.github/workflows/rust_build.yml index 0a74de4..a8b0651 100644 --- a/.github/workflows/rust_build.yml +++ b/.github/workflows/rust_build.yml @@ -8,7 +8,7 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: toolchain: stable @@ -18,4 +18,4 @@ jobs: run: | cargo clean cargo build --release - - run: scripts/pre-commit-checks.sh \ No newline at end of file + - run: scripts/pre-commit-checks.sh diff --git a/.github/workflows/sbom.yml b/.github/workflows/sbom.yml new file mode 100644 index 0000000..537d954 --- /dev/null +++ b/.github/workflows/sbom.yml @@ -0,0 +1,72 @@ +name: sbom + +on: + workflow_call: + inputs: + target-type: + description: Use "container" for an image SBOM or "release" for a filesystem/release artifact SBOM. + required: true + type: string + target-ref: + description: Container image reference or release artifact path to scan. + required: true + type: string + workflow_dispatch: + inputs: + target-type: + description: Use "container" for an image SBOM or "release" for a filesystem/release artifact SBOM. + required: true + type: choice + options: + - container + - release + target-ref: + description: Container image reference or release artifact path to scan. + required: true + type: string + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-sbom-${{ github.ref }} + cancel-in-progress: true + +jobs: + sbom: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Validate inputs + run: | + case "${{ inputs.target-type }}" in + container|release) ;; + *) echo "target-type must be container or release" >&2; exit 2 ;; + esac + + - name: Generate container SBOM + if: inputs.target-type == 'container' + uses: aquasecurity/trivy-action@0.32.0 + with: + scan-type: image + scan-ref: ${{ inputs.target-ref }} + format: cyclonedx + output: chronos-sbom.cdx.json + + - name: Generate release SBOM + if: inputs.target-type == 'release' + uses: aquasecurity/trivy-action@0.32.0 + with: + scan-type: fs + scan-ref: ${{ inputs.target-ref }} + format: cyclonedx + output: chronos-sbom.cdx.json + + - name: Upload SBOM artifact + uses: actions/upload-artifact@v4 + with: + name: chronos-sbom-${{ inputs.target-type }} + path: chronos-sbom.cdx.json + if-no-files-found: error diff --git a/.github/workflows/scan.yml b/.github/workflows/scan.yml new file mode 100644 index 0000000..440395a --- /dev/null +++ b/.github/workflows/scan.yml @@ -0,0 +1,41 @@ +name: scan + +on: + workflow_call: + workflow_dispatch: + +permissions: + contents: read + security-events: write + +concurrency: + group: ${{ github.workflow }}-scan-${{ github.ref }} + cancel-in-progress: true + +jobs: + scan: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@master + with: + toolchain: 1.94 + + - name: Install system dependencies + run: scripts/ubuntu-setup.sh + + - name: Build release binary + run: cargo build --release -p chronos_bin + + - name: Scan Rust build output + uses: aquasecurity/trivy-action@0.32.0 + with: + scan-type: fs + scan-ref: target/release/chronos + scanners: vuln + severity: CRITICAL,HIGH + exit-code: "1" + ignore-unfixed: true diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..3a5ded1 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,30 @@ +name: test + +on: + workflow_call: + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-test-${{ github.ref }} + cancel-in-progress: true + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@master + with: + toolchain: 1.94 + + - name: Install system dependencies + run: scripts/ubuntu-setup.sh + + - name: Run unit tests + run: cargo test diff --git a/dev/makefiles/act.mk b/dev/makefiles/act.mk new file mode 100644 index 0000000..bbabc79 --- /dev/null +++ b/dev/makefiles/act.mk @@ -0,0 +1,57 @@ +#!make +SHELL := /bin/bash + +ACT_EVENT ?= push +ACT_JOB ?= pre-commit +ACT_RUNNER_IMAGE ?= catthehacker/ubuntu:act-latest +ACT_ARTIFACT_DIR ?= /tmp/chronos-act-artifacts +ACT_EVENT_DIR ?= /tmp/chronos-act-events +ACT_FLAGS ?= -P ubuntu-latest=$(ACT_RUNNER_IMAGE) --artifact-server-path $(ACT_ARTIFACT_DIR) + +CI_WORKFLOW ?= .github/workflows/CI.yaml +PRE_COMMIT_WORKFLOW ?= .github/workflows/pre-commit.yml +TEST_WORKFLOW ?= .github/workflows/test.yml +SCAN_WORKFLOW ?= .github/workflows/scan.yml +BUILD_BINARY_WORKFLOW ?= .github/workflows/build-binary.yml +BUILD_CONTAINER_WORKFLOW ?= .github/workflows/build-container.yml +SBOM_WORKFLOW ?= .github/workflows/sbom.yml + +SBOM_TARGET_TYPE ?= release +SBOM_TARGET_REF ?= . + +.PHONY: act.ci act.ci.job act.pre-commit act.test act.scan act.build-binary act.build-container act.sbom act.sbom.container act.sbom.release + +act.ci: + mkdir -p "$(ACT_ARTIFACT_DIR)" + act push -W "$(CI_WORKFLOW)" $(ACT_FLAGS) + +act.ci.job: + mkdir -p "$(ACT_ARTIFACT_DIR)" + act push -W "$(CI_WORKFLOW)" -j "$(ACT_JOB)" $(ACT_FLAGS) + +act.pre-commit: + act workflow_dispatch -W "$(PRE_COMMIT_WORKFLOW)" $(ACT_FLAGS) + +act.test: + act workflow_dispatch -W "$(TEST_WORKFLOW)" $(ACT_FLAGS) + +act.scan: + act workflow_dispatch -W "$(SCAN_WORKFLOW)" $(ACT_FLAGS) + +act.build-binary: + mkdir -p "$(ACT_ARTIFACT_DIR)" + act workflow_dispatch -W "$(BUILD_BINARY_WORKFLOW)" $(ACT_FLAGS) + +act.build-container: + act workflow_dispatch -W "$(BUILD_CONTAINER_WORKFLOW)" $(ACT_FLAGS) + +act.sbom: + mkdir -p "$(ACT_ARTIFACT_DIR)" "$(ACT_EVENT_DIR)" + printf '{"inputs":{"target-type":"%s","target-ref":"%s"}}\n' "$(SBOM_TARGET_TYPE)" "$(SBOM_TARGET_REF)" > "$(ACT_EVENT_DIR)/sbom.json" + act workflow_dispatch -W "$(SBOM_WORKFLOW)" -e "$(ACT_EVENT_DIR)/sbom.json" $(ACT_FLAGS) + +act.sbom.container: + $(MAKE) -f dev/makefiles/act.mk act.sbom SBOM_TARGET_TYPE=container + +act.sbom.release: + $(MAKE) -f dev/makefiles/act.mk act.sbom SBOM_TARGET_TYPE=release From 34c0f192eaa8a60a934f574ba4a7a8bbc746b003 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Thu, 30 Apr 2026 23:20:22 +1000 Subject: [PATCH 17/36] fix(ci): push ci --- .github/workflows/CI.yaml | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml index 681fd02..21ce7d2 100644 --- a/.github/workflows/CI.yaml +++ b/.github/workflows/CI.yaml @@ -28,29 +28,3 @@ jobs: build-container: uses: ./.github/workflows/build-container.yml - - ci: - name: CI - runs-on: ubuntu-latest - needs: - - pre-commit - - test - - scan - - build-binary - - build-container - if: ${{ always() }} - steps: - - name: Require all CI jobs to pass - run: | - for result in \ - "${{ needs.pre-commit.result }}" \ - "${{ needs.test.result }}" \ - "${{ needs.scan.result }}" \ - "${{ needs.build-binary.result }}" \ - "${{ needs.build-container.result }}" - do - if [ "${result}" != "success" ]; then - echo "One or more CI jobs did not succeed." >&2 - exit 1 - fi - done From 17fff7ee6cf3eab99ece2a9247b360f348495ccf Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Thu, 30 Apr 2026 23:57:23 +1000 Subject: [PATCH 18/36] fix(container): build slim image with alpine runtime Switch the slim image away from fully static scratch output because the current rdkafka SASL/OpenSSL feature set pulls in system libraries that are not practical to link statically on Alpine. Keep the image small by using an Alpine runtime with only the required shared libraries, add bash for librdkafka configure, and disable Rust musl crt-static during the build. Update the composable binary workflow to extract the Alpine-built binary path and add a .dockerignore so image builds do not send target or git state as context. Verification: - docker build -f Dockerfile.chronos-slim -t chronos-slim:test . - docker run --rm --entrypoint /bin/sh chronos-slim:test -lc 'ldd /chronos' - sh scripts/pre-commit-checks.sh Model-version: GPT-5 --- .dockerignore | 12 +++++++ .github/workflows/build-binary.yml | 16 ++++----- Dockerfile.chronos-slim | 55 ++++++++++++++---------------- 3 files changed, 45 insertions(+), 38 deletions(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..7e8270d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +target/ +.git/ +.github/ +.codex/ + +.env +healthcheck/ + +.DS_Store +*.iml +.idea/ +.vscode/ diff --git a/.github/workflows/build-binary.yml b/.github/workflows/build-binary.yml index 662498b..0581f7e 100644 --- a/.github/workflows/build-binary.yml +++ b/.github/workflows/build-binary.yml @@ -18,20 +18,20 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Build static binary image stage - run: docker build --target builder -f Dockerfile.chronos-slim -t chronos-static-builder:${{ github.sha }} . + - name: Build binary image stage + run: docker build --target builder -f Dockerfile.chronos-slim -t chronos-binary-builder:${{ github.sha }} . - - name: Extract static binary + - name: Extract binary run: | mkdir -p dist - container_id="$(docker create chronos-static-builder:${{ github.sha }})" + container_id="$(docker create chronos-binary-builder:${{ github.sha }})" trap 'docker rm -f "${container_id}" >/dev/null 2>&1 || true' EXIT - docker cp "${container_id}:/build/target/x86_64-unknown-linux-musl/release/chronos" dist/chronos-linux-x86_64-musl - chmod 0755 dist/chronos-linux-x86_64-musl + docker cp "${container_id}:/build/target/release/chronos" dist/chronos-linux-x86_64-alpine + chmod 0755 dist/chronos-linux-x86_64-alpine - name: Upload binary artifact uses: actions/upload-artifact@v4 with: - name: chronos-linux-x86_64-musl - path: dist/chronos-linux-x86_64-musl + name: chronos-linux-x86_64-alpine + path: dist/chronos-linux-x86_64-alpine if-no-files-found: error diff --git a/Dockerfile.chronos-slim b/Dockerfile.chronos-slim index 90b6741..637f62a 100644 --- a/Dockerfile.chronos-slim +++ b/Dockerfile.chronos-slim @@ -1,12 +1,12 @@ # syntax=docker/dockerfile:1 # -# Dockerfile.chronos-slim — scratch image for minimal production deployments. +# Dockerfile.chronos-slim — Alpine image for minimal production deployments. # # Key differences from Dockerfile.chronos ("fat" / glibc image): -# - Uses Alpine + musl to produce a fully static binary (zero runtime OS deps) +# - Uses Alpine + musl to keep runtime dependencies small # - Unit tests are executed during the build stage; the image build fails if # any test fails -# - The final stage is FROM scratch — no OS, shell, or package manager +# - The final stage keeps only the small runtime packages needed by rdkafka # # To use this image in docker-compose, change the chronos service to: # build: @@ -16,19 +16,21 @@ # ───────────────────────────────────────────────────────────────────────────── # Build stage # Alpine's musl toolchain is used throughout. rdkafka compiles librdkafka from -# source (cmake). The SASL feature requires Cyrus SASL, which Alpine packages -# include as static libs so the final binary has no shared-library dependencies. +# source (cmake). The SASL feature requires Cyrus SASL. Alpine's static SASL +# archive pulls in GSSAPI, GDBM, and SQLite plugin symbols, so this image links +# SASL dynamically and carries the small runtime libraries in the final stage. +# librdkafka's configure script is run by rdkafka-sys and requires bash. # ───────────────────────────────────────────────────────────────────────────── FROM rust:1.94-alpine AS builder RUN apk add --no-cache \ + bash \ musl-dev \ cmake \ make \ g++ \ cyrus-sasl-dev \ openssl-dev \ - openssl-libs-static \ pkgconfig \ perl @@ -39,45 +41,38 @@ COPY ./ . # Library unit tests run without external services (no Kafka or Postgres). # Building the test binary also verifies that the release code compiles cleanly # under musl. The image build is aborted here if any test fails. -RUN PKG_CONFIG_ALL_STATIC=1 \ - RUSTFLAGS="-C target-feature=+crt-static" \ - cargo test --lib -p chronos_bin \ - --target x86_64-unknown-linux-musl +RUN RUSTFLAGS="-C target-feature=-crt-static" \ + cargo test --lib -p chronos_bin -# ── Build static release binary ───────────────────────────────────────────── -# PKG_CONFIG_ALL_STATIC=1 → pkg-config prefers static (.a) variants of all -# C libraries (sasl2, openssl, …) -# +crt-static → embed the musl C runtime; no libc.so at runtime -# +# ── Build release binary ─────────────────────────────────────────────────── # The release compile is fast here because the test stage above already built -# all library crates under the same flags and target. -RUN PKG_CONFIG_ALL_STATIC=1 \ - RUSTFLAGS="-C target-feature=+crt-static" \ - cargo build --release -p chronos_bin \ - --target x86_64-unknown-linux-musl +# all library crates under the same target. +# +RUN RUSTFLAGS="-C target-feature=-crt-static" \ + cargo build --release -p chronos_bin -# Minimal passwd/group entries for the non-root user in the scratch image +# Minimal passwd/group entries for the non-root user in the runtime image RUN printf 'chronos:x:1000:1000::/nonexistent:/sbin/nologin\n' > /tmp/passwd && \ printf 'chronos:x:1000:\n' > /tmp/group # ───────────────────────────────────────────────────────────────────────────── -# Runtime stage — FROM scratch -# The binary is the entire filesystem contents (plus certs and user files). +# Runtime stage. # ───────────────────────────────────────────────────────────────────────────── -FROM scratch +FROM alpine:3.23 -# TLS root certificates required for: -# - OTLP trace exporters (HTTPS) -# - Kafka with TLS listeners -COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ +RUN apk add --no-cache \ + ca-certificates \ + cyrus-sasl \ + cyrus-sasl-scram \ + libgcc \ + openssl # User and group files so the container runs as non-root (uid 1000) COPY --from=builder /tmp/passwd /etc/passwd COPY --from=builder /tmp/group /etc/group -# The statically compiled binary — the only executable in this image COPY --from=builder \ - /build/target/x86_64-unknown-linux-musl/release/chronos \ + /build/target/release/chronos \ /chronos USER 1000:1000 From b44c874c7cc5e9245b5a478f2b9ea17a07281d39 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 13:34:06 +1000 Subject: [PATCH 19/36] chore: reorganize dev compose and make targets Move Docker Compose files under dev/docker-compose, split infra and observability backends, and make make up start Chronos with PostgreSQL, Kafka, and Jaeger/OpenTelemetry by default. Add the LGTM backend as an alternate make up lgtm path and move LGTM configuration under dev/lgtm. Split the root Makefile into logical includes under dev/makefiles while keeping legacy target aliases where useful. Replace scripts/pre-commit-checks.sh with the make pre-commit target and update CI and agent documentation references. Verification: - make setup - make help - make docker.config - make docker.config BACKEND=lgtm - make pre-commit - make build - make lgtm.validate Model-version: GPT-5 --- .github/workflows/pre-commit.yml | 2 +- .github/workflows/rust_build.yml | 2 +- AGENTS.md | 19 +- Dockerfile.chronos-slim | 2 +- How-to.md | 15 +- Makefile | 260 +----------------- dev/docker-compose-lgtm.yaml | 48 ---- dev/docker-compose/compose.yaml | 58 ++++ dev/docker-compose/infra.yaml | 49 ++++ dev/docker-compose/jaeger.yaml | 34 +++ dev/docker-compose/lgtm.yaml | 43 +++ dev/{ => lgtm}/dashboards.yaml | 0 .../healthcheck.sh} | 0 dev/{lgtm-logging.sh => lgtm/logging.sh} | 0 dev/{ => lgtm}/otelcol-contrib.yaml | 0 dev/{ => lgtm}/prometheus.yaml | 0 dev/makefiles/act.mk | 8 +- dev/makefiles/ci.mk | 8 + dev/makefiles/common.mk | 16 ++ dev/makefiles/dev.mk | 53 ++++ dev/makefiles/docker.mk | 36 +++ dev/makefiles/integration.mk | 12 + dev/makefiles/rust.mk | 117 ++++++++ dev/makefiles/telemetry.mk | 24 ++ {infra => dev/otel}/otelcol-config.yml | 0 docker-compose.yml | 148 ---------- scripts/integration.sh | 9 +- scripts/pre-commit-checks.sh | 7 - 28 files changed, 488 insertions(+), 482 deletions(-) delete mode 100644 dev/docker-compose-lgtm.yaml create mode 100644 dev/docker-compose/compose.yaml create mode 100644 dev/docker-compose/infra.yaml create mode 100644 dev/docker-compose/jaeger.yaml create mode 100644 dev/docker-compose/lgtm.yaml rename dev/{ => lgtm}/dashboards.yaml (100%) rename dev/{lgtm-healthcheck.sh => lgtm/healthcheck.sh} (100%) rename dev/{lgtm-logging.sh => lgtm/logging.sh} (100%) rename dev/{ => lgtm}/otelcol-contrib.yaml (100%) rename dev/{ => lgtm}/prometheus.yaml (100%) create mode 100644 dev/makefiles/ci.mk create mode 100644 dev/makefiles/common.mk create mode 100644 dev/makefiles/dev.mk create mode 100644 dev/makefiles/docker.mk create mode 100644 dev/makefiles/integration.mk create mode 100644 dev/makefiles/rust.mk create mode 100644 dev/makefiles/telemetry.mk rename {infra => dev/otel}/otelcol-config.yml (100%) delete mode 100644 docker-compose.yml delete mode 100755 scripts/pre-commit-checks.sh diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index dc63593..88f71ed 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -28,4 +28,4 @@ jobs: run: scripts/ubuntu-setup.sh - name: Run pre-commit checks - run: sh scripts/pre-commit-checks.sh + run: make pre-commit diff --git a/.github/workflows/rust_build.yml b/.github/workflows/rust_build.yml index a8b0651..6f3a8e7 100644 --- a/.github/workflows/rust_build.yml +++ b/.github/workflows/rust_build.yml @@ -18,4 +18,4 @@ jobs: run: | cargo clean cargo build --release - - run: scripts/pre-commit-checks.sh + - run: make pre-commit diff --git a/AGENTS.md b/AGENTS.md index b8c46c8..a7bdfd1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -36,20 +36,13 @@ Use the repository's Make targets and scripts as the source of truth. - Default pre-commit verification: ```sh - sh scripts/pre-commit-checks.sh - ``` - - This runs: - - ```sh - make withenv RECIPE=lint - make withenv RECIPE=test.unit + make pre-commit ``` - Lint-only check: ```sh - make withenv RECIPE=lint + make lint ``` This runs `cargo check`, `cargo fmt -- --check`, and `cargo clippy --all-targets`. @@ -57,7 +50,7 @@ Use the repository's Make targets and scripts as the source of truth. - Unit tests: ```sh - make withenv RECIPE=test.unit + make test ``` This runs `cargo test`. @@ -101,7 +94,7 @@ Document Chronos project conventions, verification commands, and agent handoff expectations. Verification: -- sh scripts/pre-commit-checks.sh +- make pre-commit Model-version: GPT-5 ``` @@ -125,5 +118,5 @@ When making tradeoffs, record the chosen path and the reason. Avoid relying on c - Chronos treats Kafka message bodies opaquely and forwards messages after delay; avoid adding application-level assumptions about payload shape. - The README describes at-least-once delivery semantics. Preserve behavior that supports persistence, recovery from suspected node failure, and duplicate-safe processing. - Metrics work on the `feat/prom_metrics` branch currently includes a Prometheus endpoint and metric-family checks in the integration script. Changes to metrics should preserve unit tests for registry output and integration checks for expected metric families. -- Local development commonly uses `.env` copied from [.env.example](.env.example) through `make withenv`. -- Docker Compose is used for local PostgreSQL, Kafka, Jaeger, and OpenTelemetry dependencies. +- Local development commonly uses `.env` copied from [.env.example](.env.example) through `make setup` or `make withenv`. +- Docker Compose files live in `dev/docker-compose`. `make up` starts Chronos with PostgreSQL, Kafka, Jaeger, and the OpenTelemetry Collector by default; `make up lgtm` uses the LGTM backend. diff --git a/Dockerfile.chronos-slim b/Dockerfile.chronos-slim index 637f62a..5456e3e 100644 --- a/Dockerfile.chronos-slim +++ b/Dockerfile.chronos-slim @@ -8,7 +8,7 @@ # any test fails # - The final stage keeps only the small runtime packages needed by rdkafka # -# To use this image in docker-compose, change the chronos service to: +# To use this image in Docker Compose, change the chronos service to: # build: # context: . # dockerfile: Dockerfile.chronos-slim diff --git a/How-to.md b/How-to.md index 26d6b10..f3e1c3f 100644 --- a/How-to.md +++ b/How-to.md @@ -17,9 +17,13 @@ Input messages with headers 2. Delete any existing .env file, use `make withenv RECIPE=run` ## Run Chronos docker image -Using [docker-compose](./docker-compose.yml) docker conatiner can host Chronos image with mentioned env variables for Kafka, PG and Chronos configuration variables. +Using [Docker Compose](./dev/docker-compose/compose.yaml), containers can host Chronos, PostgreSQL, Kafka, and observability backends with the environment variables mentioned below. -Use `make withenv RECIPE=docker.up` +Use `make up` to build and start Chronos with PostgreSQL, Kafka, Jaeger, and the OpenTelemetry Collector. + +Use `make up lgtm` or `make up BACKEND=lgtm` to start the same Chronos stack with the Grafana LGTM backend instead of Jaeger. + +Use `make down` to stop the running stack. ## ENV vars All the required configurations for Chronos can be passed in environment variables mentioned below @@ -65,13 +69,13 @@ At this time Chronos supports Http protocol based connectivity to the Otel colle | OTEL_EXPORTER_OTLP_PROTOCOL|"http/json" ### Local Grafana LGTM stack -Use the Grafana LGTM compose overlay with the main Docker Compose file to run Grafana, Loki, Tempo, Prometheus, Pyroscope, and the OpenTelemetry Collector in one container: +Use the Grafana LGTM compose overlay to run Grafana, Loki, Tempo, Prometheus, Pyroscope, and the OpenTelemetry Collector in one container: ```sh -make lgtm.up +make up lgtm ``` -The overlay mounts local override files for Prometheus, the OpenTelemetry Collector, and Grafana dashboard provisioning. Chronos exposes its Prometheus metrics endpoint with `OTEL_EXPORTER_PROMETHEUS_HOST` and `OTEL_EXPORTER_PROMETHEUS_PORT`; when run from `docker-compose.yml` the endpoint is `chronos:9091`. +The overlay mounts local override files from `dev/lgtm` for Prometheus, the OpenTelemetry Collector, and Grafana dashboard provisioning. Chronos exposes its Prometheus metrics endpoint with `OTEL_EXPORTER_PROMETHEUS_HOST` and `OTEL_EXPORTER_PROMETHEUS_PORT`; when run from Docker Compose the endpoint is `chronos:9091`. Chronos metrics are generated from the OpenTelemetry Weaver registry in `examples/weaver/registry/chronos/metrics.yaml` into `chronos_bin/src/metrics/generated`. `OTEL_METRICS_EXPORTER=prometheus` is the default and exposes `/metrics` with the `chronos_` Prometheus namespace, for example `chronos_msg_jitter`. `OTEL_METRICS_EXPORTER=otlp` records the same generated metric IDs through the OTLP gRPC metrics exporter. @@ -89,4 +93,3 @@ Two images are published for each [RELEASE]( `https://github.com/kindredgroup/ch - chornos image - diff --git a/Makefile b/Makefile index 9940b78..d9e74e6 100644 --- a/Makefile +++ b/Makefile @@ -1,253 +1,15 @@ -#!make -SHELL:=/bin/bash +SHELL := /usr/bin/env bash -RUST_VERSION := $(shell grep 'channel' rust-toolchain.toml | sed 's/.*"\(.*\)"/\1/') -EXPORTER ?= prom -LGTM_IMAGE ?= grafana/otel-lgtm:0.24.1 -WEAVER_VERSION ?= 0.23.0 -WEAVER_IMAGE ?= otel/weaver:v$(WEAVER_VERSION) -WEAVER_REGISTRY ?= examples/weaver/registry -WEAVER_TEMPLATES ?= examples/weaver/templates -WEAVER_GENERATED_DIR ?= chronos_bin/src/metrics/generated -WEAVER_LIVE_CHECK_PORT ?= 4319 -WEAVER_LIVE_CHECK_ADMIN_PORT ?= 4320 -WEAVER_LIVE_CHECK_OUT ?= /tmp/chronos-weaver-live-check -GITHUB_CONFIG ?= .github/config.json -ACT_WORKFLOW ?= .github/workflows/pre-commit.yml -ACT_EVENT ?= push -ACT_JOB ?= pre-commit +MAKEFILES_DIR := dev/makefiles +COMMON_MAKEFILE := $(MAKEFILES_DIR)/common.mk +MAKEFILE_PARTS := $(filter-out $(COMMON_MAKEFILE),$(sort $(wildcard $(MAKEFILES_DIR)/*.mk))) -# pp - pretty print function -yellow := $(shell tput setaf 3 2>/dev/null || true) -normal := $(shell tput sgr0 2>/dev/null || true) -define pp - @printf '$(yellow)$(1)$(normal)\n' -endef +include $(COMMON_MAKEFILE) +include $(MAKEFILE_PARTS) +## help: Print available make targets +help: + @echo "Choose a command to run:" + @awk '/^## / { help=substr($$0, 4); sub(/^[^:]+: /, "", help); next } /^[A-Za-z0-9_.-]+:/ { if (help != "") { split($$0, target, ":"); printf " %-28s %s\n", target[1], help; help="" } }' Makefile $(COMMON_MAKEFILE) $(MAKEFILE_PARTS) | sort -help: Makefile - @echo " Choose a command to run:" - @sed -n 's/^##//p' $< | column -t -s ':' | sed -e 's/^/ /' - - -# DEV ############################################################################################# - -## withenv: 😭 CALL TARGETS LIKE THIS `make withenv RECIPE=dev.init` -withenv: -# NB: IT APPEARS THAT LOADING ENVIRONMENT VARIABLES INTO make SUUUUCKS. -# NB: THIS RECIPE IS A HACK TO MAKE IT WORK. -# NB: THAT'S WHY THIS MAKEFILE NEEDS TO BE CALLED LIKE `make withenv RECIPE=dev.init` - test -e .env || cp .env.example .env - bash -c 'set -o allexport; source .env; set +o allexport; make "$$RECIPE"' - -## dev.init: 🌏 Initialize local dev environment -# If rdkafka compilation fails with SSL error then install openssl@1.1 or later and export: -# export LDFLAGS=-L/opt/homebrew/opt/openssl@1.1/lib -# export CPPFLAGS=-I/opt/homebrew/opt/openssl@1.1/include -dev.init: install - $(call pp,install git hooks...) - cargo test - -## dev.kafka_init: 🥁 Init kafka topic -# dev.kafka_init: -# $(call pp,creating kafka topic...) -# cargo run --example kafka_create_topic - -dev.chronos_ex: - $(call pp,creating kafka topic...) - cargo run --example chronos_ex - -## pg.create: 🥁 Create database -pg.create: - $(call pp,creating database...) - cargo run --example pg_create_database - -## pg.migrate: 🥁 Run migrations on database -pg.migrate: - $(call pp,running migrations on database...) - cargo run --package pg_mig --bin chronos-pg-migrations - -# TEST / DEPLOY ################################################################################### - -## install: 🧹 Installs dependencies -install: - $(call pp,pull rust dependencies...) - rustup install "${RUST_VERSION}" - rustup component add rust-src clippy llvm-tools-preview - rustup toolchain install nightly - rustup override set "${RUST_VERSION}" - cargo install cargo2junit grcov - cargo fetch - -## build: 🧪 Compiles rust -build: weaver.generate - $(call pp,build rust...) - cargo build - - -## dev.run: 🧪 Runs rust app in watch mode -dev.run: - $(call pp,run app...) - cargo watch -q -c -x 'run --package chronos_bin --bin chronos' - -## run: 🧪 Runs rust app -run: - $(call pp,run app...) - cargo run --package chronos_bin --bin chronos - -## run: 🧪 Runs rust app in release mode -run.release: - $(call pp,run app...) - cargo run --package chronos_bin -r --bin chronos - - -## lint: 🧹 Checks for lint failures on rust -lint: - $(call pp,lint rust...) - RUSTFLAGS="-D warnings" cargo check - cargo fmt -- --check - RUSTFLAGS="-D warnings" cargo clippy --all-targets -- -D warnings - -## test.unit: 🧪 Runs unit tests -test.unit: - $(call pp,rust unit tests...) - RUSTFLAGS="-D warnings" cargo test - -## integration: 🧪 Start deps, migrate, run Chronos, publish test message, verify metrics -integration: build - $(call pp,running integration test...) - @bash scripts/integration.sh - -## integration.down: 🛑 Stop docker services started by make integration -integration.down: - $(call pp,stopping integration services...) - docker compose stop postgres kafka jaeger-all-in-one otel-collector 2>/dev/null || true - docker compose rm -f postgres kafka jaeger-all-in-one otel-collector 2>/dev/null || true - -## metrics.check: 🔍 Verify /metrics endpoint responds (requires running app) -metrics.check: - $(call pp,check metrics endpoint...) - curl -sf "http://localhost:$${OTEL_EXPORTER_PROMETHEUS_PORT:-$${METRICS_PORT:-9090}}/metrics" | head -20 - -## metrics.mock: 🔍 Run Prometheus/OTLP metrics mock example with EXPORTER=prom|otlp -metrics.mock: - $(call pp,run metrics mock example with exporter $(EXPORTER)...) - @case "$(EXPORTER)" in \ - prom|prometheus) OTEL_METRICS_EXPORTER=prometheus OTEL_EXPORTER_PROMETHEUS_HOST=$${OTEL_EXPORTER_PROMETHEUS_HOST:-127.0.0.1} OTEL_EXPORTER_PROMETHEUS_PORT=$${OTEL_EXPORTER_PROMETHEUS_PORT:-9092} cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ - otlp) OTEL_SERVICE_NAME=chronos-metrics-mock OTEL_RESOURCE_ATTRIBUTES=service.instance.id=chronos-metrics-mock-local OTEL_METRICS_EXPORTER=otlp OTEL_EXPORTER_OTLP_PROTOCOL=grpc OTEL_EXPORTER_OTLP_METRICS_ENDPOINT=$${OTEL_EXPORTER_OTLP_METRICS_ENDPOINT:-http://127.0.0.1:4317} OTEL_METRIC_EXPORT_INTERVAL=$${OTEL_METRIC_EXPORT_INTERVAL:-1000} cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ - *) echo "unsupported EXPORTER=$(EXPORTER); use EXPORTER=prom or EXPORTER=otlp" >&2; exit 2 ;; \ - esac - -## weaver.check: 🔍 Validate the Chronos Weaver registry -weaver.check: - $(call pp,check Weaver registry with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry check -r $(WEAVER_REGISTRY) - -## repo.config.apply: 🔐 Apply GitHub repository and branch settings from .github/config.json -repo.config.apply: - $(call pp,apply GitHub repository config from $(GITHUB_CONFIG)...) - scripts/apply-github-config.sh "$(GITHUB_CONFIG)" - -## workflow.pre-commit.act: 🎬 Run the pre-commit GitHub Actions workflow locally with act -workflow.pre-commit.act: - $(call pp,run $(ACT_WORKFLOW) locally with act...) - act "$(ACT_EVENT)" -W "$(ACT_WORKFLOW)" -j "$(ACT_JOB)" - -## weaver.generate.rust: 🧵 Generate Rust metric definitions with Weaver -weaver.generate.rust: - $(call pp,generate Rust metric definitions with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_REGISTRY) --templates $(WEAVER_TEMPLATES) rust $(WEAVER_GENERATED_DIR) - rustfmt --config-path rustfmt.toml $(WEAVER_GENERATED_DIR)/chronos_metric_definitions.rs - -## weaver.generate.markdown: 🧵 Generate Chronos metrics markdown docs with Weaver -weaver.generate.markdown: - $(call pp,generate metrics markdown docs with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_REGISTRY) --templates $(WEAVER_TEMPLATES) markdown $(WEAVER_GENERATED_DIR) - -## weaver.generate.json-schema: 🧵 Generate Weaver resolved-registry JSON schema -weaver.generate.json-schema: - $(call pp,generate Weaver JSON schema with $(WEAVER_IMAGE)...) - mkdir -p $(WEAVER_GENERATED_DIR) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry json-schema -o $(WEAVER_GENERATED_DIR)/resolved-registry.schema.json - -## weaver.generate: 🧵 Generate all Weaver artifacts -weaver.generate: weaver.generate.rust weaver.generate.markdown weaver.generate.json-schema - -## weaver.live-check: 🔍 Run Weaver live-check against the OTLP metrics mock -weaver.live-check: - $(call pp,run Weaver live-check against metrics mock...) - @set -euo pipefail; \ - cargo build --package prom_otlp_mock_runner; \ - rm -rf "$(WEAVER_LIVE_CHECK_OUT)"; \ - mkdir -p "$(WEAVER_LIVE_CHECK_OUT)"; \ - docker run --rm --network host \ - -v "$(PWD):/work" \ - -v "$(WEAVER_LIVE_CHECK_OUT):/out" \ - -w /work \ - $(WEAVER_IMAGE) registry live-check \ - -r $(WEAVER_REGISTRY) \ - --input-source otlp \ - --otlp-grpc-address 127.0.0.1 \ - --otlp-grpc-port $(WEAVER_LIVE_CHECK_PORT) \ - --admin-port $(WEAVER_LIVE_CHECK_ADMIN_PORT) \ - --inactivity-timeout 5 \ - --no-stream \ - --format json \ - -o /out & \ - live_check_pid=$$!; \ - trap 'kill "$$live_check_pid" 2>/dev/null || true' EXIT; \ - sleep 2; \ - OTEL_SERVICE_NAME=chronos-metrics-mock \ - OTEL_RESOURCE_ATTRIBUTES=service.instance.id=chronos-metrics-mock-live-check \ - OTEL_METRICS_EXPORTER=otlp \ - OTEL_EXPORTER_OTLP_PROTOCOL=grpc \ - OTEL_EXPORTER_OTLP_METRICS_ENDPOINT=http://127.0.0.1:$(WEAVER_LIVE_CHECK_PORT) \ - OTEL_METRIC_EXPORT_INTERVAL=500 \ - timeout -s INT 10 cargo run --quiet --package prom_otlp_mock_runner --bin prom_otlp_mock || test "$$?" -eq 124; \ - wait "$$live_check_pid"; \ - find "$(WEAVER_LIVE_CHECK_OUT)" -maxdepth 1 -type f -print - -## lgtm.validate: 🔍 Validate LGTM Prometheus and OpenTelemetry Collector configs -lgtm.validate: - $(call pp,validate LGTM Prometheus config with $(LGTM_IMAGE)...) - docker run --rm \ - -v "$(PWD)/dev/prometheus.yaml:/otel-lgtm/prometheus.yaml:ro" \ - --entrypoint /otel-lgtm/prometheus/promtool \ - $(LGTM_IMAGE) check config /otel-lgtm/prometheus.yaml - $(call pp,validate LGTM OpenTelemetry Collector config with $(LGTM_IMAGE)...) - docker run --rm \ - -v "$(PWD)/dev/otelcol-contrib.yaml:/otel-lgtm/otelcol-config.yaml:ro" \ - --entrypoint /otel-lgtm/otelcol-contrib/otelcol-contrib \ - $(LGTM_IMAGE) validate --config=file:/otel-lgtm/otelcol-config.yaml --feature-gates=service.profilesSupport - -## lgtm.up: 📈 Start standalone Grafana LGTM stack -lgtm.up: - $(call pp,start standalone LGTM stack...) - docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml up -d lgtm - -## lgtm.down: 🛑 Stop standalone Grafana LGTM stack -lgtm.down: - $(call pp,stop standalone LGTM stack...) - docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml stop lgtm 2>/dev/null || true - docker compose -f docker-compose.yml -f dev/docker-compose-lgtm.yaml rm -f lgtm 2>/dev/null || true - -## test.unit.coverage: 🧪 Runs rust unit tests with coverage 'cobertura' and 'junit' reports -test.unit.coverage: - $(call pp,rust unit tests...) - sh scripts/coverage-report.sh - -## docker.up: 🧪 Runs rust app in docker container along with kafka and postgres -docker.up: - $(call pp,run app...) - docker-compose --env-file /dev/null up -d - -## docker.down: bring down the docker containers -docker.down: - $(call pp,run app...) - docker-compose down -# PHONY ########################################################################################### - -# To force rebuild of not-file-related targets, make the targets "phony". -# A phony target is one that is not really the name of a file; -# Rather it is just a name for a recipe to be executed when you make an explicit request. -.PHONY: build +.PHONY: help diff --git a/dev/docker-compose-lgtm.yaml b/dev/docker-compose-lgtm.yaml deleted file mode 100644 index f67ae08..0000000 --- a/dev/docker-compose-lgtm.yaml +++ /dev/null @@ -1,48 +0,0 @@ -services: - chronos: - environment: - OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: http://lgtm:4318/v1/traces - - jaeger-all-in-one: - profiles: - - legacy-otel - - otel-collector: - profiles: - - legacy-otel - - lgtm: - image: grafana/otel-lgtm:0.24.1 - container_name: lgtm - environment: - ENABLE_LOGS_ALL: "true" - GF_LOG_CONSOLE_FORMAT: json - GF_LOG_FORMAT: json - GF_LOG_MODE: console - LGTM_LOG_DIR: /data/lgtm/logs - LOKI_EXTRA_ARGS: -log.format=json - PROMETHEUS_EXTRA_ARGS: --log.format=json - PYROSCOPE_EXTRA_ARGS: -log.format=json - ports: - - "3000:3000" # Grafana - - "3100:3100" # Loki - - "3200:3200" # Tempo - - "4040:4040" # Pyroscope - - "4317:4317" # OTLP gRPC - - "4318:4318" # OTLP HTTP - - "9090:9090" # Prometheus - healthcheck: - test: ["CMD-SHELL", "sh /otel-lgtm/chronos-healthcheck.sh"] - interval: 30s - timeout: 10s - retries: 5 - start_period: 30s - volumes: - - ./dev/prometheus.yaml:/otel-lgtm/prometheus.yaml:ro - - ./dev/otelcol-contrib.yaml:/otel-lgtm/otelcol-config.yaml:ro - - ./dev/lgtm-healthcheck.sh:/otel-lgtm/chronos-healthcheck.sh:ro - - ./dev/lgtm-logging.sh:/otel-lgtm/logging.sh:ro - - ./dev/dashboards.yaml:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos.yaml:ro - - ./dev/dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos:ro - networks: - - chronos diff --git a/dev/docker-compose/compose.yaml b/dev/docker-compose/compose.yaml new file mode 100644 index 0000000..4675fac --- /dev/null +++ b/dev/docker-compose/compose.yaml @@ -0,0 +1,58 @@ +include: + - infra.yaml + +services: + chronos-pg-migrations: + build: + context: ../.. + dockerfile: Dockerfile.chronos-pg-migrations + environment: + PG_HOST: postgres + PG_PORT: "5432" + PG_USER: admin + PG_PASSWORD: admin + PG_DATABASE: chronos_db + depends_on: + postgres: + condition: service_healthy + restart: "no" + networks: + - chronos + + chronos: + build: + context: ../.. + dockerfile: Dockerfile.chronos + ports: + - "9091:9091" + environment: + KAFKA_HOST: kafka + KAFKA_PORT: "9092" + KAFKA_CLIENT_ID: chronos + KAFKA_GROUP_ID: chronos + KAFKA_IN_TOPIC: chronos.in + KAFKA_OUT_TOPIC: chronos.out + KAFKA_USERNAME: "" + KAFKA_PASSWORD: "" + PG_HOST: postgres + PG_PORT: "5432" + PG_USER: admin + PG_PASSWORD: admin + PG_DATABASE: chronos_db + PG_POOL_SIZE: "10" + RUST_LOG: info + OTEL_EXPORTER_PROMETHEUS_HOST: "0.0.0.0" + OTEL_EXPORTER_PROMETHEUS_PORT: "9091" + MONITOR_DB_POLL: "5" + PROCESSOR_DB_POLL: "5" + TIMING_ADVANCE: "0" + FAIL_DETECT_INTERVAL: "10" + depends_on: + postgres: + condition: service_healthy + kafka: + condition: service_healthy + chronos-pg-migrations: + condition: service_completed_successfully + networks: + - chronos diff --git a/dev/docker-compose/infra.yaml b/dev/docker-compose/infra.yaml new file mode 100644 index 0000000..0e72813 --- /dev/null +++ b/dev/docker-compose/infra.yaml @@ -0,0 +1,49 @@ +services: + postgres: + image: postgres:16 + ports: + - "5432:5432" + environment: + POSTGRES_USER: admin + POSTGRES_PASSWORD: admin + POSTGRES_DB: chronos_db + volumes: + - postgres:/var/lib/postgresql/data/ + healthcheck: + test: ["CMD-SHELL", "pg_isready -U admin -d chronos_db"] + interval: 5s + timeout: 5s + retries: 10 + networks: + - chronos + + kafka: + image: bitnami/kafka:latest + ports: + - "9094:9094" + environment: + KAFKA_CFG_NODE_ID: "0" + KAFKA_CFG_PROCESS_ROLES: controller,broker + KAFKA_CFG_CONTROLLER_QUORUM_VOTERS: 0@kafka:9093 + KAFKA_CFG_LISTENERS: PLAINTEXT://:9092,CONTROLLER://:9093,EXTERNAL://:9094 + KAFKA_CFG_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,EXTERNAL://localhost:9094 + KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT + KAFKA_CFG_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_CFG_INTER_BROKER_LISTENER_NAME: PLAINTEXT + KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE: "true" + healthcheck: + test: ["CMD-SHELL", "kafka-topics.sh --bootstrap-server localhost:9092 --list"] + interval: 10s + timeout: 10s + retries: 15 + start_period: 30s + networks: + - chronos + +networks: + chronos: + name: chronos + +volumes: + postgres: + driver: local diff --git a/dev/docker-compose/jaeger.yaml b/dev/docker-compose/jaeger.yaml new file mode 100644 index 0000000..0bd9ad0 --- /dev/null +++ b/dev/docker-compose/jaeger.yaml @@ -0,0 +1,34 @@ +services: + chronos: + environment: + OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: http://otel-collector:4318/v1/traces + depends_on: + otel-collector: + condition: service_started + + jaeger-all-in-one: + image: jaegertracing/all-in-one:latest + container_name: Jaeger + ports: + - "16686:16686" + environment: + COLLECTOR_OTLP_ENABLED: "true" + networks: + - chronos + + otel-collector: + image: otel/opentelemetry-collector:latest + container_name: otelcol + restart: unless-stopped + command: ["--config=/etc/otelcol-config.yml"] + volumes: + - ../otel/otelcol-config.yml:/etc/otelcol-config.yml:ro + ports: + - "1888:1888" + - "13133:13133" + - "4317:4317" + - "4318:4318" + depends_on: + - jaeger-all-in-one + networks: + - chronos diff --git a/dev/docker-compose/lgtm.yaml b/dev/docker-compose/lgtm.yaml new file mode 100644 index 0000000..e1c8b43 --- /dev/null +++ b/dev/docker-compose/lgtm.yaml @@ -0,0 +1,43 @@ +services: + chronos: + environment: + OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: http://lgtm:4318/v1/traces + depends_on: + lgtm: + condition: service_healthy + + lgtm: + image: grafana/otel-lgtm:0.24.1 + container_name: lgtm + environment: + ENABLE_LOGS_ALL: "true" + GF_LOG_CONSOLE_FORMAT: json + GF_LOG_FORMAT: json + GF_LOG_MODE: console + LGTM_LOG_DIR: /data/lgtm/logs + LOKI_EXTRA_ARGS: -log.format=json + PROMETHEUS_EXTRA_ARGS: --log.format=json + PYROSCOPE_EXTRA_ARGS: -log.format=json + ports: + - "3000:3000" + - "3100:3100" + - "3200:3200" + - "4040:4040" + - "4317:4317" + - "4318:4318" + - "9090:9090" + healthcheck: + test: ["CMD-SHELL", "sh /otel-lgtm/chronos-healthcheck.sh"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + volumes: + - ../lgtm/prometheus.yaml:/otel-lgtm/prometheus.yaml:ro + - ../lgtm/otelcol-contrib.yaml:/otel-lgtm/otelcol-config.yaml:ro + - ../lgtm/healthcheck.sh:/otel-lgtm/chronos-healthcheck.sh:ro + - ../lgtm/logging.sh:/otel-lgtm/logging.sh:ro + - ../lgtm/dashboards.yaml:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos.yaml:ro + - ../dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos:ro + networks: + - chronos diff --git a/dev/dashboards.yaml b/dev/lgtm/dashboards.yaml similarity index 100% rename from dev/dashboards.yaml rename to dev/lgtm/dashboards.yaml diff --git a/dev/lgtm-healthcheck.sh b/dev/lgtm/healthcheck.sh similarity index 100% rename from dev/lgtm-healthcheck.sh rename to dev/lgtm/healthcheck.sh diff --git a/dev/lgtm-logging.sh b/dev/lgtm/logging.sh similarity index 100% rename from dev/lgtm-logging.sh rename to dev/lgtm/logging.sh diff --git a/dev/otelcol-contrib.yaml b/dev/lgtm/otelcol-contrib.yaml similarity index 100% rename from dev/otelcol-contrib.yaml rename to dev/lgtm/otelcol-contrib.yaml diff --git a/dev/prometheus.yaml b/dev/lgtm/prometheus.yaml similarity index 100% rename from dev/prometheus.yaml rename to dev/lgtm/prometheus.yaml diff --git a/dev/makefiles/act.mk b/dev/makefiles/act.mk index bbabc79..2bd845e 100644 --- a/dev/makefiles/act.mk +++ b/dev/makefiles/act.mk @@ -1,6 +1,3 @@ -#!make -SHELL := /bin/bash - ACT_EVENT ?= push ACT_JOB ?= pre-commit ACT_RUNNER_IMAGE ?= catthehacker/ubuntu:act-latest @@ -19,7 +16,7 @@ SBOM_WORKFLOW ?= .github/workflows/sbom.yml SBOM_TARGET_TYPE ?= release SBOM_TARGET_REF ?= . -.PHONY: act.ci act.ci.job act.pre-commit act.test act.scan act.build-binary act.build-container act.sbom act.sbom.container act.sbom.release +.PHONY: act.ci act.ci.job act.pre-commit workflow.pre-commit.act act.test act.scan act.build-binary act.build-container act.sbom act.sbom.container act.sbom.release act.ci: mkdir -p "$(ACT_ARTIFACT_DIR)" @@ -32,6 +29,9 @@ act.ci.job: act.pre-commit: act workflow_dispatch -W "$(PRE_COMMIT_WORKFLOW)" $(ACT_FLAGS) +## workflow.pre-commit.act: Run the pre-commit GitHub Actions workflow locally with act +workflow.pre-commit.act: act.pre-commit + act.test: act workflow_dispatch -W "$(TEST_WORKFLOW)" $(ACT_FLAGS) diff --git a/dev/makefiles/ci.mk b/dev/makefiles/ci.mk new file mode 100644 index 0000000..24c6629 --- /dev/null +++ b/dev/makefiles/ci.mk @@ -0,0 +1,8 @@ +GITHUB_CONFIG ?= .github/config.json + +## repo.config.apply: Apply GitHub repository and branch settings from .github/config.json +repo.config.apply: + $(call pp,apply GitHub repository config from $(GITHUB_CONFIG)...) + scripts/apply-github-config.sh "$(GITHUB_CONFIG)" + +.PHONY: repo.config.apply diff --git a/dev/makefiles/common.mk b/dev/makefiles/common.mk new file mode 100644 index 0000000..995cae9 --- /dev/null +++ b/dev/makefiles/common.mk @@ -0,0 +1,16 @@ +RUST_VERSION := $(shell grep 'channel' rust-toolchain.toml | sed 's/.*"\(.*\)"/\1/') + +yellow := $(shell tput setaf 3 2>/dev/null || true) +normal := $(shell tput sgr0 2>/dev/null || true) + +define pp + @printf '$(yellow)$(1)$(normal)\n' +endef + +define require_cmd + @command -v $(1) >/dev/null 2>&1 || { \ + printf 'Missing required command: %s\n' '$(1)' >&2; \ + printf 'Install it with your system package manager, then run make setup again.\n' >&2; \ + exit 1; \ + } +endef diff --git a/dev/makefiles/dev.mk b/dev/makefiles/dev.mk new file mode 100644 index 0000000..7d730fd --- /dev/null +++ b/dev/makefiles/dev.mk @@ -0,0 +1,53 @@ +## setup: Check local development dependencies and prepare .env +setup: + $(call pp,checking development dependencies...) + $(call require_cmd,cargo) + $(call require_cmd,rustup) + $(call require_cmd,docker) + $(call require_cmd,curl) + $(call require_cmd,awk) + @test -e .env || cp .env.example .env + @rustup component list --installed | grep -q '^rustfmt' || { echo 'Missing Rust component: rustfmt. Install with: rustup component add rustfmt' >&2; exit 1; } + @rustup component list --installed | grep -q '^clippy' || { echo 'Missing Rust component: clippy. Install with: rustup component add clippy' >&2; exit 1; } + @printf 'Development dependencies look ready.\n' + +## withenv: Run a make recipe with variables loaded from .env, for example make withenv RECIPE=run +withenv: + test -e .env || cp .env.example .env + bash -c 'set -o allexport; source .env; set +o allexport; make "$$RECIPE"' + +## dev.init: Initialize local dev environment +dev.init: setup + $(call pp,checking rust tests...) + cargo test + +dev.chronos_ex: + $(call pp,creating kafka topic...) + cargo run --example chronos_ex + +## pg.create: Create database +pg.create: + $(call pp,creating database...) + cargo run --example pg_create_database + +## pg.migrate: Run migrations on database +pg.migrate: + $(call pp,running migrations on database...) + cargo run --package pg_mig --bin chronos-pg-migrations + +## run: Run Chronos locally +run: + $(call pp,run app...) + cargo run --package chronos_bin --bin chronos + +## run.release: Run Chronos locally in release mode +run.release: + $(call pp,run app...) + cargo run --package chronos_bin -r --bin chronos + +## dev.run: Run Chronos in cargo-watch mode +dev.run: + $(call pp,run app...) + cargo watch -q -c -x 'run --package chronos_bin --bin chronos' + +.PHONY: setup withenv dev.init dev.chronos_ex pg.create pg.migrate run run.release dev.run diff --git a/dev/makefiles/docker.mk b/dev/makefiles/docker.mk new file mode 100644 index 0000000..1f548a9 --- /dev/null +++ b/dev/makefiles/docker.mk @@ -0,0 +1,36 @@ +COMPOSE_PROJECT_NAME ?= chronos +COMPOSE_FILE_BASE := dev/docker-compose/compose.yaml +COMPOSE_FILE_JAEGER := dev/docker-compose/jaeger.yaml +COMPOSE_FILE_LGTM := dev/docker-compose/lgtm.yaml +BACKEND_ARG := $(firstword $(filter jaeger lgtm,$(MAKECMDGOALS))) +BACKEND ?= $(if $(BACKEND_ARG),$(BACKEND_ARG),jaeger) +COMPOSE_BACKEND_FILE := $(if $(filter lgtm,$(BACKEND)),$(COMPOSE_FILE_LGTM),$(COMPOSE_FILE_JAEGER)) +DOCKER_COMPOSE := docker compose --project-name $(COMPOSE_PROJECT_NAME) -f $(COMPOSE_FILE_BASE) -f $(COMPOSE_BACKEND_FILE) +DOCKER_COMPOSE_JAEGER := docker compose --project-name $(COMPOSE_PROJECT_NAME) -f $(COMPOSE_FILE_BASE) -f $(COMPOSE_FILE_JAEGER) +DOCKER_COMPOSE_LGTM := docker compose --project-name $(COMPOSE_PROJECT_NAME) -f $(COMPOSE_FILE_BASE) -f $(COMPOSE_FILE_LGTM) + +## up: Build and start Chronos, dependencies, and observability. Use make up lgtm or BACKEND=lgtm for LGTM +up: + $(call pp,starting docker compose stack with $(BACKEND) observability...) + $(DOCKER_COMPOSE) up -d --build + +## down: Stop the docker compose stack +down: + $(call pp,stopping docker compose stack...) + $(DOCKER_COMPOSE_LGTM) down 2>/dev/null || true + $(DOCKER_COMPOSE_JAEGER) down 2>/dev/null || true + +## docker.config: Render the docker compose configuration +docker.config: + $(DOCKER_COMPOSE) config + +## docker.up: Legacy alias for make up +docker.up: up + +## docker.down: Legacy alias for make down +docker.down: down + +jaeger lgtm: + @: + +.PHONY: up down docker.config docker.up docker.down jaeger lgtm diff --git a/dev/makefiles/integration.mk b/dev/makefiles/integration.mk new file mode 100644 index 0000000..3e58e2d --- /dev/null +++ b/dev/makefiles/integration.mk @@ -0,0 +1,12 @@ +## integration: Start deps, migrate, run Chronos, publish test message, verify metrics +integration: build + $(call pp,running integration test...) + @bash scripts/integration.sh + +## integration.down: Stop docker services started by make integration +integration.down: + $(call pp,stopping integration services...) + docker compose --project-name chronos -f dev/docker-compose/compose.yaml stop postgres kafka 2>/dev/null || true + docker compose --project-name chronos -f dev/docker-compose/compose.yaml rm -f postgres kafka 2>/dev/null || true + +.PHONY: integration integration.down diff --git a/dev/makefiles/rust.mk b/dev/makefiles/rust.mk new file mode 100644 index 0000000..fdce9a1 --- /dev/null +++ b/dev/makefiles/rust.mk @@ -0,0 +1,117 @@ +EXPORTER ?= prom +WEAVER_VERSION ?= 0.23.0 +WEAVER_IMAGE ?= otel/weaver:v$(WEAVER_VERSION) +WEAVER_REGISTRY ?= examples/weaver/registry +WEAVER_TEMPLATES ?= examples/weaver/templates +WEAVER_GENERATED_DIR ?= chronos_bin/src/metrics/generated +WEAVER_LIVE_CHECK_PORT ?= 4319 +WEAVER_LIVE_CHECK_ADMIN_PORT ?= 4320 +WEAVER_LIVE_CHECK_OUT ?= /tmp/chronos-weaver-live-check + +## build: Build Rust binaries +build: weaver.generate + $(call pp,build rust...) + cargo build + +## fmt: Format Rust code +fmt: + $(call pp,format rust...) + cargo fmt + +## lint: Check Rust formatting, clippy, and cargo check +lint: + $(call pp,lint rust...) + RUSTFLAGS="-D warnings" cargo check + cargo fmt -- --check + RUSTFLAGS="-D warnings" cargo clippy --all-targets -- -D warnings + +## test: Run Rust unit tests +test: test.unit + +## test.unit: Run Rust unit tests +test.unit: + $(call pp,rust unit tests...) + RUSTFLAGS="-D warnings" cargo test + +## pre-commit: Run pre-commit checks +pre-commit: lint test.unit + +## test.unit.coverage: Run Rust unit tests with coverage reports +test.unit.coverage: + $(call pp,rust unit tests...) + sh scripts/coverage-report.sh + +## metrics.check: Verify /metrics endpoint responds +metrics.check: + $(call pp,check metrics endpoint...) + curl -sf "http://localhost:$${OTEL_EXPORTER_PROMETHEUS_PORT:-$${METRICS_PORT:-9090}}/metrics" | head -20 + +## metrics.mock: Run Prometheus/OTLP metrics mock example with EXPORTER=prom|otlp +metrics.mock: + $(call pp,run metrics mock example with exporter $(EXPORTER)...) + @case "$(EXPORTER)" in \ + prom|prometheus) OTEL_METRICS_EXPORTER=prometheus OTEL_EXPORTER_PROMETHEUS_HOST=$${OTEL_EXPORTER_PROMETHEUS_HOST:-127.0.0.1} OTEL_EXPORTER_PROMETHEUS_PORT=$${OTEL_EXPORTER_PROMETHEUS_PORT:-9092} cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ + otlp) OTEL_SERVICE_NAME=chronos-metrics-mock OTEL_RESOURCE_ATTRIBUTES=service.instance.id=chronos-metrics-mock-local OTEL_METRICS_EXPORTER=otlp OTEL_EXPORTER_OTLP_PROTOCOL=grpc OTEL_EXPORTER_OTLP_METRICS_ENDPOINT=$${OTEL_EXPORTER_OTLP_METRICS_ENDPOINT:-http://127.0.0.1:4317} OTEL_METRIC_EXPORT_INTERVAL=$${OTEL_METRIC_EXPORT_INTERVAL:-1000} cargo run --package prom_otlp_mock_runner --bin prom_otlp_mock ;; \ + *) echo "unsupported EXPORTER=$(EXPORTER); use EXPORTER=prom or EXPORTER=otlp" >&2; exit 2 ;; \ + esac + +## weaver.check: Validate the Chronos Weaver registry +weaver.check: + $(call pp,check Weaver registry with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry check -r $(WEAVER_REGISTRY) + +## weaver.generate.rust: Generate Rust metric definitions with Weaver +weaver.generate.rust: + $(call pp,generate Rust metric definitions with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_REGISTRY) --templates $(WEAVER_TEMPLATES) rust $(WEAVER_GENERATED_DIR) + rustfmt --config-path rustfmt.toml $(WEAVER_GENERATED_DIR)/chronos_metric_definitions.rs + +## weaver.generate.markdown: Generate Chronos metrics markdown docs with Weaver +weaver.generate.markdown: + $(call pp,generate metrics markdown docs with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_REGISTRY) --templates $(WEAVER_TEMPLATES) markdown $(WEAVER_GENERATED_DIR) + +## weaver.generate.json-schema: Generate Weaver resolved-registry JSON schema +weaver.generate.json-schema: + $(call pp,generate Weaver JSON schema with $(WEAVER_IMAGE)...) + mkdir -p $(WEAVER_GENERATED_DIR) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry json-schema -o $(WEAVER_GENERATED_DIR)/resolved-registry.schema.json + +## weaver.generate: Generate all Weaver artifacts +weaver.generate: weaver.generate.rust weaver.generate.markdown weaver.generate.json-schema + +## weaver.live-check: Run Weaver live-check against the OTLP metrics mock +weaver.live-check: + $(call pp,run Weaver live-check against metrics mock...) + @set -euo pipefail; \ + cargo build --package prom_otlp_mock_runner; \ + rm -rf "$(WEAVER_LIVE_CHECK_OUT)"; \ + mkdir -p "$(WEAVER_LIVE_CHECK_OUT)"; \ + docker run --rm --network host \ + -v "$(PWD):/work" \ + -v "$(WEAVER_LIVE_CHECK_OUT):/out" \ + -w /work \ + $(WEAVER_IMAGE) registry live-check \ + -r $(WEAVER_REGISTRY) \ + --input-source otlp \ + --otlp-grpc-address 127.0.0.1 \ + --otlp-grpc-port $(WEAVER_LIVE_CHECK_PORT) \ + --admin-port $(WEAVER_LIVE_CHECK_ADMIN_PORT) \ + --inactivity-timeout 5 \ + --no-stream \ + --format json \ + -o /out & \ + live_check_pid=$$!; \ + trap 'kill "$$live_check_pid" 2>/dev/null || true' EXIT; \ + sleep 2; \ + OTEL_SERVICE_NAME=chronos-metrics-mock \ + OTEL_RESOURCE_ATTRIBUTES=service.instance.id=chronos-metrics-mock-live-check \ + OTEL_METRICS_EXPORTER=otlp \ + OTEL_EXPORTER_OTLP_PROTOCOL=grpc \ + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT=http://127.0.0.1:$(WEAVER_LIVE_CHECK_PORT) \ + OTEL_METRIC_EXPORT_INTERVAL=500 \ + timeout -s INT 10 cargo run --quiet --package prom_otlp_mock_runner --bin prom_otlp_mock || test "$$?" -eq 124; \ + wait "$$live_check_pid"; \ + find "$(WEAVER_LIVE_CHECK_OUT)" -maxdepth 1 -type f -print + +.PHONY: build fmt lint test test.unit pre-commit test.unit.coverage metrics.check metrics.mock weaver.check weaver.generate.rust weaver.generate.markdown weaver.generate.json-schema weaver.generate weaver.live-check diff --git a/dev/makefiles/telemetry.mk b/dev/makefiles/telemetry.mk new file mode 100644 index 0000000..59bea7b --- /dev/null +++ b/dev/makefiles/telemetry.mk @@ -0,0 +1,24 @@ +LGTM_IMAGE ?= grafana/otel-lgtm:0.24.1 + +## lgtm.validate: Validate LGTM Prometheus and OpenTelemetry Collector configs +lgtm.validate: + $(call pp,validate LGTM Prometheus config with $(LGTM_IMAGE)...) + docker run --rm \ + -v "$(PWD)/dev/lgtm/prometheus.yaml:/otel-lgtm/prometheus.yaml:ro" \ + --entrypoint /otel-lgtm/prometheus/promtool \ + $(LGTM_IMAGE) check config /otel-lgtm/prometheus.yaml + $(call pp,validate LGTM OpenTelemetry Collector config with $(LGTM_IMAGE)...) + docker run --rm \ + -v "$(PWD)/dev/lgtm/otelcol-contrib.yaml:/otel-lgtm/otelcol-config.yaml:ro" \ + --entrypoint /otel-lgtm/otelcol-contrib/otelcol-contrib \ + $(LGTM_IMAGE) validate --config=file:/otel-lgtm/otelcol-config.yaml --feature-gates=service.profilesSupport + +## lgtm.up: Start the docker compose stack with Grafana LGTM +lgtm.up: + $(MAKE) up BACKEND=lgtm + +## lgtm.down: Stop the docker compose stack with Grafana LGTM +lgtm.down: + $(MAKE) down BACKEND=lgtm + +.PHONY: lgtm.validate lgtm.up lgtm.down diff --git a/infra/otelcol-config.yml b/dev/otel/otelcol-config.yml similarity index 100% rename from infra/otelcol-config.yml rename to dev/otel/otelcol-config.yml diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 2ee777f..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,148 +0,0 @@ -version: '3.8' - -# ───────────────────────────────────────────────────────────────────────────── -# Core infrastructure -# ───────────────────────────────────────────────────────────────────────────── -services: - - postgres: - image: postgres:16 - ports: - - "5432:5432" - environment: - POSTGRES_USER: admin - POSTGRES_PASSWORD: admin - POSTGRES_DB: chronos_db - volumes: - - postgres:/var/lib/postgresql/data/ - healthcheck: - test: ["CMD-SHELL", "pg_isready -U admin -d chronos_db"] - interval: 5s - timeout: 5s - retries: 10 - networks: - - chronos - - kafka: - image: bitnami/kafka:latest - ports: - - "9094:9094" # External port for host-machine access - environment: - # KRaft (no ZooKeeper required) - KAFKA_CFG_NODE_ID: "0" - KAFKA_CFG_PROCESS_ROLES: controller,broker - KAFKA_CFG_CONTROLLER_QUORUM_VOTERS: 0@kafka:9093 - # Listeners - KAFKA_CFG_LISTENERS: PLAINTEXT://:9092,CONTROLLER://:9093,EXTERNAL://:9094 - KAFKA_CFG_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,EXTERNAL://localhost:9094 - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,EXTERNAL:PLAINTEXT - KAFKA_CFG_CONTROLLER_LISTENER_NAMES: CONTROLLER - KAFKA_CFG_INTER_BROKER_LISTENER_NAME: PLAINTEXT - KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE: "true" - healthcheck: - test: ["CMD-SHELL", "kafka-topics.sh --bootstrap-server localhost:9092 --list"] - interval: 10s - timeout: 10s - retries: 15 - start_period: 30s - networks: - - chronos - -# ───────────────────────────────────────────────────────────────────────────── -# Database migrations (one-shot init container) -# ───────────────────────────────────────────────────────────────────────────── - chronos-pg-migrations: - build: - context: . - dockerfile: Dockerfile.chronos-pg-migrations - environment: - PG_HOST: postgres - PG_PORT: "5432" - PG_USER: admin - PG_PASSWORD: admin - PG_DATABASE: chronos_db - depends_on: - postgres: - condition: service_healthy - restart: "no" - networks: - - chronos - -# ───────────────────────────────────────────────────────────────────────────── -# Chronos application -# ───────────────────────────────────────────────────────────────────────────── - chronos: - build: - context: . - dockerfile: Dockerfile.chronos - ports: - - "9091:9091" # Prometheus /metrics endpoint - environment: - KAFKA_HOST: kafka - KAFKA_PORT: "9092" - KAFKA_CLIENT_ID: chronos - KAFKA_GROUP_ID: chronos - KAFKA_IN_TOPIC: chronos.in - KAFKA_OUT_TOPIC: chronos.out - KAFKA_USERNAME: "" - KAFKA_PASSWORD: "" - PG_HOST: postgres - PG_PORT: "5432" - PG_USER: admin - PG_PASSWORD: admin - PG_DATABASE: chronos_db - PG_POOL_SIZE: "10" - RUST_LOG: info - OTEL_EXPORTER_PROMETHEUS_HOST: "0.0.0.0" - OTEL_EXPORTER_PROMETHEUS_PORT: "9091" - MONITOR_DB_POLL: "5" - PROCESSOR_DB_POLL: "5" - TIMING_ADVANCE: "0" - FAIL_DETECT_INTERVAL: "10" - depends_on: - postgres: - condition: service_healthy - kafka: - condition: service_healthy - chronos-pg-migrations: - condition: service_completed_successfully - networks: - - chronos - -# ───────────────────────────────────────────────────────────────────────────── -# Telemetry (optional – used for distributed tracing) -# ───────────────────────────────────────────────────────────────────────────── - jaeger-all-in-one: - image: jaegertracing/all-in-one:latest - ports: - - "16686:16686" - container_name: Jaeger - environment: - - COLLECTOR_OTLP_ENABLED=true - networks: - - chronos - - otel-collector: - image: otel/opentelemetry-collector:latest - container_name: otelcol - restart: unless-stopped - command: ["--config=/etc/otelcol-config.yml"] - volumes: - - ./infra/otelcol-config.yml:/etc/otelcol-config.yml - ports: - - "1888:1888" # pprof extension - - "13133:13133" # health_check extension - - "4317:4317" # OTLP gRPC receiver - - "4318:4318" # OTLP HTTP receiver - depends_on: - - jaeger-all-in-one - networks: - - chronos - -networks: - chronos: - name: chronos - -volumes: - postgres: - driver: local diff --git a/scripts/integration.sh b/scripts/integration.sh index 7ab3a2d..806e2fd 100755 --- a/scripts/integration.sh +++ b/scripts/integration.sh @@ -9,6 +9,7 @@ set -euo pipefail KAFKA_EXT_PORT="${KAFKA_EXT_PORT:-9094}" PG_PORT="${PG_PORT:-5432}" METRICS_PORT="${OTEL_EXPORTER_PROMETHEUS_PORT:-${METRICS_PORT:-9090}}" +COMPOSE="docker compose --project-name chronos -f dev/docker-compose/compose.yaml" CHRONOS_PID_FILE="/tmp/chronos_integration.pid" CHRONOS_LOG="/tmp/chronos_integration.log" MAX_WAIT=120 # seconds to wait for each readiness check @@ -53,15 +54,15 @@ wait_for() { # ─── 1. start infrastructure ────────────────────────────────────────────────── log "Starting infrastructure (postgres + kafka)..." -docker compose up -d postgres kafka +${COMPOSE} up -d postgres kafka # ─── 2. wait for postgres ───────────────────────────────────────────────────── wait_for "postgres" \ - docker compose exec -T postgres pg_isready -U admin -d chronos_db + ${COMPOSE} exec -T postgres pg_isready -U admin -d chronos_db # ─── 3. wait for kafka ──────────────────────────────────────────────────────── wait_for "kafka" \ - docker compose exec -T kafka \ + ${COMPOSE} exec -T kafka \ /opt/bitnami/kafka/bin/kafka-topics.sh --bootstrap-server localhost:9092 --list # ─── 4. run migrations ──────────────────────────────────────────────────────── @@ -129,7 +130,7 @@ ok "Message published" # (timeout reached) which is normal when the topic drains before max-messages. log "Waiting for message ${MSG_ID} on chronos.out (up to 30s)..." FIRED_OUTPUT=$( - docker compose exec -T kafka \ + ${COMPOSE} exec -T kafka \ /opt/bitnami/kafka/bin/kafka-console-consumer.sh \ --bootstrap-server localhost:9092 \ --topic chronos.out \ diff --git a/scripts/pre-commit-checks.sh b/scripts/pre-commit-checks.sh deleted file mode 100755 index 4b7d15c..0000000 --- a/scripts/pre-commit-checks.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -set -e - -make withenv RECIPE=lint -make withenv RECIPE=test.unit - - From 60598c603cad77b3c1d0729052f94f3040decf17 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 13:45:26 +1000 Subject: [PATCH 20/36] chore: split production and example weaver assets Move production Weaver registry and templates under dev/weaver/production so application generation no longer depends on examples/weaver. Keep examples/weaver as the explicit example input set. Route production Weaver docs to docs/chronos_metrics.md and the resolved registry schema to docs/schema/resolved-registry.schema.json. Make build depend on weaver.production.generate; example artifacts now require an explicit make weaver.example.generate call. Verification: - make build - make weaver.example.generate - make pre-commit Model-version: GPT-5 --- AGENTS.md | 2 +- How-to.md | 5 +- .../generated/chronos_metric_definitions.rs | 2 +- dev/makefiles/rust.mk | 79 +++++++--- .../production/registry/chronos/metrics.yaml | 139 ++++++++++++++++++ .../templates/registry/markdown/metrics.md.j2 | 9 ++ .../templates/registry/markdown/weaver.yaml | 17 +++ .../templates/registry/rust/registry.rs.j2 | 58 ++++++++ .../templates/registry/rust/weaver.yaml | 22 +++ .../generated => docs}/chronos_metrics.md | 2 +- .../schema}/resolved-registry.schema.json | 0 .../generated/chronos_metric_definitions.rs | 50 ++++--- examples/weaver/generated/chronos_metrics.md | 11 +- 13 files changed, 340 insertions(+), 56 deletions(-) create mode 100644 dev/weaver/production/registry/chronos/metrics.yaml create mode 100644 dev/weaver/production/templates/registry/markdown/metrics.md.j2 create mode 100644 dev/weaver/production/templates/registry/markdown/weaver.yaml create mode 100644 dev/weaver/production/templates/registry/rust/registry.rs.j2 create mode 100644 dev/weaver/production/templates/registry/rust/weaver.yaml rename {chronos_bin/src/metrics/generated => docs}/chronos_metrics.md (90%) rename {chronos_bin/src/metrics/generated => docs/schema}/resolved-registry.schema.json (100%) diff --git a/AGENTS.md b/AGENTS.md index a7bdfd1..10e86ce 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -117,6 +117,6 @@ When making tradeoffs, record the chosen path and the reason. Avoid relying on c - Chronos treats Kafka message bodies opaquely and forwards messages after delay; avoid adding application-level assumptions about payload shape. - The README describes at-least-once delivery semantics. Preserve behavior that supports persistence, recovery from suspected node failure, and duplicate-safe processing. -- Metrics work on the `feat/prom_metrics` branch currently includes a Prometheus endpoint and metric-family checks in the integration script. Changes to metrics should preserve unit tests for registry output and integration checks for expected metric families. +- Metrics work on the `feat/prom_metrics` branch currently includes a Prometheus endpoint and metric-family checks in the integration script. Production Weaver inputs live under `dev/weaver/production`; example Weaver inputs live under `examples/weaver` and are generated only by explicit example recipes. Changes to metrics should preserve unit tests for registry output and integration checks for expected metric families. - Local development commonly uses `.env` copied from [.env.example](.env.example) through `make setup` or `make withenv`. - Docker Compose files live in `dev/docker-compose`. `make up` starts Chronos with PostgreSQL, Kafka, Jaeger, and the OpenTelemetry Collector by default; `make up lgtm` uses the LGTM backend. diff --git a/How-to.md b/How-to.md index f3e1c3f..8fd19ac 100644 --- a/How-to.md +++ b/How-to.md @@ -77,9 +77,9 @@ make up lgtm The overlay mounts local override files from `dev/lgtm` for Prometheus, the OpenTelemetry Collector, and Grafana dashboard provisioning. Chronos exposes its Prometheus metrics endpoint with `OTEL_EXPORTER_PROMETHEUS_HOST` and `OTEL_EXPORTER_PROMETHEUS_PORT`; when run from Docker Compose the endpoint is `chronos:9091`. -Chronos metrics are generated from the OpenTelemetry Weaver registry in `examples/weaver/registry/chronos/metrics.yaml` into `chronos_bin/src/metrics/generated`. `OTEL_METRICS_EXPORTER=prometheus` is the default and exposes `/metrics` with the `chronos_` Prometheus namespace, for example `chronos_msg_jitter`. `OTEL_METRICS_EXPORTER=otlp` records the same generated metric IDs through the OTLP gRPC metrics exporter. +Chronos production metrics are generated from the OpenTelemetry Weaver registry in `dev/weaver/production/registry/chronos/metrics.yaml`. Rust definitions are generated into `chronos_bin/src/metrics/generated`, Markdown docs into `docs/chronos_metrics.md`, and the resolved registry schema into `docs/schema/resolved-registry.schema.json`. `OTEL_METRICS_EXPORTER=prometheus` is the default and exposes `/metrics` with the `chronos_` Prometheus namespace, for example `chronos_msg_jitter`. `OTEL_METRICS_EXPORTER=otlp` records the same generated metric IDs through the OTLP gRPC metrics exporter. -`make build` runs `make weaver.generate` before compiling, which refreshes the generated Rust definitions, Markdown metric docs, and resolved registry JSON schema. +`make build` runs `make weaver.production.generate` before compiling, which refreshes the production Rust definitions, Markdown metric docs, and resolved registry JSON schema. Example Weaver artifacts are generated only when explicitly requested with `make weaver.example.generate`. Validate the LGTM configuration files with: @@ -92,4 +92,3 @@ Two images are published for each [RELEASE]( `https://github.com/kindredgroup/ch - migrations image - chornos image - diff --git a/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs b/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs index f849fb9..9fdb216 100644 --- a/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs +++ b/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs @@ -1,4 +1,4 @@ -// Generated from examples/weaver/registry/chronos/metrics.yaml by OpenTelemetry Weaver. +// Generated from dev/weaver/production/registry/chronos/metrics.yaml by OpenTelemetry Weaver. // Do not edit by hand. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] diff --git a/dev/makefiles/rust.mk b/dev/makefiles/rust.mk index fdce9a1..70368c0 100644 --- a/dev/makefiles/rust.mk +++ b/dev/makefiles/rust.mk @@ -1,15 +1,21 @@ EXPORTER ?= prom WEAVER_VERSION ?= 0.23.0 WEAVER_IMAGE ?= otel/weaver:v$(WEAVER_VERSION) -WEAVER_REGISTRY ?= examples/weaver/registry -WEAVER_TEMPLATES ?= examples/weaver/templates -WEAVER_GENERATED_DIR ?= chronos_bin/src/metrics/generated +WEAVER_PRODUCTION_REGISTRY ?= dev/weaver/production/registry +WEAVER_PRODUCTION_TEMPLATES ?= dev/weaver/production/templates +WEAVER_PRODUCTION_RUST_OUT ?= chronos_bin/src/metrics/generated +WEAVER_PRODUCTION_DOCS_OUT ?= docs +WEAVER_SCHEMA_OUT ?= docs/schema +WEAVER_EXAMPLE_REGISTRY ?= examples/weaver/registry +WEAVER_EXAMPLE_TEMPLATES ?= examples/weaver/templates +WEAVER_EXAMPLE_OUT ?= examples/weaver/generated +WEAVER_REGISTRY ?= $(WEAVER_PRODUCTION_REGISTRY) WEAVER_LIVE_CHECK_PORT ?= 4319 WEAVER_LIVE_CHECK_ADMIN_PORT ?= 4320 WEAVER_LIVE_CHECK_OUT ?= /tmp/chronos-weaver-live-check ## build: Build Rust binaries -build: weaver.generate +build: weaver.production.generate $(call pp,build rust...) cargo build @@ -55,30 +61,61 @@ metrics.mock: *) echo "unsupported EXPORTER=$(EXPORTER); use EXPORTER=prom or EXPORTER=otlp" >&2; exit 2 ;; \ esac -## weaver.check: Validate the Chronos Weaver registry -weaver.check: +## weaver.production.check: Validate the production Chronos Weaver registry +weaver.production.check: $(call pp,check Weaver registry with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry check -r $(WEAVER_REGISTRY) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry check -r $(WEAVER_PRODUCTION_REGISTRY) -## weaver.generate.rust: Generate Rust metric definitions with Weaver -weaver.generate.rust: +## weaver.production.generate.rust: Generate production Rust metric definitions with Weaver +weaver.production.generate.rust: $(call pp,generate Rust metric definitions with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_REGISTRY) --templates $(WEAVER_TEMPLATES) rust $(WEAVER_GENERATED_DIR) - rustfmt --config-path rustfmt.toml $(WEAVER_GENERATED_DIR)/chronos_metric_definitions.rs + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_PRODUCTION_REGISTRY) --templates $(WEAVER_PRODUCTION_TEMPLATES) rust $(WEAVER_PRODUCTION_RUST_OUT) + rustfmt --config-path rustfmt.toml $(WEAVER_PRODUCTION_RUST_OUT)/chronos_metric_definitions.rs -## weaver.generate.markdown: Generate Chronos metrics markdown docs with Weaver -weaver.generate.markdown: +## weaver.production.generate.docs: Generate production Chronos metrics docs with Weaver +weaver.production.generate.docs: $(call pp,generate metrics markdown docs with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_REGISTRY) --templates $(WEAVER_TEMPLATES) markdown $(WEAVER_GENERATED_DIR) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_PRODUCTION_REGISTRY) --templates $(WEAVER_PRODUCTION_TEMPLATES) markdown $(WEAVER_PRODUCTION_DOCS_OUT) -## weaver.generate.json-schema: Generate Weaver resolved-registry JSON schema -weaver.generate.json-schema: +## weaver.production.generate.schema: Generate production Weaver resolved-registry JSON schema +weaver.production.generate.schema: $(call pp,generate Weaver JSON schema with $(WEAVER_IMAGE)...) - mkdir -p $(WEAVER_GENERATED_DIR) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry json-schema -o $(WEAVER_GENERATED_DIR)/resolved-registry.schema.json + mkdir -p $(WEAVER_SCHEMA_OUT) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry json-schema -o $(WEAVER_SCHEMA_OUT)/resolved-registry.schema.json -## weaver.generate: Generate all Weaver artifacts -weaver.generate: weaver.generate.rust weaver.generate.markdown weaver.generate.json-schema +## weaver.production.generate: Generate production Weaver Rust, docs, and schema artifacts +weaver.production.generate: weaver.production.generate.rust weaver.production.generate.docs weaver.production.generate.schema + +## weaver.example.check: Validate the example Chronos Weaver registry +weaver.example.check: + $(call pp,check example Weaver registry with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry check -r $(WEAVER_EXAMPLE_REGISTRY) + +## weaver.example.generate.rust: Generate example Rust metric definitions with Weaver +weaver.example.generate.rust: + $(call pp,generate example Rust metric definitions with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_EXAMPLE_REGISTRY) --templates $(WEAVER_EXAMPLE_TEMPLATES) rust $(WEAVER_EXAMPLE_OUT) + rustfmt --config-path rustfmt.toml $(WEAVER_EXAMPLE_OUT)/chronos_metric_definitions.rs + +## weaver.example.generate.docs: Generate example Chronos metrics docs with Weaver +weaver.example.generate.docs: + $(call pp,generate example metrics markdown docs with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_EXAMPLE_REGISTRY) --templates $(WEAVER_EXAMPLE_TEMPLATES) markdown $(WEAVER_EXAMPLE_OUT) + +## weaver.example.generate.schema: Generate example Weaver resolved-registry JSON schema +weaver.example.generate.schema: + $(call pp,generate example Weaver JSON schema with $(WEAVER_IMAGE)...) + mkdir -p $(WEAVER_EXAMPLE_OUT) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry json-schema -o $(WEAVER_EXAMPLE_OUT)/resolved-registry.schema.json + +## weaver.example.generate: Explicitly generate example Weaver Rust, docs, and schema artifacts +weaver.example.generate: weaver.example.generate.rust weaver.example.generate.docs weaver.example.generate.schema + +## weaver.check: Validate the production Chronos Weaver registry +weaver.check: weaver.production.check + +## weaver.generate: Generate production Weaver artifacts +weaver.generate: weaver.production.generate ## weaver.live-check: Run Weaver live-check against the OTLP metrics mock weaver.live-check: @@ -114,4 +151,4 @@ weaver.live-check: wait "$$live_check_pid"; \ find "$(WEAVER_LIVE_CHECK_OUT)" -maxdepth 1 -type f -print -.PHONY: build fmt lint test test.unit pre-commit test.unit.coverage metrics.check metrics.mock weaver.check weaver.generate.rust weaver.generate.markdown weaver.generate.json-schema weaver.generate weaver.live-check +.PHONY: build fmt lint test test.unit pre-commit test.unit.coverage metrics.check metrics.mock weaver.production.check weaver.production.generate.rust weaver.production.generate.docs weaver.production.generate.schema weaver.production.generate weaver.example.check weaver.example.generate.rust weaver.example.generate.docs weaver.example.generate.schema weaver.example.generate weaver.check weaver.generate weaver.live-check diff --git a/dev/weaver/production/registry/chronos/metrics.yaml b/dev/weaver/production/registry/chronos/metrics.yaml new file mode 100644 index 0000000..e2eb38d --- /dev/null +++ b/dev/weaver/production/registry/chronos/metrics.yaml @@ -0,0 +1,139 @@ +groups: + - id: resource.chronos.service + type: attribute_group + stability: development + brief: Resource attributes emitted by the Chronos metrics mock. + attributes: + - id: service.name + type: string + stability: stable + brief: Logical name of the service. + examples: ["chronos-metrics-mock"] + requirement_level: required + - id: service.instance.id + type: string + stability: stable + brief: The string ID of the service instance. + examples: ["chronos-metrics-mock-live-check"] + requirement_level: required + + - id: metric_attributes.chronos.consume_result + type: attribute_group + stability: development + brief: Attributes for Chronos input message handling outcomes. + attributes: + - id: destination + type: string + stability: development + brief: Downstream selected by message_receiver::handle_message. + examples: ["kafka", "postgres"] + requirement_level: required + - id: chronos.consume.status + type: string + stability: development + brief: Whether the consume path completed successfully. + examples: ["pass", "fail"] + requirement_level: required + + - id: metric_attributes.chronos.process_result + type: attribute_group + stability: development + brief: Attributes for Chronos ready-message processor loop outcomes. + attributes: + - id: returned + type: string + stability: development + brief: Whether the processor loop returned early because no rows were ready. + examples: ["true", "false"] + requirement_level: required + - id: chronos.process.status + type: string + stability: development + brief: Whether the processor loop completed successfully. + examples: ["pass", "fail"] + requirement_level: required + + - id: metric.chronos.message.consume.duration + type: metric + metric_name: chronos.message.consume.duration + stability: development + brief: Duration of handle_message() in message_receiver. + instrument: histogram + unit: s + extends: metric_attributes.chronos.consume_result + annotations: + code_generation: + rust_name: msg_consume_latency + metric_value_type: double + prometheus_name: msg_consume_latency + label_names: [destination, status] + prometheus_label_names: [destination, status] + buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] + prewarm_label_values: + - [kafka, pass] + - [kafka, fail] + - [postgres, pass] + - [postgres, fail] + + - id: metric.chronos.message.process.duration + type: metric + metric_name: chronos.message.process.duration + stability: development + brief: Duration of processor_message_ready() loop in message_processor. + instrument: histogram + unit: s + extends: metric_attributes.chronos.process_result + annotations: + code_generation: + rust_name: msg_process_latency + metric_value_type: double + prometheus_name: msg_process_latency + label_names: [returned, status] + prometheus_label_names: [returned, status] + buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] + prewarm_label_values: + - ["true", pass] + - ["true", fail] + - ["false", pass] + - ["false", fail] + + - id: metric.chronos.message.wait.duration + type: metric + metric_name: chronos.message.wait.duration + stability: development + brief: Time a message spent in the Kafka input queue before processing. + instrument: histogram + unit: s + annotations: + code_generation: + rust_name: msg_wait_time + metric_value_type: double + prometheus_name: msg_wait_time + buckets: [0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2] + + - id: metric.chronos.message.jitter + type: metric + metric_name: chronos.message.jitter + stability: development + brief: Difference between actual publish time and client-requested deadline. + instrument: histogram + unit: s + annotations: + code_generation: + rust_name: msg_jitter + metric_value_type: double + prometheus_name: msg_jitter + buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] + + - id: metric.chronos.message.reset + type: metric + metric_name: chronos.message.reset + stability: development + brief: Number of records reset by reset_to_init_db() in the monitor task. + instrument: counter + unit: "{message}" + annotations: + code_generation: + rust_name: msg_reset + metric_value_type: int + prometheus_name: msg_reset diff --git a/dev/weaver/production/templates/registry/markdown/metrics.md.j2 b/dev/weaver/production/templates/registry/markdown/metrics.md.j2 new file mode 100644 index 0000000..7df98ee --- /dev/null +++ b/dev/weaver/production/templates/registry/markdown/metrics.md.j2 @@ -0,0 +1,9 @@ +# Chronos Metrics + +Generated from `dev/weaver/production/registry/chronos/metrics.yaml` by OpenTelemetry Weaver. + +| Metric | Prometheus Name | Instrument | Unit | Attributes | Description | +| --- | --- | --- | --- | --- | --- | +{% for metric in ctx.metrics -%} +| `{{ metric.metric_name }}` | `{{ metric.prometheus_name }}` | `{{ metric.instrument }}` | `{{ metric.unit }}` | {% if metric.attributes %}{% for attribute in metric.attributes %}`{{ attribute }}`{% if not loop.last %}, {% endif %}{% endfor %}{% else %}-{% endif %} | {{ metric.brief }} | +{% endfor -%} diff --git a/dev/weaver/production/templates/registry/markdown/weaver.yaml b/dev/weaver/production/templates/registry/markdown/weaver.yaml new file mode 100644 index 0000000..61b5edf --- /dev/null +++ b/dev/weaver/production/templates/registry/markdown/weaver.yaml @@ -0,0 +1,17 @@ +templates: + - pattern: metrics.md.j2 + filter: > + { + metrics: (.groups + | map(select(.type == "metric")) + | map({ + metric_name, + prometheus_name: .annotations.code_generation.prometheus_name, + brief, + instrument, + unit, + attributes: (.annotations.code_generation.label_names // (.attributes // [] | map(.name // .id // .ref))) + })) + } + application_mode: single + file_name: chronos_metrics.md diff --git a/dev/weaver/production/templates/registry/rust/registry.rs.j2 b/dev/weaver/production/templates/registry/rust/registry.rs.j2 new file mode 100644 index 0000000..c246744 --- /dev/null +++ b/dev/weaver/production/templates/registry/rust/registry.rs.j2 @@ -0,0 +1,58 @@ +// Generated from dev/weaver/production/registry/chronos/metrics.yaml by OpenTelemetry Weaver. +// Do not edit by hand. + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum MetricId { +{%- for metric in ctx.metrics %} + {{ metric.rust_name | pascal_case }}, +{%- endfor %} +} + +#[derive(Clone, Copy, Debug)] +pub enum MetricKind { + Counter, + Histogram, +} + +impl MetricKind { + pub fn is_counter(self) -> bool { + matches!(self, Self::Counter) + } + + pub fn is_histogram(self) -> bool { + matches!(self, Self::Histogram) + } +} + +#[derive(Clone, Copy, Debug)] +pub struct MetricDefinition { + pub id: MetricId, + pub otel_name: &'static str, + pub prometheus_name: &'static str, + pub description: &'static str, + pub unit: Option<&'static str>, + pub label_names: &'static [&'static str], + pub prometheus_label_names: &'static [&'static str], + pub kind: MetricKind, + pub buckets: Option<&'static [f64]>, + pub prewarm_label_values: &'static [&'static [&'static str]], +} + +pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ +{%- for metric in ctx.metrics %} + MetricDefinition { + id: MetricId::{{ metric.rust_name | pascal_case }}, + otel_name: "{{ metric.metric_name }}", + prometheus_name: "{{ metric.prometheus_name }}", + description: "{{ metric.brief }}", + unit: {% if metric.unit %}Some("{{ metric.unit }}"){% else %}None{% endif %}, + label_names: &[{% for attribute in metric.attributes %}"{{ attribute }}"{% if not loop.last %}, {% endif %}{% endfor %}], + prometheus_label_names: &[{% for label in metric.prometheus_labels %}"{{ label }}"{% if not loop.last %}, {% endif %}{% endfor %}], + kind: MetricKind::{{ metric.instrument | pascal_case }}, + buckets: {% if metric.buckets %}{% if metric.buckets | length > 10 %}Some(&[ + {{ metric.buckets | join(", ") }}, + ]){% else %}Some(&[{{ metric.buckets | join(", ") }}]){% endif %}{% else %}None{% endif %}, + prewarm_label_values: &[{% for values in metric.prewarm_label_values %}&[{% for value in values %}"{{ value }}"{% if not loop.last %}, {% endif %}{% endfor %}]{% if not loop.last %}, {% endif %}{% endfor %}], + }, +{%- endfor %} +]; diff --git a/dev/weaver/production/templates/registry/rust/weaver.yaml b/dev/weaver/production/templates/registry/rust/weaver.yaml new file mode 100644 index 0000000..ed7f9fb --- /dev/null +++ b/dev/weaver/production/templates/registry/rust/weaver.yaml @@ -0,0 +1,22 @@ +templates: + - pattern: registry.rs.j2 + filter: > + { + metrics: (.groups + | map(select(.type == "metric")) + | map({ + id, + metric_name, + rust_name: .annotations.code_generation.rust_name, + prometheus_name: .annotations.code_generation.prometheus_name, + brief, + instrument, + unit, + attributes: (.annotations.code_generation.label_names // (.attributes // [] | map(.name // .id // .ref))), + prometheus_labels: (.annotations.code_generation.prometheus_label_names // (.attributes // [] | map((.name // .id // .ref) | gsub("\\."; "_")))), + buckets: .annotations.code_generation.buckets, + prewarm_label_values: (.annotations.code_generation.prewarm_label_values // []) + })) + } + application_mode: single + file_name: chronos_metric_definitions.rs diff --git a/chronos_bin/src/metrics/generated/chronos_metrics.md b/docs/chronos_metrics.md similarity index 90% rename from chronos_bin/src/metrics/generated/chronos_metrics.md rename to docs/chronos_metrics.md index 123cbe5..1fa0dd6 100644 --- a/chronos_bin/src/metrics/generated/chronos_metrics.md +++ b/docs/chronos_metrics.md @@ -1,6 +1,6 @@ # Chronos Metrics -Generated from `examples/weaver/registry/chronos/metrics.yaml` by OpenTelemetry Weaver. +Generated from `dev/weaver/production/registry/chronos/metrics.yaml` by OpenTelemetry Weaver. | Metric | Prometheus Name | Instrument | Unit | Attributes | Description | | --- | --- | --- | --- | --- | --- | diff --git a/chronos_bin/src/metrics/generated/resolved-registry.schema.json b/docs/schema/resolved-registry.schema.json similarity index 100% rename from chronos_bin/src/metrics/generated/resolved-registry.schema.json rename to docs/schema/resolved-registry.schema.json diff --git a/examples/weaver/generated/chronos_metric_definitions.rs b/examples/weaver/generated/chronos_metric_definitions.rs index 38ed8c8..f849fb9 100644 --- a/examples/weaver/generated/chronos_metric_definitions.rs +++ b/examples/weaver/generated/chronos_metric_definitions.rs @@ -4,7 +4,6 @@ #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub enum MetricId { MsgConsumeLatency, - MsgConsumed, MsgJitter, MsgProcessLatency, MsgReset, @@ -17,6 +16,16 @@ pub enum MetricKind { Histogram, } +impl MetricKind { + pub fn is_counter(self) -> bool { + matches!(self, Self::Counter) + } + + pub fn is_histogram(self) -> bool { + matches!(self, Self::Histogram) + } +} + #[derive(Clone, Copy, Debug)] pub struct MetricDefinition { pub id: MetricId, @@ -28,73 +37,68 @@ pub struct MetricDefinition { pub prometheus_label_names: &'static [&'static str], pub kind: MetricKind, pub buckets: Option<&'static [f64]>, + pub prewarm_label_values: &'static [&'static [&'static str]], } pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ MetricDefinition { id: MetricId::MsgConsumeLatency, - otel_name: "messaging.client.operation.duration", - prometheus_name: "messaging_client_operation_duration_seconds", + otel_name: "chronos.message.consume.duration", + prometheus_name: "msg_consume_latency", description: "Duration of handle_message() in message_receiver.", unit: Some("s"), - label_names: &["messaging.destination.name", "messaging.operation.name", "messaging.system"], - prometheus_label_names: &["messaging_destination_name", "messaging_operation_name", "messaging_system"], + label_names: &["destination", "status"], + prometheus_label_names: &["destination", "status"], kind: MetricKind::Histogram, buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), - }, - MetricDefinition { - id: MetricId::MsgConsumed, - otel_name: "messaging.client.consumed.messages", - prometheus_name: "messaging_client_consumed_messages_total", - description: "Total number of Chronos input messages consumed.", - unit: Some("{message}"), - label_names: &["messaging.destination.name", "messaging.operation.name", "messaging.system"], - prometheus_label_names: &["messaging_destination_name", "messaging_operation_name", "messaging_system"], - kind: MetricKind::Counter, - buckets: None, + prewarm_label_values: &[&["kafka", "pass"], &["kafka", "fail"], &["postgres", "pass"], &["postgres", "fail"]], }, MetricDefinition { id: MetricId::MsgJitter, otel_name: "chronos.message.jitter", - prometheus_name: "chronos_message_jitter_seconds", + prometheus_name: "msg_jitter", description: "Difference between actual publish time and client-requested deadline.", unit: Some("s"), label_names: &[], prometheus_label_names: &[], kind: MetricKind::Histogram, buckets: Some(&[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]), + prewarm_label_values: &[], }, MetricDefinition { id: MetricId::MsgProcessLatency, - otel_name: "messaging.process.duration", - prometheus_name: "messaging_process_duration_seconds", + otel_name: "chronos.message.process.duration", + prometheus_name: "msg_process_latency", description: "Duration of processor_message_ready() loop in message_processor.", unit: Some("s"), - label_names: &["messaging.destination.name", "messaging.operation.name", "messaging.system"], - prometheus_label_names: &["messaging_destination_name", "messaging_operation_name", "messaging_system"], + label_names: &["returned", "status"], + prometheus_label_names: &["returned", "status"], kind: MetricKind::Histogram, buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), + prewarm_label_values: &[&["true", "pass"], &["true", "fail"], &["false", "pass"], &["false", "fail"]], }, MetricDefinition { id: MetricId::MsgReset, otel_name: "chronos.message.reset", - prometheus_name: "chronos_messages_reset_total", + prometheus_name: "msg_reset", description: "Number of records reset by reset_to_init_db() in the monitor task.", unit: Some("{message}"), label_names: &[], prometheus_label_names: &[], kind: MetricKind::Counter, buckets: None, + prewarm_label_values: &[], }, MetricDefinition { id: MetricId::MsgWaitTime, otel_name: "chronos.message.wait.duration", - prometheus_name: "chronos_message_wait_duration_seconds", + prometheus_name: "msg_wait_time", description: "Time a message spent in the Kafka input queue before processing.", unit: Some("s"), label_names: &[], prometheus_label_names: &[], kind: MetricKind::Histogram, buckets: Some(&[0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2]), + prewarm_label_values: &[], }, ]; diff --git a/examples/weaver/generated/chronos_metrics.md b/examples/weaver/generated/chronos_metrics.md index 81797f4..123cbe5 100644 --- a/examples/weaver/generated/chronos_metrics.md +++ b/examples/weaver/generated/chronos_metrics.md @@ -4,9 +4,8 @@ Generated from `examples/weaver/registry/chronos/metrics.yaml` by OpenTelemetry | Metric | Prometheus Name | Instrument | Unit | Attributes | Description | | --- | --- | --- | --- | --- | --- | -| `messaging.client.operation.duration` | `messaging_client_operation_duration_seconds` | `histogram` | `s` | `messaging.destination.name`, `messaging.operation.name`, `messaging.system` | Duration of handle_message() in message_receiver. | -| `messaging.client.consumed.messages` | `messaging_client_consumed_messages_total` | `counter` | `{message}` | `messaging.destination.name`, `messaging.operation.name`, `messaging.system` | Total number of Chronos input messages consumed. | -| `chronos.message.jitter` | `chronos_message_jitter_seconds` | `histogram` | `s` | - | Difference between actual publish time and client-requested deadline. | -| `messaging.process.duration` | `messaging_process_duration_seconds` | `histogram` | `s` | `messaging.destination.name`, `messaging.operation.name`, `messaging.system` | Duration of processor_message_ready() loop in message_processor. | -| `chronos.message.reset` | `chronos_messages_reset_total` | `counter` | `{message}` | - | Number of records reset by reset_to_init_db() in the monitor task. | -| `chronos.message.wait.duration` | `chronos_message_wait_duration_seconds` | `histogram` | `s` | - | Time a message spent in the Kafka input queue before processing. | +| `chronos.message.consume.duration` | `msg_consume_latency` | `histogram` | `s` | `destination`, `status` | Duration of handle_message() in message_receiver. | +| `chronos.message.jitter` | `msg_jitter` | `histogram` | `s` | - | Difference between actual publish time and client-requested deadline. | +| `chronos.message.process.duration` | `msg_process_latency` | `histogram` | `s` | `returned`, `status` | Duration of processor_message_ready() loop in message_processor. | +| `chronos.message.reset` | `msg_reset` | `counter` | `{message}` | - | Number of records reset by reset_to_init_db() in the monitor task. | +| `chronos.message.wait.duration` | `msg_wait_time` | `histogram` | `s` | - | Time a message spent in the Kafka input queue before processing. | From 2b98ec57225f7e669c2e79cb13db970c2bab88ba Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 13:58:56 +1000 Subject: [PATCH 21/36] chore: add container build recipes Move Dockerfiles into docker/ and update Docker Compose build paths to reference the new location. Add make docker.build plus per-image recipes for the Chronos and PostgreSQL migration images. Fix the Docker runtime stage user reference while moving the files so the images build without Dockerfile warnings for the stage names or undefined USER. Verification: - make docker.config - make docker.config BACKEND=lgtm - make docker.build - make pre-commit Model-version: GPT-5 --- How-to.md | 3 ++- dev/docker-compose/compose.yaml | 4 ++-- dev/makefiles/docker.mk | 17 ++++++++++++++++- Dockerfile.chronos => docker/Dockerfile.chronos | 12 ++++++------ .../Dockerfile.chronos-pg-migrations | 12 ++++++------ .../Dockerfile.chronos-slim | 2 +- 6 files changed, 33 insertions(+), 17 deletions(-) rename Dockerfile.chronos => docker/Dockerfile.chronos (76%) rename Dockerfile.chronos-pg-migrations => docker/Dockerfile.chronos-pg-migrations (75%) rename Dockerfile.chronos-slim => docker/Dockerfile.chronos-slim (98%) diff --git a/How-to.md b/How-to.md index 8fd19ac..3086f12 100644 --- a/How-to.md +++ b/How-to.md @@ -23,6 +23,8 @@ Use `make up` to build and start Chronos with PostgreSQL, Kafka, Jaeger, and the Use `make up lgtm` or `make up BACKEND=lgtm` to start the same Chronos stack with the Grafana LGTM backend instead of Jaeger. +Use `make docker.build` to build the Chronos and PostgreSQL migration container images without starting the Compose stack. + Use `make down` to stop the running stack. ## ENV vars @@ -91,4 +93,3 @@ make lgtm.validate Two images are published for each [RELEASE]( `https://github.com/kindredgroup/chronos/pkgs/container/chronos`) - migrations image - chornos image - diff --git a/dev/docker-compose/compose.yaml b/dev/docker-compose/compose.yaml index 4675fac..122aefa 100644 --- a/dev/docker-compose/compose.yaml +++ b/dev/docker-compose/compose.yaml @@ -5,7 +5,7 @@ services: chronos-pg-migrations: build: context: ../.. - dockerfile: Dockerfile.chronos-pg-migrations + dockerfile: docker/Dockerfile.chronos-pg-migrations environment: PG_HOST: postgres PG_PORT: "5432" @@ -22,7 +22,7 @@ services: chronos: build: context: ../.. - dockerfile: Dockerfile.chronos + dockerfile: docker/Dockerfile.chronos ports: - "9091:9091" environment: diff --git a/dev/makefiles/docker.mk b/dev/makefiles/docker.mk index 1f548a9..876c79b 100644 --- a/dev/makefiles/docker.mk +++ b/dev/makefiles/docker.mk @@ -8,6 +8,8 @@ COMPOSE_BACKEND_FILE := $(if $(filter lgtm,$(BACKEND)),$(COMPOSE_FILE_LGTM),$(CO DOCKER_COMPOSE := docker compose --project-name $(COMPOSE_PROJECT_NAME) -f $(COMPOSE_FILE_BASE) -f $(COMPOSE_BACKEND_FILE) DOCKER_COMPOSE_JAEGER := docker compose --project-name $(COMPOSE_PROJECT_NAME) -f $(COMPOSE_FILE_BASE) -f $(COMPOSE_FILE_JAEGER) DOCKER_COMPOSE_LGTM := docker compose --project-name $(COMPOSE_PROJECT_NAME) -f $(COMPOSE_FILE_BASE) -f $(COMPOSE_FILE_LGTM) +CHRONOS_IMAGE ?= chronos:local +CHRONOS_MIGRATIONS_IMAGE ?= chronos-pg-migrations:local ## up: Build and start Chronos, dependencies, and observability. Use make up lgtm or BACKEND=lgtm for LGTM up: @@ -24,6 +26,19 @@ down: docker.config: $(DOCKER_COMPOSE) config +## docker.build: Build the Chronos and PostgreSQL migration container images +docker.build: docker.build.chronos docker.build.migrations + +## docker.build.chronos: Build the Chronos container image +docker.build.chronos: + $(call pp,building Chronos container image $(CHRONOS_IMAGE)...) + docker build -f docker/Dockerfile.chronos -t $(CHRONOS_IMAGE) . + +## docker.build.migrations: Build the PostgreSQL migration container image +docker.build.migrations: + $(call pp,building Chronos PostgreSQL migration container image $(CHRONOS_MIGRATIONS_IMAGE)...) + docker build -f docker/Dockerfile.chronos-pg-migrations -t $(CHRONOS_MIGRATIONS_IMAGE) . + ## docker.up: Legacy alias for make up docker.up: up @@ -33,4 +48,4 @@ docker.down: down jaeger lgtm: @: -.PHONY: up down docker.config docker.up docker.down jaeger lgtm +.PHONY: up down docker.config docker.build docker.build.chronos docker.build.migrations docker.up docker.down jaeger lgtm diff --git a/Dockerfile.chronos b/docker/Dockerfile.chronos similarity index 76% rename from Dockerfile.chronos rename to docker/Dockerfile.chronos index 2e90a53..244ff86 100644 --- a/Dockerfile.chronos +++ b/docker/Dockerfile.chronos @@ -1,4 +1,4 @@ -FROM rust:1.94-bookworm AS BUILD +FROM rust:1.94-bookworm AS build # Install software RUN update-ca-certificates && apt-get update && apt-get install -y libsasl2-dev # Create appuser @@ -21,16 +21,16 @@ RUN cargo build -p chronos_bin --release # # Run image based on bookworm-slim to reduce image size while still using glibc # -FROM debian:bookworm-slim AS RUN +FROM debian:bookworm-slim AS run # SASL supports RUN apt-get update && apt-get install -y libsasl2-dev WORKDIR /opt/build # Import users from build -COPY --from=BUILD /etc/passwd /etc/passwd -COPY --from=BUILD /etc/group /etc/group +COPY --from=build /etc/passwd /etc/passwd +COPY --from=build /etc/group /etc/group # Copy binary from build -COPY --from=BUILD /tmp/target/release/chronos ./ +COPY --from=build /tmp/target/release/chronos ./ # Use an unprivileged user -USER ${USER}:${USER} +USER chronos:chronos # Entry point CMD ["/opt/build/chronos"] diff --git a/Dockerfile.chronos-pg-migrations b/docker/Dockerfile.chronos-pg-migrations similarity index 75% rename from Dockerfile.chronos-pg-migrations rename to docker/Dockerfile.chronos-pg-migrations index d4a4f85..3a2971d 100644 --- a/Dockerfile.chronos-pg-migrations +++ b/docker/Dockerfile.chronos-pg-migrations @@ -1,4 +1,4 @@ -FROM rust:1.94-bookworm AS BUILD +FROM rust:1.94-bookworm AS build # Install software RUN update-ca-certificates && apt-get update && apt-get install -y libsasl2-dev # Create appuser @@ -21,16 +21,16 @@ RUN cargo build -p pg_mig --release # # Run image based on bookworm-slim to reduce image size while still using glibc # -FROM debian:bookworm-slim AS RUN +FROM debian:bookworm-slim AS run # SASL supports RUN apt-get update && apt-get install -y libsasl2-dev WORKDIR /opt/build # Import users from build -COPY --from=BUILD /etc/passwd /etc/passwd -COPY --from=BUILD /etc/group /etc/group +COPY --from=build /etc/passwd /etc/passwd +COPY --from=build /etc/group /etc/group # Copy binary from build -COPY --from=BUILD /tmp/target/release/chronos-pg-migrations ./ +COPY --from=build /tmp/target/release/chronos-pg-migrations ./ # Use an unprivileged user -USER ${USER}:${USER} +USER chronos:chronos # Entry point CMD ["/opt/build/chronos-pg-migrations"] diff --git a/Dockerfile.chronos-slim b/docker/Dockerfile.chronos-slim similarity index 98% rename from Dockerfile.chronos-slim rename to docker/Dockerfile.chronos-slim index 5456e3e..0e9da01 100644 --- a/Dockerfile.chronos-slim +++ b/docker/Dockerfile.chronos-slim @@ -11,7 +11,7 @@ # To use this image in Docker Compose, change the chronos service to: # build: # context: . -# dockerfile: Dockerfile.chronos-slim +# dockerfile: docker/Dockerfile.chronos-slim # ───────────────────────────────────────────────────────────────────────────── # Build stage From 2fb688f4b322e93d5151aa8cda9fa25d8b6371de Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 14:05:02 +1000 Subject: [PATCH 22/36] chore: simplify weaver make targets Replace separate production and example Weaver recipes with shared weaver.check and weaver.generate targets selected by WEAVER_TARGET. The default target remains production, while example artifacts are generated with WEAVER_TARGET=example. Keep make build pinned to production generation by invoking make weaver.generate WEAVER_TARGET=production before cargo build. Verification: - make -n build - make -n weaver.generate - make -n weaver.generate WEAVER_TARGET=example - make weaver.generate - make weaver.generate WEAVER_TARGET=example - make build - make pre-commit Model-version: GPT-5 --- AGENTS.md | 2 +- How-to.md | 2 +- dev/makefiles/rust.mk | 104 +++++++++++++++++------------------------- 3 files changed, 43 insertions(+), 65 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 10e86ce..727d31d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -117,6 +117,6 @@ When making tradeoffs, record the chosen path and the reason. Avoid relying on c - Chronos treats Kafka message bodies opaquely and forwards messages after delay; avoid adding application-level assumptions about payload shape. - The README describes at-least-once delivery semantics. Preserve behavior that supports persistence, recovery from suspected node failure, and duplicate-safe processing. -- Metrics work on the `feat/prom_metrics` branch currently includes a Prometheus endpoint and metric-family checks in the integration script. Production Weaver inputs live under `dev/weaver/production`; example Weaver inputs live under `examples/weaver` and are generated only by explicit example recipes. Changes to metrics should preserve unit tests for registry output and integration checks for expected metric families. +- Metrics work on the `feat/prom_metrics` branch currently includes a Prometheus endpoint and metric-family checks in the integration script. Production Weaver inputs live under `dev/weaver/production`; example Weaver inputs live under `examples/weaver`. `WEAVER_TARGET` defaults to `production`, and example artifacts are generated explicitly with `make weaver.generate WEAVER_TARGET=example`. Changes to metrics should preserve unit tests for registry output and integration checks for expected metric families. - Local development commonly uses `.env` copied from [.env.example](.env.example) through `make setup` or `make withenv`. - Docker Compose files live in `dev/docker-compose`. `make up` starts Chronos with PostgreSQL, Kafka, Jaeger, and the OpenTelemetry Collector by default; `make up lgtm` uses the LGTM backend. diff --git a/How-to.md b/How-to.md index 3086f12..548dd43 100644 --- a/How-to.md +++ b/How-to.md @@ -81,7 +81,7 @@ The overlay mounts local override files from `dev/lgtm` for Prometheus, the Open Chronos production metrics are generated from the OpenTelemetry Weaver registry in `dev/weaver/production/registry/chronos/metrics.yaml`. Rust definitions are generated into `chronos_bin/src/metrics/generated`, Markdown docs into `docs/chronos_metrics.md`, and the resolved registry schema into `docs/schema/resolved-registry.schema.json`. `OTEL_METRICS_EXPORTER=prometheus` is the default and exposes `/metrics` with the `chronos_` Prometheus namespace, for example `chronos_msg_jitter`. `OTEL_METRICS_EXPORTER=otlp` records the same generated metric IDs through the OTLP gRPC metrics exporter. -`make build` runs `make weaver.production.generate` before compiling, which refreshes the production Rust definitions, Markdown metric docs, and resolved registry JSON schema. Example Weaver artifacts are generated only when explicitly requested with `make weaver.example.generate`. +`make build` runs `make weaver.generate WEAVER_TARGET=production` before compiling, which refreshes the production Rust definitions, Markdown metric docs, and resolved registry JSON schema. `WEAVER_TARGET` defaults to `production`; generate example Weaver artifacts explicitly with `make weaver.generate WEAVER_TARGET=example`. Validate the LGTM configuration files with: diff --git a/dev/makefiles/rust.mk b/dev/makefiles/rust.mk index 70368c0..a27e51a 100644 --- a/dev/makefiles/rust.mk +++ b/dev/makefiles/rust.mk @@ -1,21 +1,30 @@ EXPORTER ?= prom WEAVER_VERSION ?= 0.23.0 WEAVER_IMAGE ?= otel/weaver:v$(WEAVER_VERSION) -WEAVER_PRODUCTION_REGISTRY ?= dev/weaver/production/registry -WEAVER_PRODUCTION_TEMPLATES ?= dev/weaver/production/templates -WEAVER_PRODUCTION_RUST_OUT ?= chronos_bin/src/metrics/generated -WEAVER_PRODUCTION_DOCS_OUT ?= docs -WEAVER_SCHEMA_OUT ?= docs/schema -WEAVER_EXAMPLE_REGISTRY ?= examples/weaver/registry -WEAVER_EXAMPLE_TEMPLATES ?= examples/weaver/templates -WEAVER_EXAMPLE_OUT ?= examples/weaver/generated -WEAVER_REGISTRY ?= $(WEAVER_PRODUCTION_REGISTRY) +WEAVER_TARGET ?= production WEAVER_LIVE_CHECK_PORT ?= 4319 WEAVER_LIVE_CHECK_ADMIN_PORT ?= 4320 WEAVER_LIVE_CHECK_OUT ?= /tmp/chronos-weaver-live-check +ifeq ($(WEAVER_TARGET),production) +WEAVER_REGISTRY ?= dev/weaver/production/registry +WEAVER_TEMPLATES ?= dev/weaver/production/templates +WEAVER_RUST_OUT ?= chronos_bin/src/metrics/generated +WEAVER_DOCS_OUT ?= docs +WEAVER_SCHEMA_OUT ?= docs/schema +else ifeq ($(WEAVER_TARGET),example) +WEAVER_REGISTRY ?= examples/weaver/registry +WEAVER_TEMPLATES ?= examples/weaver/templates +WEAVER_RUST_OUT ?= examples/weaver/generated +WEAVER_DOCS_OUT ?= examples/weaver/generated +WEAVER_SCHEMA_OUT ?= examples/weaver/generated +else +$(error Unsupported WEAVER_TARGET=$(WEAVER_TARGET); use production or example) +endif + ## build: Build Rust binaries -build: weaver.production.generate +build: + $(MAKE) weaver.generate WEAVER_TARGET=production $(call pp,build rust...) cargo build @@ -61,61 +70,30 @@ metrics.mock: *) echo "unsupported EXPORTER=$(EXPORTER); use EXPORTER=prom or EXPORTER=otlp" >&2; exit 2 ;; \ esac -## weaver.production.check: Validate the production Chronos Weaver registry -weaver.production.check: - $(call pp,check Weaver registry with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry check -r $(WEAVER_PRODUCTION_REGISTRY) - -## weaver.production.generate.rust: Generate production Rust metric definitions with Weaver -weaver.production.generate.rust: - $(call pp,generate Rust metric definitions with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_PRODUCTION_REGISTRY) --templates $(WEAVER_PRODUCTION_TEMPLATES) rust $(WEAVER_PRODUCTION_RUST_OUT) - rustfmt --config-path rustfmt.toml $(WEAVER_PRODUCTION_RUST_OUT)/chronos_metric_definitions.rs - -## weaver.production.generate.docs: Generate production Chronos metrics docs with Weaver -weaver.production.generate.docs: - $(call pp,generate metrics markdown docs with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_PRODUCTION_REGISTRY) --templates $(WEAVER_PRODUCTION_TEMPLATES) markdown $(WEAVER_PRODUCTION_DOCS_OUT) - -## weaver.production.generate.schema: Generate production Weaver resolved-registry JSON schema -weaver.production.generate.schema: - $(call pp,generate Weaver JSON schema with $(WEAVER_IMAGE)...) +## weaver.check: Validate the selected Chronos Weaver registry with WEAVER_TARGET=production|example +weaver.check: + $(call pp,check $(WEAVER_TARGET) Weaver registry with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry check -r $(WEAVER_REGISTRY) + +## weaver.generate.rust: Generate selected Rust metric definitions with WEAVER_TARGET=production|example +weaver.generate.rust: + $(call pp,generate $(WEAVER_TARGET) Rust metric definitions with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_REGISTRY) --templates $(WEAVER_TEMPLATES) rust $(WEAVER_RUST_OUT) + rustfmt --config-path rustfmt.toml $(WEAVER_RUST_OUT)/chronos_metric_definitions.rs + +## weaver.generate.docs: Generate selected Chronos metrics docs with WEAVER_TARGET=production|example +weaver.generate.docs: + $(call pp,generate $(WEAVER_TARGET) metrics markdown docs with $(WEAVER_IMAGE)...) + docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_REGISTRY) --templates $(WEAVER_TEMPLATES) markdown $(WEAVER_DOCS_OUT) + +## weaver.generate.schema: Generate selected Weaver resolved-registry JSON schema with WEAVER_TARGET=production|example +weaver.generate.schema: + $(call pp,generate $(WEAVER_TARGET) Weaver JSON schema with $(WEAVER_IMAGE)...) mkdir -p $(WEAVER_SCHEMA_OUT) docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry json-schema -o $(WEAVER_SCHEMA_OUT)/resolved-registry.schema.json -## weaver.production.generate: Generate production Weaver Rust, docs, and schema artifacts -weaver.production.generate: weaver.production.generate.rust weaver.production.generate.docs weaver.production.generate.schema - -## weaver.example.check: Validate the example Chronos Weaver registry -weaver.example.check: - $(call pp,check example Weaver registry with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry check -r $(WEAVER_EXAMPLE_REGISTRY) - -## weaver.example.generate.rust: Generate example Rust metric definitions with Weaver -weaver.example.generate.rust: - $(call pp,generate example Rust metric definitions with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_EXAMPLE_REGISTRY) --templates $(WEAVER_EXAMPLE_TEMPLATES) rust $(WEAVER_EXAMPLE_OUT) - rustfmt --config-path rustfmt.toml $(WEAVER_EXAMPLE_OUT)/chronos_metric_definitions.rs - -## weaver.example.generate.docs: Generate example Chronos metrics docs with Weaver -weaver.example.generate.docs: - $(call pp,generate example metrics markdown docs with $(WEAVER_IMAGE)...) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry generate -r $(WEAVER_EXAMPLE_REGISTRY) --templates $(WEAVER_EXAMPLE_TEMPLATES) markdown $(WEAVER_EXAMPLE_OUT) - -## weaver.example.generate.schema: Generate example Weaver resolved-registry JSON schema -weaver.example.generate.schema: - $(call pp,generate example Weaver JSON schema with $(WEAVER_IMAGE)...) - mkdir -p $(WEAVER_EXAMPLE_OUT) - docker run --rm -v "$(PWD):/work" -w /work $(WEAVER_IMAGE) registry json-schema -o $(WEAVER_EXAMPLE_OUT)/resolved-registry.schema.json - -## weaver.example.generate: Explicitly generate example Weaver Rust, docs, and schema artifacts -weaver.example.generate: weaver.example.generate.rust weaver.example.generate.docs weaver.example.generate.schema - -## weaver.check: Validate the production Chronos Weaver registry -weaver.check: weaver.production.check - -## weaver.generate: Generate production Weaver artifacts -weaver.generate: weaver.production.generate +## weaver.generate: Generate selected Weaver Rust, docs, and schema artifacts with WEAVER_TARGET=production|example +weaver.generate: weaver.generate.rust weaver.generate.docs weaver.generate.schema ## weaver.live-check: Run Weaver live-check against the OTLP metrics mock weaver.live-check: @@ -151,4 +129,4 @@ weaver.live-check: wait "$$live_check_pid"; \ find "$(WEAVER_LIVE_CHECK_OUT)" -maxdepth 1 -type f -print -.PHONY: build fmt lint test test.unit pre-commit test.unit.coverage metrics.check metrics.mock weaver.production.check weaver.production.generate.rust weaver.production.generate.docs weaver.production.generate.schema weaver.production.generate weaver.example.check weaver.example.generate.rust weaver.example.generate.docs weaver.example.generate.schema weaver.example.generate weaver.check weaver.generate weaver.live-check +.PHONY: build fmt lint test test.unit pre-commit test.unit.coverage metrics.check metrics.mock weaver.check weaver.generate.rust weaver.generate.docs weaver.generate.schema weaver.generate weaver.live-check From f4325587e7365fbef2c0eb38f81b50b9358fe13c Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 14:54:17 +1000 Subject: [PATCH 23/36] test: add k6 integration workloads Add a custom k6 image with xk6-kafka, Make targets for contract and load workloads, and k6 scripts that publish results through OTLP. The contract test exercises immediate publish, delayed DB publish, invalid delayed payload, and missing-key immediate failure paths. The load test models the README throughput and jitter SLOs with configurable rate/duration inputs. Route Chronos logs to the LGTM filelog directory when the LGTM compose overlay is active, and send Chronos metrics to LGTM over OTLP while leaving traces on the existing HTTP OTLP path. Verification: - make k6.build - docker run --rm chronos-k6:1.7.1 version - k6 inspect for dev/k6/contract.js and dev/k6/load.js through the custom image - make up lgtm - make k6.contract - K6_LOAD_RATE=10 K6_LOAD_DURATION=5s K6_LOAD_CONSUME_DURATION=15s K6_LOAD_EXPECTED_MESSAGES=50 make k6.load (expected SLO failure: p99.9 jitter exceeded 500ms on the local stack) - make docker.config BACKEND=lgtm - make pre-commit Model-version: GPT-5 --- .dockerignore | 1 + .gitignore | 1 + dev/docker-compose/lgtm.yaml | 10 +++ dev/k6/README.md | 14 +++ dev/k6/contract.js | 139 +++++++++++++++++++++++++++++ dev/k6/load.js | 164 +++++++++++++++++++++++++++++++++++ dev/makefiles/docker.mk | 4 +- dev/makefiles/k6.mk | 47 ++++++++++ docker/Dockerfile.k6 | 23 +++++ 9 files changed, 401 insertions(+), 2 deletions(-) create mode 100644 dev/k6/README.md create mode 100644 dev/k6/contract.js create mode 100644 dev/k6/load.js create mode 100644 dev/makefiles/k6.mk create mode 100644 docker/Dockerfile.k6 diff --git a/.dockerignore b/.dockerignore index 7e8270d..bbc0032 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,6 +5,7 @@ target/ .env healthcheck/ +dev/lgtm/runtime-logs/ .DS_Store *.iml diff --git a/.gitignore b/.gitignore index fb16bbd..ee5d169 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ /target .env /healthcheck +dev/lgtm/runtime-logs/*.jsonl ### Linux ### diff --git a/dev/docker-compose/lgtm.yaml b/dev/docker-compose/lgtm.yaml index e1c8b43..f317bb4 100644 --- a/dev/docker-compose/lgtm.yaml +++ b/dev/docker-compose/lgtm.yaml @@ -1,10 +1,19 @@ services: chronos: + command: + - sh + - -c + - /opt/build/chronos >> /data/lgtm/logs/chronos.jsonl 2>&1 environment: + OTEL_METRICS_EXPORTER: otlp + OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: grpc + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: http://lgtm:4317 OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: http://lgtm:4318/v1/traces depends_on: lgtm: condition: service_healthy + volumes: + - ../lgtm/runtime-logs:/data/lgtm/logs lgtm: image: grafana/otel-lgtm:0.24.1 @@ -37,6 +46,7 @@ services: - ../lgtm/otelcol-contrib.yaml:/otel-lgtm/otelcol-config.yaml:ro - ../lgtm/healthcheck.sh:/otel-lgtm/chronos-healthcheck.sh:ro - ../lgtm/logging.sh:/otel-lgtm/logging.sh:ro + - ../lgtm/runtime-logs:/data/lgtm/logs - ../lgtm/dashboards.yaml:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos.yaml:ro - ../dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos:ro networks: diff --git a/dev/k6/README.md b/dev/k6/README.md new file mode 100644 index 0000000..fa8b833 --- /dev/null +++ b/dev/k6/README.md @@ -0,0 +1,14 @@ +# Chronos k6 Integration Tests + +The k6 image is built with `xk6-kafka` so tests can publish to and consume from the Chronos Kafka topics. k6 run metrics are exported with the built-in OpenTelemetry output. + +## Targets + +- `make k6.build` builds the custom k6 image. +- `make k6.contract` runs one pass through the important Chronos processing paths. +- `make k6.load` runs a constant-arrival-rate producer load test. +- `make k6.test` runs contract and load tests. + +By default the recipes use the LGTM compose network and send k6 OTLP metrics to `lgtm:4317`. In GitHub Actions outside `act`, set `K6_CI_OTLP_ENDPOINT`; the default is `host.docker.internal:4317`. When running under `act`, the recipes keep using the LGTM container. + +Logs from k6 are appended to `dev/lgtm/runtime-logs/*.jsonl`, which is mounted into the LGTM collector filelog receiver. diff --git a/dev/k6/contract.js b/dev/k6/contract.js new file mode 100644 index 0000000..5dc1c3a --- /dev/null +++ b/dev/k6/contract.js @@ -0,0 +1,139 @@ +import { check, sleep } from "k6"; +import encoding from "k6/encoding"; +import { Counter } from "k6/metrics"; +import { Producer, Consumer } from "k6/x/kafka"; + +const brokers = (__ENV.KAFKA_BROKERS || "kafka:9092").split(","); +const inputTopic = __ENV.KAFKA_IN_TOPIC || "chronos.in"; +const outputTopic = __ENV.KAFKA_OUT_TOPIC || "chronos.out"; +const runId = __ENV.K6_RUN_ID || `contract-${Date.now()}`; +const outputTimeoutMs = Number(__ENV.K6_CONTRACT_OUTPUT_TIMEOUT_MS || 15000); + +const exercisedPaths = new Counter("chronos_contract_paths_exercised"); + +export const options = { + scenarios: { + contract: { + executor: "shared-iterations", + vus: 1, + iterations: 1, + maxDuration: "30s", + }, + }, + thresholds: { + checks: ["rate==1"], + chronos_contract_paths_exercised: ["count>=4"], + }, +}; + +const producer = new Producer({ + brokers, + topic: inputTopic, + autoCreateTopic: true, + requiredAcks: 1, +}); + +const consumer = new Consumer({ + brokers, + topic: outputTopic, + groupId: `${runId}-out`, + startOffset: "start_offsets_first_offset", + maxWait: "500ms", +}); + +function deadline(offsetMs) { + return new Date(Date.now() + offsetMs).toISOString(); +} + +function payload(id, extra = {}) { + return JSON.stringify({ + source: "k6-contract", + run_id: runId, + message_id: id, + sent_at_ms: Date.now(), + ...extra, + }); +} + +function chronosHeaders(id, deadlineValue) { + return { + chronosMessageId: id, + chronosDeadline: deadlineValue, + }; +} + +function bytesToString(value) { + if (typeof value === "string") { + return value; + } + return String.fromCharCode.apply(null, Array.from(value || [])); +} + +function produceMessage({ id, key = id, value = payload(id), deadlineMs = -1000, headers = null }) { + const message = { + value: encoding.b64encode(value), + headers: headers || chronosHeaders(id, deadline(deadlineMs)), + }; + if (key !== null) { + message.key = encoding.b64encode(key); + } + producer.produce({ messages: [message] }); +} + +function consumeUntil(id, timeoutMs) { + const expiresAt = Date.now() + timeoutMs; + while (Date.now() < expiresAt) { + const messages = consumer.consume({ maxMessages: 25, expectTimeout: true }); + for (const message of messages) { + const value = bytesToString(message.value); + if (value.includes(id)) { + return value; + } + } + sleep(0.1); + } + return ""; +} + +export default function () { + const immediatePassId = `${runId}-immediate-pass`; + produceMessage({ id: immediatePassId, deadlineMs: -1000 }); + const immediateOutput = consumeUntil(immediatePassId, outputTimeoutMs); + check(immediateOutput, { + "immediate kafka path publishes output": (value) => value.includes(immediatePassId), + }); + exercisedPaths.add(1, { chronos_destination: "kafka", chronos_status: "pass" }); + + const delayedPassId = `${runId}-delayed-pass`; + produceMessage({ id: delayedPassId, deadlineMs: 750 }); + const delayedOutput = consumeUntil(delayedPassId, outputTimeoutMs); + check(delayedOutput, { + "postgres delay path publishes output": (value) => value.includes(delayedPassId), + }); + exercisedPaths.add(1, { chronos_destination: "postgres", chronos_status: "pass" }); + + const postgresFailId = `${runId}-postgres-fail`; + produceMessage({ id: postgresFailId, value: "not-json", deadlineMs: 60_000 }); + sleep(1); + const postgresFailOutput = consumeUntil(postgresFailId, 1000); + check(postgresFailOutput, { + "invalid future payload is not published": (value) => value === "", + }); + exercisedPaths.add(1, { chronos_destination: "postgres", chronos_status: "fail" }); + + const kafkaFailId = `${runId}-kafka-fail`; + produceMessage({ id: kafkaFailId, key: null, deadlineMs: -1000 }); + sleep(1); + const kafkaFailOutput = consumeUntil(kafkaFailId, 1000); + check(kafkaFailOutput, { + "missing key immediate payload is not published": (value) => value === "", + }); + exercisedPaths.add(1, { chronos_destination: "kafka", chronos_status: "fail" }); + + sleep(1); +} + +export function teardown() { + producer.close(); + consumer.close(); +} diff --git a/dev/k6/load.js b/dev/k6/load.js new file mode 100644 index 0000000..68decf1 --- /dev/null +++ b/dev/k6/load.js @@ -0,0 +1,164 @@ +import { check, sleep } from "k6"; +import encoding from "k6/encoding"; +import { Counter, Trend } from "k6/metrics"; +import { Producer, Consumer } from "k6/x/kafka"; + +const brokers = (__ENV.KAFKA_BROKERS || "kafka:9092").split(","); +const inputTopic = __ENV.KAFKA_IN_TOPIC || "chronos.in"; +const outputTopic = __ENV.KAFKA_OUT_TOPIC || "chronos.out"; +const rate = Number(__ENV.K6_LOAD_RATE || 1000); +const duration = __ENV.K6_LOAD_DURATION || "1m"; +const consumeDuration = __ENV.K6_LOAD_CONSUME_DURATION || "2m"; +const delayMs = Number(__ENV.K6_LOAD_DELAY_MS || 1000); +const runId = __ENV.K6_RUN_ID || `load-${Date.now()}`; +const expectedMessages = Number(__ENV.K6_LOAD_EXPECTED_MESSAGES || Math.floor(rate * durationSeconds(duration))); + +const published = new Counter("chronos_messages_published"); +const consumed = new Counter("chronos_messages_consumed"); +const schedulingJitter = new Trend("chronos_scheduling_jitter", true); + +export const options = { + scenarios: { + queueing_load: { + executor: "constant-arrival-rate", + exec: "produceInput", + rate, + timeUnit: "1s", + duration, + preAllocatedVUs: Number(__ENV.K6_LOAD_PREALLOCATED_VUS || 100), + maxVUs: Number(__ENV.K6_LOAD_MAX_VUS || 500), + }, + output_drain: { + executor: "constant-vus", + exec: "consumeOutput", + vus: 1, + duration: consumeDuration, + gracefulStop: "5s", + }, + }, + summaryTrendStats: ["avg", "min", "med", "p(95)", "p(99)", "p(99.9)", "max"], + thresholds: { + checks: ["rate>=0.999"], + dropped_iterations: ["count==0"], + chronos_messages_published: [`count>=${expectedMessages}`], + chronos_messages_consumed: [`count>=${expectedMessages}`], + chronos_scheduling_jitter: ["p(99.9)<500"], + }, +}; + +let producer; +let consumer; +const seen = {}; + +function getProducer() { + if (!producer) { + producer = new Producer({ + brokers, + topic: inputTopic, + autoCreateTopic: true, + requiredAcks: 1, + }); + } + return producer; +} + +function getConsumer(data) { + if (!consumer) { + consumer = new Consumer({ + brokers, + topic: outputTopic, + groupId: `${data.runId}-out`, + startOffset: "start_offsets_first_offset", + maxWait: "500ms", + }); + } + return consumer; +} + +function durationSeconds(value) { + const match = String(value).match(/^(\d+)(ms|s|m|h)$/); + if (!match) { + return 60; + } + const amount = Number(match[1]); + switch (match[2]) { + case "ms": + return amount / 1000; + case "s": + return amount; + case "m": + return amount * 60; + case "h": + return amount * 3600; + default: + return 60; + } +} + +function bytesToString(value) { + if (typeof value === "string") { + return value; + } + return String.fromCharCode.apply(null, Array.from(value || [])); +} + +export function setup() { + return { runId, expectedMessages }; +} + +export function produceInput(data) { + const now = Date.now(); + const id = `${data.runId}-${__VU}-${__ITER}-${now}`; + const deadlineMs = now + delayMs; + const message = { + key: encoding.b64encode(id), + value: encoding.b64encode(JSON.stringify({ + source: "k6-load", + run_id: data.runId, + message_id: id, + sent_at_ms: now, + deadline_ms: deadlineMs, + })), + headers: { + chronosMessageId: id, + chronosDeadline: new Date(deadlineMs).toISOString(), + }, + }; + getProducer().produce({ messages: [message] }); + published.add(1); +} + +export function consumeOutput(data) { + const messages = getConsumer(data).consume({ maxMessages: 500, expectTimeout: true }); + let matched = 0; + for (const message of messages) { + const value = bytesToString(message.value); + if (!value.includes(data.runId)) { + continue; + } + const parsed = JSON.parse(value); + if (seen[parsed.message_id]) { + continue; + } + seen[parsed.message_id] = true; + consumed.add(1); + schedulingJitter.add(Math.max(0, Date.now() - Number(parsed.deadline_ms))); + matched += 1; + } + if (matched === 0) { + sleep(0.1); + } +} + +export function teardown() { + if (producer) { + producer.flush(); + producer.close(); + } + if (consumer) { + consumer.close(); + } + check(true, { + "load test completed": (value) => value === true, + }); +} diff --git a/dev/makefiles/docker.mk b/dev/makefiles/docker.mk index 876c79b..c51cafb 100644 --- a/dev/makefiles/docker.mk +++ b/dev/makefiles/docker.mk @@ -26,8 +26,8 @@ down: docker.config: $(DOCKER_COMPOSE) config -## docker.build: Build the Chronos and PostgreSQL migration container images -docker.build: docker.build.chronos docker.build.migrations +## docker.build: Build the Chronos, PostgreSQL migration, and k6 container images +docker.build: docker.build.chronos docker.build.migrations k6.build ## docker.build.chronos: Build the Chronos container image docker.build.chronos: diff --git a/dev/makefiles/k6.mk b/dev/makefiles/k6.mk new file mode 100644 index 0000000..7ad1ed9 --- /dev/null +++ b/dev/makefiles/k6.mk @@ -0,0 +1,47 @@ +K6_VERSION ?= 1.7.1 +XK6_KAFKA_VERSION ?= latest +K6_IMAGE ?= chronos-k6:$(K6_VERSION) +K6_LOG_DIR ?= $(PWD)/dev/lgtm/runtime-logs +K6_RUN_ID ?= chronos-k6-$(shell date +%Y%m%d%H%M%S) +K6_CI_OTLP_ENDPOINT ?= host.docker.internal:4317 +K6_DEFAULT_OTEL_ENDPOINT := $(if $(and $(GITHUB_ACTIONS),$(if $(ACT),,1)),$(K6_CI_OTLP_ENDPOINT),lgtm:4317) +K6_OTEL_GRPC_EXPORTER_ENDPOINT ?= $(K6_DEFAULT_OTEL_ENDPOINT) +K6_DOCKER_NETWORK ?= $(if $(and $(GITHUB_ACTIONS),$(if $(ACT),,1)),bridge,chronos) +K6_COMMON_ENV := \ + -e KAFKA_BROKERS=$${KAFKA_BROKERS:-kafka:9092} \ + -e KAFKA_IN_TOPIC=$${KAFKA_IN_TOPIC:-chronos.in} \ + -e KAFKA_OUT_TOPIC=$${KAFKA_OUT_TOPIC:-chronos.out} \ + -e K6_OTEL_SERVICE_NAME=$${K6_OTEL_SERVICE_NAME:-k6-chronos} \ + -e K6_OTEL_METRIC_PREFIX=$${K6_OTEL_METRIC_PREFIX:-k6_} \ + -e K6_OTEL_GRPC_EXPORTER_INSECURE=$${K6_OTEL_GRPC_EXPORTER_INSECURE:-true} \ + -e K6_OTEL_GRPC_EXPORTER_ENDPOINT=$(K6_OTEL_GRPC_EXPORTER_ENDPOINT) \ + -e K6_RUN_ID=$(K6_RUN_ID) +K6_DOCKER_RUN := docker run --rm --network $(K6_DOCKER_NETWORK) --add-host=host.docker.internal:host-gateway -v "$(PWD)/dev/k6:/scripts:ro" -v "$(K6_LOG_DIR):/data/lgtm/logs" $(K6_COMMON_ENV) + +## k6.build: Build the custom k6 image with xk6-kafka +k6.build: + $(call pp,building k6 image $(K6_IMAGE) with k6 $(K6_VERSION) and xk6-kafka $(XK6_KAFKA_VERSION)...) + docker build -f docker/Dockerfile.k6 --build-arg K6_VERSION=$(K6_VERSION) --build-arg XK6_KAFKA_VERSION=$(XK6_KAFKA_VERSION) -t $(K6_IMAGE) . + +## k6.contract: Run the k6 Chronos contract integration test with OTLP output +k6.contract: + $(call pp,running k6 contract test with OTLP endpoint $(K6_OTEL_GRPC_EXPORTER_ENDPOINT)...) + mkdir -p "$(K6_LOG_DIR)" + $(K6_DOCKER_RUN) --entrypoint bash $(K6_IMAGE) -lc 'k6 run --out opentelemetry /scripts/contract.js 2>&1 | tee -a /data/lgtm/logs/k6-contract.jsonl; exit $${PIPESTATUS[0]}' + +## k6.load: Run the k6 Chronos load test with OTLP output +k6.load: + $(call pp,running k6 load test with OTLP endpoint $(K6_OTEL_GRPC_EXPORTER_ENDPOINT)...) + mkdir -p "$(K6_LOG_DIR)" + $(K6_DOCKER_RUN) \ + -e K6_LOAD_RATE=$${K6_LOAD_RATE:-1000} \ + -e K6_LOAD_DURATION=$${K6_LOAD_DURATION:-1m} \ + -e K6_LOAD_CONSUME_DURATION=$${K6_LOAD_CONSUME_DURATION:-2m} \ + -e K6_LOAD_DELAY_MS=$${K6_LOAD_DELAY_MS:-1000} \ + -e K6_LOAD_EXPECTED_MESSAGES=$${K6_LOAD_EXPECTED_MESSAGES:-} \ + --entrypoint bash $(K6_IMAGE) -lc 'k6 run --out opentelemetry /scripts/load.js 2>&1 | tee -a /data/lgtm/logs/k6-load.jsonl; exit $${PIPESTATUS[0]}' + +## k6.test: Run k6 contract and load integration tests +k6.test: k6.contract k6.load + +.PHONY: k6.build k6.contract k6.load k6.test diff --git a/docker/Dockerfile.k6 b/docker/Dockerfile.k6 new file mode 100644 index 0000000..e1c251a --- /dev/null +++ b/docker/Dockerfile.k6 @@ -0,0 +1,23 @@ +FROM golang:1.26-bookworm AS build + +ARG K6_VERSION=1.7.1 +ARG XK6_KAFKA_VERSION=latest + +RUN apt-get update \ + && apt-get install -y --no-install-recommends build-essential ca-certificates git pkg-config \ + && rm -rf /var/lib/apt/lists/* + +RUN go install go.k6.io/xk6/cmd/xk6@latest +RUN CGO_ENABLED=1 xk6 build --k6-version "v${K6_VERSION}" \ + --with "github.com/mostafa/xk6-kafka/v2@${XK6_KAFKA_VERSION}" \ + --output /tmp/k6 + +FROM debian:bookworm-slim + +RUN apt-get update \ + && apt-get install -y --no-install-recommends bash ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=build /tmp/k6 /usr/local/bin/k6 + +ENTRYPOINT ["k6"] From f8659a44cfb8f92fcfcc45a34c8c2c27d0f69f8f Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 15:03:30 +1000 Subject: [PATCH 24/36] fix: default chronos otel service name Set OTEL_SERVICE_NAME to chronos during Chronos startup when callers have not provided a value, before tracing and metrics exporters initialize. Also make the Jaeger helper use the effective service name instead of formatting the Result returned by env lookup. Verification: - make pre-commit Model-version: GPT-5 --- chronos_bin/src/bin/chronos.rs | 4 ++++ chronos_bin/src/telemetry/jaegar_backend.rs | 13 ++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/chronos_bin/src/bin/chronos.rs b/chronos_bin/src/bin/chronos.rs index b146037..c07c5af 100644 --- a/chronos_bin/src/bin/chronos.rs +++ b/chronos_bin/src/bin/chronos.rs @@ -14,6 +14,10 @@ use std::time::Duration; async fn main() { env_logger::init(); dotenvy::dotenv().ok(); + std::env::set_var( + "OTEL_SERVICE_NAME", + std::env::var("OTEL_SERVICE_NAME").unwrap_or_else(|_| "chronos".to_string()), + ); let protocol = std::env::var("OTEL_EXPORTER_OTLP_PROTOCOL").unwrap_or_else(|_| "http/json".to_string()); diff --git a/chronos_bin/src/telemetry/jaegar_backend.rs b/chronos_bin/src/telemetry/jaegar_backend.rs index 887f9fa..92f674e 100644 --- a/chronos_bin/src/telemetry/jaegar_backend.rs +++ b/chronos_bin/src/telemetry/jaegar_backend.rs @@ -2,11 +2,10 @@ use opentelemetry_api::trace::TraceError; use opentelemetry_sdk::trace::Tracer; pub fn instrument_jaegar_pipleline() -> Result { - let service_name = std::env::var("OTEL_SERVICE_NAME"); - if service_name.is_err() { - std::env::set_var("OTEL_SERVICE_NAME", "chronos"); - } - opentelemetry_jaeger::new_agent_pipeline() - .with_service_name(format!("{:?}", service_name)) - .install_simple() + let service_name = std::env::var("OTEL_SERVICE_NAME").unwrap_or_else(|_| { + let service_name = "chronos".to_string(); + std::env::set_var("OTEL_SERVICE_NAME", &service_name); + service_name + }); + opentelemetry_jaeger::new_agent_pipeline().with_service_name(service_name).install_simple() } From 8f75edc68c2a4e3f6bb19dcced1d97ae0d6914af Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 15:11:02 +1000 Subject: [PATCH 25/36] test: make k6 full load opt in Change the default k6 load profile to 100 messages/sec while keeping the production-scale 1,000 messages/sec profile available through K6_FULL_LOAD=true make k6.load. Document that the full load profile depends on k6, Docker host, Kafka, PostgreSQL, and Chronos capacity and may require production-like infrastructure to pass. Verification: - node --check dev/k6/load.js - make -n k6.load - make -n k6.load K6_FULL_LOAD=true - k6 inspect for dev/k6/load.js through the custom image - make pre-commit Model-version: GPT-5 --- dev/k6/README.md | 5 ++++- dev/k6/load.js | 4 ++-- dev/makefiles/k6.mk | 15 ++++++++++----- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/dev/k6/README.md b/dev/k6/README.md index fa8b833..6008056 100644 --- a/dev/k6/README.md +++ b/dev/k6/README.md @@ -6,9 +6,12 @@ The k6 image is built with `xk6-kafka` so tests can publish to and consume from - `make k6.build` builds the custom k6 image. - `make k6.contract` runs one pass through the important Chronos processing paths. -- `make k6.load` runs a constant-arrival-rate producer load test. +- `make k6.load` runs a constant-arrival-rate producer load test. The default profile tops out at 100 messages/sec. +- `K6_FULL_LOAD=true make k6.load` runs the full load profile at 1,000 messages/sec for one minute. - `make k6.test` runs contract and load tests. +The full load profile is a production-scale signal, not a guaranteed local-dev pass. It depends on k6 producer speed, k6 consumer drain speed, Docker host capacity, Kafka throughput, PostgreSQL throughput, and Chronos capacity. It may require production-like infrastructure to satisfy the 1,000 messages/sec throughput target and the 500 ms p99.9 observed scheduling jitter threshold. + By default the recipes use the LGTM compose network and send k6 OTLP metrics to `lgtm:4317`. In GitHub Actions outside `act`, set `K6_CI_OTLP_ENDPOINT`; the default is `host.docker.internal:4317`. When running under `act`, the recipes keep using the LGTM container. Logs from k6 are appended to `dev/lgtm/runtime-logs/*.jsonl`, which is mounted into the LGTM collector filelog receiver. diff --git a/dev/k6/load.js b/dev/k6/load.js index 68decf1..99983b9 100644 --- a/dev/k6/load.js +++ b/dev/k6/load.js @@ -6,9 +6,9 @@ import { Producer, Consumer } from "k6/x/kafka"; const brokers = (__ENV.KAFKA_BROKERS || "kafka:9092").split(","); const inputTopic = __ENV.KAFKA_IN_TOPIC || "chronos.in"; const outputTopic = __ENV.KAFKA_OUT_TOPIC || "chronos.out"; -const rate = Number(__ENV.K6_LOAD_RATE || 1000); +const rate = Number(__ENV.K6_LOAD_RATE || 100); const duration = __ENV.K6_LOAD_DURATION || "1m"; -const consumeDuration = __ENV.K6_LOAD_CONSUME_DURATION || "2m"; +const consumeDuration = __ENV.K6_LOAD_CONSUME_DURATION || "30s"; const delayMs = Number(__ENV.K6_LOAD_DELAY_MS || 1000); const runId = __ENV.K6_RUN_ID || `load-${Date.now()}`; const expectedMessages = Number(__ENV.K6_LOAD_EXPECTED_MESSAGES || Math.floor(rate * durationSeconds(duration))); diff --git a/dev/makefiles/k6.mk b/dev/makefiles/k6.mk index 7ad1ed9..979692d 100644 --- a/dev/makefiles/k6.mk +++ b/dev/makefiles/k6.mk @@ -7,6 +7,11 @@ K6_CI_OTLP_ENDPOINT ?= host.docker.internal:4317 K6_DEFAULT_OTEL_ENDPOINT := $(if $(and $(GITHUB_ACTIONS),$(if $(ACT),,1)),$(K6_CI_OTLP_ENDPOINT),lgtm:4317) K6_OTEL_GRPC_EXPORTER_ENDPOINT ?= $(K6_DEFAULT_OTEL_ENDPOINT) K6_DOCKER_NETWORK ?= $(if $(and $(GITHUB_ACTIONS),$(if $(ACT),,1)),bridge,chronos) +K6_FULL_LOAD ?= false +K6_LOAD_DEFAULT_RATE := $(if $(filter true 1 yes,$(K6_FULL_LOAD)),1000,100) +K6_LOAD_DEFAULT_DURATION := $(if $(filter true 1 yes,$(K6_FULL_LOAD)),1m,1m) +K6_LOAD_DEFAULT_CONSUME_DURATION := $(if $(filter true 1 yes,$(K6_FULL_LOAD)),2m,30s) +K6_LOAD_PROFILE := $(if $(filter true 1 yes,$(K6_FULL_LOAD)),full load,load) K6_COMMON_ENV := \ -e KAFKA_BROKERS=$${KAFKA_BROKERS:-kafka:9092} \ -e KAFKA_IN_TOPIC=$${KAFKA_IN_TOPIC:-chronos.in} \ @@ -29,14 +34,14 @@ k6.contract: mkdir -p "$(K6_LOG_DIR)" $(K6_DOCKER_RUN) --entrypoint bash $(K6_IMAGE) -lc 'k6 run --out opentelemetry /scripts/contract.js 2>&1 | tee -a /data/lgtm/logs/k6-contract.jsonl; exit $${PIPESTATUS[0]}' -## k6.load: Run the k6 Chronos load test with OTLP output +## k6.load: Run the k6 Chronos load test with OTLP output. Use K6_FULL_LOAD=true for the 1,000 rps full load profile k6.load: - $(call pp,running k6 load test with OTLP endpoint $(K6_OTEL_GRPC_EXPORTER_ENDPOINT)...) + $(call pp,running k6 $(K6_LOAD_PROFILE) test with OTLP endpoint $(K6_OTEL_GRPC_EXPORTER_ENDPOINT)...) mkdir -p "$(K6_LOG_DIR)" $(K6_DOCKER_RUN) \ - -e K6_LOAD_RATE=$${K6_LOAD_RATE:-1000} \ - -e K6_LOAD_DURATION=$${K6_LOAD_DURATION:-1m} \ - -e K6_LOAD_CONSUME_DURATION=$${K6_LOAD_CONSUME_DURATION:-2m} \ + -e K6_LOAD_RATE=$${K6_LOAD_RATE:-$(K6_LOAD_DEFAULT_RATE)} \ + -e K6_LOAD_DURATION=$${K6_LOAD_DURATION:-$(K6_LOAD_DEFAULT_DURATION)} \ + -e K6_LOAD_CONSUME_DURATION=$${K6_LOAD_CONSUME_DURATION:-$(K6_LOAD_DEFAULT_CONSUME_DURATION)} \ -e K6_LOAD_DELAY_MS=$${K6_LOAD_DELAY_MS:-1000} \ -e K6_LOAD_EXPECTED_MESSAGES=$${K6_LOAD_EXPECTED_MESSAGES:-} \ --entrypoint bash $(K6_IMAGE) -lc 'k6 run --out opentelemetry /scripts/load.js 2>&1 | tee -a /data/lgtm/logs/k6-load.jsonl; exit $${PIPESTATUS[0]}' From 9ec2119c2ddbf7529326edc629dd2a50c697e644 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 15:24:46 +1000 Subject: [PATCH 26/36] fix: measure k6 load jitter from output timestamp Change the k6 load test to calculate scheduling jitter from the Kafka output record timestamp minus the requested scheduled timestamp. This removes k6 drain/consume timing from the jitter measurement and keeps the original input publish timestamp as separate payload data. Add a timestamp error counter threshold so missing or unparsable output record timestamps fail visibly. Verification: - node --check dev/k6/load.js - k6 inspect for dev/k6/load.js through the custom image - make pre-commit Model-version: GPT-5 --- dev/k6/README.md | 2 ++ dev/k6/load.js | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/dev/k6/README.md b/dev/k6/README.md index 6008056..282c063 100644 --- a/dev/k6/README.md +++ b/dev/k6/README.md @@ -12,6 +12,8 @@ The k6 image is built with `xk6-kafka` so tests can publish to and consume from The full load profile is a production-scale signal, not a guaranteed local-dev pass. It depends on k6 producer speed, k6 consumer drain speed, Docker host capacity, Kafka throughput, PostgreSQL throughput, and Chronos capacity. It may require production-like infrastructure to satisfy the 1,000 messages/sec throughput target and the 500 ms p99.9 observed scheduling jitter threshold. +The load test records `chronos_scheduling_jitter` from the Kafka output record timestamp minus the requested scheduled timestamp. It does not use the time k6 consumes or drains the output topic. + By default the recipes use the LGTM compose network and send k6 OTLP metrics to `lgtm:4317`. In GitHub Actions outside `act`, set `K6_CI_OTLP_ENDPOINT`; the default is `host.docker.internal:4317`. When running under `act`, the recipes keep using the LGTM container. Logs from k6 are appended to `dev/lgtm/runtime-logs/*.jsonl`, which is mounted into the LGTM collector filelog receiver. diff --git a/dev/k6/load.js b/dev/k6/load.js index 99983b9..922845d 100644 --- a/dev/k6/load.js +++ b/dev/k6/load.js @@ -9,12 +9,13 @@ const outputTopic = __ENV.KAFKA_OUT_TOPIC || "chronos.out"; const rate = Number(__ENV.K6_LOAD_RATE || 100); const duration = __ENV.K6_LOAD_DURATION || "1m"; const consumeDuration = __ENV.K6_LOAD_CONSUME_DURATION || "30s"; -const delayMs = Number(__ENV.K6_LOAD_DELAY_MS || 1000); +const scheduleDelayMs = Number(__ENV.K6_LOAD_DELAY_MS || 1000); const runId = __ENV.K6_RUN_ID || `load-${Date.now()}`; const expectedMessages = Number(__ENV.K6_LOAD_EXPECTED_MESSAGES || Math.floor(rate * durationSeconds(duration))); const published = new Counter("chronos_messages_published"); const consumed = new Counter("chronos_messages_consumed"); +const timestampErrors = new Counter("chronos_output_timestamp_errors"); const schedulingJitter = new Trend("chronos_scheduling_jitter", true); export const options = { @@ -42,6 +43,7 @@ export const options = { dropped_iterations: ["count==0"], chronos_messages_published: [`count>=${expectedMessages}`], chronos_messages_consumed: [`count>=${expectedMessages}`], + chronos_output_timestamp_errors: ["count==0"], chronos_scheduling_jitter: ["p(99.9)<500"], }, }; @@ -107,21 +109,21 @@ export function setup() { } export function produceInput(data) { - const now = Date.now(); - const id = `${data.runId}-${__VU}-${__ITER}-${now}`; - const deadlineMs = now + delayMs; + const publishedAtMs = Date.now(); + const id = `${data.runId}-${__VU}-${__ITER}-${publishedAtMs}`; + const scheduledAtMs = publishedAtMs + scheduleDelayMs; const message = { key: encoding.b64encode(id), value: encoding.b64encode(JSON.stringify({ source: "k6-load", run_id: data.runId, message_id: id, - sent_at_ms: now, - deadline_ms: deadlineMs, + published_at_ms: publishedAtMs, + scheduled_at_ms: scheduledAtMs, })), headers: { chronosMessageId: id, - chronosDeadline: new Date(deadlineMs).toISOString(), + chronosDeadline: new Date(scheduledAtMs).toISOString(), }, }; getProducer().produce({ messages: [message] }); @@ -141,8 +143,13 @@ export function consumeOutput(data) { continue; } seen[parsed.message_id] = true; + const outputPublishedAtMs = Date.parse(message.time); + if (Number.isNaN(outputPublishedAtMs)) { + timestampErrors.add(1); + continue; + } consumed.add(1); - schedulingJitter.add(Math.max(0, Date.now() - Number(parsed.deadline_ms))); + schedulingJitter.add(Math.max(0, outputPublishedAtMs - Number(parsed.scheduled_at_ms))); matched += 1; } if (matched === 0) { From d878b0a79e376628f1c02dc6fc0aa1878bb2b63d Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 15:43:04 +1000 Subject: [PATCH 27/36] fix: measure jitter from output record timestamp Record an explicit Kafka output record timestamp when Chronos publishes a delayed message and use that timestamp for chronos.message.jitter instead of measuring after the Kafka delivery future completes. This removes broker acknowledgement/backpressure wait time from the jitter observation. Add focused tests proving jitter milliseconds are converted to seconds and clock skew floors at zero. Verification: - make pre-commit Model-version: GPT-5 --- chronos_bin/src/kafka/producer.rs | 17 +++++++++-- chronos_bin/src/message_processor.rs | 28 ++++++++++++++++--- .../examples/publish_test_message.rs | 2 +- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/chronos_bin/src/kafka/producer.rs b/chronos_bin/src/kafka/producer.rs index 0192df3..dd83d15 100644 --- a/chronos_bin/src/kafka/producer.rs +++ b/chronos_bin/src/kafka/producer.rs @@ -3,6 +3,7 @@ use std::time::Duration; use crate::utils::util::into_headers; use crate::{kafka::errors::KafkaAdapterError, utils::util::CHRONOS_ID}; +use chrono::{DateTime, Utc}; use rdkafka::producer::{FutureProducer, FutureRecord}; use super::config::KafkaConfig; @@ -16,6 +17,11 @@ pub struct KafkaProducer { topic: String, } +pub struct PublishedMessage { + pub id: String, + pub timestamp: DateTime, +} + impl KafkaProducer { pub fn new(config: &KafkaConfig) -> Self { // rdlibkafka goes infinitely trying to connect to kafka broker @@ -25,11 +31,12 @@ impl KafkaProducer { Self { producer, topic } } #[instrument(skip_all, fields(topic = %self.topic))] - pub async fn kafka_publish(&self, message: String, headers: Option>, key: String) -> Result { + pub async fn kafka_publish(&self, message: String, headers: Option>, key: String) -> Result { // Only because never expecting wrong headers to reach here let unwrap_header = &headers.unwrap_or_default(); let o_header = into_headers(unwrap_header); + let published_at = Utc::now(); // println!("headers {:?}", o_header); // println!("headers {:?} headers--{:?}", &headers["chronosId)"].to_string(), &headers["chronosDeadline)"].to_string()); @@ -39,11 +46,15 @@ impl KafkaProducer { FutureRecord::to(self.topic.as_str()) .payload(message.as_str()) .key(key.as_str()) - .headers(o_header), + .headers(o_header) + .timestamp(published_at.timestamp_millis()), Duration::from_secs(0), ) .await .map_err(|(kafka_error, _record)| KafkaAdapterError::PublishMessage(kafka_error, "message publishing failed".to_string()))?; - Ok(unwrap_header[CHRONOS_ID].to_string()) + Ok(PublishedMessage { + id: unwrap_header[CHRONOS_ID].to_string(), + timestamp: published_at, + }) } } diff --git a/chronos_bin/src/message_processor.rs b/chronos_bin/src/message_processor.rs index ab52eb7..c85d4af 100644 --- a/chronos_bin/src/message_processor.rs +++ b/chronos_bin/src/message_processor.rs @@ -60,16 +60,15 @@ impl MessageProcessor { match readied_by_column { Some(id) => { headers.insert("readied_by".to_string(), id); - if let Ok(id) = self + if let Ok(published) = self .producer .kafka_publish(updated_row.message_value.to_string(), Some(headers), updated_row.message_key.to_string()) .await { // msg_jitter: difference between actual publish time and client-requested deadline. // Floored at 0 to guard against clock skew producing negative jitter. - let jitter_secs = (Utc::now() - deadline).num_milliseconds().max(0) as f64 / 1000.0; - self.metrics.observe_jitter(jitter_secs); - Ok(id) + self.metrics.observe_jitter(jitter_seconds(published.timestamp, deadline)); + Ok(published.id) } else { Err("error occurred while publishing".to_string()) } @@ -174,8 +173,13 @@ impl MessageProcessor { } } +fn jitter_seconds(published_at: chrono::DateTime, deadline: chrono::DateTime) -> f64 { + (published_at - deadline).num_milliseconds().max(0) as f64 / 1000.0 +} + #[cfg(test)] mod tests { + use super::jitter_seconds; use crate::metrics::ChronosMetrics; #[test] @@ -186,6 +190,22 @@ mod tests { assert!(jitter_ms >= 300, "jitter should be at least 300ms when deadline was 300ms ago"); } + #[test] + fn test_jitter_seconds_converts_milliseconds_to_seconds() { + use chrono::{Duration, Utc}; + let deadline = Utc::now(); + let published_at = deadline + Duration::milliseconds(300); + assert!((jitter_seconds(published_at, deadline) - 0.3).abs() < f64::EPSILON); + } + + #[test] + fn test_jitter_seconds_floors_clock_skew_at_zero() { + use chrono::{Duration, Utc}; + let deadline = Utc::now(); + let published_at = deadline - Duration::milliseconds(300); + assert_eq!(jitter_seconds(published_at, deadline), 0.0); + } + #[test] fn test_jitter_below_500ms_within_sla() { let metrics = ChronosMetrics::new().unwrap(); diff --git a/examples/chronos_ex/examples/publish_test_message.rs b/examples/chronos_ex/examples/publish_test_message.rs index 3cf6998..8343ca1 100644 --- a/examples/chronos_ex/examples/publish_test_message.rs +++ b/examples/chronos_ex/examples/publish_test_message.rs @@ -47,7 +47,7 @@ async fn main() { let producer = KafkaProducer::new(&kafka_config); match producer.kafka_publish(payload, Some(headers), msg_id.clone()).await { - Ok(id) => println!("✓ Published successfully (returned id: {})", id), + Ok(published) => println!("✓ Published successfully (returned id: {})", published.id), Err(e) => { eprintln!("✗ Failed to publish: {}", e); std::process::exit(1); From 29a6aa6c0d70ae007966cf2925c1a6e48deae088 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 16:01:21 +1000 Subject: [PATCH 28/36] fix: use chronos histogram buckets for otlp Apply Chronos' generated histogram bucket boundaries to the OTLP metrics exporter so LGTM does not fall back to the SDK default histogram buckets. This prevents low-latency Chronos histograms from quantiling into the broad default 0..5s bucket and reporting p95 values around 4.75s. Update the k6 load workload to publish a default mix of immediate and delayed messages. Immediate messages exercise the receiver-to-Kafka path, while delayed messages enter PostgreSQL and exercise the processor path. Keep the scheduling jitter trend scoped to delayed messages and record immediate delivery delay separately. Verification: - node --check dev/k6/load.js - make -n k6.load - make -n k6.load K6_FULL_LOAD=true - cargo test -p chronos_bin metrics::registry::tests::otlp_histograms_use_generated_second_boundaries - make pre-commit - docker run --rm -v /home/ah34/work/opensource/chronos/dev/k6:/scripts:ro chronos-k6:1.7.1 inspect /scripts/load.js Model-version: GPT-5 --- chronos_bin/src/metrics/registry.rs | 40 ++++++++++++++++++++++++++++ dev/k6/README.md | 4 ++- dev/k6/load.js | 41 +++++++++++++++++++++++++---- dev/makefiles/k6.mk | 4 ++- 4 files changed, 82 insertions(+), 7 deletions(-) diff --git a/chronos_bin/src/metrics/registry.rs b/chronos_bin/src/metrics/registry.rs index 2b8750c..59fe2d2 100644 --- a/chronos_bin/src/metrics/registry.rs +++ b/chronos_bin/src/metrics/registry.rs @@ -5,6 +5,8 @@ use opentelemetry::global; use opentelemetry::metrics::{Counter as OtlpCounter, Histogram as OtlpHistogram, Unit}; use opentelemetry::KeyValue; use opentelemetry_otlp::WithExportConfig; +use opentelemetry_sdk::metrics::reader::{AggregationSelector, DefaultAggregationSelector}; +use opentelemetry_sdk::metrics::{Aggregation, InstrumentKind}; use prometheus::{histogram_opts, opts, CounterVec as PromCounterVec, HistogramVec as PromHistogramVec, Registry}; use crate::metrics::generated::{MetricDefinition, MetricId, MetricKind, METRIC_DEFINITIONS}; @@ -223,6 +225,33 @@ impl MetricsBackend for PrometheusMetricsBackend { fn shutdown(&self) {} } +struct ChronosAggregationSelector; + +impl AggregationSelector for ChronosAggregationSelector { + fn aggregation(&self, kind: InstrumentKind) -> Aggregation { + if kind != InstrumentKind::Histogram { + return DefaultAggregationSelector::new().aggregation(kind); + } + + Aggregation::ExplicitBucketHistogram { + boundaries: otlp_histogram_boundaries(), + record_min_max: true, + } + } +} + +fn otlp_histogram_boundaries() -> Vec { + let mut boundaries = METRIC_DEFINITIONS + .iter() + .filter(|definition| definition.kind.is_histogram()) + .filter_map(|definition| definition.buckets) + .flat_map(|buckets| buckets.iter().copied()) + .collect::>(); + boundaries.sort_by(f64::total_cmp); + boundaries.dedup(); + boundaries +} + struct OtlpMetricsBackend { provider: opentelemetry_sdk::metrics::MeterProvider, counters: HashMap>, @@ -238,6 +267,7 @@ impl OtlpMetricsBackend { let provider = opentelemetry_otlp::new_pipeline() .metrics(opentelemetry::runtime::Tokio) .with_exporter(exporter) + .with_aggregation_selector(ChronosAggregationSelector) .build()?; global::set_meter_provider(provider.clone()); @@ -400,6 +430,16 @@ mod tests { assert!(output.contains("chronos_msg_jitter_bucket{le=\"0.5\"} 1")); } + #[test] + fn otlp_histograms_use_generated_second_boundaries() { + let boundaries = otlp_histogram_boundaries(); + + assert!(boundaries.contains(&0.5)); + assert!(boundaries.contains(&2.048)); + assert!(boundaries.contains(&5.0)); + assert!(boundaries.windows(2).all(|window| window[0] < window[1])); + } + #[test] #[serial] fn msg_reset_increments_correctly() { diff --git a/dev/k6/README.md b/dev/k6/README.md index 282c063..439783f 100644 --- a/dev/k6/README.md +++ b/dev/k6/README.md @@ -12,7 +12,9 @@ The k6 image is built with `xk6-kafka` so tests can publish to and consume from The full load profile is a production-scale signal, not a guaranteed local-dev pass. It depends on k6 producer speed, k6 consumer drain speed, Docker host capacity, Kafka throughput, PostgreSQL throughput, and Chronos capacity. It may require production-like infrastructure to satisfy the 1,000 messages/sec throughput target and the 500 ms p99.9 observed scheduling jitter threshold. -The load test records `chronos_scheduling_jitter` from the Kafka output record timestamp minus the requested scheduled timestamp. It does not use the time k6 consumes or drains the output topic. +The load test publishes a default mix of immediate and delayed messages. Immediate messages use an already-expired deadline and exercise the receiver-to-Kafka path. Delayed messages use a future deadline, are inserted into PostgreSQL, and exercise the processor-to-Kafka path. Set `K6_LOAD_IMMEDIATE_RATIO` to change the immediate-message fraction, `K6_LOAD_IMMEDIATE_DELAY_MS` to change the immediate deadline offset, and `K6_LOAD_DELAY_MS` to change the delayed deadline offset. + +The load test records `chronos_scheduling_jitter` only for delayed messages, using the Kafka output record timestamp minus the requested scheduled timestamp. It does not use the time k6 consumes or drains the output topic. Immediate messages record `chronos_immediate_output_delay` from the Kafka output record timestamp minus the input publish timestamp. By default the recipes use the LGTM compose network and send k6 OTLP metrics to `lgtm:4317`. In GitHub Actions outside `act`, set `K6_CI_OTLP_ENDPOINT`; the default is `host.docker.internal:4317`. When running under `act`, the recipes keep using the LGTM container. diff --git a/dev/k6/load.js b/dev/k6/load.js index 922845d..023659b 100644 --- a/dev/k6/load.js +++ b/dev/k6/load.js @@ -8,8 +8,10 @@ const inputTopic = __ENV.KAFKA_IN_TOPIC || "chronos.in"; const outputTopic = __ENV.KAFKA_OUT_TOPIC || "chronos.out"; const rate = Number(__ENV.K6_LOAD_RATE || 100); const duration = __ENV.K6_LOAD_DURATION || "1m"; -const consumeDuration = __ENV.K6_LOAD_CONSUME_DURATION || "30s"; -const scheduleDelayMs = Number(__ENV.K6_LOAD_DELAY_MS || 1000); +const consumeDuration = __ENV.K6_LOAD_CONSUME_DURATION || "90s"; +const delayedScheduleDelayMs = Number(__ENV.K6_LOAD_DELAY_MS || 1000); +const immediateScheduleDelayMs = Number(__ENV.K6_LOAD_IMMEDIATE_DELAY_MS || -1000); +const immediateRatio = clampRatio(Number(__ENV.K6_LOAD_IMMEDIATE_RATIO || 0.5)); const runId = __ENV.K6_RUN_ID || `load-${Date.now()}`; const expectedMessages = Number(__ENV.K6_LOAD_EXPECTED_MESSAGES || Math.floor(rate * durationSeconds(duration))); @@ -17,6 +19,7 @@ const published = new Counter("chronos_messages_published"); const consumed = new Counter("chronos_messages_consumed"); const timestampErrors = new Counter("chronos_output_timestamp_errors"); const schedulingJitter = new Trend("chronos_scheduling_jitter", true); +const immediateOutputDelay = new Trend("chronos_immediate_output_delay", true); export const options = { scenarios: { @@ -97,6 +100,17 @@ function durationSeconds(value) { } } +function clampRatio(value) { + if (Number.isNaN(value)) { + return 0.5; + } + return Math.min(1, Math.max(0, value)); +} + +function shouldPublishImmediate() { + return ((__ITER % 100) / 100) < immediateRatio; +} + function bytesToString(value) { if (typeof value === "string") { return value; @@ -111,6 +125,8 @@ export function setup() { export function produceInput(data) { const publishedAtMs = Date.now(); const id = `${data.runId}-${__VU}-${__ITER}-${publishedAtMs}`; + const chronosPath = shouldPublishImmediate() ? "immediate" : "delayed"; + const scheduleDelayMs = chronosPath === "immediate" ? immediateScheduleDelayMs : delayedScheduleDelayMs; const scheduledAtMs = publishedAtMs + scheduleDelayMs; const message = { key: encoding.b64encode(id), @@ -118,6 +134,7 @@ export function produceInput(data) { source: "k6-load", run_id: data.runId, message_id: id, + chronos_path: chronosPath, published_at_ms: publishedAtMs, scheduled_at_ms: scheduledAtMs, })), @@ -127,7 +144,7 @@ export function produceInput(data) { }, }; getProducer().produce({ messages: [message] }); - published.add(1); + published.add(1, { chronos_path: chronosPath }); } export function consumeOutput(data) { @@ -148,8 +165,22 @@ export function consumeOutput(data) { timestampErrors.add(1); continue; } - consumed.add(1); - schedulingJitter.add(Math.max(0, outputPublishedAtMs - Number(parsed.scheduled_at_ms))); + consumed.add(1, { chronos_path: parsed.chronos_path || "unknown" }); + if (parsed.chronos_path === "delayed") { + const scheduledAtMs = Number(parsed.scheduled_at_ms); + if (Number.isNaN(scheduledAtMs)) { + timestampErrors.add(1); + continue; + } + schedulingJitter.add(Math.max(0, outputPublishedAtMs - scheduledAtMs), { chronos_path: "delayed" }); + } else { + const publishedAtMs = Number(parsed.published_at_ms); + if (Number.isNaN(publishedAtMs)) { + timestampErrors.add(1); + continue; + } + immediateOutputDelay.add(Math.max(0, outputPublishedAtMs - publishedAtMs), { chronos_path: "immediate" }); + } matched += 1; } if (matched === 0) { diff --git a/dev/makefiles/k6.mk b/dev/makefiles/k6.mk index 979692d..a2f3b19 100644 --- a/dev/makefiles/k6.mk +++ b/dev/makefiles/k6.mk @@ -10,7 +10,7 @@ K6_DOCKER_NETWORK ?= $(if $(and $(GITHUB_ACTIONS),$(if $(ACT),,1)),bridge,chrono K6_FULL_LOAD ?= false K6_LOAD_DEFAULT_RATE := $(if $(filter true 1 yes,$(K6_FULL_LOAD)),1000,100) K6_LOAD_DEFAULT_DURATION := $(if $(filter true 1 yes,$(K6_FULL_LOAD)),1m,1m) -K6_LOAD_DEFAULT_CONSUME_DURATION := $(if $(filter true 1 yes,$(K6_FULL_LOAD)),2m,30s) +K6_LOAD_DEFAULT_CONSUME_DURATION := $(if $(filter true 1 yes,$(K6_FULL_LOAD)),2m,90s) K6_LOAD_PROFILE := $(if $(filter true 1 yes,$(K6_FULL_LOAD)),full load,load) K6_COMMON_ENV := \ -e KAFKA_BROKERS=$${KAFKA_BROKERS:-kafka:9092} \ @@ -43,6 +43,8 @@ k6.load: -e K6_LOAD_DURATION=$${K6_LOAD_DURATION:-$(K6_LOAD_DEFAULT_DURATION)} \ -e K6_LOAD_CONSUME_DURATION=$${K6_LOAD_CONSUME_DURATION:-$(K6_LOAD_DEFAULT_CONSUME_DURATION)} \ -e K6_LOAD_DELAY_MS=$${K6_LOAD_DELAY_MS:-1000} \ + -e K6_LOAD_IMMEDIATE_DELAY_MS=$${K6_LOAD_IMMEDIATE_DELAY_MS:--1000} \ + -e K6_LOAD_IMMEDIATE_RATIO=$${K6_LOAD_IMMEDIATE_RATIO:-0.5} \ -e K6_LOAD_EXPECTED_MESSAGES=$${K6_LOAD_EXPECTED_MESSAGES:-} \ --entrypoint bash $(K6_IMAGE) -lc 'k6 run --out opentelemetry /scripts/load.js 2>&1 | tee -a /data/lgtm/logs/k6-load.jsonl; exit $${PIPESTATUS[0]}' From 339de1944e1896950970146019aa473d9d6baca2 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 16:14:01 +1000 Subject: [PATCH 29/36] test: make k6 load use 10 percent immediate mix Change the k6 load workload default to send 10% of messages with already-expired deadlines and 90% with future deadlines. Use the global k6 scenario iteration counter to spread immediate messages through the run instead of grouping them by VU-local iteration. Add tagged published and consumed thresholds for immediate and delayed paths so the load test verifies that both receiver-to-Kafka and PostgreSQL-backed processor paths are exercised. Verification: - node --check dev/k6/load.js - docker run --rm -v /home/ah34/work/opensource/chronos/dev/k6:/scripts:ro chronos-k6:1.7.1 inspect /scripts/load.js - make -n k6.load - make -n k6.load K6_FULL_LOAD=true - make pre-commit Model-version: GPT-5 --- dev/k6/README.md | 2 +- dev/k6/load.js | 24 +++++++++++++++++++++--- dev/makefiles/k6.mk | 2 +- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/dev/k6/README.md b/dev/k6/README.md index 439783f..4c453ce 100644 --- a/dev/k6/README.md +++ b/dev/k6/README.md @@ -12,7 +12,7 @@ The k6 image is built with `xk6-kafka` so tests can publish to and consume from The full load profile is a production-scale signal, not a guaranteed local-dev pass. It depends on k6 producer speed, k6 consumer drain speed, Docker host capacity, Kafka throughput, PostgreSQL throughput, and Chronos capacity. It may require production-like infrastructure to satisfy the 1,000 messages/sec throughput target and the 500 ms p99.9 observed scheduling jitter threshold. -The load test publishes a default mix of immediate and delayed messages. Immediate messages use an already-expired deadline and exercise the receiver-to-Kafka path. Delayed messages use a future deadline, are inserted into PostgreSQL, and exercise the processor-to-Kafka path. Set `K6_LOAD_IMMEDIATE_RATIO` to change the immediate-message fraction, `K6_LOAD_IMMEDIATE_DELAY_MS` to change the immediate deadline offset, and `K6_LOAD_DELAY_MS` to change the delayed deadline offset. +The load test publishes a default mix of 10% immediate messages and 90% delayed messages. Immediate messages use an already-expired deadline and exercise the receiver-to-Kafka path. Delayed messages use a future deadline, are inserted into PostgreSQL, and exercise the processor-to-Kafka path. Set `K6_LOAD_IMMEDIATE_RATIO` to change the immediate-message fraction, `K6_LOAD_IMMEDIATE_DELAY_MS` to change the immediate deadline offset, and `K6_LOAD_DELAY_MS` to change the delayed deadline offset. The load test records `chronos_scheduling_jitter` only for delayed messages, using the Kafka output record timestamp minus the requested scheduled timestamp. It does not use the time k6 consumes or drains the output topic. Immediate messages record `chronos_immediate_output_delay` from the Kafka output record timestamp minus the input publish timestamp. diff --git a/dev/k6/load.js b/dev/k6/load.js index 023659b..c45f495 100644 --- a/dev/k6/load.js +++ b/dev/k6/load.js @@ -1,5 +1,6 @@ import { check, sleep } from "k6"; import encoding from "k6/encoding"; +import exec from "k6/execution"; import { Counter, Trend } from "k6/metrics"; import { Producer, Consumer } from "k6/x/kafka"; @@ -11,9 +12,11 @@ const duration = __ENV.K6_LOAD_DURATION || "1m"; const consumeDuration = __ENV.K6_LOAD_CONSUME_DURATION || "90s"; const delayedScheduleDelayMs = Number(__ENV.K6_LOAD_DELAY_MS || 1000); const immediateScheduleDelayMs = Number(__ENV.K6_LOAD_IMMEDIATE_DELAY_MS || -1000); -const immediateRatio = clampRatio(Number(__ENV.K6_LOAD_IMMEDIATE_RATIO || 0.5)); +const immediateRatio = clampRatio(Number(__ENV.K6_LOAD_IMMEDIATE_RATIO || 0.1)); const runId = __ENV.K6_RUN_ID || `load-${Date.now()}`; const expectedMessages = Number(__ENV.K6_LOAD_EXPECTED_MESSAGES || Math.floor(rate * durationSeconds(duration))); +const expectedImmediateMessages = Math.floor(expectedMessages * immediateRatio); +const expectedDelayedMessages = expectedMessages - expectedImmediateMessages; const published = new Counter("chronos_messages_published"); const consumed = new Counter("chronos_messages_consumed"); @@ -48,6 +51,7 @@ export const options = { chronos_messages_consumed: [`count>=${expectedMessages}`], chronos_output_timestamp_errors: ["count==0"], chronos_scheduling_jitter: ["p(99.9)<500"], + ...pathThresholds(), }, }; @@ -102,13 +106,27 @@ function durationSeconds(value) { function clampRatio(value) { if (Number.isNaN(value)) { - return 0.5; + return 0.1; } return Math.min(1, Math.max(0, value)); } function shouldPublishImmediate() { - return ((__ITER % 100) / 100) < immediateRatio; + const spreadBucket = ((exec.scenario.iterationInTest * 9973) % 100) / 100; + return spreadBucket < immediateRatio; +} + +function pathThresholds() { + const thresholds = {}; + if (expectedImmediateMessages > 0) { + thresholds["chronos_messages_published{chronos_path:immediate}"] = [`count>=${expectedImmediateMessages}`]; + thresholds["chronos_messages_consumed{chronos_path:immediate}"] = [`count>=${expectedImmediateMessages}`]; + } + if (expectedDelayedMessages > 0) { + thresholds["chronos_messages_published{chronos_path:delayed}"] = [`count>=${expectedDelayedMessages}`]; + thresholds["chronos_messages_consumed{chronos_path:delayed}"] = [`count>=${expectedDelayedMessages}`]; + } + return thresholds; } function bytesToString(value) { diff --git a/dev/makefiles/k6.mk b/dev/makefiles/k6.mk index a2f3b19..ad639a3 100644 --- a/dev/makefiles/k6.mk +++ b/dev/makefiles/k6.mk @@ -44,7 +44,7 @@ k6.load: -e K6_LOAD_CONSUME_DURATION=$${K6_LOAD_CONSUME_DURATION:-$(K6_LOAD_DEFAULT_CONSUME_DURATION)} \ -e K6_LOAD_DELAY_MS=$${K6_LOAD_DELAY_MS:-1000} \ -e K6_LOAD_IMMEDIATE_DELAY_MS=$${K6_LOAD_IMMEDIATE_DELAY_MS:--1000} \ - -e K6_LOAD_IMMEDIATE_RATIO=$${K6_LOAD_IMMEDIATE_RATIO:-0.5} \ + -e K6_LOAD_IMMEDIATE_RATIO=$${K6_LOAD_IMMEDIATE_RATIO:-0.1} \ -e K6_LOAD_EXPECTED_MESSAGES=$${K6_LOAD_EXPECTED_MESSAGES:-} \ --entrypoint bash $(K6_IMAGE) -lc 'k6 run --out opentelemetry /scripts/load.js 2>&1 | tee -a /data/lgtm/logs/k6-load.jsonl; exit $${PIPESTATUS[0]}' From 871b8e9371581a9c100b89cef33e3b756fc4b75c Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 16:34:06 +1000 Subject: [PATCH 30/36] fix: derive prometheus metrics from otel spec Make the production Weaver registry use one canonical metric name and label set for both OTEL and Prometheus. Remove duplicated prometheus_name and prometheus_label_names annotations, derive Prometheus names and labels by normalizing canonical identifiers, and regenerate the Rust definitions and metrics docs. Update the runtime registry to use canonical metric names for OTLP, normalized names for Prometheus exposition, and cumulative temporality for histograms. Remove the older unused chronos_bin/src/metrics/spec.yaml so the production Weaver registry is the single metric definition source. Verification: - make weaver.generate - cargo test -p chronos_bin metrics::registry - make weaver.check - make pre-commit Model-version: GPT-5 --- chronos_bin/src/message_processor.rs | 6 +- chronos_bin/src/message_receiver.rs | 4 +- .../generated/chronos_metric_definitions.rs | 43 ++++--- chronos_bin/src/metrics/registry.rs | 95 ++++++++++++---- chronos_bin/src/metrics/spec.yaml | 107 ------------------ chronos_bin/src/monitor.rs | 2 +- .../production/registry/chronos/metrics.yaml | 33 +++--- .../templates/registry/markdown/weaver.yaml | 4 +- .../templates/registry/rust/registry.rs.j2 | 15 ++- .../templates/registry/rust/weaver.yaml | 5 +- docs/chronos_metrics.md | 10 +- 11 files changed, 129 insertions(+), 195 deletions(-) delete mode 100644 chronos_bin/src/metrics/spec.yaml diff --git a/chronos_bin/src/message_processor.rs b/chronos_bin/src/message_processor.rs index c85d4af..3350353 100644 --- a/chronos_bin/src/message_processor.rs +++ b/chronos_bin/src/message_processor.rs @@ -65,7 +65,7 @@ impl MessageProcessor { .kafka_publish(updated_row.message_value.to_string(), Some(headers), updated_row.message_key.to_string()) .await { - // msg_jitter: difference between actual publish time and client-requested deadline. + // chronos.message.jitter: difference between actual publish time and client-requested deadline. // Floored at 0 to guard against clock skew producing negative jitter. self.metrics.observe_jitter(jitter_seconds(published.timestamp, deadline)); Ok(published.id) @@ -162,7 +162,7 @@ impl MessageProcessor { log::debug!("MessageProcessor loop"); tokio::time::sleep(Duration::from_millis(10)).await; - // msg_process_latency: time the full processor_message_ready() call. + // chronos.message.process.duration: time the full processor_message_ready() call. let timer = std::time::Instant::now(); let (returned, status) = self.processor_message_ready(node_id).await; let elapsed = timer.elapsed().as_secs_f64(); @@ -213,7 +213,7 @@ mod tests { metrics.observe_jitter(0.3); let output = metrics.render_prometheus().unwrap(); assert!( - output.contains("chronos_msg_jitter_bucket{le=\"0.5\"} 1"), + output.contains("chronos_message_jitter_bucket{le=\"0.5\"} 1"), "300ms jitter must be counted in the <=500ms bucket" ); } diff --git a/chronos_bin/src/message_receiver.rs b/chronos_bin/src/message_receiver.rs index 6b420c1..01ac4cc 100644 --- a/chronos_bin/src/message_receiver.rs +++ b/chronos_bin/src/message_receiver.rs @@ -84,7 +84,7 @@ impl MessageReceiver { #[tracing::instrument(name = "receiver_handle_message", skip_all, fields(correlationId, error))] pub async fn handle_message(&self, message: &BorrowedMessage<'_>) { - // msg_wait_time: record how long the message waited in the Kafka input queue. + // chronos.message.wait.duration: record how long the message waited in the Kafka input queue. // Uses the Kafka-assigned message timestamp; guards against clock skew with max(0). if let Some(kafka_ts_ms) = message.timestamp().to_millis() { let wait_secs = (Utc::now().timestamp_millis() - kafka_ts_ms).max(0) as f64 / 1000.0; @@ -117,7 +117,7 @@ impl MessageReceiver { } } - // msg_consume_latency: only record when destination was determined (valid message headers). + // chronos.message.consume.duration: only record when destination was determined (valid message headers). if destination != "unknown" { let elapsed = timer.elapsed().as_secs_f64(); self.metrics.observe_consume_latency(elapsed, destination, status); diff --git a/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs b/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs index 9fdb216..3ee2792 100644 --- a/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs +++ b/chronos_bin/src/metrics/generated/chronos_metric_definitions.rs @@ -26,16 +26,20 @@ impl MetricKind { } } +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MetricTemporality { + Cumulative, +} + #[derive(Clone, Copy, Debug)] pub struct MetricDefinition { pub id: MetricId, - pub otel_name: &'static str, - pub prometheus_name: &'static str, + pub name: &'static str, pub description: &'static str, pub unit: Option<&'static str>, pub label_names: &'static [&'static str], - pub prometheus_label_names: &'static [&'static str], pub kind: MetricKind, + pub temporality: Option, pub buckets: Option<&'static [f64]>, pub prewarm_label_values: &'static [&'static [&'static str]], } @@ -43,61 +47,56 @@ pub struct MetricDefinition { pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ MetricDefinition { id: MetricId::MsgConsumeLatency, - otel_name: "chronos.message.consume.duration", - prometheus_name: "msg_consume_latency", + name: "chronos.message.consume.duration", description: "Duration of handle_message() in message_receiver.", unit: Some("s"), - label_names: &["destination", "status"], - prometheus_label_names: &["destination", "status"], + label_names: &["chronos.consume.status", "chronos.destination"], kind: MetricKind::Histogram, + temporality: Some(MetricTemporality::Cumulative), buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), - prewarm_label_values: &[&["kafka", "pass"], &["kafka", "fail"], &["postgres", "pass"], &["postgres", "fail"]], + prewarm_label_values: &[&["pass", "kafka"], &["fail", "kafka"], &["pass", "postgres"], &["fail", "postgres"]], }, MetricDefinition { id: MetricId::MsgJitter, - otel_name: "chronos.message.jitter", - prometheus_name: "msg_jitter", + name: "chronos.message.jitter", description: "Difference between actual publish time and client-requested deadline.", unit: Some("s"), label_names: &[], - prometheus_label_names: &[], kind: MetricKind::Histogram, + temporality: Some(MetricTemporality::Cumulative), buckets: Some(&[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]), prewarm_label_values: &[], }, MetricDefinition { id: MetricId::MsgProcessLatency, - otel_name: "chronos.message.process.duration", - prometheus_name: "msg_process_latency", + name: "chronos.message.process.duration", description: "Duration of processor_message_ready() loop in message_processor.", unit: Some("s"), - label_names: &["returned", "status"], - prometheus_label_names: &["returned", "status"], + label_names: &["chronos.process.status", "chronos.processor.returned"], kind: MetricKind::Histogram, + temporality: Some(MetricTemporality::Cumulative), buckets: Some(&[0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048]), - prewarm_label_values: &[&["true", "pass"], &["true", "fail"], &["false", "pass"], &["false", "fail"]], + prewarm_label_values: &[&["pass", "true"], &["fail", "true"], &["pass", "false"], &["fail", "false"]], }, MetricDefinition { id: MetricId::MsgReset, - otel_name: "chronos.message.reset", - prometheus_name: "msg_reset", + name: "chronos.message.reset", description: "Number of records reset by reset_to_init_db() in the monitor task.", unit: Some("{message}"), label_names: &[], - prometheus_label_names: &[], kind: MetricKind::Counter, + temporality: None, buckets: None, prewarm_label_values: &[], }, MetricDefinition { id: MetricId::MsgWaitTime, - otel_name: "chronos.message.wait.duration", - prometheus_name: "msg_wait_time", + name: "chronos.message.wait.duration", description: "Time a message spent in the Kafka input queue before processing.", unit: Some("s"), label_names: &[], - prometheus_label_names: &[], kind: MetricKind::Histogram, + temporality: Some(MetricTemporality::Cumulative), buckets: Some(&[0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2]), prewarm_label_values: &[], }, diff --git a/chronos_bin/src/metrics/registry.rs b/chronos_bin/src/metrics/registry.rs index 59fe2d2..10a64d7 100644 --- a/chronos_bin/src/metrics/registry.rs +++ b/chronos_bin/src/metrics/registry.rs @@ -5,19 +5,18 @@ use opentelemetry::global; use opentelemetry::metrics::{Counter as OtlpCounter, Histogram as OtlpHistogram, Unit}; use opentelemetry::KeyValue; use opentelemetry_otlp::WithExportConfig; -use opentelemetry_sdk::metrics::reader::{AggregationSelector, DefaultAggregationSelector}; +use opentelemetry_sdk::metrics::data::Temporality; +use opentelemetry_sdk::metrics::reader::{AggregationSelector, DefaultAggregationSelector, TemporalitySelector}; use opentelemetry_sdk::metrics::{Aggregation, InstrumentKind}; use prometheus::{histogram_opts, opts, CounterVec as PromCounterVec, HistogramVec as PromHistogramVec, Registry}; -use crate::metrics::generated::{MetricDefinition, MetricId, MetricKind, METRIC_DEFINITIONS}; +use crate::metrics::generated::{MetricDefinition, MetricId, MetricKind, MetricTemporality, METRIC_DEFINITIONS}; const OTEL_METRICS_EXPORTER: &str = "OTEL_METRICS_EXPORTER"; const OTEL_EXPORTER_OTLP_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_ENDPOINT"; const OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"; const OTEL_EXPORTER_OTLP_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_PROTOCOL"; const OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"; -const PROMETHEUS_NAMESPACE: &str = "chronos"; - type MetricLabels<'a> = &'a [(&'static str, String)]; trait MetricsBackend: Send + Sync { @@ -133,24 +132,27 @@ struct PrometheusMetricsBackend { impl PrometheusMetricsBackend { fn new() -> Result { - let registry = Registry::new_custom(Some(PROMETHEUS_NAMESPACE.to_string()), None)?; + let registry = Registry::new(); let mut counters = HashMap::new(); let mut histograms = HashMap::new(); for definition in METRIC_DEFINITIONS { + let prometheus_name = prometheus_metric_name(definition.name); + let prometheus_label_names = prometheus_label_names(definition.label_names); + let prometheus_label_refs = prometheus_label_names.iter().map(String::as_str).collect::>(); match definition.kind { MetricKind::Counter => { - let metric = PromCounterVec::new(opts!(definition.prometheus_name, definition.description), definition.prometheus_label_names)?; + let metric = PromCounterVec::new(opts!(prometheus_name, definition.description), &prometheus_label_refs)?; registry.register(Box::new(metric.clone()))?; prewarm_counter(definition, &metric)?; counters.insert(definition.id, metric); } MetricKind::Histogram => { let opts = match definition.buckets { - Some(buckets) => histogram_opts!(definition.prometheus_name, definition.description, buckets.to_vec()), - None => histogram_opts!(definition.prometheus_name, definition.description), + Some(buckets) => histogram_opts!(prometheus_name, definition.description, buckets.to_vec()), + None => histogram_opts!(prometheus_name, definition.description), }; - let metric = PromHistogramVec::new(opts, definition.prometheus_label_names)?; + let metric = PromHistogramVec::new(opts, &prometheus_label_refs)?; registry.register(Box::new(metric.clone()))?; prewarm_histogram(definition, &metric)?; histograms.insert(definition.id, metric); @@ -167,7 +169,7 @@ impl PrometheusMetricsBackend { } fn prewarm_counter(definition: &MetricDefinition, metric: &PromCounterVec) -> Result<(), prometheus::Error> { - if definition.prometheus_label_names.is_empty() { + if definition.label_names.is_empty() { metric.get_metric_with_label_values(&[])?; return Ok(()); } @@ -180,7 +182,7 @@ fn prewarm_counter(definition: &MetricDefinition, metric: &PromCounterVec) -> Re } fn prewarm_histogram(definition: &MetricDefinition, metric: &PromHistogramVec) -> Result<(), prometheus::Error> { - if definition.prometheus_label_names.is_empty() { + if definition.label_names.is_empty() { metric.get_metric_with_label_values(&[])?; return Ok(()); } @@ -240,10 +242,23 @@ impl AggregationSelector for ChronosAggregationSelector { } } +struct ChronosTemporalitySelector; + +impl TemporalitySelector for ChronosTemporalitySelector { + fn temporality(&self, kind: InstrumentKind) -> Temporality { + if kind == InstrumentKind::Histogram { + Temporality::Cumulative + } else { + opentelemetry_sdk::metrics::reader::DefaultTemporalitySelector::new().temporality(kind) + } + } +} + fn otlp_histogram_boundaries() -> Vec { let mut boundaries = METRIC_DEFINITIONS .iter() .filter(|definition| definition.kind.is_histogram()) + .filter(|definition| definition.temporality == Some(MetricTemporality::Cumulative)) .filter_map(|definition| definition.buckets) .flat_map(|buckets| buckets.iter().copied()) .collect::>(); @@ -268,6 +283,7 @@ impl OtlpMetricsBackend { .metrics(opentelemetry::runtime::Tokio) .with_exporter(exporter) .with_aggregation_selector(ChronosAggregationSelector) + .with_temporality_selector(ChronosTemporalitySelector) .build()?; global::set_meter_provider(provider.clone()); @@ -279,14 +295,14 @@ impl OtlpMetricsBackend { for definition in METRIC_DEFINITIONS { match definition.kind { MetricKind::Counter => { - let mut builder = meter.u64_counter(definition.otel_name).with_description(definition.description); + let mut builder = meter.u64_counter(definition.name).with_description(definition.description); if let Some(unit) = definition.unit { builder = builder.with_unit(Unit::new(unit)); } counters.insert(definition.id, builder.init()); } MetricKind::Histogram => { - let mut builder = meter.f64_histogram(definition.otel_name).with_description(definition.description); + let mut builder = meter.f64_histogram(definition.name).with_description(definition.description); if let Some(unit) = definition.unit { builder = builder.with_unit(Unit::new(unit)); } @@ -343,11 +359,14 @@ fn require_grpc_protocol() -> Result<(), Box Vec<(&'static str, String)> { - vec![("destination", destination.to_string()), ("status", status.to_string())] + vec![("chronos.destination", destination.to_string()), ("chronos.consume.status", status.to_string())] } fn process_labels(returned: bool, status: &'static str) -> Vec<(&'static str, String)> { - vec![("returned", returned.to_string()), ("status", status.to_string())] + vec![ + ("chronos.processor.returned", returned.to_string()), + ("chronos.process.status", status.to_string()), + ] } fn metric_definition(id: MetricId) -> Option<&'static MetricDefinition> { @@ -376,6 +395,31 @@ fn labels_to_key_values(labels: MetricLabels<'_>) -> Vec { labels.iter().map(|(key, value)| KeyValue::new(*key, value.clone())).collect() } +fn prometheus_metric_name(name: &str) -> String { + normalize_prometheus_identifier(name, true) +} + +fn prometheus_label_names(names: &[&str]) -> Vec { + names.iter().map(|name| normalize_prometheus_identifier(name, false)).collect() +} + +fn normalize_prometheus_identifier(name: &str, allow_colon: bool) -> String { + let mut output = String::with_capacity(name.len()); + + for (index, character) in name.chars().enumerate() { + let is_allowed = character.is_ascii_alphanumeric() || character == '_' || (allow_colon && character == ':'); + let is_valid_first = character.is_ascii_alphabetic() || character == '_' || (allow_colon && character == ':'); + + if (index == 0 && !is_valid_first) || (index > 0 && !is_allowed) { + output.push('_'); + } else { + output.push(character); + } + } + + output +} + #[cfg(test)] mod tests { use super::*; @@ -401,23 +445,24 @@ mod tests { let output = metrics.render_prometheus().unwrap(); for definition in METRIC_DEFINITIONS { + let prometheus_name = prometheus_metric_name(definition.name); assert!( - output.contains(&format!("# HELP {PROMETHEUS_NAMESPACE}_{}", definition.prometheus_name)), + output.contains(&format!("# HELP {prometheus_name}")), "metric {} must be registered from generated definitions", - definition.prometheus_name + definition.name ); } } #[test] #[serial] - fn prometheus_metrics_use_chronos_namespace() { + fn prometheus_metrics_normalize_otel_names() { let metrics = prometheus_metrics(); metrics.observe_jitter(0.499); let output = metrics.render_prometheus().unwrap(); - assert!(output.contains("# HELP chronos_msg_jitter")); - assert!(!output.contains("# HELP msg_jitter")); + assert!(output.contains("# HELP chronos_message_jitter")); + assert!(!output.contains("# HELP chronos.message.jitter")); } #[test] @@ -427,7 +472,7 @@ mod tests { metrics.observe_jitter(0.499); let output = metrics.render_prometheus().unwrap(); - assert!(output.contains("chronos_msg_jitter_bucket{le=\"0.5\"} 1")); + assert!(output.contains("chronos_message_jitter_bucket{le=\"0.5\"} 1")); } #[test] @@ -448,7 +493,7 @@ mod tests { metrics.messages_reset(2); let output = metrics.render_prometheus().unwrap(); - assert!(output.contains("chronos_msg_reset 5")); + assert!(output.contains("chronos_message_reset 5")); } #[test] @@ -458,7 +503,7 @@ mod tests { metrics.observe_wait_time(1.5); let output = metrics.render_prometheus().unwrap(); - assert!(output.contains("chronos_msg_wait_time_count 1")); + assert!(output.contains("chronos_message_wait_duration_count 1")); } #[test] @@ -469,7 +514,7 @@ mod tests { metrics.observe_process_latency(0.01, false, "fail"); let output = metrics.render_prometheus().unwrap(); - assert!(output.contains("chronos_msg_consume_latency_count{destination=\"postgres\",status=\"pass\"} 1")); - assert!(output.contains("chronos_msg_process_latency_count{returned=\"false\",status=\"fail\"} 1")); + assert!(output.contains("chronos_message_consume_duration_count{chronos_consume_status=\"pass\",chronos_destination=\"postgres\"} 1")); + assert!(output.contains("chronos_message_process_duration_count{chronos_process_status=\"fail\",chronos_processor_returned=\"false\"} 1")); } } diff --git a/chronos_bin/src/metrics/spec.yaml b/chronos_bin/src/metrics/spec.yaml deleted file mode 100644 index 1e0728e..0000000 --- a/chronos_bin/src/metrics/spec.yaml +++ /dev/null @@ -1,107 +0,0 @@ -# Chronos metrics specification. -# -# This file is intentionally not wired into the runtime yet. It captures the -# current Prometheus metric surface from registry.rs while carrying the -# OpenTelemetry-style names needed by a future generated registry. -# -# Sources: -# - https://github.com/kindredgroup/chronos/issues/12 -# - examples/prom_otlp_mock.rs - -schema_version: 1 -service: chronos -stability: development -source_issue: https://github.com/kindredgroup/chronos/issues/12 -notes: - - >- - Metrics operations must stay outside the critical message path. Failures to - record metrics must not block consuming, storing, publishing, or deleting messages. - - >- - Chronos exposes Prometheus pull metrics today; OTLP names are included so the - same definitions can generate an OTLP backend later. - - >- - The message jitter histogram keeps an explicit 0.5 second bucket for the - 500ms scheduling SLA discussed in issue #12. - -label_sets: - consume_result: - labels: - - name: destination - otel_name: chronos.destination - description: Downstream selected by message_receiver::handle_message. - values: [kafka, postgres] - - name: status - otel_name: chronos.status - description: Whether the consume path completed successfully. - values: [pass, fail] - prewarm: - - [kafka, pass] - - [kafka, fail] - - [postgres, pass] - - [postgres, fail] - process_result: - labels: - - name: returned - otel_name: chronos.processor.returned - description: Whether the processor loop returned early because no rows were ready. - values: ["true", "false"] - - name: status - otel_name: chronos.status - description: Whether the processor loop completed successfully. - values: [pass, fail] - prewarm: - - ["true", pass] - - ["true", fail] - - ["false", pass] - - ["false", fail] - -metrics: - - id: msg_consume_latency - kind: histogram - prometheus_name: msg_consume_latency - otel_name: chronos.message.consume.duration - description: Duration of message_receiver::MessageReceiver::handle_message(). - unit: s - labels: consume_result - buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] - issue_12_signal: latency - - - id: msg_process_latency - kind: histogram - prometheus_name: msg_process_latency - otel_name: chronos.message.process.duration - description: Duration of message_processor::MessageProcessor::processor_message_ready(). - unit: s - labels: process_result - buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] - issue_12_signal: latency - - - id: msg_wait_time - kind: histogram - prometheus_name: msg_wait_time - otel_name: chronos.message.wait.duration - description: Time a message spent in the Kafka input queue before processing. - unit: s - labels: [] - buckets: [0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2] - issue_12_signal: latency - - - id: msg_jitter - kind: histogram - prometheus_name: msg_jitter - otel_name: chronos.message.jitter - description: Difference between actual publish time and client-requested deadline. - unit: s - labels: [] - buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] - issue_12_signal: latency - sla_bucket_seconds: 0.5 - - - id: msg_reset - kind: counter - prometheus_name: msg_reset - otel_name: chronos.message.reset - description: Number of records reset by postgres::pg::Pg::reset_to_init_db() in the monitor task. - unit: "{message}" - labels: [] - issue_12_signal: errors diff --git a/chronos_bin/src/monitor.rs b/chronos_bin/src/monitor.rs index 30405a3..1825d93 100644 --- a/chronos_bin/src/monitor.rs +++ b/chronos_bin/src/monitor.rs @@ -26,7 +26,7 @@ impl FailureDetector { if !fetched_rows.is_empty() { match &self.data_store.reset_to_init_db(fetched_rows).await { Ok(reset_ids) => { - // msg_reset: count the number of messages reset by the monitor task. + // chronos.message.reset: count the number of messages reset by the monitor task. self.metrics.messages_reset(reset_ids.len() as u64); log::debug!("reset_to_init_db success for {:?}", fetched_rows) } diff --git a/dev/weaver/production/registry/chronos/metrics.yaml b/dev/weaver/production/registry/chronos/metrics.yaml index e2eb38d..0b37840 100644 --- a/dev/weaver/production/registry/chronos/metrics.yaml +++ b/dev/weaver/production/registry/chronos/metrics.yaml @@ -22,7 +22,7 @@ groups: stability: development brief: Attributes for Chronos input message handling outcomes. attributes: - - id: destination + - id: chronos.destination type: string stability: development brief: Downstream selected by message_receiver::handle_message. @@ -40,7 +40,7 @@ groups: stability: development brief: Attributes for Chronos ready-message processor loop outcomes. attributes: - - id: returned + - id: chronos.processor.returned type: string stability: development brief: Whether the processor loop returned early because no rows were ready. @@ -65,15 +65,13 @@ groups: code_generation: rust_name: msg_consume_latency metric_value_type: double - prometheus_name: msg_consume_latency - label_names: [destination, status] - prometheus_label_names: [destination, status] + temporality: cumulative buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] prewarm_label_values: - - [kafka, pass] - - [kafka, fail] - - [postgres, pass] - - [postgres, fail] + - [pass, kafka] + - [fail, kafka] + - [pass, postgres] + - [fail, postgres] - id: metric.chronos.message.process.duration type: metric @@ -87,15 +85,13 @@ groups: code_generation: rust_name: msg_process_latency metric_value_type: double - prometheus_name: msg_process_latency - label_names: [returned, status] - prometheus_label_names: [returned, status] + temporality: cumulative buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] prewarm_label_values: - - ["true", pass] - - ["true", fail] - - ["false", pass] - - ["false", fail] + - [pass, "true"] + - [fail, "true"] + - [pass, "false"] + - [fail, "false"] - id: metric.chronos.message.wait.duration type: metric @@ -108,7 +104,7 @@ groups: code_generation: rust_name: msg_wait_time metric_value_type: double - prometheus_name: msg_wait_time + temporality: cumulative buckets: [0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2] - id: metric.chronos.message.jitter @@ -122,7 +118,7 @@ groups: code_generation: rust_name: msg_jitter metric_value_type: double - prometheus_name: msg_jitter + temporality: cumulative buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] - id: metric.chronos.message.reset @@ -136,4 +132,3 @@ groups: code_generation: rust_name: msg_reset metric_value_type: int - prometheus_name: msg_reset diff --git a/dev/weaver/production/templates/registry/markdown/weaver.yaml b/dev/weaver/production/templates/registry/markdown/weaver.yaml index 61b5edf..ae68a0e 100644 --- a/dev/weaver/production/templates/registry/markdown/weaver.yaml +++ b/dev/weaver/production/templates/registry/markdown/weaver.yaml @@ -6,11 +6,11 @@ templates: | map(select(.type == "metric")) | map({ metric_name, - prometheus_name: .annotations.code_generation.prometheus_name, + prometheus_name: (.metric_name | split(".") | join("_")), brief, instrument, unit, - attributes: (.annotations.code_generation.label_names // (.attributes // [] | map(.name // .id // .ref))) + attributes: (.attributes // [] | map(.name // .id // .ref)) })) } application_mode: single diff --git a/dev/weaver/production/templates/registry/rust/registry.rs.j2 b/dev/weaver/production/templates/registry/rust/registry.rs.j2 index c246744..4f667b3 100644 --- a/dev/weaver/production/templates/registry/rust/registry.rs.j2 +++ b/dev/weaver/production/templates/registry/rust/registry.rs.j2 @@ -24,16 +24,20 @@ impl MetricKind { } } +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MetricTemporality { + Cumulative, +} + #[derive(Clone, Copy, Debug)] pub struct MetricDefinition { pub id: MetricId, - pub otel_name: &'static str, - pub prometheus_name: &'static str, + pub name: &'static str, pub description: &'static str, pub unit: Option<&'static str>, pub label_names: &'static [&'static str], - pub prometheus_label_names: &'static [&'static str], pub kind: MetricKind, + pub temporality: Option, pub buckets: Option<&'static [f64]>, pub prewarm_label_values: &'static [&'static [&'static str]], } @@ -42,13 +46,12 @@ pub const METRIC_DEFINITIONS: &[MetricDefinition] = &[ {%- for metric in ctx.metrics %} MetricDefinition { id: MetricId::{{ metric.rust_name | pascal_case }}, - otel_name: "{{ metric.metric_name }}", - prometheus_name: "{{ metric.prometheus_name }}", + name: "{{ metric.metric_name }}", description: "{{ metric.brief }}", unit: {% if metric.unit %}Some("{{ metric.unit }}"){% else %}None{% endif %}, label_names: &[{% for attribute in metric.attributes %}"{{ attribute }}"{% if not loop.last %}, {% endif %}{% endfor %}], - prometheus_label_names: &[{% for label in metric.prometheus_labels %}"{{ label }}"{% if not loop.last %}, {% endif %}{% endfor %}], kind: MetricKind::{{ metric.instrument | pascal_case }}, + temporality: {% if metric.temporality == "cumulative" %}Some(MetricTemporality::Cumulative){% else %}None{% endif %}, buckets: {% if metric.buckets %}{% if metric.buckets | length > 10 %}Some(&[ {{ metric.buckets | join(", ") }}, ]){% else %}Some(&[{{ metric.buckets | join(", ") }}]){% endif %}{% else %}None{% endif %}, diff --git a/dev/weaver/production/templates/registry/rust/weaver.yaml b/dev/weaver/production/templates/registry/rust/weaver.yaml index ed7f9fb..829701e 100644 --- a/dev/weaver/production/templates/registry/rust/weaver.yaml +++ b/dev/weaver/production/templates/registry/rust/weaver.yaml @@ -8,12 +8,11 @@ templates: id, metric_name, rust_name: .annotations.code_generation.rust_name, - prometheus_name: .annotations.code_generation.prometheus_name, brief, instrument, unit, - attributes: (.annotations.code_generation.label_names // (.attributes // [] | map(.name // .id // .ref))), - prometheus_labels: (.annotations.code_generation.prometheus_label_names // (.attributes // [] | map((.name // .id // .ref) | gsub("\\."; "_")))), + attributes: (.attributes // [] | map(.name // .id // .ref)), + temporality: .annotations.code_generation.temporality, buckets: .annotations.code_generation.buckets, prewarm_label_values: (.annotations.code_generation.prewarm_label_values // []) })) diff --git a/docs/chronos_metrics.md b/docs/chronos_metrics.md index 1fa0dd6..8a5d111 100644 --- a/docs/chronos_metrics.md +++ b/docs/chronos_metrics.md @@ -4,8 +4,8 @@ Generated from `dev/weaver/production/registry/chronos/metrics.yaml` by OpenTele | Metric | Prometheus Name | Instrument | Unit | Attributes | Description | | --- | --- | --- | --- | --- | --- | -| `chronos.message.consume.duration` | `msg_consume_latency` | `histogram` | `s` | `destination`, `status` | Duration of handle_message() in message_receiver. | -| `chronos.message.jitter` | `msg_jitter` | `histogram` | `s` | - | Difference between actual publish time and client-requested deadline. | -| `chronos.message.process.duration` | `msg_process_latency` | `histogram` | `s` | `returned`, `status` | Duration of processor_message_ready() loop in message_processor. | -| `chronos.message.reset` | `msg_reset` | `counter` | `{message}` | - | Number of records reset by reset_to_init_db() in the monitor task. | -| `chronos.message.wait.duration` | `msg_wait_time` | `histogram` | `s` | - | Time a message spent in the Kafka input queue before processing. | +| `chronos.message.consume.duration` | `chronos_message_consume_duration` | `histogram` | `s` | `chronos.consume.status`, `chronos.destination` | Duration of handle_message() in message_receiver. | +| `chronos.message.jitter` | `chronos_message_jitter` | `histogram` | `s` | - | Difference between actual publish time and client-requested deadline. | +| `chronos.message.process.duration` | `chronos_message_process_duration` | `histogram` | `s` | `chronos.process.status`, `chronos.processor.returned` | Duration of processor_message_ready() loop in message_processor. | +| `chronos.message.reset` | `chronos_message_reset` | `counter` | `{message}` | - | Number of records reset by reset_to_init_db() in the monitor task. | +| `chronos.message.wait.duration` | `chronos_message_wait_duration` | `histogram` | `s` | - | Time a message spent in the Kafka input queue before processing. | From f3b3ecf0b6ec6318b0c8f60f54af658aa042191c Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 16:44:15 +1000 Subject: [PATCH 31/36] feat(dashboard): add chronos dashboard v1 --- dev/dashboards/chronos.json | 948 ++++++++++++++++++++++++++++++++ dev/docker-compose/compose.yaml | 2 + docs/weaver-metrics-proposal.md | 63 --- 3 files changed, 950 insertions(+), 63 deletions(-) create mode 100644 dev/dashboards/chronos.json delete mode 100644 docs/weaver-metrics-proposal.md diff --git a/dev/dashboards/chronos.json b/dev/dashboards/chronos.json new file mode 100644 index 0000000..5f6c9c4 --- /dev/null +++ b/dev/dashboards/chronos.json @@ -0,0 +1,948 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "A dashboard for monitoring Chronos\nhttps://github.com/kindredgroup/chronos", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(increase(chronos_message_reset_total{}[$__range]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "msgs reset", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Over time range", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "msg/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value_and_name", + "wideLayout": true + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n increase(\n chronos_message_consume_duration_seconds_count[$__range]\n )\n) by (status, destination)", + "legendFormat": "{{ destination }}:{{ status }}", + "range": true, + "refId": "A" + } + ], + "title": "Messages consumed", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 0.5, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "dashed" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(chronos_message_jitter_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "p99 message jitter", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "msg/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n rate(\n chronos_message_consume_duration_seconds_count[$__rate_interval]\n )\n) by (status, destination)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Messages consumed p/s", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "p99 time spent in queue", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99,\n sum(\n rate(\n chronos_message_wait_duration_seconds_bucket[$__rate_interval]\n )\n ) by (le)\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Messages wait time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The % of messages routed to either Postgres or Kafka", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n rate(\n chronos_message_consume_duration_seconds_count[$__rate_interval]\n )\n) by (status, destination) / on() group_left() sum(\n rate(\n chronos_message_consume_duration_seconds_count[$__rate_interval]\n )\n) by (status)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Consumed msg destination %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The percentage of time spent running the consumption or processing of messages", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n rate(\n chronos_message_process_duration_seconds_sum[$__rate_interval]\n )\n)", + "instant": false, + "legendFormat": "processing", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n rate(\n chronos_message_consume_duration_seconds_sum[$__rate_interval]\n )\n)", + "instant": false, + "legendFormat": "consumption", + "range": true, + "refId": "C" + } + ], + "title": "Running time p/s", + "transformations": [ + { + "id": "calculateField", + "options": {} + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "description": "Use the TRACEID variables to see a trace", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "footer": { + "reducers": [] + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 17, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 10, + "options": { + "cellHeight": "sm", + "showHeader": true + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "limit": 20, + "metricsQueryType": "range", + "query": "{resource.service.name=\"chronos\"}", + "queryType": "traceql", + "refId": "A", + "serviceMapUseNativeHistograms": false, + "spss": 1, + "tableType": "traces" + } + ], + "title": "Traces", + "transformations": [ + { + "id": "calculateField", + "options": {} + } + ], + "type": "table" + }, + { + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "description": "Use the TRACEID variables to see a trace", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 17, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 9, + "options": { + "spanFilters": { + "adhocFilters": [], + "criticalPathOnly": false, + "matchesOnly": false + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "limit": 20, + "metricsQueryType": "range", + "query": "${TRACEID}", + "queryType": "traceql", + "refId": "A", + "serviceMapUseNativeHistograms": false, + "tableType": "traces" + } + ], + "title": "Traces", + "transformations": [ + { + "id": "calculateField", + "options": {} + } + ], + "type": "traces" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": false, + "enableLogDetails": true, + "showControls": false, + "showTime": false, + "sortOrder": "Descending", + "syntaxHighlighting": true, + "unwrappedColumns": false, + "wrapLogMessage": true + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "direction": "backward", + "editorMode": "code", + "expr": "{service_name=\"chronos\"} | log_file_name=\"chronos.jsonl\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "logs", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 24, + "x": 0, + "y": 57 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "direction": "backward", + "editorMode": "code", + "expr": "sum(count_over_time({service_name=\"chronos\"} | log_file_name=\"chronos.jsonl\"[$__auto])) by (detected_level)", + "queryType": "range", + "refId": "A" + } + ], + "title": "logs", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "5s", + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "", + "value": "" + }, + "description": "A trace ID to lookup", + "label": "TRACEID", + "name": "TRACEID", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Chronos", + "uid": "gk65ns", + "version": 3, + "weekStart": "" +} \ No newline at end of file diff --git a/dev/docker-compose/compose.yaml b/dev/docker-compose/compose.yaml index 122aefa..aae8b4a 100644 --- a/dev/docker-compose/compose.yaml +++ b/dev/docker-compose/compose.yaml @@ -41,8 +41,10 @@ services: PG_DATABASE: chronos_db PG_POOL_SIZE: "10" RUST_LOG: info + OTEL_METRICS_EXPORTER: "prometheus" OTEL_EXPORTER_PROMETHEUS_HOST: "0.0.0.0" OTEL_EXPORTER_PROMETHEUS_PORT: "9091" + OTEL_METRIC_EXPORT_INTERVAL: "1500" MONITOR_DB_POLL: "5" PROCESSOR_DB_POLL: "5" TIMING_ADVANCE: "0" diff --git a/docs/weaver-metrics-proposal.md b/docs/weaver-metrics-proposal.md deleted file mode 100644 index 4d5524f..0000000 --- a/docs/weaver-metrics-proposal.md +++ /dev/null @@ -1,63 +0,0 @@ -# Managing Chronos Metrics with OpenTelemetry Weaver - -This proposal keeps the model from `examples/prom_otlp_mock.rs`: metric definitions live once, then the Prometheus and OTLP backends register instruments from that shared definition set. Weaver becomes the source of truth for the shared definition set. - -## Example Spec - -The example registry is in `examples/weaver/registry/chronos/metrics.yaml`. It defines Chronos metrics using OpenTelemetry-style names: - -| OpenTelemetry metric | Prometheus output name | Instrument | -| --- | --- | --- | -| `messaging.client.consumed.messages` | `messaging_client_consumed_messages_total` | counter | -| `messaging.client.operation.duration` | `messaging_client_operation_duration_seconds` | histogram | -| `messaging.process.duration` | `messaging_process_duration_seconds` | histogram | -| `chronos.message.wait.duration` | `chronos_message_wait_duration_seconds` | histogram | -| `chronos.message.jitter` | `chronos_message_jitter_seconds` | histogram | -| `chronos.message.reset` | `chronos_messages_reset_total` | counter | - -The checked-in generated example is `examples/weaver/generated/chronos_metric_definitions.rs`. It mirrors the `MetricDefinition` table in `examples/prom_otlp_mock.rs`, with both `otel_name` and `prometheus_name` so each exporter can use the native naming convention it expects. The messaging metrics and attributes use OpenTelemetry semantic convention names; Chronos-specific timing and recovery metrics remain under the `chronos.*` namespace. - -## Suggested Workflow - -Pin Weaver to the version used by the branch and make the generated file reproducible: - -```sh -WEAVER_VERSION=0.23.0 -docker run --rm \ - -v "$(pwd):/work" \ - -w /work \ - "otel/weaver:v${WEAVER_VERSION}" \ - registry check -r examples/weaver/registry -docker run --rm \ - -v "$(pwd):/work" \ - -w /work \ - "otel/weaver:v${WEAVER_VERSION}" \ - registry generate \ - -r examples/weaver/registry \ - --templates examples/weaver/templates \ - rust chronos_bin/src/metrics/generated -rustfmt chronos_bin/src/metrics/generated/chronos_metric_definitions.rs -``` - -The repository now has Make targets for the main Weaver workflows: - -```sh -make weaver.check -make weaver.generate -make weaver.generate.rust -make weaver.generate.markdown -make weaver.generate.json-schema -make weaver.live-check -``` - -`make weaver.live-check` starts Weaver's OTLP live-check receiver with Docker, runs the mock with `OTEL_METRICS_EXPORTER=otlp`, and writes the report to `/tmp/chronos-weaver-live-check/live_check.json`. - -## Implementation Path - -1. Keep the current Prometheus registry working while introducing generated definitions behind a small module such as `chronos_bin/src/metrics/generated/definitions.rs`. -2. Replace the hand-written metric creation in `chronos_bin/src/metrics/registry.rs` with a loop over generated `METRIC_DEFINITIONS`, following the backend loop already sketched in `examples/prom_otlp_mock.rs`. -3. Preserve compatibility temporarily by either exporting the current `msg_*` Prometheus names or by dual-registering old and new names for one release. The example spec prefers OpenTelemetry names and Prometheus-conventional rendered names. -4. Use generated attribute constants for label names so call sites record attributes by typed identifiers instead of string literals. -5. After the generated table is in use, add a test that gathers the registry and asserts every generated `prometheus_name` appears in the text output. - -Weaver can generate all of the static definition layer: metric IDs, names, descriptions, units, label names, bucket boundaries, and eventually attribute constants. Runtime behavior should remain hand-written because it contains Chronos-specific decisions: which events record which metric, pre-warming label combinations, exporter selection, and shutdown behavior. From 4f38abe3ed60a6959551f67a2b3bb8735b13378e Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Fri, 1 May 2026 20:33:47 +1000 Subject: [PATCH 32/36] feat(lgtm): add infrastructure exporters Add cAdvisor, postgres_exporter, KMinion, and sql_exporter to the LGTM compose overlay, with health checks and dependency ordering. Configure Prometheus scrape jobs for those exporters and add sql_exporter configuration for the chronos_rows hanger table row-count metric. Limit the local Chronos compose container to 2 CPUs and 2 GiB of memory, and limit k6 runner containers launched from Make targets to 1 CPU and 1 GiB of memory. Document the new local LGTM exporters and limits in How-to.md. Verification: - docker compose --project-name chronos -f dev/docker-compose/compose.yaml -f dev/docker-compose/lgtm.yaml config - make lgtm.validate - docker run --rm -v /home/ah34/work/opensource/chronos/dev/lgtm/sql_exporter.yaml:/etc/sql_exporter/sql_exporter.yaml:ro burningalchemist/sql_exporter:0.18.3 --config.file=/etc/sql_exporter/sql_exporter.yaml --config.check - docker compose --project-name chronos -f dev/docker-compose/compose.yaml -f dev/docker-compose/lgtm.yaml up -d --build postgres kafka chronos-pg-migrations postgres-exporter sql-exporter kminion cadvisor lgtm - docker exec lgtm curl -sf 'http://127.0.0.1:9090/api/v1/query?query=chronos_rows' - docker compose --project-name chronos -f dev/docker-compose/compose.yaml -f dev/docker-compose/lgtm.yaml down - make pre-commit Model-version: GPT-5 --- How-to.md | 4 ++ dev/docker-compose/compose.yaml | 2 + dev/docker-compose/lgtm.yaml | 93 +++++++++++++++++++++++++++++++++ dev/lgtm/kminion.yaml | 25 +++++++++ dev/lgtm/prometheus.yaml | 16 ++++++ dev/lgtm/sql_exporter.yaml | 24 +++++++++ dev/makefiles/k6.mk | 2 +- 7 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 dev/lgtm/kminion.yaml create mode 100644 dev/lgtm/sql_exporter.yaml diff --git a/How-to.md b/How-to.md index 548dd43..99c8470 100644 --- a/How-to.md +++ b/How-to.md @@ -79,6 +79,10 @@ make up lgtm The overlay mounts local override files from `dev/lgtm` for Prometheus, the OpenTelemetry Collector, and Grafana dashboard provisioning. Chronos exposes its Prometheus metrics endpoint with `OTEL_EXPORTER_PROMETHEUS_HOST` and `OTEL_EXPORTER_PROMETHEUS_PORT`; when run from Docker Compose the endpoint is `chronos:9091`. +The LGTM overlay also starts local infrastructure exporters for container, PostgreSQL, Kafka, and SQL-derived database metrics. Prometheus scrapes cAdvisor, postgres_exporter, KMinion, and sql_exporter from `dev/lgtm/prometheus.yaml`; the SQL exporter emits `chronos_rows`, the current row count of the Chronos `hanger` table. The exporter-specific configuration lives in `dev/lgtm/kminion.yaml` and `dev/lgtm/sql_exporter.yaml`. + +The local Compose stack limits the Chronos container to 2 CPUs and 2 GiB of memory. k6 runner containers launched by `make k6.contract` and `make k6.load` are limited to 1 CPU and 1 GiB of memory. + Chronos production metrics are generated from the OpenTelemetry Weaver registry in `dev/weaver/production/registry/chronos/metrics.yaml`. Rust definitions are generated into `chronos_bin/src/metrics/generated`, Markdown docs into `docs/chronos_metrics.md`, and the resolved registry schema into `docs/schema/resolved-registry.schema.json`. `OTEL_METRICS_EXPORTER=prometheus` is the default and exposes `/metrics` with the `chronos_` Prometheus namespace, for example `chronos_msg_jitter`. `OTEL_METRICS_EXPORTER=otlp` records the same generated metric IDs through the OTLP gRPC metrics exporter. `make build` runs `make weaver.generate WEAVER_TARGET=production` before compiling, which refreshes the production Rust definitions, Markdown metric docs, and resolved registry JSON schema. `WEAVER_TARGET` defaults to `production`; generate example Weaver artifacts explicitly with `make weaver.generate WEAVER_TARGET=example`. diff --git a/dev/docker-compose/compose.yaml b/dev/docker-compose/compose.yaml index aae8b4a..682dfcc 100644 --- a/dev/docker-compose/compose.yaml +++ b/dev/docker-compose/compose.yaml @@ -23,6 +23,8 @@ services: build: context: ../.. dockerfile: docker/Dockerfile.chronos + cpus: 2 + mem_limit: 2g ports: - "9091:9091" environment: diff --git a/dev/docker-compose/lgtm.yaml b/dev/docker-compose/lgtm.yaml index f317bb4..88ac73e 100644 --- a/dev/docker-compose/lgtm.yaml +++ b/dev/docker-compose/lgtm.yaml @@ -41,6 +41,15 @@ services: timeout: 10s retries: 5 start_period: 30s + depends_on: + cadvisor: + condition: service_healthy + postgres-exporter: + condition: service_healthy + kminion: + condition: service_healthy + sql-exporter: + condition: service_healthy volumes: - ../lgtm/prometheus.yaml:/otel-lgtm/prometheus.yaml:ro - ../lgtm/otelcol-contrib.yaml:/otel-lgtm/otelcol-config.yaml:ro @@ -51,3 +60,87 @@ services: - ../dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards/chronos:ro networks: - chronos + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.52.1 + container_name: cadvisor + command: + - --docker_only=true + - --housekeeping_interval=10s + - --store_container_labels=false + privileged: true + devices: + - /dev/kmsg:/dev/kmsg + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker:/var/lib/docker:ro + - /dev/disk:/dev/disk:ro + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://127.0.0.1:8080/healthz >/dev/null"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 15s + networks: + - chronos + + postgres-exporter: + image: quay.io/prometheuscommunity/postgres-exporter:v0.19.1 + container_name: postgres-exporter + environment: + DATA_SOURCE_URI: postgres:5432/chronos_db?sslmode=disable + DATA_SOURCE_USER: admin + DATA_SOURCE_PASS: admin + depends_on: + postgres: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://127.0.0.1:9187/metrics >/dev/null"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 15s + networks: + - chronos + + kminion: + image: redpandadata/kminion:v2.2.14 + container_name: kminion + environment: + CONFIG_FILEPATH: /etc/kminion/kminion.yaml + depends_on: + kafka: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://127.0.0.1:8080/metrics >/dev/null"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 30s + volumes: + - ../lgtm/kminion.yaml:/etc/kminion/kminion.yaml:ro + networks: + - chronos + + sql-exporter: + image: burningalchemist/sql_exporter:0.18.3 + container_name: sql-exporter + command: + - --config.file=/etc/sql_exporter/sql_exporter.yaml + depends_on: + postgres: + condition: service_healthy + chronos-pg-migrations: + condition: service_completed_successfully + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://127.0.0.1:9399/metrics >/dev/null"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 15s + volumes: + - ../lgtm/sql_exporter.yaml:/etc/sql_exporter/sql_exporter.yaml:ro + networks: + - chronos diff --git a/dev/lgtm/kminion.yaml b/dev/lgtm/kminion.yaml new file mode 100644 index 0000000..bc293ee --- /dev/null +++ b/dev/lgtm/kminion.yaml @@ -0,0 +1,25 @@ +logger: + level: info + +kafka: + brokers: + - kafka:9092 + clientId: chronos-kminion + +minion: + consumerGroups: + enabled: true + scrapeMode: adminApi + granularity: topic + topics: + enabled: true + granularity: topic + logDirs: + enabled: true + endToEnd: + enabled: false + +exporter: + namespace: kminion + host: "" + port: 8080 diff --git a/dev/lgtm/prometheus.yaml b/dev/lgtm/prometheus.yaml index 16b2260..683294c 100644 --- a/dev/lgtm/prometheus.yaml +++ b/dev/lgtm/prometheus.yaml @@ -43,6 +43,22 @@ scrape_configs: static_configs: - targets: ["chronos:9091"] + - job_name: cadvisor + static_configs: + - targets: ["cadvisor:8080"] + + - job_name: postgres-exporter + static_configs: + - targets: ["postgres-exporter:9187"] + + - job_name: kminion + static_configs: + - targets: ["kminion:8080"] + + - job_name: sql-exporter + static_configs: + - targets: ["sql-exporter:9399"] + - job_name: grafana static_configs: - targets: ["127.0.0.1:3000"] diff --git a/dev/lgtm/sql_exporter.yaml b/dev/lgtm/sql_exporter.yaml new file mode 100644 index 0000000..ba77c2c --- /dev/null +++ b/dev/lgtm/sql_exporter.yaml @@ -0,0 +1,24 @@ +global: + scrape_timeout_offset: 500ms + min_interval: 15s + max_connections: 2 + max_idle_connections: 2 + +target: + name: chronos_db + data_source_name: "postgresql://admin:admin@postgres:5432/chronos_db?sslmode=disable" + collectors: + - chronos + enable_ping: true + +collectors: + - collector_name: chronos + metrics: + - metric_name: chronos_rows + type: gauge + help: "Number of rows stored in the Chronos hanger table." + values: + - rows + query: | + SELECT count(*)::double precision AS rows + FROM hanger; diff --git a/dev/makefiles/k6.mk b/dev/makefiles/k6.mk index ad639a3..e549cba 100644 --- a/dev/makefiles/k6.mk +++ b/dev/makefiles/k6.mk @@ -21,7 +21,7 @@ K6_COMMON_ENV := \ -e K6_OTEL_GRPC_EXPORTER_INSECURE=$${K6_OTEL_GRPC_EXPORTER_INSECURE:-true} \ -e K6_OTEL_GRPC_EXPORTER_ENDPOINT=$(K6_OTEL_GRPC_EXPORTER_ENDPOINT) \ -e K6_RUN_ID=$(K6_RUN_ID) -K6_DOCKER_RUN := docker run --rm --network $(K6_DOCKER_NETWORK) --add-host=host.docker.internal:host-gateway -v "$(PWD)/dev/k6:/scripts:ro" -v "$(K6_LOG_DIR):/data/lgtm/logs" $(K6_COMMON_ENV) +K6_DOCKER_RUN := docker run --rm --cpus 1 --memory 1g --network $(K6_DOCKER_NETWORK) --add-host=host.docker.internal:host-gateway -v "$(PWD)/dev/k6:/scripts:ro" -v "$(K6_LOG_DIR):/data/lgtm/logs" $(K6_COMMON_ENV) ## k6.build: Build the custom k6 image with xk6-kafka k6.build: From 459494582482b749f3f4c526bc840b01ad5626bf Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Sat, 2 May 2026 01:20:11 +1000 Subject: [PATCH 33/36] chore(make): harden smoke-test targets Smoke test every Make recipe with a 10 second timeout and fix immediate target-level failures. Set the default Make goal to help, make withenv deterministic without RECIPE, load .env for local app/database recipes, and let dev.run fall back to a single cargo run when cargo-watch is unavailable. Repair supporting targets found by the smoke pass: add the missing coverage-report script with a cargo-llvm-cov path and raw LLVM coverage fallback, correct the example OTLP endpoint wiring, fix the act artifact server address, use the repository's master branch in .github/config.json, and reference Trivy actions with the v-prefixed tag. Verification: - timeout 10s make ; results recorded under /tmp/chronos-make-smoke and /tmp/chronos-make-smoke-escalated - timeout -k 2s 10s make withenv - timeout -k 2s 10s make repo.config.apply - timeout -k 2s 10s make act.scan - timeout -k 2s 10s make act.sbom - make lgtm.validate - make pre-commit Model-version: GPT-5 --- .env.example | 4 ++-- .github/config.json | 2 +- .github/workflows/sbom.yml | 4 ++-- .github/workflows/scan.yml | 2 +- Makefile | 1 + dev/makefiles/act.mk | 3 ++- dev/makefiles/dev.mk | 27 ++++++++++++++++------ examples/chronos_ex/examples/chronos_ex.rs | 4 ++-- scripts/coverage-report.sh | 17 ++++++++++++++ 9 files changed, 48 insertions(+), 16 deletions(-) create mode 100755 scripts/coverage-report.sh diff --git a/.env.example b/.env.example index 68def42..a8539c5 100644 --- a/.env.example +++ b/.env.example @@ -1,11 +1,11 @@ # WHEN DEVELOPING LOCALLY, WE NEED TO ACCESS THE HOST NETWORK FROM K8S (FOR POSTGRES/KAFKA/ELASTIC/ETC) -LOCAL_HOST_IP=${LOCAL_HOST_IP:-$(hostname -I 2>/dev/null | awk '{print $1}')} +LOCAL_HOST_IP=127.0.0.1 # RUST version RUST_VERSION=stable # KAFKA -KAFKA_HOST="localhost\,$LOCAL_HOST_IP" +KAFKA_HOST="localhost,$LOCAL_HOST_IP" KAFKA_PORT="9094" KAFKA_CLIENT_ID="chronos" KAFKA_GROUP_ID="chronos" diff --git a/.github/config.json b/.github/config.json index 1a17ae0..93f5fa1 100644 --- a/.github/config.json +++ b/.github/config.json @@ -14,7 +14,7 @@ "can_approve_pull_request_reviews": false }, "branches": { - "main": { + "master": { "protection": { "required_status_checks": { "strict": true, diff --git a/.github/workflows/sbom.yml b/.github/workflows/sbom.yml index 537d954..e11b530 100644 --- a/.github/workflows/sbom.yml +++ b/.github/workflows/sbom.yml @@ -48,7 +48,7 @@ jobs: - name: Generate container SBOM if: inputs.target-type == 'container' - uses: aquasecurity/trivy-action@0.32.0 + uses: aquasecurity/trivy-action@v0.32.0 with: scan-type: image scan-ref: ${{ inputs.target-ref }} @@ -57,7 +57,7 @@ jobs: - name: Generate release SBOM if: inputs.target-type == 'release' - uses: aquasecurity/trivy-action@0.32.0 + uses: aquasecurity/trivy-action@v0.32.0 with: scan-type: fs scan-ref: ${{ inputs.target-ref }} diff --git a/.github/workflows/scan.yml b/.github/workflows/scan.yml index 440395a..640948b 100644 --- a/.github/workflows/scan.yml +++ b/.github/workflows/scan.yml @@ -31,7 +31,7 @@ jobs: run: cargo build --release -p chronos_bin - name: Scan Rust build output - uses: aquasecurity/trivy-action@0.32.0 + uses: aquasecurity/trivy-action@v0.32.0 with: scan-type: fs scan-ref: target/release/chronos diff --git a/Makefile b/Makefile index d9e74e6..8a045df 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ SHELL := /usr/bin/env bash +.DEFAULT_GOAL := help MAKEFILES_DIR := dev/makefiles COMMON_MAKEFILE := $(MAKEFILES_DIR)/common.mk diff --git a/dev/makefiles/act.mk b/dev/makefiles/act.mk index 2bd845e..12211ba 100644 --- a/dev/makefiles/act.mk +++ b/dev/makefiles/act.mk @@ -3,7 +3,8 @@ ACT_JOB ?= pre-commit ACT_RUNNER_IMAGE ?= catthehacker/ubuntu:act-latest ACT_ARTIFACT_DIR ?= /tmp/chronos-act-artifacts ACT_EVENT_DIR ?= /tmp/chronos-act-events -ACT_FLAGS ?= -P ubuntu-latest=$(ACT_RUNNER_IMAGE) --artifact-server-path $(ACT_ARTIFACT_DIR) +ACT_ARTIFACT_ADDR ?= 127.0.0.1 +ACT_FLAGS ?= -P ubuntu-latest=$(ACT_RUNNER_IMAGE) --artifact-server-path $(ACT_ARTIFACT_DIR) --artifact-server-addr $(ACT_ARTIFACT_ADDR) CI_WORKFLOW ?= .github/workflows/CI.yaml PRE_COMMIT_WORKFLOW ?= .github/workflows/pre-commit.yml diff --git a/dev/makefiles/dev.mk b/dev/makefiles/dev.mk index 7d730fd..6800e7a 100644 --- a/dev/makefiles/dev.mk +++ b/dev/makefiles/dev.mk @@ -1,3 +1,5 @@ +RECIPE ?= help + ## setup: Check local development dependencies and prepare .env setup: $(call pp,checking development dependencies...) @@ -14,7 +16,7 @@ setup: ## withenv: Run a make recipe with variables loaded from .env, for example make withenv RECIPE=run withenv: test -e .env || cp .env.example .env - bash -c 'set -o allexport; source .env; set +o allexport; make "$$RECIPE"' + bash -c 'set -o allexport; source .env; set +o allexport; make "$(RECIPE)"' ## dev.init: Initialize local dev environment dev.init: setup @@ -23,31 +25,42 @@ dev.init: setup dev.chronos_ex: $(call pp,creating kafka topic...) - cargo run --example chronos_ex + test -e .env || cp .env.example .env + bash -c 'set -o allexport; source .env; set +o allexport; cargo run --example chronos_ex' ## pg.create: Create database pg.create: $(call pp,creating database...) - cargo run --example pg_create_database + test -e .env || cp .env.example .env + bash -c 'set -o allexport; source .env; set +o allexport; cargo run --example pg_create_database' ## pg.migrate: Run migrations on database pg.migrate: $(call pp,running migrations on database...) - cargo run --package pg_mig --bin chronos-pg-migrations + test -e .env || cp .env.example .env + bash -c 'set -o allexport; source .env; set +o allexport; cargo run --package pg_mig --bin chronos-pg-migrations' ## run: Run Chronos locally run: $(call pp,run app...) - cargo run --package chronos_bin --bin chronos + test -e .env || cp .env.example .env + bash -c 'set -o allexport; source .env; set +o allexport; cargo run --package chronos_bin --bin chronos' ## run.release: Run Chronos locally in release mode run.release: $(call pp,run app...) - cargo run --package chronos_bin -r --bin chronos + test -e .env || cp .env.example .env + bash -c 'set -o allexport; source .env; set +o allexport; cargo run --package chronos_bin -r --bin chronos' ## dev.run: Run Chronos in cargo-watch mode dev.run: $(call pp,run app...) - cargo watch -q -c -x 'run --package chronos_bin --bin chronos' + test -e .env || cp .env.example .env + @if cargo watch --version >/dev/null 2>&1; then \ + bash -c 'set -o allexport; source .env; set +o allexport; cargo watch -q -c -x "run --package chronos_bin --bin chronos"'; \ + else \ + printf 'cargo-watch not installed; falling back to one cargo run invocation.\n' >&2; \ + bash -c 'set -o allexport; source .env; set +o allexport; cargo run --package chronos_bin --bin chronos'; \ + fi .PHONY: setup withenv dev.init dev.chronos_ex pg.create pg.migrate run run.release dev.run diff --git a/examples/chronos_ex/examples/chronos_ex.rs b/examples/chronos_ex/examples/chronos_ex.rs index 4afd691..3b0ffdf 100644 --- a/examples/chronos_ex/examples/chronos_ex.rs +++ b/examples/chronos_ex/examples/chronos_ex.rs @@ -31,7 +31,7 @@ fn init_tracer() -> Result { if service_name.is_err() { std::env::set_var("OTEL_SERVICE_NAME", "chronos"); } - if trace_exporter.is_ok() { + if let Ok(trace_exporter) = trace_exporter { global::set_text_map_propagator(TraceContextPropagator::new()); let os_resource = OsResourceDetector.detect(Duration::from_secs(0)); let process_resource = ProcessResourceDetector.detect(Duration::from_secs(0)); @@ -40,7 +40,7 @@ fn init_tracer() -> Result { let telemetry_resource = TelemetryResourceDetector.detect(Duration::from_secs(0)); opentelemetry_otlp::new_pipeline() .tracing() - .with_exporter(opentelemetry_otlp::new_exporter().http().with_endpoint(format!("{:?}", service_name))) + .with_exporter(opentelemetry_otlp::new_exporter().http().with_endpoint(trace_exporter)) .with_trace_config( sdktrace::config().with_resource( os_resource diff --git a/scripts/coverage-report.sh b/scripts/coverage-report.sh new file mode 100755 index 0000000..7d08e1e --- /dev/null +++ b/scripts/coverage-report.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env sh + +set -eu + +if ! cargo llvm-cov --version >/dev/null 2>&1; then + printf 'cargo-llvm-cov not installed; writing raw LLVM coverage profiles under target/coverage.\n' >&2 + coverage_dir="$(pwd)/target/coverage" + mkdir -p "${coverage_dir}" + CARGO_INCREMENTAL=0 \ + CARGO_HUSKY_DONT_INSTALL_HOOKS=true \ + RUSTFLAGS="${RUSTFLAGS:-} -Cinstrument-coverage" \ + LLVM_PROFILE_FILE="${coverage_dir}/chronos-%p-%m.profraw" \ + cargo test + exit 0 +fi + +cargo llvm-cov --workspace --all-targets From 73e1ddb05baa3ca37355ab62f961f0cf9886dbea Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Sat, 2 May 2026 01:39:53 +1000 Subject: [PATCH 34/36] ci: add Weaver live-check workflow Add a reusable GitHub Actions workflow that installs the pinned Rust toolchain and system dependencies, then runs make weaver.live-check. Include the workflow in the top-level CI fan-out so live Weaver validation is part of CI tests. Update the OTLP metrics mock to use the production ChronosMetrics facade instead of a stale standalone metrics list. This keeps live-check samples aligned with the production Weaver registry and removes duplicate mock metric definitions. Verification: - make weaver.live-check - make weaver.check - rg '"highest_advice_level": "violation"|"level": "violation"' /tmp/chronos-weaver-live-check/live_check.json || true - make pre-commit Model-version: GPT-5 --- .github/workflows/CI.yaml | 3 + .github/workflows/weaver-live-check.yml | 30 ++ Cargo.lock | 5 +- examples/prom_otlp_mock.rs | 358 ++-------------------- examples/prom_otlp_mock_runner/Cargo.toml | 5 +- 5 files changed, 64 insertions(+), 337 deletions(-) create mode 100644 .github/workflows/weaver-live-check.yml diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml index 21ce7d2..f8b5168 100644 --- a/.github/workflows/CI.yaml +++ b/.github/workflows/CI.yaml @@ -20,6 +20,9 @@ jobs: test: uses: ./.github/workflows/test.yml + weaver-live-check: + uses: ./.github/workflows/weaver-live-check.yml + scan: uses: ./.github/workflows/scan.yml diff --git a/.github/workflows/weaver-live-check.yml b/.github/workflows/weaver-live-check.yml new file mode 100644 index 0000000..f04a864 --- /dev/null +++ b/.github/workflows/weaver-live-check.yml @@ -0,0 +1,30 @@ +name: weaver-live-check + +on: + workflow_call: + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-weaver-live-check-${{ github.ref }} + cancel-in-progress: true + +jobs: + weaver-live-check: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@master + with: + toolchain: 1.94 + + - name: Install system dependencies + run: scripts/ubuntu-setup.sh + + - name: Run Weaver live check + run: make weaver.live-check diff --git a/Cargo.lock b/Cargo.lock index 2ca93fe..0f3fcc2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1639,10 +1639,7 @@ dependencies = [ name = "prom_otlp_mock_runner" version = "0.0.0" dependencies = [ - "opentelemetry", - "opentelemetry-otlp", - "opentelemetry_sdk", - "prometheus", + "chronos_bin", "tokio", ] diff --git a/examples/prom_otlp_mock.rs b/examples/prom_otlp_mock.rs index 621d8c2..426edb5 100644 --- a/examples/prom_otlp_mock.rs +++ b/examples/prom_otlp_mock.rs @@ -1,144 +1,33 @@ -//! Mock design for a Chronos metrics abstraction that can export through either -//! the Prometheus client library or OpenTelemetry OTLP metrics. +//! Mock runner for exercising Chronos metrics through Prometheus or OTLP. //! -//! Selection is intentionally driven by the standard OpenTelemetry metric -//! exporter variable: -//! -//! - `OTEL_METRICS_EXPORTER=prometheus` uses the `prometheus` crate registry. -//! - `OTEL_METRICS_EXPORTER=otlp` uses the OTLP gRPC exporter. -//! - unset defaults to Prometheus for local compatibility. -//! -//! This file is a design sketch for the Chronos rewrite, not wired into the -//! runtime yet. The important shape is that metric definitions live once in -//! `MetricDefinition`, while the backend-specific registrations stay behind the -//! `MetricsBackend` interface. +//! This intentionally uses the production `ChronosMetrics` facade so Weaver +//! live checks validate the same generated metric definitions as the runtime. -use std::collections::HashMap; use std::env; use std::sync::Arc; use std::time::Duration; -use opentelemetry::global; -use opentelemetry::metrics::{Counter as OtlpCounter, Histogram as OtlpHistogram, Unit}; -use opentelemetry::KeyValue; -use opentelemetry_otlp::WithExportConfig; -use prometheus::{histogram_opts, opts, CounterVec as PromCounterVec, HistogramVec as PromHistogramVec, Registry}; +use chronos_bin::metrics::ChronosMetrics; const OTEL_METRICS_EXPORTER: &str = "OTEL_METRICS_EXPORTER"; -const OTEL_EXPORTER_OTLP_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_ENDPOINT"; -const OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"; -const OTEL_EXPORTER_OTLP_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_PROTOCOL"; -const OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: &str = "OTEL_EXPORTER_OTLP_METRICS_PROTOCOL"; const OTEL_METRIC_EXPORT_INTERVAL: &str = "OTEL_METRIC_EXPORT_INTERVAL"; const OTEL_EXPORTER_PROMETHEUS_HOST: &str = "OTEL_EXPORTER_PROMETHEUS_HOST"; const OTEL_EXPORTER_PROMETHEUS_PORT: &str = "OTEL_EXPORTER_PROMETHEUS_PORT"; -#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] -enum MetricId { - MsgConsumed, - MsgConsumeLatency, -} - -#[derive(Clone, Copy, Debug)] -enum MetricKind { - Counter, - Histogram, -} - -#[derive(Clone, Copy, Debug)] -struct MetricDefinition { - id: MetricId, - otel_name: &'static str, - prometheus_name: &'static str, - description: &'static str, - unit: Option<&'static str>, - attribute_names: &'static [&'static str], - prometheus_label_names: &'static [&'static str], - kind: MetricKind, -} - -const METRIC_DEFINITIONS: &[MetricDefinition] = &[ - MetricDefinition { - id: MetricId::MsgConsumed, - otel_name: "messaging.client.consumed.messages", - prometheus_name: "messaging_client_consumed_messages", - description: "Total number of Chronos input messages consumed", - unit: Some("{message}"), - attribute_names: &["messaging.system", "messaging.operation.name", "messaging.destination.name"], - prometheus_label_names: &["messaging_system", "messaging_operation_name", "messaging_destination_name"], - kind: MetricKind::Counter, - }, - MetricDefinition { - id: MetricId::MsgConsumeLatency, - otel_name: "messaging.process.duration", - prometheus_name: "messaging_process_duration_seconds", - description: "Time spent handling a consumed Chronos message", - unit: Some("s"), - attribute_names: &["messaging.system", "messaging.operation.name", "messaging.destination.name"], - prometheus_label_names: &["messaging_system", "messaging_operation_name", "messaging_destination_name"], - kind: MetricKind::Histogram, - }, -]; - -trait MetricsBackend: Send + Sync { - fn inc_counter(&self, id: MetricId, value: u64, labels: &[(&'static str, String)]); - fn observe_histogram(&self, id: MetricId, value: f64, labels: &[(&'static str, String)]); - fn render_prometheus(&self) -> Option; - fn shutdown(&self); -} - -struct ChronosMetrics { - backend: Box, +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum MetricsExporter { + Prometheus, + Otlp, } -impl ChronosMetrics { +impl MetricsExporter { fn from_env() -> Result> { - let backend: Box = match MetricsExporter::from_env()? { - MetricsExporter::Prometheus => Box::new(PrometheusMetricsBackend::new()?), - MetricsExporter::Otlp => Box::new(OtlpMetricsBackend::new()?), - }; - - Ok(Self { backend }) - } - - fn message_consumed(&self, destination: &'static str) { - self.backend.inc_counter( - MetricId::MsgConsumed, - 1, - &[ - ("messaging.system", "kafka".to_string()), - ("messaging.operation.name", "receive".to_string()), - ("messaging.destination.name", destination.to_string()), - ], - ); - } - - fn consume_latency(&self, seconds: f64, destination: &'static str) { - self.backend.observe_histogram( - MetricId::MsgConsumeLatency, - seconds, - &[ - ("messaging.system", "kafka".to_string()), - ("messaging.operation.name", "process".to_string()), - ("messaging.destination.name", destination.to_string()), - ], - ); - } - - fn record_cycle(&self, cycle: u64) { - let destination = if cycle.is_multiple_of(2) { "chronos-input" } else { "chronos-retry" }; - let latency_seconds = 0.005 + ((cycle % 20) as f64 * 0.0025); - - self.message_consumed(destination); - self.consume_latency(latency_seconds, destination); - } - - fn prometheus_text(&self) -> Option { - self.backend.render_prometheus() - } - - fn shutdown(&self) { - self.backend.shutdown(); + match env::var(OTEL_METRICS_EXPORTER).unwrap_or_else(|_| "prometheus".to_string()).as_str() { + "prometheus" => Ok(Self::Prometheus), + "otlp" => Ok(Self::Otlp), + "none" => Err("metrics exporter disabled by OTEL_METRICS_EXPORTER=none".into()), + other => Err(format!("unsupported {OTEL_METRICS_EXPORTER} value: {other}").into()), + } } } @@ -169,206 +58,6 @@ fn env_duration_ms(name: &'static str, default_ms: u64) -> Result Result> { - match env::var(OTEL_METRICS_EXPORTER).unwrap_or_else(|_| "prometheus".to_string()).as_str() { - "prometheus" => Ok(Self::Prometheus), - "otlp" => { - require_grpc_protocol()?; - Ok(Self::Otlp) - } - "none" => Err("metrics exporter disabled by OTEL_METRICS_EXPORTER=none".into()), - other => Err(format!("unsupported {OTEL_METRICS_EXPORTER} value: {other}").into()), - } - } -} - -fn require_grpc_protocol() -> Result<(), Box> { - let protocol = env::var(OTEL_EXPORTER_OTLP_METRICS_PROTOCOL) - .or_else(|_| env::var(OTEL_EXPORTER_OTLP_PROTOCOL)) - .unwrap_or_else(|_| "grpc".to_string()); - - if protocol == "grpc" { - Ok(()) - } else { - Err(format!("unsupported OTLP metrics protocol {protocol:?}; use grpc for this design").into()) - } -} - -struct PrometheusMetricsBackend { - registry: Registry, - counters: HashMap, - histograms: HashMap, -} - -impl PrometheusMetricsBackend { - fn new() -> Result { - let registry = Registry::new(); - let mut counters = HashMap::new(); - let mut histograms = HashMap::new(); - - for definition in METRIC_DEFINITIONS { - match definition.kind { - MetricKind::Counter => { - let metric = PromCounterVec::new(opts!(definition.prometheus_name, definition.description), definition.prometheus_label_names)?; - registry.register(Box::new(metric.clone()))?; - counters.insert(definition.id, metric); - } - MetricKind::Histogram => { - let metric = PromHistogramVec::new( - histogram_opts!(definition.prometheus_name, definition.description), - definition.prometheus_label_names, - )?; - registry.register(Box::new(metric.clone()))?; - histograms.insert(definition.id, metric); - } - } - } - - Ok(Self { - registry, - counters, - histograms, - }) - } -} - -impl MetricsBackend for PrometheusMetricsBackend { - fn inc_counter(&self, id: MetricId, value: u64, labels: &[(&'static str, String)]) { - if let Some(counter) = self.counters.get(&id) { - let label_values = prometheus_label_values(id, labels); - if let Ok(metric) = counter.get_metric_with_label_values(&label_values) { - metric.inc_by(value as f64); - } - } - } - - fn observe_histogram(&self, id: MetricId, value: f64, labels: &[(&'static str, String)]) { - if let Some(histogram) = self.histograms.get(&id) { - let label_values = prometheus_label_values(id, labels); - if let Ok(metric) = histogram.get_metric_with_label_values(&label_values) { - metric.observe(value); - } - } - } - - fn render_prometheus(&self) -> Option { - use prometheus::{Encoder, TextEncoder}; - - let encoder = TextEncoder::new(); - let mut buffer = Vec::new(); - encoder.encode(&self.registry.gather(), &mut buffer).ok()?; - String::from_utf8(buffer).ok() - } - - fn shutdown(&self) {} -} - -struct OtlpMetricsBackend { - provider: opentelemetry_sdk::metrics::MeterProvider, - counters: HashMap>, - histograms: HashMap>, -} - -impl OtlpMetricsBackend { - fn new() -> Result> { - let endpoint = env::var(OTEL_EXPORTER_OTLP_METRICS_ENDPOINT) - .or_else(|_| env::var(OTEL_EXPORTER_OTLP_ENDPOINT)) - .unwrap_or_else(|_| "http://127.0.0.1:4317".to_string()); - let exporter = opentelemetry_otlp::new_exporter().tonic().with_env().with_endpoint(endpoint); - let provider = opentelemetry_otlp::new_pipeline() - .metrics(opentelemetry::runtime::Tokio) - .with_exporter(exporter) - .build()?; - - global::set_meter_provider(provider.clone()); - let meter = global::meter("chronos"); - - let mut counters = HashMap::new(); - let mut histograms = HashMap::new(); - - for definition in METRIC_DEFINITIONS { - match definition.kind { - MetricKind::Counter => { - let mut builder = meter.u64_counter(definition.otel_name).with_description(definition.description); - if let Some(unit) = definition.unit { - builder = builder.with_unit(Unit::new(unit)); - } - counters.insert(definition.id, builder.init()); - } - MetricKind::Histogram => { - let mut builder = meter.f64_histogram(definition.otel_name).with_description(definition.description); - if let Some(unit) = definition.unit { - builder = builder.with_unit(Unit::new(unit)); - } - histograms.insert(definition.id, builder.init()); - } - } - } - - Ok(Self { - provider, - counters, - histograms, - }) - } -} - -impl MetricsBackend for OtlpMetricsBackend { - fn inc_counter(&self, id: MetricId, value: u64, labels: &[(&'static str, String)]) { - if let Some(counter) = self.counters.get(&id) { - counter.add(value, &labels_to_key_values(labels)); - } - } - - fn observe_histogram(&self, id: MetricId, value: f64, labels: &[(&'static str, String)]) { - if let Some(histogram) = self.histograms.get(&id) { - histogram.record(value, &labels_to_key_values(labels)); - } - } - - fn render_prometheus(&self) -> Option { - None - } - - fn shutdown(&self) { - if let Err(err) = self.provider.force_flush(&opentelemetry::Context::current()) { - eprintln!("failed to flush OTLP metrics: {err}"); - } - if let Err(err) = self.provider.shutdown() { - eprintln!("failed to shut down OTLP metrics provider: {err}"); - } - } -} - -fn labels_to_key_values(labels: &[(&'static str, String)]) -> Vec { - labels.iter().map(|(key, value)| KeyValue::new(*key, value.clone())).collect() -} - -fn prometheus_label_values<'a>(id: MetricId, labels: &'a [(&'static str, String)]) -> Vec<&'a str> { - let Some(definition) = METRIC_DEFINITIONS.iter().find(|definition| definition.id == id) else { - return Vec::new(); - }; - - definition - .attribute_names - .iter() - .map(|name| { - labels - .iter() - .find(|(label_name, _)| label_name == name) - .map(|(_, value)| value.as_str()) - .unwrap_or("unknown") - }) - .collect() -} - async fn spawn_prometheus_server( metrics: Arc, host: String, @@ -390,7 +79,7 @@ async fn spawn_prometheus_server( let bytes_read = stream.read(&mut request).await.unwrap_or(0); let request_line = String::from_utf8_lossy(&request[..bytes_read]); let (status, body) = if request_line.starts_with("GET /metrics ") { - ("200 OK", metrics.prometheus_text().unwrap_or_default()) + ("200 OK", metrics.render_prometheus().unwrap_or_default()) } else { ("404 Not Found", "not found\n".to_string()) }; @@ -408,7 +97,18 @@ async fn run_workload(metrics: Arc, config: &MockRuntimeConfig) let mut cycle = 0_u64; loop { cycle += 1; - metrics.record_cycle(cycle); + + let consume_destination = if cycle.is_multiple_of(2) { "kafka" } else { "postgres" }; + let consume_status = if cycle.is_multiple_of(5) { "fail" } else { "pass" }; + let process_returned = cycle.is_multiple_of(3); + let process_status = if cycle.is_multiple_of(7) { "fail" } else { "pass" }; + let duration_seconds = 0.005 + ((cycle % 20) as f64 * 0.0025); + + metrics.observe_consume_latency(duration_seconds, consume_destination, consume_status); + metrics.observe_process_latency(duration_seconds * 1.5, process_returned, process_status); + metrics.observe_wait_time(0.1 + ((cycle % 10) as f64 * 0.05)); + metrics.observe_jitter(0.01 + ((cycle % 10) as f64 * 0.025)); + metrics.messages_reset(1); tokio::time::sleep(config.interval).await; } @@ -437,7 +137,7 @@ async fn main() -> Result<(), Box> { } if exporter == MetricsExporter::Otlp { - tokio::time::sleep(std::time::Duration::from_secs(2)).await; + tokio::time::sleep(Duration::from_secs(2)).await; } metrics.shutdown(); diff --git a/examples/prom_otlp_mock_runner/Cargo.toml b/examples/prom_otlp_mock_runner/Cargo.toml index 404e90b..53717b7 100644 --- a/examples/prom_otlp_mock_runner/Cargo.toml +++ b/examples/prom_otlp_mock_runner/Cargo.toml @@ -8,8 +8,5 @@ name = "prom_otlp_mock" path = "../prom_otlp_mock.rs" [dependencies] -opentelemetry.workspace = true -opentelemetry_sdk.workspace = true -opentelemetry-otlp.workspace = true -prometheus.workspace = true +chronos_bin = { path = "../../chronos_bin" } tokio.workspace = true From 772d459952a761804f5591e7b25ca48a03004b47 Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Sat, 2 May 2026 03:28:58 +1000 Subject: [PATCH 35/36] ci: fix workflow smoke failures Point container build workflows at the Dockerfiles under docker/, make the Weaver live-check output directory writable for the container on hosted runners, and run Trivy through its pinned container image to avoid the failing action-side installer. Verification: - make weaver.live-check - docker build --target builder -f docker/Dockerfile.chronos-slim -t chronos-binary-builder:ci-smoke . - docker build -f docker/Dockerfile.chronos -t chronos:ci-smoke . - docker build -f docker/Dockerfile.chronos-slim -t chronos-scratch:ci-smoke . - make pre-commit - cargo build --release -p chronos_bin - docker run --rm -v "/home/ah34/work/opensource/chronos:/work:ro" aquasec/trivy:0.64.1 fs --scanners vuln --severity CRITICAL,HIGH --ignore-unfixed --exit-code 1 /work/target/release/chronos Model-version: GPT-5 --- .github/workflows/build-binary.yml | 2 +- .github/workflows/build-container.yml | 4 ++-- .github/workflows/scan.yml | 18 ++++++++++-------- dev/makefiles/rust.mk | 1 + 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build-binary.yml b/.github/workflows/build-binary.yml index 0581f7e..819e341 100644 --- a/.github/workflows/build-binary.yml +++ b/.github/workflows/build-binary.yml @@ -19,7 +19,7 @@ jobs: uses: actions/checkout@v4 - name: Build binary image stage - run: docker build --target builder -f Dockerfile.chronos-slim -t chronos-binary-builder:${{ github.sha }} . + run: docker build --target builder -f docker/Dockerfile.chronos-slim -t chronos-binary-builder:${{ github.sha }} . - name: Extract binary run: | diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml index 5b48e92..186418b 100644 --- a/.github/workflows/build-container.yml +++ b/.github/workflows/build-container.yml @@ -19,7 +19,7 @@ jobs: uses: actions/checkout@v4 - name: Build regular container - run: docker build -f Dockerfile.chronos -t chronos:${{ github.sha }} . + run: docker build -f docker/Dockerfile.chronos -t chronos:${{ github.sha }} . - name: Build scratch container - run: docker build -f Dockerfile.chronos-slim -t chronos-scratch:${{ github.sha }} . + run: docker build -f docker/Dockerfile.chronos-slim -t chronos-scratch:${{ github.sha }} . diff --git a/.github/workflows/scan.yml b/.github/workflows/scan.yml index 640948b..9077382 100644 --- a/.github/workflows/scan.yml +++ b/.github/workflows/scan.yml @@ -31,11 +31,13 @@ jobs: run: cargo build --release -p chronos_bin - name: Scan Rust build output - uses: aquasecurity/trivy-action@v0.32.0 - with: - scan-type: fs - scan-ref: target/release/chronos - scanners: vuln - severity: CRITICAL,HIGH - exit-code: "1" - ignore-unfixed: true + run: | + docker run --rm \ + -v "$PWD:/work:ro" \ + aquasec/trivy:0.64.1 \ + fs \ + --scanners vuln \ + --severity CRITICAL,HIGH \ + --ignore-unfixed \ + --exit-code 1 \ + /work/target/release/chronos diff --git a/dev/makefiles/rust.mk b/dev/makefiles/rust.mk index a27e51a..f0ec866 100644 --- a/dev/makefiles/rust.mk +++ b/dev/makefiles/rust.mk @@ -102,6 +102,7 @@ weaver.live-check: cargo build --package prom_otlp_mock_runner; \ rm -rf "$(WEAVER_LIVE_CHECK_OUT)"; \ mkdir -p "$(WEAVER_LIVE_CHECK_OUT)"; \ + chmod 0777 "$(WEAVER_LIVE_CHECK_OUT)"; \ docker run --rm --network host \ -v "$(PWD):/work" \ -v "$(WEAVER_LIVE_CHECK_OUT):/out" \ From 9cbf390c290c8404ab51a32d6fc657b99b27dd2a Mon Sep 17 00:00:00 2001 From: aidanhall34 Date: Tue, 5 May 2026 08:48:25 +1000 Subject: [PATCH 36/36] feat: new d board --- dev/dashboards/chronos.json | 20 +- dev/dashboards/testing.json | 2927 +++++++++++++++++ examples/weaver/registry/chronos/metrics.yaml | 88 +- 3 files changed, 3004 insertions(+), 31 deletions(-) create mode 100644 dev/dashboards/testing.json diff --git a/dev/dashboards/chronos.json b/dev/dashboards/chronos.json index 5f6c9c4..20c005e 100644 --- a/dev/dashboards/chronos.json +++ b/dev/dashboards/chronos.json @@ -134,8 +134,8 @@ "targets": [ { "editorMode": "code", - "expr": "sum(\n increase(\n chronos_message_consume_duration_seconds_count[$__range]\n )\n) by (status, destination)", - "legendFormat": "{{ destination }}:{{ status }}", + "expr": "sum(\n increase(\n chronos_message_consume_duration_seconds_count[$__range]\n )\n) by (chronos_consume_status, chronos_destination)", + "legendFormat": "{{ chronos_consume_status }}:{{ chronos_destination }}", "range": true, "refId": "A" } @@ -160,6 +160,7 @@ "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 0.5, + "axisSoftMin": 0, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", @@ -244,6 +245,7 @@ "type": "prometheus", "uid": "prometheus" }, + "description": "{{ chronos_destination }}:{{ chronos_consume_status }}", "fieldConfig": { "defaults": { "color": { @@ -325,8 +327,8 @@ "targets": [ { "editorMode": "code", - "expr": "sum(\n rate(\n chronos_message_consume_duration_seconds_count[$__rate_interval]\n )\n) by (status, destination)", - "legendFormat": "__auto", + "expr": "sum(\n rate(\n chronos_message_consume_duration_seconds_count[$__rate_interval]\n )\n) by (chronos_consume_status, chronos_destination)", + "legendFormat": "{{ chronos_destination }}:{{ chronos_consume_status }}", "range": true, "refId": "A" } @@ -435,7 +437,7 @@ "type": "prometheus", "uid": "prometheus" }, - "description": "The % of messages routed to either Postgres or Kafka", + "description": "The % of messages routed to either Postgres or Kafka\n{{ chronos_destination }}:{{ chronos_consume_status }}", "fieldConfig": { "defaults": { "color": { @@ -513,8 +515,8 @@ "targets": [ { "editorMode": "code", - "expr": "sum(\n rate(\n chronos_message_consume_duration_seconds_count[$__rate_interval]\n )\n) by (status, destination) / on() group_left() sum(\n rate(\n chronos_message_consume_duration_seconds_count[$__rate_interval]\n )\n) by (status)", - "legendFormat": "__auto", + "expr": "sum(\n rate(\n chronos_message_consume_duration_seconds_count[$__rate_interval]\n )\n) by (chronos_consume_status, chronos_destination) / on() group_left() sum(\n rate(\n chronos_message_consume_duration_seconds_count[$__rate_interval]\n )\n) by (chronos_consume_status)", + "legendFormat": "{{ chronos_destination }}:{{ chronos_consume_status }}", "range": true, "refId": "A" } @@ -936,13 +938,13 @@ ] }, "time": { - "from": "now-5m", + "from": "now-30m", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "Chronos", "uid": "gk65ns", - "version": 3, + "version": 1, "weekStart": "" } \ No newline at end of file diff --git a/dev/dashboards/testing.json b/dev/dashboards/testing.json new file mode 100644 index 0000000..51db6da --- /dev/null +++ b/dev/dashboards/testing.json @@ -0,0 +1,2927 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "A dashboard for working with the local testing infrastructure", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "count(up)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Targets", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(up{service_name!~\".+\"}) by (job, instance, target)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Target status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(scrape_samples_post_metric_relabeling) by (job)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Series per job", + "transformations": [ + { + "id": "calculateField", + "options": {} + } + ], + "type": "timeseries" + } + ], + "title": "exporters", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 31, + "panels": [], + "title": "k6", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "{{ scenario }}:{{ topic }}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 2 + }, + "id": 33, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99,\n sum(\n rate(\n k6_iteration_duration_milliseconds_bucket[$__rate_interval]\n )\n ) by (le, scenario)\n)", + "legendFormat": "{{ scenario }}:99%", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5,\n sum(\n rate(\n k6_iteration_duration_milliseconds_bucket[$__rate_interval]\n )\n ) by (le, scenario)\n)", + "instant": false, + "legendFormat": "{{ scenario }}:50%", + "range": true, + "refId": "B" + } + ], + "title": "kafka p99 iteration duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 2 + }, + "id": 34, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n rate(\n k6_iteration_duration_milliseconds_count[$__rate_interval]\n )\n) by (scenario)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "iterations per second", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The path the message intends to exercise.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 37, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n rate(\n k6_chronos_messages_published_total{job=\"k6-chronos\"}[$__rate_interval]\n )\n) by (chronos_path)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Publish path", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The path the message intends to exercise.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 38, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n rate(\n k6_chronos_messages_published_total{job=\"k6-chronos\"}[$__rate_interval]\n )\n) by (chronos_path) / on() group_left() sum(\n rate(\n k6_chronos_messages_published_total{job=\"k6-chronos\"}[$__rate_interval]\n )\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Publish path %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "{{ scenario }}:{{ topic }}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 32, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.99,\n sum(\n rate(\n k6_kafka_writer_write_seconds_milliseconds_bucket{job=\"k6-chronos\"}[$__rate_interval]\n )\n ) by (le, scenario, topic)\n)", + "legendFormat": "{{ scenario }}:{{ topic }}", + "range": true, + "refId": "A" + } + ], + "title": "kafka p99 write seconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The % of iterations dropped per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1, + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "dashed" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 0.05 + }, + { + "color": "red", + "value": 0.1 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 18 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n increase(\n k6_dropped_iterations_total{}[$__rate_interval]\n )\n) by (scenario) / sum(\n increase(\n k6_iterations_total[$__rate_interval]\n )\n ) by (scenario)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Dropped iterations %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The % of iterations dropped over the range", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 0.05 + }, + { + "color": "red", + "value": 0.1 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 18 + }, + "id": 35, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(increase(k6_dropped_iterations_total{}[$__range])) / sum(increase(k6_iterations_total[$__range]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Dropped iterations %", + "type": "stat" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 16, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "{{ topic_name }}:{{ group_id }}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 76 + }, + "id": 27, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(kminion_kafka_consumer_group_topic_lag) by (group_id, topic_name)", + "legendFormat": "{{ topic_name }}:{{ group_id }}", + "range": true, + "refId": "A" + } + ], + "title": "Topic lag", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "{{ topic_name }}:{{ group_id }}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 76 + }, + "id": 28, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n kminion_kafka_topic_log_dir_size_total_bytes{topic_name!~\"__.+\"}\n) by (topic_name)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "log size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 84 + }, + "id": 29, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n rate(\n kminion_kafka_received_bytes{}[$__rate_interval]\n )\n) * -1", + "legendFormat": "received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n rate(\n kminion_kafka_sent_bytes{}[$__rate_interval]\n )\n)", + "instant": false, + "legendFormat": "sent", + "range": true, + "refId": "B" + } + ], + "title": "Network I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 84 + }, + "id": 30, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(kminion_kafka_requests_received_total[$__rate_interval]))", + "legendFormat": "sent", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(kminion_kafka_requests_sent_total{}[$__rate_interval])) *-1", + "instant": false, + "legendFormat": "received", + "range": true, + "refId": "B" + } + ], + "title": "Requests p/s", + "type": "timeseries" + } + ], + "title": "kafka", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 15, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Number of rows in the hangfire table", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "chronos_rows{job=\"sql-exporter\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Hangfire rows", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 28 + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "sum(pg_database_size_bytes{datname!~\"postgres|template\\\\d\"}) by (datname)", + "format": "table", + "instant": false, + "legendFormat": "{{ name }}", + "range": true, + "refId": "A" + } + ], + "title": "Database size", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Number of entries in the hangfire table now", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 28 + }, + "id": 22, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(chronos_rows{job=\"sql-exporter\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active hangfire rows", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Requires:\\\npg_settings_track_io_timing == 1", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 0, + "y": 36 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(pg_stat_database_blk_read_time{datname!~\"postgres|template\\\\d\"}[$__rate_interval])) by (datname) *-1", + "legendFormat": "{{ datname }}:read", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(pg_stat_database_blk_write_time{datname!~\"postgres|template\\\\d\"}[$__rate_interval])) by (datname)", + "instant": false, + "legendFormat": "{{ datname }}:write", + "range": true, + "refId": "B" + } + ], + "title": "PG blk I/O time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "{{ datname }}:returned", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 9, + "y": 36 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(pg_stat_database_tup_updated{datname!~\"postgres|template\\\\d\"}[$__rate_interval])) by (datname)", + "instant": false, + "legendFormat": "{{ datname }}:updated", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(pg_stat_database_tup_returned{datname!~\"postgres|template\\\\d\"}[$__rate_interval])) by (datname)", + "instant": false, + "legendFormat": "{{ datname }}:returned", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(pg_stat_database_tup_inserted{datname!~\"postgres|template\\\\d\"}[$__rate_interval])) by (datname)", + "instant": false, + "legendFormat": "{{ datname }}:inserted", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(pg_stat_database_tup_fetched{datname!~\"postgres|template\\\\d\"}[$__rate_interval])) by (datname)", + "instant": false, + "legendFormat": "{{ datname }}:fetched", + "range": true, + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(pg_stat_database_tup_deleted{datname!~\"postgres|template\\\\d\"}[$__rate_interval])) by (datname)", + "instant": false, + "legendFormat": "{{ datname }}:deleted", + "range": true, + "refId": "F" + } + ], + "title": "Database operations", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Lookup time for sql-exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "left", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 6, + "x": 18, + "y": 36 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(scrape_duration_seconds{\n job=\"sql-exporter\", target=~\".+\"\n}) by (target)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Hangfire table count query latency", + "type": "timeseries" + } + ], + "title": "postgres", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 2, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 38 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[$__rate_interval])) by (image)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU seconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(container_memory_usage_bytes{image!=\"\"}) by (image)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Memory usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "CPU usage compared to the node total", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 46 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[$__rate_interval])) by (image) / on() group_left() sum(machine_cpu_cores)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU usage %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Memory usage compared to node total", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 46 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n container_memory_usage_bytes{id!=\"/\"}\n) by (image) / on() group_left() sum(\n machine_memory_bytes{}\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Memory usage %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 54 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(container_cpu_cfs_throttled_periods_total{}[$__rate_interval])) by (image) / sum( rate(container_cpu_cfs_periods_total{}[$__rate_interval]) ) by (image)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Throttled periods %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 54 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(container_fs_reads_bytes_total{image!=\"\", name!=\"\"}[$__rate_interval])) by (image) * -1", + "legendFormat": "{{ image }}:read", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(container_fs_writes_bytes_total{image!=\"\", name!=\"\"}[$__rate_interval])) by (image)", + "instant": false, + "legendFormat": "{{ image }}:write", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "C" + } + ], + "title": "Disk I/O bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 62 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(\n rate(\n container_network_receive_bytes_total{image!=\"\", name!=\"\"}[$__rate_interval]\n )\n) by (image)", + "legendFormat": "{{ image }}:rx", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n rate(\n container_network_transmit_bytes_total{image!=\"\", name!=\"\"}[$__rate_interval]\n )\n) by (image) * -1", + "instant": false, + "legendFormat": "{{ image }}:tx", + "range": true, + "refId": "B" + } + ], + "title": "Network I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 62 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.4.2", + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(container_fs_read_seconds_total{image!=\"\", name!=\"\"}[$__rate_interval])) by (image) * -1", + "legendFormat": "{{ image }}:read", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(container_fs_write_seconds_total{image!=\"\", name!=\"\"}[$__rate_interval])) by (image)", + "instant": false, + "legendFormat": "{{ image }}:write", + "range": true, + "refId": "B" + } + ], + "title": "Disk I/O seconds", + "type": "timeseries" + } + ], + "title": "containers", + "type": "row" + } + ], + "preload": false, + "refresh": "5s", + "schemaVersion": 42, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Testing", + "uid": "gwvvwj", + "version": 4, + "weekStart": "" +} \ No newline at end of file diff --git a/examples/weaver/registry/chronos/metrics.yaml b/examples/weaver/registry/chronos/metrics.yaml index e2eb38d..1814c20 100644 --- a/examples/weaver/registry/chronos/metrics.yaml +++ b/examples/weaver/registry/chronos/metrics.yaml @@ -8,13 +8,13 @@ groups: type: string stability: stable brief: Logical name of the service. - examples: ["chronos-metrics-mock"] + examples: [ "chronos-metrics-mock" ] requirement_level: required - id: service.instance.id type: string stability: stable brief: The string ID of the service instance. - examples: ["chronos-metrics-mock-live-check"] + examples: [ "chronos-metrics-mock-live-check" ] requirement_level: required - id: metric_attributes.chronos.consume_result @@ -26,13 +26,13 @@ groups: type: string stability: development brief: Downstream selected by message_receiver::handle_message. - examples: ["kafka", "postgres"] + examples: [ "kafka", "postgres" ] requirement_level: required - id: chronos.consume.status type: string stability: development brief: Whether the consume path completed successfully. - examples: ["pass", "fail"] + examples: [ "pass", "fail" ] requirement_level: required - id: metric_attributes.chronos.process_result @@ -44,13 +44,13 @@ groups: type: string stability: development brief: Whether the processor loop returned early because no rows were ready. - examples: ["true", "false"] + examples: [ "true", "false" ] requirement_level: required - id: chronos.process.status type: string stability: development brief: Whether the processor loop completed successfully. - examples: ["pass", "fail"] + examples: [ "pass", "fail" ] requirement_level: required - id: metric.chronos.message.consume.duration @@ -66,14 +66,28 @@ groups: rust_name: msg_consume_latency metric_value_type: double prometheus_name: msg_consume_latency - label_names: [destination, status] - prometheus_label_names: [destination, status] - buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] + label_names: [ destination, status ] + prometheus_label_names: [ destination, status ] + buckets: + [ + 0.001, + 0.002, + 0.004, + 0.008, + 0.016, + 0.032, + 0.064, + 0.128, + 0.256, + 0.512, + 1.024, + 2.048, + ] prewarm_label_values: - - [kafka, pass] - - [kafka, fail] - - [postgres, pass] - - [postgres, fail] + - [ kafka, pass ] + - [ kafka, fail ] + - [ postgres, pass ] + - [ postgres, fail ] - id: metric.chronos.message.process.duration type: metric @@ -88,14 +102,28 @@ groups: rust_name: msg_process_latency metric_value_type: double prometheus_name: msg_process_latency - label_names: [returned, status] - prometheus_label_names: [returned, status] - buckets: [0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, 0.256, 0.512, 1.024, 2.048] + label_names: [ returned, status ] + prometheus_label_names: [ returned, status ] + buckets: + [ + 0.001, + 0.002, + 0.004, + 0.008, + 0.016, + 0.032, + 0.064, + 0.128, + 0.256, + 0.512, + 1.024, + 2.048, + ] prewarm_label_values: - - ["true", pass] - - ["true", fail] - - ["false", pass] - - ["false", fail] + - [ "true", pass ] + - [ "true", fail ] + - [ "false", pass ] + - [ "false", fail ] - id: metric.chronos.message.wait.duration type: metric @@ -109,7 +137,23 @@ groups: rust_name: msg_wait_time metric_value_type: double prometheus_name: msg_wait_time - buckets: [0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6, 51.2, 102.4, 204.8, 409.6, 819.2] + buckets: + [ + 0.1, + 0.2, + 0.4, + 0.8, + 1.6, + 3.2, + 6.4, + 12.8, + 25.6, + 51.2, + 102.4, + 204.8, + 409.6, + 819.2, + ] - id: metric.chronos.message.jitter type: metric @@ -123,7 +167,7 @@ groups: rust_name: msg_jitter metric_value_type: double prometheus_name: msg_jitter - buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] + buckets: [ 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0 ] - id: metric.chronos.message.reset type: metric