From a8b14635df84c9db75800d0b3dd3b114c490a437 Mon Sep 17 00:00:00 2001 From: stxkxs Date: Tue, 23 Jun 2026 20:59:09 -0700 Subject: [PATCH] feat(observability): bridge OTLP metrics + logs to AMP/Loki and scrape hubble L7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the grafana-agent (Alloy) telemetry pipeline so two previously-dropped signals actually reach the backends. OTLP metrics + logs → AMP / Loki (#62). The otelcol.receiver.otlp listened on :4317/:4318 but only routed traces to Tempo — every OTLP metric and log a tenant app pushed was silently discarded, and no Service exposed the receiver ports, so grafana-agent.monitoring.svc:4318 wasn't even reachable. This: - wires the receiver's metrics + logs outputs through new otelcol.exporter.prometheus and otelcol.exporter.loki components into the same AMP remote-write (SigV4) and Loki sinks the scrape + tail pipelines already use; - exposes :4317/:4318 on the agent Service via agent.extraPorts, so workloads in other namespaces can reach the OTLP endpoint. This is the shared prerequisite for the per-tenant o11y retrofits — their metrics half was blocked on it. Hubble L7 metrics → AMP (#63). The hubble_http_* L7 flow metrics are served per cilium-agent pod on :9965 but exposed only via the headless hubble-metrics Service; the annotation-gated pod scrape can't reach them (the agent pod's prometheus.io/port is already its own :9962). Adds an endpoints-discovery scrape of the hubble-metrics service, so the hubble-overview dashboard renders. Validated offline: grafana/alloy fmt parses the config; alloy validate is identical to the deployed config except line-shifts of the pre-existing env() deprecation warnings (no new issues); helm template confirms the Service + container expose 4317/4318. End-to-end (metrics landing in AMP) verifies on a live cluster. Closes #62. Closes #63. --- .../observability/grafana-agent/values.yaml | 61 ++++++++++++++++++- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/addons/observability/grafana-agent/values.yaml b/addons/observability/grafana-agent/values.yaml index 32ad4f4..18e88b9 100644 --- a/addons/observability/grafana-agent/values.yaml +++ b/addons/observability/grafana-agent/values.yaml @@ -8,6 +8,17 @@ agent: mode: flow + # Expose the OTLP receiver ports on the agent Service so tenant apps in other + # namespaces can push to grafana-agent.monitoring.svc:4317/4318. + extraPorts: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP + - name: otlp-http + port: 4318 + targetPort: 4318 + protocol: TCP configMap: create: true content: | @@ -83,6 +94,37 @@ agent: forward_to = [prometheus.remote_write.amp.receiver] } + // Hubble L7 metrics (:9965) are served per cilium-agent pod and exposed only + // via the headless hubble-metrics Service in kube-system. The annotation-gated + // pod scrape can't reach them (the agent pod's prometheus.io/port is its own + // :9962), so scrape the service endpoints directly. + discovery.kubernetes "hubble" { + role = "endpoints" + namespaces { + names = ["kube-system"] + } + } + discovery.relabel "hubble" { + targets = discovery.kubernetes.hubble.targets + rule { + source_labels = ["__meta_kubernetes_service_name"] + regex = "hubble-metrics" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_namespace"] + target_label = "namespace" + } + rule { + source_labels = ["__meta_kubernetes_pod_name"] + target_label = "pod" + } + } + prometheus.scrape "hubble" { + targets = discovery.relabel.hubble.output + forward_to = [prometheus.remote_write.amp.receiver] + } + // SigV4 signs each remote-write request with the IRSA-projected // AWS_ROLE_ARN / AWS_WEB_IDENTITY_TOKEN_FILE credentials. AMP // expects service "aps" and the workspace's region. @@ -109,7 +151,12 @@ agent: } } - // ────────────────────────────── Traces → Tempo ───────────────────────── + // ──────────────────── OTLP ingest (traces + metrics + logs) ───────────── + // Tenant apps push OTLP to this agent on :4317/:4318 (exposed via the agent + // Service + agent.extraPorts above). Traces go to Tempo; metrics and logs are + // converted and fed into the same AMP remote-write / Loki sinks the scrape and + // tail pipelines use — without these outputs the OTLP metrics and logs were + // silently dropped. otelcol.receiver.otlp "default" { grpc { endpoint = "0.0.0.0:4317" @@ -118,7 +165,9 @@ agent: endpoint = "0.0.0.0:4318" } output { - traces = [otelcol.exporter.otlp.tempo.input] + traces = [otelcol.exporter.otlp.tempo.input] + metrics = [otelcol.exporter.prometheus.otlp.input] + logs = [otelcol.exporter.loki.otlp.input] } } otelcol.exporter.otlp "tempo" { @@ -129,6 +178,14 @@ agent: } } } + // OTLP metrics → Prometheus → AMP (same SigV4 remote-write as the scrape path). + otelcol.exporter.prometheus "otlp" { + forward_to = [prometheus.remote_write.amp.receiver] + } + // OTLP logs → Loki (same sink as the kubernetes log tail). + otelcol.exporter.loki "otlp" { + forward_to = [loki.write.default.receiver] + } # IRSA is required for SigV4 to AMP. values-{env}.yaml sets the role-arn # annotation on the SA.