From 311babd1d007f30a16e816b3c709a571014ccf0e Mon Sep 17 00:00:00 2001 From: Josiah England Date: Wed, 27 May 2026 14:34:43 -0600 Subject: [PATCH] fix(tracing): direct OTel SDK setup for chain-coherent sampling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Knative's config-observability ConfigMap only exposes a flat tracing-sampling-rate, so at fractional rates each service in the chain rolls independently — PaC can drop a trace while Tekton keeps it, leaving execution spans whose parent_spanID points at nothing. Switching to the OTel SDK opens up OTEL_TRACES_SAMPLER's parentbased_* family, which honors the root span's sample decision in the W3C traceparent flag so the whole chain is kept or dropped together. Controller and watcher call tracing.New() at startup. Tracing is opt-in: both OTEL_EXPORTER_OTLP_ENDPOINT and OTEL_TRACES_SAMPLER must be set, otherwise PaC falls back to noop (matching the prior tracing-sampling-rate "0" default). Resource service.name is pipelines-as-code. Propagator is W3C TraceContext only; Baggage is intentionally not honored per Konflux-CI ADR 0061. otlptracegrpc and otlptracehttp promoted from indirect to direct dependencies. Assisted-by: Claude Code Signed-off-by: Josiah England --- config/305-config-observability.yaml | 14 --- docs/content/docs/operations/tracing.md | 33 ++--- go.mod | 4 +- pkg/adapter/adapter.go | 7 ++ pkg/reconciler/controller.go | 10 ++ pkg/tracing/provider.go | 136 +++++++++++++++++++++ pkg/tracing/provider_test.go | 156 ++++++++++++++++++++++++ 7 files changed, 322 insertions(+), 38 deletions(-) create mode 100644 pkg/tracing/provider.go create mode 100644 pkg/tracing/provider_test.go diff --git a/config/305-config-observability.yaml b/config/305-config-observability.yaml index bfa697624e..78e2b009a4 100644 --- a/config/305-config-observability.yaml +++ b/config/305-config-observability.yaml @@ -52,20 +52,6 @@ data: # Only applicable for grpc and http/protobuf protocols. # metrics-export-interval: "30s" - # tracing-protocol specifies the trace export protocol. - # Supported values: "grpc", "http/protobuf", "none". - # Default is "none" (tracing disabled). - # tracing-protocol: "none" - - # tracing-endpoint specifies the OTLP collector endpoint. - # Required when tracing-protocol is "grpc" or "http/protobuf". - # The OTEL_EXPORTER_OTLP_ENDPOINT env var takes precedence if set. - # tracing-endpoint: "http://otel-collector.observability.svc.cluster.local:4317" - - # tracing-sampling-rate controls the fraction of traces sampled. - # 0.0 = none, 1.0 = all. Default is 0 (none). - # tracing-sampling-rate: "1.0" - # runtime-profiling enables/disables the pprof profiling server on port 8008. # Supported values: "enabled", "disabled" (default). # runtime-profiling: "disabled" diff --git a/docs/content/docs/operations/tracing.md b/docs/content/docs/operations/tracing.md index a3e3aa3e5c..8e53b7a9ab 100644 --- a/docs/content/docs/operations/tracing.md +++ b/docs/content/docs/operations/tracing.md @@ -7,35 +7,24 @@ This page describes how to enable OpenTelemetry distributed tracing for Pipeline ## Enabling tracing -The ConfigMap `pipelines-as-code-config-observability` controls tracing configuration. It must exist in the same namespace as the Pipelines-as-Code controller and watcher deployments. See [config/305-config-observability.yaml](https://github.com/tektoncd/pipelines-as-code/blob/main/config/305-config-observability.yaml) for the full example. +Pipelines-as-Code reads tracing configuration from standard OpenTelemetry environment variables on the controller and watcher pods: -It contains the following tracing fields: +* `OTEL_EXPORTER_OTLP_ENDPOINT`: OTLP collector endpoint URL. Required to enable tracing. +* `OTEL_TRACES_SAMPLER`: Sampler family. Required to enable tracing. Supported values: `always_on`, `always_off`, `traceidratio`, `parentbased_always_on`, `parentbased_always_off`, `parentbased_traceidratio`. +* `OTEL_TRACES_SAMPLER_ARG`: Numeric argument for ratio-based samplers. Set to `0.1` with `OTEL_TRACES_SAMPLER=parentbased_traceidratio` to sample 10% of root traces while keeping the chain coherent on those that are kept. +* `OTEL_EXPORTER_OTLP_PROTOCOL` (or traces-specific `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL`): OTLP transport. Supported values: `grpc`, `http/protobuf`. Default: `grpc`. -* `tracing-protocol`: Export protocol. Supported values: `grpc`, `http/protobuf`, `none`. Default is `none` (tracing disabled). -* `tracing-endpoint`: OTLP collector endpoint. Required when protocol is not `none`. The `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable takes precedence if set. -* `tracing-sampling-rate`: Fraction of traces to sample. `0.0` = none, `1.0` = all. Default is `0`. +Both `OTEL_EXPORTER_OTLP_ENDPOINT` and `OTEL_TRACES_SAMPLER` must be set to opt in to tracing. If either is unset PaC falls back to a noop tracer that emits no spans and incurs no exporter cost. Changes to any of these env vars take effect on the next pod restart. -### Example +PaC honors inbound `traceparent` headers on incoming webhook requests via the W3C TraceContext propagator. OTel Baggage is intentionally not honored: every emission point in PaC already has the attributes it needs from the local PipelineRun and webhook event, so no cross-service attribute propagation channel is needed. -```yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: pipelines-as-code-config-observability - namespace: pipelines-as-code -data: - tracing-protocol: grpc - tracing-endpoint: "http://otel-collector.observability.svc.cluster.local:4317" - tracing-sampling-rate: "1.0" -``` +### Sampler choice and chain coherency -Changes to `tracing-protocol`, `tracing-endpoint`, and `tracing-sampling-rate` require restarting the controller and watcher pods. The trace exporter is created once at startup from the ConfigMap values at that time. Set `tracing-protocol` to `none` or remove the tracing keys to disable tracing. - -The controller and watcher locate this ConfigMap by name via the `CONFIG_OBSERVABILITY_NAME` environment variable set in their deployment manifests. Operator-based installations may manage this differently; consult the operator documentation for details. +The `parentbased_*` sampler family honors the parent span's sample decision carried in the W3C `traceparent` flag bit. When PaC, Tekton, integration-service, and release-service all use parent-based samplers, the root span's sampling decision propagates through the whole delivery chain: every service either keeps its spans or drops them based on what the root chose. Flat-rate samplers (`traceidratio` without parent-based) cause each service to roll independently, which at fractional sampling fragments the chain into orphaned spans whose `parent_spanID` references a span that was dropped. `parentbased_always_on` keeps everything; `parentbased_traceidratio` with a numeric argument samples a coherent fraction. ## Emitted spans -The controller emits a `PipelinesAsCode:ProcessEvent` span for each webhook event. The watcher emits `waitDuration` and `executeDuration` spans for completed PipelineRuns. +The controller emits a `PipelinesAsCode:ProcessEvent` span for each webhook event. The watcher emits `waitDuration` and `executeDuration` spans for completed PipelineRuns. The OTel resource attribute `service.name` on all emitted spans is `pipelines-as-code`. ### Webhook event span (`PipelinesAsCode:ProcessEvent`) @@ -109,7 +98,7 @@ If Tekton Pipelines is also configured with tracing pointing at the same collect ## Deploying a trace collector -Pipelines-as-Code exports traces using the standard OpenTelemetry Protocol (OTLP). You need a running OTLP-compatible collector for the `tracing-endpoint` to point to. Common options include: +Pipelines-as-Code exports traces using the standard OpenTelemetry Protocol (OTLP). You need a running OTLP-compatible collector for `OTEL_EXPORTER_OTLP_ENDPOINT` to point to. Common options include: * [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) -- the vendor-neutral reference collector * [Jaeger](https://www.jaegertracing.io/docs/latest/getting-started/) -- supports OTLP ingestion natively since v1.35 diff --git a/go.mod b/go.mod index 9a4613c9db..4bc837e607 100644 --- a/go.mod +++ b/go.mod @@ -30,6 +30,8 @@ require ( github.com/tektoncd/pipeline v1.11.1 gitlab.com/gitlab-org/api/client-go v1.46.0 go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 go.opentelemetry.io/otel/metric v1.43.0 go.opentelemetry.io/otel/sdk v1.43.0 go.opentelemetry.io/otel/sdk/metric v1.43.0 @@ -90,8 +92,6 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 // indirect go.opentelemetry.io/otel/exporters/prometheus v0.65.0 // indirect go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 // indirect go.opentelemetry.io/proto/otlp v1.10.0 // indirect diff --git a/pkg/adapter/adapter.go b/pkg/adapter/adapter.go index a2cb2925d1..d7d93d56c3 100644 --- a/pkg/adapter/adapter.go +++ b/pkg/adapter/adapter.go @@ -79,6 +79,13 @@ func New(run *params.Run, k *kubeinteraction.Interaction) adapter.AdapterConstru } func (l *listener) Start(ctx context.Context) error { + tp := tracing.New(l.logger) + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = tp.Shutdown(shutdownCtx) + }() + adapterPort := globalAdapterPort envAdapterPort := os.Getenv("PAC_CONTROLLER_PORT") if envAdapterPort != "" { diff --git a/pkg/reconciler/controller.go b/pkg/reconciler/controller.go index 3221231e7a..53cff1ffa8 100644 --- a/pkg/reconciler/controller.go +++ b/pkg/reconciler/controller.go @@ -3,6 +3,7 @@ package reconciler import ( "context" "path" + "time" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" @@ -14,6 +15,7 @@ import ( "github.com/openshift-pipelines/pipelines-as-code/pkg/params/info" prmetrics "github.com/openshift-pipelines/pipelines-as-code/pkg/pipelinerunmetrics" queuepkg "github.com/openshift-pipelines/pipelines-as-code/pkg/queue" + "github.com/openshift-pipelines/pipelines-as-code/pkg/tracing" tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" tektonPipelineRunInformerv1 "github.com/tektoncd/pipeline/pkg/client/injection/informers/pipeline/v1/pipelinerun" tektonPipelineRunReconcilerv1 "github.com/tektoncd/pipeline/pkg/client/injection/reconciler/pipeline/v1/pipelinerun" @@ -30,6 +32,14 @@ func NewController() func(context.Context, configmap.Watcher) *controller.Impl { ctx = info.StoreNS(ctx, system.Namespace()) log := logging.FromContext(ctx) + tp := tracing.New(log) + go func() { + <-ctx.Done() + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = tp.Shutdown(shutdownCtx) + }() + run := params.New() err := run.Clients.NewClients(ctx, &run.Info) if err != nil { diff --git a/pkg/tracing/provider.go b/pkg/tracing/provider.go new file mode 100644 index 0000000000..9365e4f3e5 --- /dev/null +++ b/pkg/tracing/provider.go @@ -0,0 +1,136 @@ +package tracing + +import ( + "context" + "os" + "strconv" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.40.0" + "go.uber.org/zap" +) + +const ( + EnvOTLPEndpoint = "OTEL_EXPORTER_OTLP_ENDPOINT" + EnvOTLPProtocol = "OTEL_EXPORTER_OTLP_PROTOCOL" + EnvOTLPTracesProtocol = "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL" + EnvTracesSampler = "OTEL_TRACES_SAMPLER" + EnvTracesSamplerArg = "OTEL_TRACES_SAMPLER_ARG" + + protocolGRPC = "grpc" + protocolHTTP = "http/protobuf" +) + +type TracerProvider struct { + shutdown func(context.Context) error +} + +func New(logger *zap.SugaredLogger) *TracerProvider { + if os.Getenv(EnvOTLPEndpoint) == "" { + logger.Info("OTLP endpoint not configured, using noop tracer provider") + return noopProvider() + } + if os.Getenv(EnvTracesSampler) == "" { + logger.Info("OTEL_TRACES_SAMPLER not set, tracing disabled (set explicitly to opt in)") + return noopProvider() + } + + exporter, err := newExporter(context.Background(), logger) + if err != nil { + logger.Errorw("failed to create OTLP exporter, using noop tracer provider", "error", err) + return noopProvider() + } + + res, err := resource.Merge( + resource.Default(), + resource.NewWithAttributes( + semconv.SchemaURL, + semconv.ServiceName(TracerName), + ), + ) + if err != nil { + logger.Errorw("failed to create resource", "error", err) + res = resource.Default() + } + + tp := sdktrace.NewTracerProvider( + sdktrace.WithBatcher(exporter), + sdktrace.WithResource(res), + sdktrace.WithSampler(samplerFromEnv(logger)), + ) + + otel.SetTracerProvider(tp) + otel.SetTextMapPropagator(propagation.TraceContext{}) + + logger.Infow("tracing initialized", "endpoint", os.Getenv(EnvOTLPEndpoint), "protocol", protocolFromEnv()) + + return &TracerProvider{shutdown: tp.Shutdown} +} + +func noopProvider() *TracerProvider { + return &TracerProvider{shutdown: func(context.Context) error { return nil }} +} + +func protocolFromEnv() string { + if v := os.Getenv(EnvOTLPTracesProtocol); v != "" { + return v + } + if v := os.Getenv(EnvOTLPProtocol); v != "" { + return v + } + return protocolGRPC +} + +func newExporter(ctx context.Context, logger *zap.SugaredLogger) (sdktrace.SpanExporter, error) { + endpoint := os.Getenv(EnvOTLPEndpoint) + proto := protocolFromEnv() + switch proto { + case protocolHTTP: + return otlptracehttp.New(ctx, otlptracehttp.WithEndpointURL(endpoint)) + case protocolGRPC: + return otlptracegrpc.New(ctx, otlptracegrpc.WithEndpointURL(endpoint)) + default: + logger.Errorw("unsupported OTLP protocol; falling back to grpc", "protocol", proto) + return otlptracegrpc.New(ctx, otlptracegrpc.WithEndpointURL(endpoint)) + } +} + +func (tp *TracerProvider) Shutdown(ctx context.Context) error { + if tp.shutdown != nil { + return tp.shutdown(ctx) + } + return nil +} + +func samplerFromEnv(logger *zap.SugaredLogger) sdktrace.Sampler { + name := os.Getenv(EnvTracesSampler) + argStr := os.Getenv(EnvTracesSamplerArg) + arg, err := strconv.ParseFloat(argStr, 64) + if err != nil && argStr != "" { + logger.Errorw("ignoring malformed sampler argument", "env", EnvTracesSamplerArg, "value", argStr) + } + if argStr == "" && (name == "traceidratio" || name == "parentbased_traceidratio") { + logger.Infow("ratio sampler selected without "+EnvTracesSamplerArg+"; defaulting to 0% sampling", "env", EnvTracesSampler, "value", name) + } + switch name { + case "always_on": + return sdktrace.AlwaysSample() + case "always_off": + return sdktrace.NeverSample() + case "traceidratio": + return sdktrace.TraceIDRatioBased(arg) + case "parentbased_always_on": + return sdktrace.ParentBased(sdktrace.AlwaysSample()) + case "parentbased_always_off": + return sdktrace.ParentBased(sdktrace.NeverSample()) + case "parentbased_traceidratio": + return sdktrace.ParentBased(sdktrace.TraceIDRatioBased(arg)) + } + logger.Warnw("unrecognized OTEL_TRACES_SAMPLER value; falling back to never sample", "value", name) + return sdktrace.NeverSample() +} diff --git a/pkg/tracing/provider_test.go b/pkg/tracing/provider_test.go new file mode 100644 index 0000000000..7604e30148 --- /dev/null +++ b/pkg/tracing/provider_test.go @@ -0,0 +1,156 @@ +package tracing + +import ( + "context" + "strings" + "testing" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/propagation" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.uber.org/zap" + "gotest.tools/v3/assert" +) + +func nopLogger() *zap.SugaredLogger { + return zap.NewNop().Sugar() +} + +func saveGlobalProvider(t *testing.T) { + t.Helper() + prevTP := otel.GetTracerProvider() + prevProp := otel.GetTextMapPropagator() + t.Cleanup(func() { + otel.SetTracerProvider(prevTP) + otel.SetTextMapPropagator(prevProp) + }) +} + +func TestNewReturnsNoopWhenEndpointUnset(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "") + t.Setenv(EnvTracesSampler, "always_on") + + tp := New(nopLogger()) + assert.Assert(t, tp != nil) + + _, isSDK := otel.GetTracerProvider().(*sdktrace.TracerProvider) + assert.Assert(t, !isSDK, "should not install SDK provider when endpoint unset") +} + +func TestNewReturnsNoopWhenSamplerUnset(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "http://localhost:4317") + t.Setenv(EnvTracesSampler, "") + + tp := New(nopLogger()) + assert.Assert(t, tp != nil) + + _, isSDK := otel.GetTracerProvider().(*sdktrace.TracerProvider) + assert.Assert(t, !isSDK, "should not install SDK provider when sampler unset") +} + +func TestNewInstallsSDKAndW3CPropagatorOnGRPC(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "http://localhost:4317") + t.Setenv(EnvTracesSampler, "parentbased_always_on") + t.Setenv(EnvOTLPProtocol, "grpc") + + tp := New(nopLogger()) + assert.Assert(t, tp != nil) + + _, isSDK := otel.GetTracerProvider().(*sdktrace.TracerProvider) + assert.Assert(t, isSDK, "should install SDK provider when endpoint and sampler are both set") + + _, isW3C := otel.GetTextMapPropagator().(propagation.TraceContext) + assert.Assert(t, isW3C, "should set W3C TraceContext as the global propagator") + + assert.NilError(t, tp.Shutdown(context.Background())) +} + +func TestNewInstallsSDKOnHTTPProtobuf(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "http://localhost:4318") + t.Setenv(EnvTracesSampler, "parentbased_traceidratio") + t.Setenv(EnvTracesSamplerArg, "0.5") + t.Setenv(EnvOTLPProtocol, "http/protobuf") + + tp := New(nopLogger()) + + _, isSDK := otel.GetTracerProvider().(*sdktrace.TracerProvider) + assert.Assert(t, isSDK, "http/protobuf protocol should also install an SDK provider") + + assert.NilError(t, tp.Shutdown(context.Background())) +} + +func TestProtocolFromEnv(t *testing.T) { + tests := []struct { + name string + tracesProtocol string + otlpProtocol string + want string + }{ + {"defaults to grpc when neither is set", "", "", protocolGRPC}, + {"falls back to OTLPProtocol when the traces-specific var is unset", "", "http/protobuf", "http/protobuf"}, + {"traces-specific takes precedence over the generic var", "grpc", "http/protobuf", "grpc"}, + {"OTLPProtocol grpc applies", "", "grpc", "grpc"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv(EnvOTLPTracesProtocol, tt.tracesProtocol) + t.Setenv(EnvOTLPProtocol, tt.otlpProtocol) + assert.Equal(t, protocolFromEnv(), tt.want) + }) + } +} + +func TestSamplerFromEnv(t *testing.T) { + tests := []struct { + name string + sampler string + arg string + wantDescPrefix string + }{ + {"always_on", "always_on", "", "AlwaysOnSampler"}, + {"always_off", "always_off", "", "AlwaysOffSampler"}, + {"traceidratio half", "traceidratio", "0.5", "TraceIDRatioBased{0.5}"}, + {"parentbased_always_on", "parentbased_always_on", "", "ParentBased{root:AlwaysOnSampler"}, + {"parentbased_always_off", "parentbased_always_off", "", "ParentBased{root:AlwaysOffSampler"}, + {"parentbased_traceidratio one tenth", "parentbased_traceidratio", "0.1", "ParentBased{root:TraceIDRatioBased{0.1}"}, + {"unrecognized falls back to never sample", "some-future-otel-keyword", "", "AlwaysOffSampler"}, + {"empty value falls back to never sample", "", "", "AlwaysOffSampler"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv(EnvTracesSampler, tt.sampler) + t.Setenv(EnvTracesSamplerArg, tt.arg) + s := samplerFromEnv(nopLogger()) + assert.Assert(t, strings.Contains(s.Description(), tt.wantDescPrefix), + "expected sampler description containing %q, got %q", tt.wantDescPrefix, s.Description()) + }) + } +} + +func TestShutdownReturnsNilForNoopProvider(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "") + t.Setenv(EnvTracesSampler, "") + + tp := New(nopLogger()) + assert.NilError(t, tp.Shutdown(context.Background())) +} + +func TestShutdownIsIdempotent(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "http://localhost:4317") + t.Setenv(EnvTracesSampler, "always_on") + + tp := New(nopLogger()) + assert.NilError(t, tp.Shutdown(context.Background())) + assert.NilError(t, tp.Shutdown(context.Background()), "second shutdown must not panic or error") +} + +func TestShutdownOnProviderWithoutHookReturnsNil(t *testing.T) { + tp := &TracerProvider{} + assert.NilError(t, tp.Shutdown(context.Background())) +}