diff --git a/config/305-config-observability.yaml b/config/305-config-observability.yaml index bfa697624..78e2b009a 100644 --- a/config/305-config-observability.yaml +++ b/config/305-config-observability.yaml @@ -52,20 +52,6 @@ data: # Only applicable for grpc and http/protobuf protocols. # metrics-export-interval: "30s" - # tracing-protocol specifies the trace export protocol. - # Supported values: "grpc", "http/protobuf", "none". - # Default is "none" (tracing disabled). - # tracing-protocol: "none" - - # tracing-endpoint specifies the OTLP collector endpoint. - # Required when tracing-protocol is "grpc" or "http/protobuf". - # The OTEL_EXPORTER_OTLP_ENDPOINT env var takes precedence if set. - # tracing-endpoint: "http://otel-collector.observability.svc.cluster.local:4317" - - # tracing-sampling-rate controls the fraction of traces sampled. - # 0.0 = none, 1.0 = all. Default is 0 (none). - # tracing-sampling-rate: "1.0" - # runtime-profiling enables/disables the pprof profiling server on port 8008. # Supported values: "enabled", "disabled" (default). # runtime-profiling: "disabled" diff --git a/docs/content/docs/operations/tracing.md b/docs/content/docs/operations/tracing.md index a3e3aa3e5..8e53b7a9a 100644 --- a/docs/content/docs/operations/tracing.md +++ b/docs/content/docs/operations/tracing.md @@ -7,35 +7,24 @@ This page describes how to enable OpenTelemetry distributed tracing for Pipeline ## Enabling tracing -The ConfigMap `pipelines-as-code-config-observability` controls tracing configuration. It must exist in the same namespace as the Pipelines-as-Code controller and watcher deployments. See [config/305-config-observability.yaml](https://github.com/tektoncd/pipelines-as-code/blob/main/config/305-config-observability.yaml) for the full example. +Pipelines-as-Code reads tracing configuration from standard OpenTelemetry environment variables on the controller and watcher pods: -It contains the following tracing fields: +* `OTEL_EXPORTER_OTLP_ENDPOINT`: OTLP collector endpoint URL. Required to enable tracing. +* `OTEL_TRACES_SAMPLER`: Sampler family. Required to enable tracing. Supported values: `always_on`, `always_off`, `traceidratio`, `parentbased_always_on`, `parentbased_always_off`, `parentbased_traceidratio`. +* `OTEL_TRACES_SAMPLER_ARG`: Numeric argument for ratio-based samplers. Set to `0.1` with `OTEL_TRACES_SAMPLER=parentbased_traceidratio` to sample 10% of root traces while keeping the chain coherent on those that are kept. +* `OTEL_EXPORTER_OTLP_PROTOCOL` (or traces-specific `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL`): OTLP transport. Supported values: `grpc`, `http/protobuf`. Default: `grpc`. -* `tracing-protocol`: Export protocol. Supported values: `grpc`, `http/protobuf`, `none`. Default is `none` (tracing disabled). -* `tracing-endpoint`: OTLP collector endpoint. Required when protocol is not `none`. The `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable takes precedence if set. -* `tracing-sampling-rate`: Fraction of traces to sample. `0.0` = none, `1.0` = all. Default is `0`. +Both `OTEL_EXPORTER_OTLP_ENDPOINT` and `OTEL_TRACES_SAMPLER` must be set to opt in to tracing. If either is unset PaC falls back to a noop tracer that emits no spans and incurs no exporter cost. Changes to any of these env vars take effect on the next pod restart. -### Example +PaC honors inbound `traceparent` headers on incoming webhook requests via the W3C TraceContext propagator. OTel Baggage is intentionally not honored: every emission point in PaC already has the attributes it needs from the local PipelineRun and webhook event, so no cross-service attribute propagation channel is needed. -```yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: pipelines-as-code-config-observability - namespace: pipelines-as-code -data: - tracing-protocol: grpc - tracing-endpoint: "http://otel-collector.observability.svc.cluster.local:4317" - tracing-sampling-rate: "1.0" -``` +### Sampler choice and chain coherency -Changes to `tracing-protocol`, `tracing-endpoint`, and `tracing-sampling-rate` require restarting the controller and watcher pods. The trace exporter is created once at startup from the ConfigMap values at that time. Set `tracing-protocol` to `none` or remove the tracing keys to disable tracing. - -The controller and watcher locate this ConfigMap by name via the `CONFIG_OBSERVABILITY_NAME` environment variable set in their deployment manifests. Operator-based installations may manage this differently; consult the operator documentation for details. +The `parentbased_*` sampler family honors the parent span's sample decision carried in the W3C `traceparent` flag bit. When PaC, Tekton, integration-service, and release-service all use parent-based samplers, the root span's sampling decision propagates through the whole delivery chain: every service either keeps its spans or drops them based on what the root chose. Flat-rate samplers (`traceidratio` without parent-based) cause each service to roll independently, which at fractional sampling fragments the chain into orphaned spans whose `parent_spanID` references a span that was dropped. `parentbased_always_on` keeps everything; `parentbased_traceidratio` with a numeric argument samples a coherent fraction. ## Emitted spans -The controller emits a `PipelinesAsCode:ProcessEvent` span for each webhook event. The watcher emits `waitDuration` and `executeDuration` spans for completed PipelineRuns. +The controller emits a `PipelinesAsCode:ProcessEvent` span for each webhook event. The watcher emits `waitDuration` and `executeDuration` spans for completed PipelineRuns. The OTel resource attribute `service.name` on all emitted spans is `pipelines-as-code`. ### Webhook event span (`PipelinesAsCode:ProcessEvent`) @@ -109,7 +98,7 @@ If Tekton Pipelines is also configured with tracing pointing at the same collect ## Deploying a trace collector -Pipelines-as-Code exports traces using the standard OpenTelemetry Protocol (OTLP). You need a running OTLP-compatible collector for the `tracing-endpoint` to point to. Common options include: +Pipelines-as-Code exports traces using the standard OpenTelemetry Protocol (OTLP). You need a running OTLP-compatible collector for `OTEL_EXPORTER_OTLP_ENDPOINT` to point to. Common options include: * [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) -- the vendor-neutral reference collector * [Jaeger](https://www.jaegertracing.io/docs/latest/getting-started/) -- supports OTLP ingestion natively since v1.35 diff --git a/go.mod b/go.mod index 9a4613c9d..4bc837e60 100644 --- a/go.mod +++ b/go.mod @@ -30,6 +30,8 @@ require ( github.com/tektoncd/pipeline v1.11.1 gitlab.com/gitlab-org/api/client-go v1.46.0 go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 go.opentelemetry.io/otel/metric v1.43.0 go.opentelemetry.io/otel/sdk v1.43.0 go.opentelemetry.io/otel/sdk/metric v1.43.0 @@ -90,8 +92,6 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 // indirect go.opentelemetry.io/otel/exporters/prometheus v0.65.0 // indirect go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 // indirect go.opentelemetry.io/proto/otlp v1.10.0 // indirect diff --git a/pkg/adapter/adapter.go b/pkg/adapter/adapter.go index a2cb2925d..d7d93d56c 100644 --- a/pkg/adapter/adapter.go +++ b/pkg/adapter/adapter.go @@ -79,6 +79,13 @@ func New(run *params.Run, k *kubeinteraction.Interaction) adapter.AdapterConstru } func (l *listener) Start(ctx context.Context) error { + tp := tracing.New(l.logger) + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = tp.Shutdown(shutdownCtx) + }() + adapterPort := globalAdapterPort envAdapterPort := os.Getenv("PAC_CONTROLLER_PORT") if envAdapterPort != "" { diff --git a/pkg/reconciler/controller.go b/pkg/reconciler/controller.go index 3221231e7..53cff1ffa 100644 --- a/pkg/reconciler/controller.go +++ b/pkg/reconciler/controller.go @@ -3,6 +3,7 @@ package reconciler import ( "context" "path" + "time" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode" "github.com/openshift-pipelines/pipelines-as-code/pkg/apis/pipelinesascode/keys" @@ -14,6 +15,7 @@ import ( "github.com/openshift-pipelines/pipelines-as-code/pkg/params/info" prmetrics "github.com/openshift-pipelines/pipelines-as-code/pkg/pipelinerunmetrics" queuepkg "github.com/openshift-pipelines/pipelines-as-code/pkg/queue" + "github.com/openshift-pipelines/pipelines-as-code/pkg/tracing" tektonv1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1" tektonPipelineRunInformerv1 "github.com/tektoncd/pipeline/pkg/client/injection/informers/pipeline/v1/pipelinerun" tektonPipelineRunReconcilerv1 "github.com/tektoncd/pipeline/pkg/client/injection/reconciler/pipeline/v1/pipelinerun" @@ -30,6 +32,14 @@ func NewController() func(context.Context, configmap.Watcher) *controller.Impl { ctx = info.StoreNS(ctx, system.Namespace()) log := logging.FromContext(ctx) + tp := tracing.New(log) + go func() { + <-ctx.Done() + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = tp.Shutdown(shutdownCtx) + }() + run := params.New() err := run.Clients.NewClients(ctx, &run.Info) if err != nil { diff --git a/pkg/tracing/provider.go b/pkg/tracing/provider.go new file mode 100644 index 000000000..9365e4f3e --- /dev/null +++ b/pkg/tracing/provider.go @@ -0,0 +1,136 @@ +package tracing + +import ( + "context" + "os" + "strconv" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.40.0" + "go.uber.org/zap" +) + +const ( + EnvOTLPEndpoint = "OTEL_EXPORTER_OTLP_ENDPOINT" + EnvOTLPProtocol = "OTEL_EXPORTER_OTLP_PROTOCOL" + EnvOTLPTracesProtocol = "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL" + EnvTracesSampler = "OTEL_TRACES_SAMPLER" + EnvTracesSamplerArg = "OTEL_TRACES_SAMPLER_ARG" + + protocolGRPC = "grpc" + protocolHTTP = "http/protobuf" +) + +type TracerProvider struct { + shutdown func(context.Context) error +} + +func New(logger *zap.SugaredLogger) *TracerProvider { + if os.Getenv(EnvOTLPEndpoint) == "" { + logger.Info("OTLP endpoint not configured, using noop tracer provider") + return noopProvider() + } + if os.Getenv(EnvTracesSampler) == "" { + logger.Info("OTEL_TRACES_SAMPLER not set, tracing disabled (set explicitly to opt in)") + return noopProvider() + } + + exporter, err := newExporter(context.Background(), logger) + if err != nil { + logger.Errorw("failed to create OTLP exporter, using noop tracer provider", "error", err) + return noopProvider() + } + + res, err := resource.Merge( + resource.Default(), + resource.NewWithAttributes( + semconv.SchemaURL, + semconv.ServiceName(TracerName), + ), + ) + if err != nil { + logger.Errorw("failed to create resource", "error", err) + res = resource.Default() + } + + tp := sdktrace.NewTracerProvider( + sdktrace.WithBatcher(exporter), + sdktrace.WithResource(res), + sdktrace.WithSampler(samplerFromEnv(logger)), + ) + + otel.SetTracerProvider(tp) + otel.SetTextMapPropagator(propagation.TraceContext{}) + + logger.Infow("tracing initialized", "endpoint", os.Getenv(EnvOTLPEndpoint), "protocol", protocolFromEnv()) + + return &TracerProvider{shutdown: tp.Shutdown} +} + +func noopProvider() *TracerProvider { + return &TracerProvider{shutdown: func(context.Context) error { return nil }} +} + +func protocolFromEnv() string { + if v := os.Getenv(EnvOTLPTracesProtocol); v != "" { + return v + } + if v := os.Getenv(EnvOTLPProtocol); v != "" { + return v + } + return protocolGRPC +} + +func newExporter(ctx context.Context, logger *zap.SugaredLogger) (sdktrace.SpanExporter, error) { + endpoint := os.Getenv(EnvOTLPEndpoint) + proto := protocolFromEnv() + switch proto { + case protocolHTTP: + return otlptracehttp.New(ctx, otlptracehttp.WithEndpointURL(endpoint)) + case protocolGRPC: + return otlptracegrpc.New(ctx, otlptracegrpc.WithEndpointURL(endpoint)) + default: + logger.Errorw("unsupported OTLP protocol; falling back to grpc", "protocol", proto) + return otlptracegrpc.New(ctx, otlptracegrpc.WithEndpointURL(endpoint)) + } +} + +func (tp *TracerProvider) Shutdown(ctx context.Context) error { + if tp.shutdown != nil { + return tp.shutdown(ctx) + } + return nil +} + +func samplerFromEnv(logger *zap.SugaredLogger) sdktrace.Sampler { + name := os.Getenv(EnvTracesSampler) + argStr := os.Getenv(EnvTracesSamplerArg) + arg, err := strconv.ParseFloat(argStr, 64) + if err != nil && argStr != "" { + logger.Errorw("ignoring malformed sampler argument", "env", EnvTracesSamplerArg, "value", argStr) + } + if argStr == "" && (name == "traceidratio" || name == "parentbased_traceidratio") { + logger.Infow("ratio sampler selected without "+EnvTracesSamplerArg+"; defaulting to 0% sampling", "env", EnvTracesSampler, "value", name) + } + switch name { + case "always_on": + return sdktrace.AlwaysSample() + case "always_off": + return sdktrace.NeverSample() + case "traceidratio": + return sdktrace.TraceIDRatioBased(arg) + case "parentbased_always_on": + return sdktrace.ParentBased(sdktrace.AlwaysSample()) + case "parentbased_always_off": + return sdktrace.ParentBased(sdktrace.NeverSample()) + case "parentbased_traceidratio": + return sdktrace.ParentBased(sdktrace.TraceIDRatioBased(arg)) + } + logger.Warnw("unrecognized OTEL_TRACES_SAMPLER value; falling back to never sample", "value", name) + return sdktrace.NeverSample() +} diff --git a/pkg/tracing/provider_test.go b/pkg/tracing/provider_test.go new file mode 100644 index 000000000..7604e3014 --- /dev/null +++ b/pkg/tracing/provider_test.go @@ -0,0 +1,156 @@ +package tracing + +import ( + "context" + "strings" + "testing" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/propagation" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.uber.org/zap" + "gotest.tools/v3/assert" +) + +func nopLogger() *zap.SugaredLogger { + return zap.NewNop().Sugar() +} + +func saveGlobalProvider(t *testing.T) { + t.Helper() + prevTP := otel.GetTracerProvider() + prevProp := otel.GetTextMapPropagator() + t.Cleanup(func() { + otel.SetTracerProvider(prevTP) + otel.SetTextMapPropagator(prevProp) + }) +} + +func TestNewReturnsNoopWhenEndpointUnset(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "") + t.Setenv(EnvTracesSampler, "always_on") + + tp := New(nopLogger()) + assert.Assert(t, tp != nil) + + _, isSDK := otel.GetTracerProvider().(*sdktrace.TracerProvider) + assert.Assert(t, !isSDK, "should not install SDK provider when endpoint unset") +} + +func TestNewReturnsNoopWhenSamplerUnset(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "http://localhost:4317") + t.Setenv(EnvTracesSampler, "") + + tp := New(nopLogger()) + assert.Assert(t, tp != nil) + + _, isSDK := otel.GetTracerProvider().(*sdktrace.TracerProvider) + assert.Assert(t, !isSDK, "should not install SDK provider when sampler unset") +} + +func TestNewInstallsSDKAndW3CPropagatorOnGRPC(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "http://localhost:4317") + t.Setenv(EnvTracesSampler, "parentbased_always_on") + t.Setenv(EnvOTLPProtocol, "grpc") + + tp := New(nopLogger()) + assert.Assert(t, tp != nil) + + _, isSDK := otel.GetTracerProvider().(*sdktrace.TracerProvider) + assert.Assert(t, isSDK, "should install SDK provider when endpoint and sampler are both set") + + _, isW3C := otel.GetTextMapPropagator().(propagation.TraceContext) + assert.Assert(t, isW3C, "should set W3C TraceContext as the global propagator") + + assert.NilError(t, tp.Shutdown(context.Background())) +} + +func TestNewInstallsSDKOnHTTPProtobuf(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "http://localhost:4318") + t.Setenv(EnvTracesSampler, "parentbased_traceidratio") + t.Setenv(EnvTracesSamplerArg, "0.5") + t.Setenv(EnvOTLPProtocol, "http/protobuf") + + tp := New(nopLogger()) + + _, isSDK := otel.GetTracerProvider().(*sdktrace.TracerProvider) + assert.Assert(t, isSDK, "http/protobuf protocol should also install an SDK provider") + + assert.NilError(t, tp.Shutdown(context.Background())) +} + +func TestProtocolFromEnv(t *testing.T) { + tests := []struct { + name string + tracesProtocol string + otlpProtocol string + want string + }{ + {"defaults to grpc when neither is set", "", "", protocolGRPC}, + {"falls back to OTLPProtocol when the traces-specific var is unset", "", "http/protobuf", "http/protobuf"}, + {"traces-specific takes precedence over the generic var", "grpc", "http/protobuf", "grpc"}, + {"OTLPProtocol grpc applies", "", "grpc", "grpc"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv(EnvOTLPTracesProtocol, tt.tracesProtocol) + t.Setenv(EnvOTLPProtocol, tt.otlpProtocol) + assert.Equal(t, protocolFromEnv(), tt.want) + }) + } +} + +func TestSamplerFromEnv(t *testing.T) { + tests := []struct { + name string + sampler string + arg string + wantDescPrefix string + }{ + {"always_on", "always_on", "", "AlwaysOnSampler"}, + {"always_off", "always_off", "", "AlwaysOffSampler"}, + {"traceidratio half", "traceidratio", "0.5", "TraceIDRatioBased{0.5}"}, + {"parentbased_always_on", "parentbased_always_on", "", "ParentBased{root:AlwaysOnSampler"}, + {"parentbased_always_off", "parentbased_always_off", "", "ParentBased{root:AlwaysOffSampler"}, + {"parentbased_traceidratio one tenth", "parentbased_traceidratio", "0.1", "ParentBased{root:TraceIDRatioBased{0.1}"}, + {"unrecognized falls back to never sample", "some-future-otel-keyword", "", "AlwaysOffSampler"}, + {"empty value falls back to never sample", "", "", "AlwaysOffSampler"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv(EnvTracesSampler, tt.sampler) + t.Setenv(EnvTracesSamplerArg, tt.arg) + s := samplerFromEnv(nopLogger()) + assert.Assert(t, strings.Contains(s.Description(), tt.wantDescPrefix), + "expected sampler description containing %q, got %q", tt.wantDescPrefix, s.Description()) + }) + } +} + +func TestShutdownReturnsNilForNoopProvider(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "") + t.Setenv(EnvTracesSampler, "") + + tp := New(nopLogger()) + assert.NilError(t, tp.Shutdown(context.Background())) +} + +func TestShutdownIsIdempotent(t *testing.T) { + saveGlobalProvider(t) + t.Setenv(EnvOTLPEndpoint, "http://localhost:4317") + t.Setenv(EnvTracesSampler, "always_on") + + tp := New(nopLogger()) + assert.NilError(t, tp.Shutdown(context.Background())) + assert.NilError(t, tp.Shutdown(context.Background()), "second shutdown must not panic or error") +} + +func TestShutdownOnProviderWithoutHookReturnsNil(t *testing.T) { + tp := &TracerProvider{} + assert.NilError(t, tp.Shutdown(context.Background())) +}