diff --git a/roadmap-planner/backend/cmd/server/main.go b/roadmap-planner/backend/cmd/server/main.go index a62bf5e7..69833785 100644 --- a/roadmap-planner/backend/cmd/server/main.go +++ b/roadmap-planner/backend/cmd/server/main.go @@ -80,24 +80,38 @@ func main() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - // Initialize metrics system if enabled + // Open the persistent store once and share it across both the + // metrics collector (DORA Lead Time needs PR data) and the + // team-analytics path. nil when storage is disabled — both consumers + // soft-fail in that mode. + var sharedStore storage.Store + if cfg.Storage.Enabled { + s, err := openStore(cfg.Storage) + if err != nil { + logger.Error("Failed to open store", zap.Error(err)) + } else if err := s.Migrate(ctx); err != nil { + logger.Error("Failed to migrate store", zap.Error(err)) + } else { + logger.Info("Persistent store ready", + zap.String("type", cfg.Storage.Type), + zap.Int("backfill_days", cfg.Storage.BackfillDays)) + sharedStore = s + } + } + // Initialize metrics system if enabled. if cfg.Metrics.Enabled { logger.Info("Initializing metrics system") - err = initMetrics(ctx, router, cfg) - if err != nil { + if err := initMetrics(ctx, router, cfg, sharedStore); err != nil { logger.Error("Failed to initialize metrics system", zap.Error(err)) } } - // Initialize team-analytics storage + GitHub sync if enabled. - // - // We open the store once and let it close on process exit; the - // GitHub syncer runs on a configurable interval in its own goroutine. - // Both are no-ops when the corresponding config blocks are off, so - // existing deployments stay unchanged. - if cfg.Storage.Enabled { - if err := initTeamAnalytics(ctx, router, cfg); err != nil { + // Initialize team-analytics + GitHub sync if both storage is open + // and the config block is enabled. The store handle is shared with + // the metrics collector above. + if cfg.Storage.Enabled && sharedStore != nil { + if err := initTeamAnalytics(ctx, router, cfg, sharedStore); err != nil { logger.Error("Failed to initialize team analytics", zap.Error(err)) } } @@ -136,8 +150,10 @@ func main() { logger.Info("Server exited") } -// initMetrics initializes the metrics system if enabled in config -func initMetrics(ctx context.Context, router *gin.Engine, cfg *config.Config) error { +// initMetrics initializes the metrics system if enabled in config. +// store may be nil (storage disabled) — in that case PR-backed metrics +// like DORA Lead Time stage attribution silently degrade to empty. +func initMetrics(ctx context.Context, router *gin.Engine, cfg *config.Config, store storage.Store) error { if cfg.Jira.BaseURL == "" || cfg.Jira.Username == "" || cfg.Jira.Password == "" { logger.Warn("Metrics enabled but Jira credentials not configured in config file") return nil @@ -153,8 +169,9 @@ func initMetrics(ctx context.Context, router *gin.Engine, cfg *config.Config) er logger.Error("Failed to create Jira client for metrics", zap.Error(err)) return err } - // Create collector and service - collector := metrics.NewCollector(jiraClient, &cfg.Metrics) + // Create collector and service. The store handle (may be nil) gives + // the collector access to pull_requests rows for DORA Lead Time. + collector := metrics.NewCollector(jiraClient, store, &cfg.Metrics) metricsService := metrics.NewService(&cfg.Metrics, collector) // Register calculators @@ -189,22 +206,11 @@ func initMetrics(ctx context.Context, router *gin.Engine, cfg *config.Config) er return nil } -// initTeamAnalytics opens the persistent store, runs migrations, mounts -// the contributions REST endpoints, and (if configured) starts the -// GitHub sync loop. Failures here are non-fatal — the rest of the app -// continues to serve roadmap + metrics. -func initTeamAnalytics(ctx context.Context, router *gin.Engine, cfg *config.Config) error { - store, err := openStore(cfg.Storage) - if err != nil { - return fmt.Errorf("open store: %w", err) - } - if err := store.Migrate(ctx); err != nil { - return fmt.Errorf("migrate: %w", err) - } - logger.Info("Team analytics store ready", - zap.String("type", cfg.Storage.Type), - zap.Int("backfill_days", cfg.Storage.BackfillDays)) - +// initTeamAnalytics mounts the contributions REST endpoints and (if +// configured) starts the GitHub sync loop, using a store opened by the +// caller. Failures here are non-fatal — the rest of the app continues +// to serve roadmap + metrics. +func initTeamAnalytics(ctx context.Context, router *gin.Engine, cfg *config.Config, store storage.Store) error { service := contributions.NewService(store) pillarMap := contributions.NewPillarMap(cfg.TeamAnalytics) service.SetPillarMap(pillarMap) diff --git a/roadmap-planner/backend/config.example.yaml b/roadmap-planner/backend/config.example.yaml index a4358f19..6748ef0a 100644 --- a/roadmap-planner/backend/config.example.yaml +++ b/roadmap-planner/backend/config.example.yaml @@ -49,6 +49,15 @@ metrics: path: "/metrics" namespace: "roadmap" # Metric prefix: roadmap_dora_* + # Components dropped from all component dimensions (D6 — v3-era + # plugins must not dilute v4 team metrics). Exact component names as + # parsed from version names (e.g. katanomi-v3.1.0 -> katanomi). + exclude_plugins: + - katanomi + - knative + - jenkins + - tekton-operator + filters: - name: "releases" enabled: true diff --git a/roadmap-planner/backend/internal/api/handlers/metrics.go b/roadmap-planner/backend/internal/api/handlers/metrics.go index 195f642b..9456456c 100644 --- a/roadmap-planner/backend/internal/api/handlers/metrics.go +++ b/roadmap-planner/backend/internal/api/handlers/metrics.go @@ -56,6 +56,10 @@ func (h *MetricsHandler) ListMetrics(c *gin.Context) { // GetMetric returns a specific metric with optional filters // GET /api/metrics/:name +// +// Recognised query parameters in addition to component/pillar/quarter: +// include_bots — bool, Lead Time only (default false) +// with_trend — bool, Lead Time only (default true) func (h *MetricsHandler) GetMetric(c *gin.Context) { name := c.Param("name") @@ -69,7 +73,11 @@ func (h *MetricsHandler) GetMetric(c *gin.Context) { // Parse time range timeRange := h.parseTimeRange(c) - results, err := h.service.CalculateMetric(c.Request.Context(), name, filters, timeRange) + // Per-request options (Lead Time uses these; other calculators + // ignore them silently). + opts := parseMetricOptions(c) + + results, err := h.service.CalculateMetric(c.Request.Context(), name, filters, timeRange, opts) if err != nil { h.logger.Error("Failed to calculate metric", zap.String("metric", name), @@ -141,7 +149,33 @@ func (h *MetricsHandler) GetCollectorStatus(c *gin.Context) { }) } -// parseTimeRange parses the from/to query parameters into a TimeRange +// parseMetricOptions extracts calculator-specific query flags (only the +// keys known today are surfaced; new flags are added as calculators +// learn to read them from data.Options). +func parseMetricOptions(c *gin.Context) map[string]interface{} { + opts := map[string]interface{}{} + if v := c.Query("include_bots"); v != "" { + opts["include_bots"] = v == "true" || v == "1" || v == "yes" + } + if v := c.Query("with_trend"); v != "" { + opts["with_trend"] = v == "true" || v == "1" || v == "yes" + } + if len(opts) == 0 { + return nil + } + return opts +} + +// parseTimeRange parses the from/to query parameters into a TimeRange. +// +// When `from` is omitted, the default Start mirrors +// metrics.historical_days so the API window matches what the collector +// actually has in memory. A previous implementation hard-coded one +// year, which caused under-reporting on deployments configured with a +// smaller historical window: the collector dropped PRs outside +// HistoricalDays + lookback, but the API still asked for a full year, +// so 270d–365d-old releases were classified as "no linked PRs" and +// Lead Time was distorted (DORA P2 review). func (h *MetricsHandler) parseTimeRange(c *gin.Context) models.TimeRange { var timeRange models.TimeRange @@ -169,9 +203,15 @@ func (h *MetricsHandler) parseTimeRange(c *gin.Context) models.TimeRange { timeRange.End = time.Now() } - // Default to 1 year ago if start is not specified + // Default Start = HistoricalDays back. Falls back to 365 if + // HistoricalDays is unset or non-positive (matches the historical + // default and the value in config.example.yaml). if timeRange.Start.IsZero() { - timeRange.Start = timeRange.End.AddDate(-1, 0, 0) + days := 365 + if h.config != nil && h.config.Metrics.HistoricalDays > 0 { + days = h.config.Metrics.HistoricalDays + } + timeRange.Start = timeRange.End.AddDate(0, 0, -days) } return timeRange diff --git a/roadmap-planner/backend/internal/api/handlers/metrics_test.go b/roadmap-planner/backend/internal/api/handlers/metrics_test.go new file mode 100644 index 00000000..07d4baa1 --- /dev/null +++ b/roadmap-planner/backend/internal/api/handlers/metrics_test.go @@ -0,0 +1,91 @@ +/* +Copyright 2024 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package handlers + +import ( + "net/http/httptest" + "testing" + "time" + + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/config" + "github.com/gin-gonic/gin" +) + +// TestParseTimeRange_DefaultMatchesHistoricalDays guards the P2 fix: +// when `from` is omitted, the API default window must mirror +// metrics.historical_days, not a hard-coded 1 year. The collector +// loads PRs over a window sized to HistoricalDays + lookback, so a +// 1-year API default against HistoricalDays=90 silently under-reports +// Lead Time for releases 270d–365d ago whose PRs were never loaded. +func TestParseTimeRange_DefaultMatchesHistoricalDays(t *testing.T) { + gin.SetMode(gin.TestMode) + + cases := []struct { + name string + historicalDays int + wantDays int + }{ + {"90-day deployment", 90, 90}, + {"default 365-day deployment", 365, 365}, + {"180-day deployment", 180, 180}, + {"zero falls back to 365", 0, 365}, + {"negative falls back to 365", -1, 365}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + h := &MetricsHandler{ + config: &config.Config{ + Metrics: config.Metrics{HistoricalDays: tc.historicalDays}, + }, + } + ctx, _ := gin.CreateTestContext(httptest.NewRecorder()) + ctx.Request = httptest.NewRequest("GET", "/api/metrics/lead_time_to_release", nil) + + before := time.Now() + got := h.parseTimeRange(ctx) + after := time.Now() + + // End defaults to now — between `before` and `after`. + if got.End.Before(before) || got.End.After(after) { + t.Errorf("End = %v, want between %v and %v", got.End, before, after) + } + // Start = End - HistoricalDays. Use a small tolerance for + // the few microseconds between End and our `before` capture. + wantStart := got.End.AddDate(0, 0, -tc.wantDays) + delta := got.Start.Sub(wantStart) + if delta < -time.Second || delta > time.Second { + t.Errorf("Start = %v, want %v (HistoricalDays=%d → %d days back)", + got.Start, wantStart, tc.historicalDays, tc.wantDays) + } + }) + } +} + +// TestParseTimeRange_ExplicitFromOverridesHistoricalDays makes sure +// callers can still query outside HistoricalDays by passing `from` +// explicitly — the default only fires when the param is absent. +func TestParseTimeRange_ExplicitFromOverridesHistoricalDays(t *testing.T) { + gin.SetMode(gin.TestMode) + + h := &MetricsHandler{ + config: &config.Config{Metrics: config.Metrics{HistoricalDays: 30}}, + } + ctx, _ := gin.CreateTestContext(httptest.NewRecorder()) + ctx.Request = httptest.NewRequest("GET", "/x?from=2025-01-15&to=2025-06-15", nil) + + got := h.parseTimeRange(ctx) + wantStart := time.Date(2025, 1, 15, 0, 0, 0, 0, time.UTC) + wantEnd := time.Date(2025, 6, 15, 0, 0, 0, 0, time.UTC) + if !got.Start.Equal(wantStart) || !got.End.Equal(wantEnd) { + t.Errorf("explicit range = (%v, %v), want (%v, %v) — caller-supplied from/to must win over the HistoricalDays default", + got.Start, got.End, wantStart, wantEnd) + } +} diff --git a/roadmap-planner/backend/internal/config/config.go b/roadmap-planner/backend/internal/config/config.go index 4f868b76..04d8e533 100644 --- a/roadmap-planner/backend/internal/config/config.go +++ b/roadmap-planner/backend/internal/config/config.go @@ -274,6 +274,22 @@ type Metrics struct { Prometheus PrometheusConfig `mapstructure:"prometheus"` Filters []OptionsConfig `mapstructure:"filters"` Calculators []OptionsConfig `mapstructure:"calculators"` + // ExcludePlugins lists component names dropped from all component + // dimensions (releases and issue components) during collection. + // Implements plan D6 — v3-era plugins (katanomi, knative, jenkins, + // tekton-operator) must not dilute v4 team metrics. Exact match. + ExcludePlugins []string `mapstructure:"exclude_plugins"` +} + +// IsPluginExcluded reports whether a component name is in the +// exclude_plugins list. +func (c *Metrics) IsPluginExcluded(name string) bool { + for _, p := range c.ExcludePlugins { + if p == name { + return true + } + } + return false } // PrometheusConfig represents Prometheus exporter configuration diff --git a/roadmap-planner/backend/internal/github/client.go b/roadmap-planner/backend/internal/github/client.go index 0e99dd22..bb6fc70b 100644 --- a/roadmap-planner/backend/internal/github/client.go +++ b/roadmap-planner/backend/internal/github/client.go @@ -187,6 +187,37 @@ func (c *Client) ListReviews(ctx context.Context, owner, repo string, number int return out, nil } +// PRCommit is the slice of the PR-commits resource we use for Lead Time. +// We only persist the earliest commit's author date (see Lead Time +// calculator), so the struct stays narrow on purpose. +type PRCommit struct { + SHA string `json:"sha"` + Commit struct { + Author struct { + Date time.Time `json:"date"` + } `json:"author"` + } `json:"commit"` +} + +// ListPRCommits returns the commits associated with one PR. We use this +// to derive `pull_requests.first_commit_at` for the DORA Lead Time +// calculator — the commit author date is the only field needed, but the +// endpoint returns the full graph from base to head so we read it all +// and let the caller pick min(author.date). +// +// We fetch a single page of up to 100 commits. PRs with more than 100 +// commits are vanishingly rare; if we ever need to handle them, paginate +// here. GitHub's default per_page on this endpoint is 30, so the +// per_page=100 override matters. +func (c *Client) ListPRCommits(ctx context.Context, owner, repo string, number int) ([]PRCommit, error) { + path := fmt.Sprintf("/repos/%s/%s/pulls/%d/commits?per_page=100", owner, repo, number) + var out []PRCommit + if err := c.do(ctx, "GET", path, nil, &out); err != nil { + return nil, fmt.Errorf("list PR commits %s/%s#%d: %w", owner, repo, number, err) + } + return out, nil +} + // do is the request engine. It: // // 1. Sleeps before issuing if the rate-limit budget is near-exhausted. diff --git a/roadmap-planner/backend/internal/github/client_test.go b/roadmap-planner/backend/internal/github/client_test.go new file mode 100644 index 00000000..43b79d43 --- /dev/null +++ b/roadmap-planner/backend/internal/github/client_test.go @@ -0,0 +1,76 @@ +/* +Copyright 2024 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +*/ + +package github + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// TestListPRCommits_ReturnsAuthorDates verifies that the PR-commits +// endpoint is decoded into PRCommit{} with the commit.author.date +// field preserved as time.Time. Callers (sync + Lead Time calculator) +// rely on this to compute pull_requests.first_commit_at as the +// minimum across all returned commits. +func TestListPRCommits_ReturnsAuthorDates(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got, want := r.URL.Path, "/repos/AlaudaDevops/tektoncd-operator/pulls/42/commits"; got != want { + t.Errorf("path = %q, want %q", got, want) + } + if got := r.URL.Query().Get("per_page"); got != "100" { + t.Errorf("per_page = %q, want 100", got) + } + w.Header().Set("X-RateLimit-Remaining", "5000") + w.Header().Set("Content-Type", "application/json") + // Three commits in graph order (base → head). Middle commit + // has an *earlier* author.date than the first to exercise the + // "caller picks min" contract. + _, _ = w.Write([]byte(`[ + { + "sha": "aaaa", + "commit": {"author": {"date": "2026-04-10T09:00:00Z"}} + }, + { + "sha": "bbbb", + "commit": {"author": {"date": "2026-04-08T14:30:00Z"}} + }, + { + "sha": "cccc", + "commit": {"author": {"date": "2026-04-12T11:00:00Z"}} + } + ]`)) + })) + t.Cleanup(srv.Close) + + c := New(srv.URL, "tok", srv.Client()) + commits, err := c.ListPRCommits(context.Background(), "AlaudaDevops", "tektoncd-operator", 42) + if err != nil { + t.Fatalf("ListPRCommits: %v", err) + } + if len(commits) != 3 { + t.Fatalf("len(commits)=%d, want 3", len(commits)) + } + if commits[0].SHA != "aaaa" || commits[1].SHA != "bbbb" || commits[2].SHA != "cccc" { + t.Fatalf("commit SHAs out of order: %+v", commits) + } + + // Simulate the caller's min(author.date) logic. + var firstCommit time.Time + for _, cm := range commits { + if firstCommit.IsZero() || cm.Commit.Author.Date.Before(firstCommit) { + firstCommit = cm.Commit.Author.Date + } + } + want := time.Date(2026, 4, 8, 14, 30, 0, 0, time.UTC) + if !firstCommit.Equal(want) { + t.Fatalf("min(author.date) = %s, want %s", firstCommit, want) + } +} diff --git a/roadmap-planner/backend/internal/github/sync.go b/roadmap-planner/backend/internal/github/sync.go index fd0ea1d2..a52b6c67 100644 --- a/roadmap-planner/backend/internal/github/sync.go +++ b/roadmap-planner/backend/internal/github/sync.go @@ -11,6 +11,7 @@ import ( "context" "fmt" "regexp" + "strconv" "strings" "time" @@ -129,17 +130,39 @@ func NewSyncer(client *Client, store storage.Store, repos []RepoConfig, linker L } } +// firstCommitBackfillBatch caps how many historical PRs we rehydrate +// per Sync cycle. Migration 0009 added first_commit_at, but the +// incremental PR fetch only sees PRs whose `updated_at` lands inside +// the cycle window — so PRs merged before 0009 keep NULL forever +// unless we revisit them explicitly. Cap chosen so a 5k-row backlog +// drains in ≤25 cycles (~2h at the default 5-minute interval) without +// dominating the API budget against the regular incremental sync. +const firstCommitBackfillBatch = 200 + // Sync pulls PRs updated since the last successful run for each repo, // resolves authors against members, attempts epic linking, and persists. // // Reviews are fetched only for PRs we have not seen before (new) or that // merged since the previous sync — this caps the per-cycle review-API // fan-out to "new + recently-merged" rather than the entire history. +// +// Before the incremental pass, Sync runs a bounded backfill for PRs +// still missing first_commit_at (DORA Lead Time Dev-stage start). See +// backfillFirstCommit for the rationale — without it, history ingested +// before migration 0009 stays C3 in Lead Time and the Dev stage is +// under-attributed. func (s *Syncer) Sync(ctx context.Context) error { if len(s.repos) == 0 { return nil } + // Best-effort: a backfill failure must not block the regular sync, + // which still produces fresh data for everything outside the + // post-0009 backlog. + if err := s.backfillFirstCommit(ctx); err != nil { + s.logger.Warn("first_commit_at backfill failed (continuing with incremental sync)", zap.Error(err)) + } + // Resolve OWNER/* specs against the org-repos endpoint before we // start the per-repo loop. A wildcard-expansion failure is recorded // but does not abort the cycle: any explicit + already-resolved @@ -245,15 +268,38 @@ func (s *Syncer) Sync(ctx context.Context) error { // - PRs closed without merge >7d ago (rarely accrue new // reviews, and they dominate the call budget on noisy // repos during backfill) - // - // We always fetch reviews for merged PRs in window — those - // drive the review-latency rollup. - skipReviews := pr.Draft - if !skipReviews && pr.MergedAt == nil && pr.ClosedAt != nil && + skipPRAPIs := pr.Draft + if !skipPRAPIs && pr.MergedAt == nil && pr.ClosedAt != nil && time.Since(*pr.ClosedAt) > 7*24*time.Hour { - skipReviews = true + skipPRAPIs = true } - if !skipReviews { + if pr.MergedAt != nil { + // Commits → first_commit_at (DORA Lead Time Dev-stage start). + // Only merged PRs are read by the Lead Time calculator + // (filterPreReleasePRs drops MergedAt==nil), and + // ListPullRequestsSince filters merged_at IS NOT NULL — + // fetching commits for open / closed-unmerged PRs is + // pure API budget burn. Failures are logged but never + // block the PR upsert: COALESCE in UpsertPullRequests + // preserves any earlier value, and the post-0009 + // backfill (backfillFirstCommit) catches up on rows + // missed by transient errors. + commits, err := s.client.ListPRCommits(ctx, repo.Owner, repo.Name, pr.Number) + if err != nil { + s.logger.Warn("ListPRCommits failed", + zap.String("repo", full), zap.Int("pr", pr.Number), zap.Error(err)) + } else { + var firstCommit *time.Time + for _, cm := range commits { + d := cm.Commit.Author.Date + if firstCommit == nil || d.Before(*firstCommit) { + firstCommit = &d + } + } + rec.FirstCommitAt = firstCommit + } + } + if !skipPRAPIs { reviews, err := s.client.ListReviews(ctx, repo.Owner, repo.Name, pr.Number) if err != nil { s.logger.Warn("ListReviews failed", @@ -322,3 +368,89 @@ func (s *Syncer) Sync(ctx context.Context) error { zap.Duration("duration", time.Since(runStart))) return firstErr } + +// backfillFirstCommit fetches `first_commit_at` for at most +// firstCommitBackfillBatch already-merged PRs whose row predates +// migration 0009. The query condition is the state machine: a row +// stays in the result set until first_commit_at is populated, so +// running this every cycle is naturally idempotent and converges +// after a handful of syncs. +// +// API errors on individual rows are logged and skipped — leaving the +// column NULL lets the next cycle retry. Rows where the API returns +// no commits (e.g. a fork with rewritten squash history) also stay +// NULL; the Lead Time C3 fallback handles them. +func (s *Syncer) backfillFirstCommit(ctx context.Context) error { + since := time.Now().AddDate(0, 0, -s.backfillDays) + prs, err := s.store.ListMergedPRsMissingFirstCommit(ctx, "github", since, firstCommitBackfillBatch) + if err != nil { + return err + } + if len(prs) == 0 { + return nil + } + s.logger.Info("github backfill first_commit_at", + zap.Int("batch", len(prs)), + zap.Int("backfill_days", s.backfillDays)) + + updated, failed := 0, 0 + for _, p := range prs { + owner, name, number, ok := parsePRID(p.ID) + if !ok { + s.logger.Warn("backfill: unparseable PR id (skipping)", zap.String("pr_id", p.ID)) + failed++ + continue + } + commits, err := s.client.ListPRCommits(ctx, owner, name, number) + if err != nil { + s.logger.Warn("backfill: ListPRCommits failed", + zap.String("pr_id", p.ID), zap.Error(err)) + failed++ + continue + } + var firstCommit *time.Time + for _, cm := range commits { + d := cm.Commit.Author.Date + if firstCommit == nil || d.Before(*firstCommit) { + firstCommit = &d + } + } + // firstCommit may be nil here — API returned no commits. We + // still skip the UPDATE: writing NULL is a no-op, but a single + // write call against a 0-commit edge case earns nothing and + // keeps the row eligible for one cheap retry per cycle. + if firstCommit == nil { + continue + } + if err := s.store.UpdatePRFirstCommitAt(ctx, p.ID, firstCommit); err != nil { + s.logger.Warn("backfill: UpdatePRFirstCommitAt failed", + zap.String("pr_id", p.ID), zap.Error(err)) + failed++ + continue + } + updated++ + } + s.logger.Info("github backfill first_commit_at complete", + zap.Int("updated", updated), zap.Int("failed", failed)) + return nil +} + +// parsePRID splits a github PR id of the form "owner/name#number". +// Owner/name may contain dashes, dots, and underscores; number is the +// integer PR number. Returns ok=false on any structural mismatch. +func parsePRID(id string) (owner, name string, number int, ok bool) { + hash := strings.LastIndexByte(id, '#') + if hash <= 0 || hash == len(id)-1 { + return "", "", 0, false + } + fullRepo := id[:hash] + slash := strings.IndexByte(fullRepo, '/') + if slash <= 0 || slash == len(fullRepo)-1 { + return "", "", 0, false + } + n, err := strconv.Atoi(id[hash+1:]) + if err != nil || n <= 0 { + return "", "", 0, false + } + return fullRepo[:slash], fullRepo[slash+1:], n, true +} diff --git a/roadmap-planner/backend/internal/github/sync_backfill_test.go b/roadmap-planner/backend/internal/github/sync_backfill_test.go new file mode 100644 index 00000000..1f77bea2 --- /dev/null +++ b/roadmap-planner/backend/internal/github/sync_backfill_test.go @@ -0,0 +1,156 @@ +/* +Copyright 2024 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package github + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "path/filepath" + "testing" + "time" + + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/storage" +) + +// TestParsePRID locks the PR id grammar (owner/name#number) the +// post-0009 backfill depends on. Whoever changes the id shape must +// update parsePRID too — otherwise backfillFirstCommit silently +// skips every row. +func TestParsePRID(t *testing.T) { + cases := []struct { + in string + wantOK bool + wantOwner string + wantName string + wantNum int + }{ + {"AlaudaDevops/toolbox#173", true, "AlaudaDevops", "toolbox", 173}, + {"alaudadevops/tektoncd-operator#9999", true, "alaudadevops", "tektoncd-operator", 9999}, + {"org/with.dots_and-dashes#1", true, "org", "with.dots_and-dashes", 1}, + {"missing-number-marker", false, "", "", 0}, + {"org/repo#", false, "", "", 0}, + {"#42", false, "", "", 0}, + {"org/repo#abc", false, "", "", 0}, + {"org/repo#-1", false, "", "", 0}, + {"#", false, "", "", 0}, + {"", false, "", "", 0}, + } + for _, tc := range cases { + t.Run(tc.in, func(t *testing.T) { + gotOwner, gotName, gotNum, gotOK := parsePRID(tc.in) + if gotOK != tc.wantOK { + t.Fatalf("ok = %v, want %v", gotOK, tc.wantOK) + } + if !tc.wantOK { + return + } + if gotOwner != tc.wantOwner || gotName != tc.wantName || gotNum != tc.wantNum { + t.Errorf("parsed = (%q, %q, %d), want (%q, %q, %d)", + gotOwner, gotName, gotNum, tc.wantOwner, tc.wantName, tc.wantNum) + } + }) + } +} + +// TestBackfillFirstCommit_EndToEnd exercises the full backfill path +// against a real SQLite store and a mocked PR-commits endpoint: +// - one PR with NULL first_commit_at in window → must be updated +// - one PR already populated → must stay untouched +// - one PR whose commits API errors → row stays NULL, +// other rows still get updated (single failure must not abort) +// This is the regression barrier for the P2 review: without the +// backfill the first row stays NULL and Lead Time misclassifies it +// as C3. +func TestBackfillFirstCommit_EndToEnd(t *testing.T) { + dir := t.TempDir() + store, err := storage.OpenSQLite(filepath.Join(dir, "test.db")) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = store.Close() }) + + ctx := context.Background() + if err := store.Migrate(ctx); err != nil { + t.Fatalf("migrate: %v", err) + } + + now := time.Now().UTC().Truncate(time.Second) + merged := now.AddDate(0, 0, -10) + existing := merged.Add(-3 * 24 * time.Hour) + + if err := store.UpsertPullRequests(ctx, []storage.PullRequest{ + { // target — must get backfilled + ID: "org/repo#1", Source: "github", RepoID: "org/repo", Number: 1, + Title: "needs backfill", State: "merged", + CreatedAt: merged.Add(-time.Hour), MergedAt: &merged, FetchedAt: now, + }, + { // already populated — must be left alone + ID: "org/repo#2", Source: "github", RepoID: "org/repo", Number: 2, + Title: "already has commit", State: "merged", + CreatedAt: merged.Add(-time.Hour), + FirstCommitAt: &existing, + MergedAt: &merged, FetchedAt: now, + }, + { // commits API will error — must stay NULL but not abort batch + ID: "org/repo#3", Source: "github", RepoID: "org/repo", Number: 3, + Title: "api fails", State: "merged", + CreatedAt: merged.Add(-time.Hour), MergedAt: &merged, FetchedAt: now, + }, + }); err != nil { + t.Fatalf("seed PRs: %v", err) + } + + commit1Date := merged.Add(-5 * 24 * time.Hour) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("X-RateLimit-Remaining", "5000") + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/repos/org/repo/pulls/1/commits": + _, _ = fmt.Fprintf(w, `[{"sha":"a","commit":{"author":{"date":%q}}}]`, + commit1Date.Format(time.RFC3339)) + case "/repos/org/repo/pulls/3/commits": + http.Error(w, "boom", http.StatusInternalServerError) + default: + t.Errorf("unexpected commits request: %s", r.URL.Path) + http.Error(w, "not found", http.StatusNotFound) + } + })) + t.Cleanup(srv.Close) + + syncer := NewSyncer(New(srv.URL, "tok", srv.Client()), store, nil, nil, 30) + if err := syncer.backfillFirstCommit(ctx); err != nil { + t.Fatalf("backfillFirstCommit: %v", err) + } + + // Sanity-check: read the rows back via ListPullRequestsSince. + prs, err := store.ListPullRequestsSince(ctx, now.AddDate(0, 0, -60)) + if err != nil { + t.Fatalf("list PRs: %v", err) + } + got := map[string]*time.Time{} + for _, p := range prs { + got[p.ID] = p.FirstCommitAt + } + if got["org/repo#1"] == nil || !got["org/repo#1"].Equal(commit1Date) { + t.Errorf("org/repo#1 first_commit_at = %v, want %v (backfill failed)", + got["org/repo#1"], commit1Date) + } + if got["org/repo#2"] == nil || !got["org/repo#2"].Equal(existing) { + t.Errorf("org/repo#2 first_commit_at = %v, want %v (backfill overwrote existing value)", + got["org/repo#2"], existing) + } + if got["org/repo#3"] != nil { + t.Errorf("org/repo#3 first_commit_at = %v, want nil (API errored — row must stay eligible for retry)", + got["org/repo#3"]) + } +} diff --git a/roadmap-planner/backend/internal/github/sync_commits_gate_test.go b/roadmap-planner/backend/internal/github/sync_commits_gate_test.go new file mode 100644 index 00000000..40e97f70 --- /dev/null +++ b/roadmap-planner/backend/internal/github/sync_commits_gate_test.go @@ -0,0 +1,138 @@ +/* +Copyright 2024 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package github + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "path/filepath" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/storage" +) + +// TestSync_ListPRCommitsOnlyForMergedPRs guards the P2 fix: the +// commits endpoint must be called for merged PRs only, not for every +// non-draft / recently-closed PR. Reviews stay broader (same skip +// rules as before) since they accrue on open PRs too. +// +// The Lead Time calculator drops unmerged PRs (filterPreReleasePRs) +// and ListPullRequestsSince filters merged_at IS NOT NULL — fetching +// first_commit_at on open / closed-unmerged PRs is rate-budget waste. +func TestSync_ListPRCommitsOnlyForMergedPRs(t *testing.T) { + dir := t.TempDir() + store, err := storage.OpenSQLite(filepath.Join(dir, "test.db")) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = store.Close() }) + + ctx := context.Background() + if err := store.Migrate(ctx); err != nil { + t.Fatalf("migrate: %v", err) + } + + now := time.Now().UTC() + merged := now.Add(-3 * 24 * time.Hour) + closedRecently := now.Add(-1 * 24 * time.Hour) // <7d → reviews still fetched + + // Four PRs covering the relevant skip permutations: + // #1 merged → commits + reviews + // #2 open non-draft → reviews only (no commits) + // #3 closed unmerged <7d → reviews only (no commits) + // #4 draft → neither (top-level skip) + prs := []PullRequest{ + {Number: 1, Title: "merged pr", State: "closed", Draft: false, + CreatedAt: now.Add(-5 * 24 * time.Hour), UpdatedAt: now, + MergedAt: &merged}, + {Number: 2, Title: "open pr", State: "open", Draft: false, + CreatedAt: now.Add(-2 * 24 * time.Hour), UpdatedAt: now}, + {Number: 3, Title: "closed unmerged recently", State: "closed", Draft: false, + CreatedAt: now.Add(-2 * 24 * time.Hour), UpdatedAt: now, + ClosedAt: &closedRecently}, + {Number: 4, Title: "draft pr", State: "open", Draft: true, + CreatedAt: now.Add(-1 * 24 * time.Hour), UpdatedAt: now}, + } + + var commitsHits, reviewsHits int32 + commitsSeen := make(map[int]bool) + reviewsSeen := make(map[int]bool) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("X-RateLimit-Remaining", "5000") + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/repos/org/repo/pulls": + page := r.URL.Query().Get("page") + if page != "1" { + _, _ = w.Write([]byte(`[]`)) + return + } + _ = json.NewEncoder(w).Encode(prs) + case strings.HasPrefix(r.URL.Path, "/repos/org/repo/pulls/") && + strings.HasSuffix(r.URL.Path, "/commits"): + atomic.AddInt32(&commitsHits, 1) + var n int + _, _ = fmt.Sscanf(r.URL.Path, "/repos/org/repo/pulls/%d/commits", &n) + commitsSeen[n] = true + _, _ = fmt.Fprintf(w, `[{"sha":"a","commit":{"author":{"date":%q}}}]`, + merged.Add(-2*24*time.Hour).Format(time.RFC3339)) + case strings.HasPrefix(r.URL.Path, "/repos/org/repo/pulls/") && + strings.HasSuffix(r.URL.Path, "/reviews"): + atomic.AddInt32(&reviewsHits, 1) + var n int + _, _ = fmt.Sscanf(r.URL.Path, "/repos/org/repo/pulls/%d/reviews", &n) + reviewsSeen[n] = true + _, _ = w.Write([]byte(`[]`)) + default: + t.Errorf("unexpected request: %s", r.URL.Path) + http.Error(w, "not found", http.StatusNotFound) + } + })) + t.Cleanup(srv.Close) + + syncer := NewSyncer( + New(srv.URL, "tok", srv.Client()), + store, + []RepoConfig{{Owner: "org", Name: "repo"}}, + nil, + 30, + ) + if err := syncer.Sync(ctx); err != nil { + t.Fatalf("Sync: %v", err) + } + + if got := atomic.LoadInt32(&commitsHits); got != 1 { + t.Errorf("commits endpoint hits = %d, want 1 (only merged PR #1)", got) + } + if !commitsSeen[1] { + t.Errorf("commits not called for merged PR #1: %+v", commitsSeen) + } + for _, n := range []int{2, 3, 4} { + if commitsSeen[n] { + t.Errorf("commits called for unmerged PR #%d — gate must skip non-merged PRs", n) + } + } + + // Reviews: PRs #1, #2, #3 (draft #4 still skipped). + if got := atomic.LoadInt32(&reviewsHits); got != 3 { + t.Errorf("reviews endpoint hits = %d, want 3 (skip only the draft #4)", got) + } + if reviewsSeen[4] { + t.Error("reviews fetched for draft PR #4 — draft skip regressed") + } +} diff --git a/roadmap-planner/backend/internal/gitlab/client.go b/roadmap-planner/backend/internal/gitlab/client.go index a31ba2b8..a6ef0395 100644 --- a/roadmap-planner/backend/internal/gitlab/client.go +++ b/roadmap-planner/backend/internal/gitlab/client.go @@ -377,6 +377,29 @@ func (c *Client) ListMRNotes(ctx context.Context, projectID int64, iid int) ([]N return out, nil } +// MRCommit is the slice of the MR-commits resource we use for Lead Time. +// We only need the earliest commit's authored_date (see Lead Time +// calculator), so the struct stays narrow on purpose. +type MRCommit struct { + SHA string `json:"id"` + AuthoredDate time.Time `json:"authored_date"` +} + +// ListMRCommits returns the commits associated with one MR. Mirrors the +// GitHub client's ListPRCommits: we read one page (up to 100 commits) +// and let the caller pick min(authored_date) for +// `pull_requests.first_commit_at` in the DORA Lead Time calculator. +// +// GitLab's default per_page is 20 — override matters. +func (c *Client) ListMRCommits(ctx context.Context, projectID int64, iid int) ([]MRCommit, error) { + path := fmt.Sprintf("/api/v4/projects/%d/merge_requests/%d/commits?per_page=100", projectID, iid) + var out []MRCommit + if err := c.do(ctx, "GET", path, nil, &out); err != nil { + return nil, fmt.Errorf("list MR commits %d!%d: %w", projectID, iid, err) + } + return out, nil +} + // SearchUsers does a /users?search= lookup. GitLab returns an // array even for an exact-email match, so callers should pick the entry // whose email matches case-insensitively (when their token has the diff --git a/roadmap-planner/backend/internal/gitlab/client_test.go b/roadmap-planner/backend/internal/gitlab/client_test.go new file mode 100644 index 00000000..19bfbaa2 --- /dev/null +++ b/roadmap-planner/backend/internal/gitlab/client_test.go @@ -0,0 +1,59 @@ +/* +Copyright 2024 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +*/ + +package gitlab + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// TestListMRCommits_ReturnsAuthoredDates mirrors the GitHub PR-commits +// test: verifies the MR-commits endpoint is decoded into MRCommit{} +// with the authored_date field preserved as time.Time, and that +// callers can compute pull_requests.first_commit_at via min(). +func TestListMRCommits_ReturnsAuthoredDates(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got, want := r.URL.Path, "/api/v4/projects/123/merge_requests/77/commits"; got != want { + t.Errorf("path = %q, want %q", got, want) + } + if got := r.URL.Query().Get("per_page"); got != "100" { + t.Errorf("per_page = %q, want 100", got) + } + w.Header().Set("Content-Type", "application/json") + // Three commits — middle one has the earliest authored_date. + _, _ = w.Write([]byte(`[ + {"id": "aaaa", "authored_date": "2026-04-10T09:00:00.000+00:00"}, + {"id": "bbbb", "authored_date": "2026-04-08T14:30:00.000+00:00"}, + {"id": "cccc", "authored_date": "2026-04-12T11:00:00.000+00:00"} + ]`)) + })) + t.Cleanup(srv.Close) + + c := New(srv.URL, "tok", srv.Client()) + commits, err := c.ListMRCommits(context.Background(), 123, 77) + if err != nil { + t.Fatalf("ListMRCommits: %v", err) + } + if len(commits) != 3 { + t.Fatalf("len(commits)=%d, want 3", len(commits)) + } + + var firstCommit time.Time + for _, cm := range commits { + if firstCommit.IsZero() || cm.AuthoredDate.Before(firstCommit) { + firstCommit = cm.AuthoredDate + } + } + want := time.Date(2026, 4, 8, 14, 30, 0, 0, time.UTC) + if !firstCommit.Equal(want) { + t.Fatalf("min(authored_date) = %s, want %s", firstCommit, want) + } +} diff --git a/roadmap-planner/backend/internal/gitlab/sync.go b/roadmap-planner/backend/internal/gitlab/sync.go index c0280584..d622f9d1 100644 --- a/roadmap-planner/backend/internal/gitlab/sync.go +++ b/roadmap-planner/backend/internal/gitlab/sync.go @@ -11,6 +11,7 @@ import ( "context" "fmt" "regexp" + "strconv" "strings" "sync" "time" @@ -178,17 +179,32 @@ func classifyNote(body string) (string, bool) { return "commented", true } +// firstCommitBackfillBatch caps how many historical MRs are rehydrated +// per Sync cycle. See github.firstCommitBackfillBatch for the rationale +// — same budget reasoning applies: keep the post-0009 catch-up bounded +// so it never starves the regular incremental sync. +const firstCommitBackfillBatch = 200 + // Sync runs one cycle: expand specs → list MRs updated since the // previous run → upsert PRs + notes-as-reviews → write a collection_run. // // The shape mirrors github.Syncer.Sync — see those comments for first- -// run vs. incremental behaviour. +// run vs. incremental behaviour. Sync also runs a bounded backfill of +// `first_commit_at` for already-merged MRs that the incremental window +// can never revisit (rows from before migration 0009). Without this the +// Lead Time calculator under-attributes the Dev stage on upgraded +// deployments — see backfillFirstCommit. func (s *Syncer) Sync(ctx context.Context) error { passBActive := s.MemberInstanceSweep && len(s.AllowedMemberIDs) > 0 if len(s.specs) == 0 && !passBActive { return nil } + // Best-effort: backfill errors must not block the regular sync. + if err := s.backfillFirstCommit(ctx); err != nil { + s.logger.Warn("first_commit_at backfill failed (continuing with incremental sync)", zap.Error(err)) + } + now := s.nowFn() var resolved []Project var resolveErr error @@ -299,14 +315,39 @@ func (s *Syncer) Sync(ctx context.Context) error { } } - // Notes → reviews. Same skip rules as the github side: drop - // drafts, drop closed-without-merge older than 7d. - skipNotes := mr.Draft || mr.WorkInProg - if !skipNotes && mr.MergedAt == nil && mr.ClosedAt != nil && + // Notes → reviews. Same skip rules as the github side: + // drop drafts, drop closed-without-merge older than 7d. + skipMRAPIs := mr.Draft || mr.WorkInProg + if !skipMRAPIs && mr.MergedAt == nil && mr.ClosedAt != nil && time.Since(*mr.ClosedAt) > 7*24*time.Hour { - skipNotes = true + skipMRAPIs = true } - if !skipNotes { + if mr.MergedAt != nil { + // Commits → first_commit_at. Only merged MRs are read + // by the Lead Time calculator (filterPreReleasePRs + // drops MergedAt==nil) and ListPullRequestsSince + // filters merged_at IS NOT NULL — fetching commits for + // open / closed-unmerged MRs is pure API budget burn, + // especially on Pass B sweeps over large GitLab + // instances. Failures are logged but never block the + // MR upsert; backfillFirstCommit catches up later. + commits, err := s.client.ListMRCommits(ctx, p.ID, mr.IID) + if err != nil { + s.logger.Warn("ListMRCommits failed", + zap.String("project", p.PathWithNamespace), + zap.Int("iid", mr.IID), zap.Error(err)) + } else { + var firstCommit *time.Time + for _, cm := range commits { + d := cm.AuthoredDate + if firstCommit == nil || d.Before(*firstCommit) { + firstCommit = &d + } + } + rec.FirstCommitAt = firstCommit + } + } + if !skipMRAPIs { notes, err := s.client.ListMRNotes(ctx, p.ID, mr.IID) if err != nil { s.logger.Warn("ListMRNotes failed", @@ -512,12 +553,32 @@ func (s *Syncer) sweepInstanceByMember( } } - skipNotes := mr.Draft || mr.WorkInProg - if !skipNotes && mr.MergedAt == nil && mr.ClosedAt != nil && + skipMRAPIs := mr.Draft || mr.WorkInProg + if !skipMRAPIs && mr.MergedAt == nil && mr.ClosedAt != nil && time.Since(*mr.ClosedAt) > 7*24*time.Hour { - skipNotes = true + skipMRAPIs = true } - if !skipNotes && mr.ProjectID > 0 { + if mr.MergedAt != nil && mr.ProjectID > 0 { + // Commits → first_commit_at. See Pass A comment: only + // merged MRs are read by Lead Time, so non-merged MRs + // would burn rate budget for data that is discarded. + commits, err := s.client.ListMRCommits(ctx, mr.ProjectID, mr.IID) + if err != nil { + s.logger.Warn("Pass B: ListMRCommits failed", + zap.String("project", projectPath), + zap.Int("iid", mr.IID), zap.Error(err)) + } else { + var firstCommit *time.Time + for _, cm := range commits { + d := cm.AuthoredDate + if firstCommit == nil || d.Before(*firstCommit) { + firstCommit = &d + } + } + rec.FirstCommitAt = firstCommit + } + } + if !skipMRAPIs && mr.ProjectID > 0 { notes, err := s.client.ListMRNotes(ctx, mr.ProjectID, mr.IID) if err != nil { s.logger.Warn("Pass B: ListMRNotes failed", @@ -604,3 +665,95 @@ func (s *Syncer) resolveProjects(ctx context.Context, now time.Time) ([]Project, } return out, firstErr } + +// backfillFirstCommit fetches `first_commit_at` for at most +// firstCommitBackfillBatch already-merged MRs whose row predates +// migration 0009. Mirrors github.Syncer.backfillFirstCommit; the only +// twist is GitLab's commits endpoint needs a numeric project_id while +// the storage row only carries the namespace path, so we cache +// path → projectID per cycle to avoid re-resolving the same project +// for every MR. +func (s *Syncer) backfillFirstCommit(ctx context.Context) error { + since := time.Now().AddDate(0, 0, -s.backfillDays) + prs, err := s.store.ListMergedPRsMissingFirstCommit(ctx, "gitlab", since, firstCommitBackfillBatch) + if err != nil { + return err + } + if len(prs) == 0 { + return nil + } + s.logger.Info("gitlab backfill first_commit_at", + zap.Int("batch", len(prs)), + zap.Int("backfill_days", s.backfillDays)) + + projectIDCache := make(map[string]int64, len(prs)) + updated, failed := 0, 0 + for _, p := range prs { + path, iid, ok := parseMRID(p.ID) + if !ok { + s.logger.Warn("backfill: unparseable MR id (skipping)", zap.String("mr_id", p.ID)) + failed++ + continue + } + // Prefer the stored RepoID — it's already the namespace path + // and stays correct even if the MR id changed shape across + // migrations. parseMRID is the fallback. + if p.RepoID != "" { + path = p.RepoID + } + projectID, ok := projectIDCache[path] + if !ok { + proj, err := s.client.GetProject(ctx, path) + if err != nil { + s.logger.Warn("backfill: GetProject failed", + zap.String("mr_id", p.ID), zap.String("path", path), zap.Error(err)) + failed++ + continue + } + projectID = proj.ID + projectIDCache[path] = projectID + } + commits, err := s.client.ListMRCommits(ctx, projectID, iid) + if err != nil { + s.logger.Warn("backfill: ListMRCommits failed", + zap.String("mr_id", p.ID), zap.Error(err)) + failed++ + continue + } + var firstCommit *time.Time + for _, cm := range commits { + d := cm.AuthoredDate + if firstCommit == nil || d.Before(*firstCommit) { + firstCommit = &d + } + } + if firstCommit == nil { + continue + } + if err := s.store.UpdatePRFirstCommitAt(ctx, p.ID, firstCommit); err != nil { + s.logger.Warn("backfill: UpdatePRFirstCommitAt failed", + zap.String("mr_id", p.ID), zap.Error(err)) + failed++ + continue + } + updated++ + } + s.logger.Info("gitlab backfill first_commit_at complete", + zap.Int("updated", updated), zap.Int("failed", failed)) + return nil +} + +// parseMRID splits a gitlab MR id of the form "group/sub/proj!iid". +// The project path may contain any number of "/" segments, so we split +// on the trailing "!" only. Returns ok=false on any structural mismatch. +func parseMRID(id string) (path string, iid int, ok bool) { + bang := strings.LastIndexByte(id, '!') + if bang <= 0 || bang == len(id)-1 { + return "", 0, false + } + n, err := strconv.Atoi(id[bang+1:]) + if err != nil || n <= 0 { + return "", 0, false + } + return id[:bang], n, true +} diff --git a/roadmap-planner/backend/internal/gitlab/sync_backfill_test.go b/roadmap-planner/backend/internal/gitlab/sync_backfill_test.go new file mode 100644 index 00000000..c35bc63d --- /dev/null +++ b/roadmap-planner/backend/internal/gitlab/sync_backfill_test.go @@ -0,0 +1,183 @@ +/* +Copyright 2026 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package gitlab + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/storage" +) + +// TestParseMRID locks the MR id grammar (group/.../proj!iid) the +// post-0009 backfill depends on. The project path is multi-segment +// (groups + subgroups), so we split on the trailing "!" only. +func TestParseMRID(t *testing.T) { + cases := []struct { + in string + wantOK bool + wantPath string + wantIID int + }{ + {"devops/edge!42", true, "devops/edge", 42}, + {"group/sub/sub/proj!9999", true, "group/sub/sub/proj", 9999}, + {"single!1", true, "single", 1}, + {"missing-bang", false, "", 0}, + {"trailing!", false, "", 0}, + {"!42", false, "", 0}, + {"path!abc", false, "", 0}, + {"path!-1", false, "", 0}, + {"path!0", false, "", 0}, + {"", false, "", 0}, + } + for _, tc := range cases { + t.Run(tc.in, func(t *testing.T) { + gotPath, gotIID, gotOK := parseMRID(tc.in) + if gotOK != tc.wantOK { + t.Fatalf("ok = %v, want %v", gotOK, tc.wantOK) + } + if !tc.wantOK { + return + } + if gotPath != tc.wantPath || gotIID != tc.wantIID { + t.Errorf("parsed = (%q, %d), want (%q, %d)", + gotPath, gotIID, tc.wantPath, tc.wantIID) + } + }) + } +} + +// TestBackfillFirstCommit_GitLab_EndToEnd mirrors the GitHub backfill +// end-to-end test against a real SQLite store and a mocked GitLab +// API. Verifies: +// - target row gets the min(authored_date) written, +// - already-populated row is untouched, +// - GetProject(path) is called once per project (cache hit on the +// second MR from the same project), +// - a commits-API error on one row does not abort the batch. +func TestBackfillFirstCommit_GitLab_EndToEnd(t *testing.T) { + dir := t.TempDir() + store, err := storage.OpenSQLite(filepath.Join(dir, "test.db")) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = store.Close() }) + + ctx := context.Background() + if err := store.Migrate(ctx); err != nil { + t.Fatalf("migrate: %v", err) + } + + now := time.Now().UTC().Truncate(time.Second) + merged := now.AddDate(0, 0, -10) + existing := merged.Add(-3 * 24 * time.Hour) + + if err := store.UpsertPullRequests(ctx, []storage.PullRequest{ + { // target — must get backfilled + ID: "devops/edge!1", Source: "gitlab", RepoID: "devops/edge", Number: 1, + Title: "needs backfill", State: "merged", + CreatedAt: merged.Add(-time.Hour), MergedAt: &merged, FetchedAt: now, + }, + { // second MR same project → exercises projectID cache + ID: "devops/edge!2", Source: "gitlab", RepoID: "devops/edge", Number: 2, + Title: "same project", State: "merged", + CreatedAt: merged.Add(-30 * time.Minute), MergedAt: &merged, FetchedAt: now, + }, + { // already populated — must be left alone (and not show up in batch) + ID: "devops/edge!3", Source: "gitlab", RepoID: "devops/edge", Number: 3, + Title: "already has commit", State: "merged", + CreatedAt: merged.Add(-time.Hour), + FirstCommitAt: &existing, + MergedAt: &merged, FetchedAt: now, + }, + { // commits API will 500 — row stays NULL, batch must continue + ID: "devops/edge!4", Source: "gitlab", RepoID: "devops/edge", Number: 4, + Title: "api fails", State: "merged", + CreatedAt: merged.Add(-time.Hour), MergedAt: &merged, FetchedAt: now, + }, + }); err != nil { + t.Fatalf("seed PRs: %v", err) + } + + commit1Date := merged.Add(-5 * 24 * time.Hour) + commit2Date := merged.Add(-7 * 24 * time.Hour) + + var projectHits int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + // The client URL-escapes the project path, but Go's HTTP server + // decodes r.URL.Path. RawPath preserves the escaped form (when + // non-canonical) so we can match the wire format directly. + raw := r.URL.RawPath + if raw == "" { + raw = r.URL.Path + } + switch { + case raw == "/api/v4/projects/devops%2Fedge": + projectHits++ + _, _ = fmt.Fprintln(w, `{"id":42,"path_with_namespace":"devops/edge"}`) + case strings.HasPrefix(raw, "/api/v4/projects/42/merge_requests/"): + switch { + case strings.HasSuffix(raw, "/1/commits"): + _, _ = fmt.Fprintf(w, `[{"id":"a","authored_date":%q}]`, commit1Date.Format(time.RFC3339)) + case strings.HasSuffix(raw, "/2/commits"): + _, _ = fmt.Fprintf(w, `[{"id":"b","authored_date":%q}]`, commit2Date.Format(time.RFC3339)) + case strings.HasSuffix(raw, "/4/commits"): + http.Error(w, "boom", http.StatusInternalServerError) + default: + t.Errorf("unexpected commits path: %s", raw) + http.Error(w, "not found", http.StatusNotFound) + } + default: + t.Errorf("unexpected request: %s", raw) + http.Error(w, "not found", http.StatusNotFound) + } + })) + t.Cleanup(srv.Close) + + syncer := NewSyncer(New(srv.URL, "tok", srv.Client()), store, nil, nil, 30) + if err := syncer.backfillFirstCommit(ctx); err != nil { + t.Fatalf("backfillFirstCommit: %v", err) + } + + if projectHits != 1 { + t.Errorf("GetProject called %d times, want 1 (path→projectID cache must dedupe)", projectHits) + } + + prs, err := store.ListPullRequestsSince(ctx, now.AddDate(0, 0, -60)) + if err != nil { + t.Fatalf("list PRs: %v", err) + } + got := map[string]*time.Time{} + for _, p := range prs { + got[p.ID] = p.FirstCommitAt + } + if got["devops/edge!1"] == nil || !got["devops/edge!1"].Equal(commit1Date) { + t.Errorf("!1 first_commit_at = %v, want %v", got["devops/edge!1"], commit1Date) + } + if got["devops/edge!2"] == nil || !got["devops/edge!2"].Equal(commit2Date) { + t.Errorf("!2 first_commit_at = %v, want %v", got["devops/edge!2"], commit2Date) + } + if got["devops/edge!3"] == nil || !got["devops/edge!3"].Equal(existing) { + t.Errorf("!3 first_commit_at = %v, want %v (backfill overwrote existing value)", + got["devops/edge!3"], existing) + } + if got["devops/edge!4"] != nil { + t.Errorf("!4 first_commit_at = %v, want nil (API errored — row must stay eligible for retry)", + got["devops/edge!4"]) + } +} diff --git a/roadmap-planner/backend/internal/gitlab/sync_commits_gate_test.go b/roadmap-planner/backend/internal/gitlab/sync_commits_gate_test.go new file mode 100644 index 00000000..adee7c87 --- /dev/null +++ b/roadmap-planner/backend/internal/gitlab/sync_commits_gate_test.go @@ -0,0 +1,146 @@ +/* +Copyright 2026 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package gitlab + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "path/filepath" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/storage" +) + +// TestSync_ListMRCommitsOnlyForMergedMRs is the GitLab counterpart of +// TestSync_ListPRCommitsOnlyForMergedPRs. Pass A path only — Pass B +// is exercised in TestSyncPassBMemberSweep. Verifies: +// - commits endpoint hit exactly once (merged MR) +// - notes endpoint hit three times (everything except the draft) +// - draft skip still in place +func TestSync_ListMRCommitsOnlyForMergedMRs(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + store, err := storage.OpenSQLite(filepath.Join(dir, "gl.db")) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = store.Close() }) + if err := store.Migrate(ctx); err != nil { + t.Fatalf("migrate: %v", err) + } + + now := time.Now().UTC().Truncate(time.Second) + merged := now.Add(-3 * 24 * time.Hour) + closedRecently := now.Add(-1 * 24 * time.Hour) + + const projectID = 42 + const projectPath = "devops/edge" + + // Four MRs covering the relevant skip permutations: + // 1 merged → commits + notes + // 2 open non-draft → notes only + // 3 closed unmerged <7d → notes only + // 4 draft → neither (top-level skip) + mrs := []MergeRequest{ + {IID: 1, ProjectID: projectID, Title: "merged mr", State: "merged", + CreatedAt: now.Add(-5 * 24 * time.Hour), UpdatedAt: now, + MergedAt: &merged}, + {IID: 2, ProjectID: projectID, Title: "open mr", State: "opened", + CreatedAt: now.Add(-2 * 24 * time.Hour), UpdatedAt: now}, + {IID: 3, ProjectID: projectID, Title: "closed unmerged recently", State: "closed", + CreatedAt: now.Add(-2 * 24 * time.Hour), UpdatedAt: now, + ClosedAt: &closedRecently}, + {IID: 4, ProjectID: projectID, Title: "draft", State: "opened", Draft: true, + CreatedAt: now.Add(-1 * 24 * time.Hour), UpdatedAt: now}, + } + + var commitsHits, notesHits int32 + commitsSeen := make(map[int]bool) + notesSeen := make(map[int]bool) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + raw := r.URL.RawPath + if raw == "" { + raw = r.URL.Path + } + switch { + // GetProject (URL-escaped path → "devops%2Fedge"). + case raw == "/api/v4/projects/devops%2Fedge": + _, _ = fmt.Fprintf(w, `{"id":%d,"path_with_namespace":%q}`, projectID, projectPath) + case r.URL.Path == fmt.Sprintf("/api/v4/projects/%d/merge_requests", projectID): + if r.URL.Query().Get("page") != "1" { + _, _ = w.Write([]byte(`[]`)) + return + } + _ = json.NewEncoder(w).Encode(mrs) + case strings.HasPrefix(r.URL.Path, fmt.Sprintf("/api/v4/projects/%d/merge_requests/", projectID)): + var iid int + tail := strings.TrimPrefix(r.URL.Path, + fmt.Sprintf("/api/v4/projects/%d/merge_requests/", projectID)) + switch { + case strings.HasSuffix(tail, "/commits"): + _, _ = fmt.Sscanf(tail, "%d/commits", &iid) + atomic.AddInt32(&commitsHits, 1) + commitsSeen[iid] = true + _, _ = fmt.Fprintf(w, `[{"id":"a","authored_date":%q}]`, + merged.Add(-2*24*time.Hour).Format(time.RFC3339)) + case strings.HasSuffix(tail, "/notes"): + _, _ = fmt.Sscanf(tail, "%d/notes", &iid) + atomic.AddInt32(¬esHits, 1) + notesSeen[iid] = true + _, _ = w.Write([]byte(`[]`)) + default: + t.Errorf("unexpected MR subpath: %s", r.URL.Path) + http.Error(w, "not found", http.StatusNotFound) + } + default: + t.Errorf("unexpected request: %s (raw %s)", r.URL.Path, raw) + http.Error(w, "not found", http.StatusNotFound) + } + })) + t.Cleanup(srv.Close) + + spec, ok := ParseGroupSpec(projectPath) + if !ok { + t.Fatalf("ParseGroupSpec failed") + } + syncer := NewSyncer(New(srv.URL, "tok", srv.Client()), store, []GroupSpec{spec}, nil, 30) + syncer.HydrateDiff = false // avoid an extra GetMergeRequest call we don't care about here + if err := syncer.Sync(ctx); err != nil { + t.Fatalf("Sync: %v", err) + } + + if got := atomic.LoadInt32(&commitsHits); got != 1 { + t.Errorf("commits endpoint hits = %d, want 1 (only merged MR #1)", got) + } + if !commitsSeen[1] { + t.Errorf("commits not called for merged MR #1: %+v", commitsSeen) + } + for _, n := range []int{2, 3, 4} { + if commitsSeen[n] { + t.Errorf("commits called for unmerged MR #%d — gate must skip non-merged MRs", n) + } + } + + if got := atomic.LoadInt32(¬esHits); got != 3 { + t.Errorf("notes endpoint hits = %d, want 3 (skip only the draft #4)", got) + } + if notesSeen[4] { + t.Error("notes fetched for draft MR #4 — draft skip regressed") + } +} diff --git a/roadmap-planner/backend/internal/metrics/calculators/interface.go b/roadmap-planner/backend/internal/metrics/calculators/interface.go index 106060fa..9cecf89a 100644 --- a/roadmap-planner/backend/internal/metrics/calculators/interface.go +++ b/roadmap-planner/backend/internal/metrics/calculators/interface.go @@ -134,6 +134,15 @@ func (b *BaseCalculator) GetStringOption(key string, defaultValue string) string return defaultValue } +// GetBoolOption retrieves a bool option with a default fallback. +func (b *BaseCalculator) GetBoolOption(key string, defaultValue bool) bool { + val := b.GetOption(key, defaultValue) + if v, ok := val.(bool); ok { + return v + } + return defaultValue +} + // GetStringSliceOption retrieves a string slice option with a default fallback func (b *BaseCalculator) GetStringSliceOption(key string, defaultValue []string) []string { val := b.GetOption(key, nil) diff --git a/roadmap-planner/backend/internal/metrics/calculators/lead_time.go b/roadmap-planner/backend/internal/metrics/calculators/lead_time.go index 195ff6c4..afb76315 100644 --- a/roadmap-planner/backend/internal/metrics/calculators/lead_time.go +++ b/roadmap-planner/backend/internal/metrics/calculators/lead_time.go @@ -14,124 +14,1011 @@ See the License for the specific language governing permissions and limitations under the License. */ +// Lead Time for Changes — DORA Phase 2 implementation. +// +// Algorithm reference: docs/plans/2026-05-21-dora-optimization-plan.md +// §Lead Time 3 段计算 · 详细设计 (§1–§11). All design decisions +// (D3, D14, D15, D16, D17) are honoured here. +// +// T0 = min(p.first_commit_at) ── earliest commit on linked PRs +// T1 = min(p.created_at) ── first PR opened +// T2 = max(p.merged_at) ── last PR merged +// T3 = issue.fix_version.releaseDate +// +// dev_s = T1 − T0 +// review_s = T2 − T1 +// release_s = T3 − T2 +// total_s = T3 − T0 +// +// Per-issue fallback (C1..C6) is documented in plan §4. Bottleneck +// rule (D15): stage_p50 / sum > 40% AND stage_p50 >= team baseline p75. +// worst_issues: each stage's top-1, three rows total. Trend (D17): +// month-bucket p50 + MoM/QoQ chips. + package calculators import ( "context" + "fmt" + "math" "sort" "time" "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/metrics/models" ) -// LeadTimeCalculator calculates the time from Epic creation to release +// Stage names — kept lowercase to match API contract (plan §8). +const ( + stageDev = "dev" + stageReview = "review" + stageRelease = "release" +) + +// Bottleneck rule thresholds (D15). +const ( + bottleneckShareThreshold = 0.40 +) + +// excludedLongDevThreshold ─ if T0 is earlier than (window.start - 180d) +// the issue is dropped (plan §9 E9). Keeps long-tail outliers from +// dragging Dev/Total p50. +const excludedLongDevThreshold = 180 * 24 * time.Hour + +// stagedIssue is the per-issue computation result. +type stagedIssue struct { + Key string + Issuetype string + Summary string + Component string + LinkedPRCount int + IsBotMajority bool + ReleaseDate time.Time + Case string + + // Stage durations in hours. NaN means the issue is excluded from + // that stage's distribution (typically C3 — no first_commit_at). + DevHours float64 + ReviewHours float64 + ReleaseHours float64 + TotalHours float64 +} + +// stageBreakdown is the aggregated per-stage view returned to the API. +type stageBreakdown struct { + Name string `json:"name"` + Label string `json:"label"` + Definition string `json:"definition"` + P50Hours float64 `json:"p50_hours"` + P75Hours float64 `json:"p75_hours"` + SampleCount int `json:"sample_count"` + Bottleneck bool `json:"bottleneck"` + BottleneckReason string `json:"bottleneck_reason,omitempty"` +} + +// worstIssue is a top-1 record for one stage. The API returns up to +// three (one per stage that has a candidate). +type worstIssue struct { + JiraKey string `json:"jira_key"` + Issuetype string `json:"issuetype"` + Summary string `json:"summary"` + BottleneckStage string `json:"bottleneck_stage"` + BottleneckDurationHours float64 `json:"bottleneck_duration_hours"` + TotalLeadTimeHours float64 `json:"total_lead_time_hours"` + PRCount int `json:"pr_count"` + IsBotMajority bool `json:"is_bot_majority"` +} + +// coverageStats reports how every issue was classified by the fallback +// matrix (plan §4 C1..C6). Exposed in the API so the UI can show +// "linked-PR coverage X% · N excluded". +type coverageStats struct { + IssuesTotal int `json:"issues_total"` + IssuesFull int `json:"issues_full"` // C1 + IssuesPartial int `json:"issues_partial"` // C2 + IssuesDevMissing int `json:"issues_dev_missing"` // C3 + IssuesNoPRs int `json:"issues_no_prs"` // C4 (excluded) + IssuesUnreleased int `json:"issues_unreleased"` // C5 (excluded, count-only) + IssuesDataAnomaly int `json:"issues_data_anomaly"` // C6 (excluded) + IssuesLongDev int `json:"issues_long_dev"` // E9 (excluded) + CoveragePct int `json:"coverage_pct"` +} + +// trendPoint is one month's bucket aggregate. P50Hours is *float64 so +// the API can encode "sample too small" as null (plan §11). +type trendPoint struct { + Period string `json:"period"` // "2026-05" + P50Hours *float64 `json:"p50_hours"` + SampleCount int `json:"sample_count"` + IsCurrent bool `json:"is_current,omitempty"` +} + +// trendData is the panel payload (D17). +type trendData struct { + Granularity string `json:"granularity"` + Points []trendPoint `json:"points"` + MoMPct *float64 `json:"mom_pct"` + QoQPct *float64 `json:"qoq_pct"` + MoMDirection string `json:"mom_direction"` + QoQDirection string `json:"qoq_direction"` +} + +// percentileStats packages the common quartile view. +type percentileStats struct { + P25Hours float64 `json:"p25_hours"` + P50Hours float64 `json:"p50_hours"` + P75Hours float64 `json:"p75_hours"` + P90Hours float64 `json:"p90_hours"` + SampleCount int `json:"sample_count"` +} + +// LeadTimeCalculator implements the DORA Lead Time for Changes metric +// with 3-stage attribution (Dev / Review / Release). type LeadTimeCalculator struct { BaseCalculator } -// NewLeadTimeCalculator creates a new lead time calculator +// NewLeadTimeCalculator wires the calculator with default options. +// +// Recognised options: +// +// include_bots (bool, default false) — when false, bot PRs are +// filtered out of T0/T1/T2 computation and is_bot_majority is +// evaluated against the original set. +// with_trend (bool, default true) — emit the trend block. func NewLeadTimeCalculator(options map[string]interface{}) *LeadTimeCalculator { return &LeadTimeCalculator{ BaseCalculator: NewBaseCalculator( "lead_time_to_release", - "Time from Epic creation to release date", + "Time from first commit on a linked PR to release", + // Unit stays "days" for backward compatibility with the + // existing MetricCard / MetricBreakdown / Prometheus + // consumers (DORA day thresholds, lead_time_days). The + // hour-precision values live in MetricResult.Metadata + // (total.p50_hours, stages[].p50_hours, trend.points[]). "days", - []string{"component", "pillar"}, + []string{"component"}, options, ), } } -// Calculate computes the lead time metric +// Calculate runs the full 3-stage attribution + trend computation. +// Per-request flags from data.Options (include_bots, with_trend) override +// the calculator's startup defaults. +// +// When no PR storage backend is configured (PRStoreAvailable == false), +// Calculate falls back to a Jira-only calendar lead time (issue.created +// → release.releaseDate, in days) and marks the result as degraded. +// This preserves the pre-Phase-2 Lead Time signal for minimal +// deployments where storage.enabled is still false. func (c *LeadTimeCalculator) Calculate(ctx context.Context, data *models.CalculationContext) ([]models.MetricResult, error) { - percentile := c.GetIntOption("percentile", 50) // Default to median + if !data.PRStoreAvailable { + return c.calculateJiraOnlyFallback(data), nil + } + includeBots := readBoolOpt(data.Options, "include_bots", c.GetBoolOption("include_bots", false)) + withTrend := readBoolOpt(data.Options, "with_trend", c.GetBoolOption("with_trend", true)) + + // Indexes for the per-issue join. + versionByName := indexReleases(data.Releases) + prsByKey := groupPRsByJiraKey(data.PullRequests, includeBots) + + // Two passes: + // 1) classify every issue, collect staged issues per component + // 2) aggregate, identify bottlenecks, pick worst_issues, build trend + componentBuckets := make(map[string][]stagedIssue) + componentCoverage := make(map[string]*coverageStats) - // Group epics by component - componentEpics := make(map[string][]models.EnrichedIssue) - for _, epic := range data.Epics { - // Skip epics without release date - if epic.ReleaseDate.IsZero() || epic.CreatedDate.IsZero() { + for _, iss := range mergeIssueLists(data.Epics, data.Issues) { + // Window check is by release date, not issue.created (DORA + // convention; plan §9 E6). + relDate, release := matchReleasedVersion(iss.Versions, versionByName, data.TimeRange) + if relDate.IsZero() { + recordCoverage(componentCoverage, "", "C5") continue } - - // Apply time range filter on release date - if epic.ReleaseDate.Before(data.TimeRange.Start) || epic.ReleaseDate.After(data.TimeRange.End) { + component := release.Component + if component == "" { + // Collector drops unparsable releases, but guard here too: a + // raw version name (e.g. "0.3") is not a component bucket. continue } - - for _, comp := range epic.Components { - // Apply component filter if specified - if len(data.Filters.Components) > 0 && !Contains(data.Filters.Components, comp) { - continue - } - componentEpics[comp] = append(componentEpics[comp], epic) + if len(data.Filters.Components) > 0 && !containsString(data.Filters.Components, component) { + continue } - // If epic has no components, use "unknown" - if len(epic.Components) == 0 { - componentEpics["unknown"] = append(componentEpics["unknown"], epic) + linked := prsByKey[iss.Key] + linked = filterPreReleasePRs(linked, relDate) + + si, kase := classifyIssue(iss, linked, relDate, component, data.TimeRange) + recordCoverage(componentCoverage, component, kase) + switch kase { + case "C1", "C2", "C3": + componentBuckets[component] = append(componentBuckets[component], si) } } - results := make([]models.MetricResult, 0, len(componentEpics)) + // Team baseline p75 per stage — drives the bottleneck rule (D15). + teamBaseline := computeTeamBaselineP75(componentBuckets) - for component, epics := range componentEpics { - var leadTimes []float64 - for _, epic := range epics { - leadTime := epic.ReleaseDate.Sub(epic.CreatedDate).Hours() / 24 // Convert to days - if leadTime >= 0 { - leadTimes = append(leadTimes, leadTime) - } + results := make([]models.MetricResult, 0, len(componentBuckets)) + for component, issues := range componentBuckets { + coverage := componentCoverage[component] + if coverage == nil { + coverage = &coverageStats{} } + coverage.CoveragePct = computeCoveragePct(coverage) - if len(leadTimes) == 0 { - continue + // Total percentiles over qualified issues. + totals := collectTotals(issues) + totalStats := percentileSummary(totals) + + stages := computeStageBreakdowns(issues, teamBaseline) + worst := pickWorstPerStage(issues, stages) + var consistencyWarning string + if !consistencyOK(stages, totalStats) { + consistencyWarning = fmt.Sprintf( + "sum(stages.p50)=%.0fh vs total.p50=%.0fh — fallback exclusion exceeds 5%% drift", + sumStageP50(stages), totalStats.P50Hours, + ) } - // Calculate percentile - sort.Float64s(leadTimes) - idx := int(float64(len(leadTimes)-1) * float64(percentile) / 100) - if idx >= len(leadTimes) { - idx = len(leadTimes) - 1 + var trend *trendData + if withTrend { + trend = buildTrend(issues, data.TimeRange) + } + + // Legacy metadata keys (days) consumed by MetricBreakdown.jsx + // and any other day-based reader. Sourced from `totals` (hours) + // and converted; structured hour-precision data stays in + // metadata.total / metadata.stages. + minDays, maxDays, avgDays := legacyDaySummary(totals) + + results = append(results, models.MetricResult{ + Name: c.Name(), + // Value is in days for the existing consumers (P1-1); the + // hour-precision values are in Metadata. + Value: totalStats.P50Hours / 24, + Unit: c.Unit(), + Labels: map[string]string{ + "component": component, + }, + Timestamp: time.Now(), + Metadata: map[string]interface{}{ + // New (hours-precision) structured payload. + "total": totalStats, + "stages": stages, + "worst_issues": worst, + "coverage": coverage, + "trend": trend, + "consistency_warning": stringPtrOrNil(consistencyWarning), + "include_bots": includeBots, + // Legacy backward-compat (days) for MetricBreakdown.jsx + // `min`/`max`/`count` reads. See P2 review fix. + "min": minDays, + "max": maxDays, + "average": avgDays, + "count": totalStats.SampleCount, + "sample_size": totalStats.SampleCount, + "percentile": 50, + }, + }) + } + + return results, nil +} + +// legacyDaySummary returns min/max/average of the totals slice +// (which is in hours) converted to days. Empty slice yields zeros. +func legacyDaySummary(totals []float64) (minDays, maxDays, avgDays float64) { + if len(totals) == 0 { + return 0, 0, 0 + } + sorted := append([]float64(nil), totals...) + sort.Float64s(sorted) + minDays = sorted[0] / 24 + maxDays = sorted[len(sorted)-1] / 24 + var sum float64 + for _, h := range sorted { + sum += h + } + avgDays = (sum / float64(len(sorted))) / 24 + return +} + +// calculateJiraOnlyFallback runs the pre-Phase-2 calendar Lead Time +// (issue.created → release.releaseDate, days) when no PR store is +// available. Output is intentionally minimal: Value + the legacy +// metadata keys (min/max/average/count) so MetricBreakdown.jsx renders +// numbers instead of N/A. Stage attribution, worst_issues, trend, and +// coverage are absent and metadata.degraded is set so the UI can warn +// the operator that storage.enabled is off. +// +// This breaks D3 (commit-centric start) — but only on minimal +// deployments where PR data is not even ingested. The trade-off is +// documented in plan §10. +func (c *LeadTimeCalculator) calculateJiraOnlyFallback(data *models.CalculationContext) []models.MetricResult { + versionByName := indexReleases(data.Releases) + componentBuckets := make(map[string][]float64) + + for _, iss := range mergeIssueLists(data.Epics, data.Issues) { + if iss.CreatedDate.IsZero() { + continue + } + relDate, release := matchReleasedVersion(iss.Versions, versionByName, data.TimeRange) + if relDate.IsZero() { + continue + } + component := release.Component + if component == "" { + // Same guard as the PR-backed path: unparsable version names + // are not component buckets. + continue + } + if len(data.Filters.Components) > 0 && !containsString(data.Filters.Components, component) { + continue + } + leadDays := relDate.Sub(iss.CreatedDate).Hours() / 24 + if leadDays < 0 { + continue } - value := leadTimes[idx] + componentBuckets[component] = append(componentBuckets[component], leadDays) + } - // Calculate additional stats + results := make([]models.MetricResult, 0, len(componentBuckets)) + for component, days := range componentBuckets { + if len(days) == 0 { + continue + } + sorted := append([]float64(nil), days...) + sort.Float64s(sorted) var sum float64 - for _, lt := range leadTimes { - sum += lt + for _, d := range sorted { + sum += d } - avg := sum / float64(len(leadTimes)) + avg := sum / float64(len(sorted)) results = append(results, models.MetricResult{ Name: c.Name(), - Value: value, + Value: percentile(sorted, 50), Unit: c.Unit(), Labels: map[string]string{ "component": component, }, Timestamp: time.Now(), Metadata: map[string]interface{}{ - "sample_size": len(leadTimes), - "percentile": percentile, - "min": leadTimes[0], - "max": leadTimes[len(leadTimes)-1], - "average": avg, + "min": sorted[0], + "max": sorted[len(sorted)-1], + "average": avg, + "count": len(sorted), + "sample_size": len(sorted), + "percentile": 50, + "degraded": "no_pr_store", + "degraded_reason": "PR storage is not enabled; falling back to issue.created → release.releaseDate calendar lead time (days). Stage attribution, worst_issues, trend, and coverage are unavailable until storage.enabled = true and a PR sync has run.", }, }) } - - return results, nil + return results } -// PrometheusMetrics returns the Prometheus metric descriptors +// PrometheusMetrics returns the Prometheus metric descriptors. func (c *LeadTimeCalculator) PrometheusMetrics() []models.PrometheusMetricDesc { return []models.PrometheusMetricDesc{ { - Name: "lead_time_days", - Help: "Lead time from epic creation to release in days", + Name: "lead_time_hours_p50", + Help: "Lead Time for Changes (DORA) p50, first commit → released, in hours", Type: "gauge", - LabelNames: []string{"component", "pillar"}, + LabelNames: []string{"component"}, }, } } + +// ───────────────────────────────────────────────────────────────────── +// Internal helpers — kept package-private; the structured payload is +// the only externally observable surface. +// ───────────────────────────────────────────────────────────────────── + +func indexReleases(rs []models.EnrichedRelease) map[string]models.EnrichedRelease { + out := make(map[string]models.EnrichedRelease, len(rs)) + for _, r := range rs { + out[r.Name] = r + } + return out +} + +func groupPRsByJiraKey(prs []models.EnrichedPR, includeBots bool) map[string][]models.EnrichedPR { + out := make(map[string][]models.EnrichedPR) + for _, p := range prs { + if p.JiraKey == "" { + continue + } + if !includeBots && p.IsBot { + continue + } + out[p.JiraKey] = append(out[p.JiraKey], p) + } + return out +} + +// mergeIssueLists yields Epics + Issues without duplicates by Key. +// Probe A confirms the bulk of fix_version-carrying issues are Story / +// Bug / Technical Debt (under `Issues`), not Epic — but we sweep both +// since the collector populates each slice from different JQL. +func mergeIssueLists(epics, issues []models.EnrichedIssue) []models.EnrichedIssue { + seen := make(map[string]struct{}, len(epics)+len(issues)) + out := make([]models.EnrichedIssue, 0, len(epics)+len(issues)) + for _, i := range epics { + if _, ok := seen[i.Key]; ok { + continue + } + seen[i.Key] = struct{}{} + out = append(out, i) + } + for _, i := range issues { + if _, ok := seen[i.Key]; ok { + continue + } + seen[i.Key] = struct{}{} + out = append(out, i) + } + return out +} + +// matchReleasedVersion picks the earliest released version on this +// issue whose releaseDate falls inside `window`. Returns the matching +// EnrichedRelease so callers can reuse its parsed Component field +// (set by collector via ConvertJiraVersionToVersion); zero values map +// to C5. +// +// The returned T3 is normalised to end-of-day (23:59:59.999999999). +// Jira version releaseDate only carries a calendar date, parsed at +// midnight; comparing PR merge timestamps directly would drop any PR +// merged later on the same calendar day as "after release", losing +// the last PR on release day for many shipped issues. +func matchReleasedVersion(versionNames []string, byName map[string]models.EnrichedRelease, window models.TimeRange) (time.Time, models.EnrichedRelease) { + var bestDate time.Time + var best models.EnrichedRelease + for _, name := range versionNames { + r, ok := byName[name] + if !ok || !r.Released || r.ReleaseDate.IsZero() { + continue + } + if r.ReleaseDate.Before(window.Start) || r.ReleaseDate.After(window.End) { + continue + } + if bestDate.IsZero() || r.ReleaseDate.Before(bestDate) { + bestDate = r.ReleaseDate + best = r + } + } + if !bestDate.IsZero() { + bestDate = endOfDay(bestDate) + } + return bestDate, best +} + +// endOfDay snaps a timestamp to 23:59:59.999999999 of the same calendar +// day in its original location. Used for Jira release date comparison; +// see matchReleasedVersion. +func endOfDay(t time.Time) time.Time { + return time.Date(t.Year(), t.Month(), t.Day(), 23, 59, 59, 999999999, t.Location()) +} + +// filterPreReleasePRs drops PRs whose merged_at is *after* the release +// date — those are hotfixes against the released version and belong to +// the *next* release window (plan §9 E3). +func filterPreReleasePRs(prs []models.EnrichedPR, releaseDate time.Time) []models.EnrichedPR { + out := prs[:0:0] // new slice, don't mutate caller + for _, p := range prs { + if p.MergedAt == nil || p.MergedAt.After(releaseDate) { + continue + } + out = append(out, p) + } + return out +} + +// classifyIssue runs the C1–C6 fallback matrix and produces a +// stagedIssue with stage durations populated where possible. +func classifyIssue(iss models.EnrichedIssue, linked []models.EnrichedPR, releaseDate time.Time, component string, window models.TimeRange) (stagedIssue, string) { + si := stagedIssue{ + Key: iss.Key, + Issuetype: iss.IssueType, + Summary: iss.Name, + Component: component, + LinkedPRCount: len(linked), + ReleaseDate: releaseDate, + DevHours: math.NaN(), + ReviewHours: math.NaN(), + ReleaseHours: math.NaN(), + TotalHours: math.NaN(), + } + + if len(linked) == 0 { + return si, "C4" + } + + // Bot-majority badge is based on the *unfiltered* set as it exists + // after include_bots/hotfix filtering — i.e. what is being charted. + botCount := 0 + for _, p := range linked { + if p.IsBot { + botCount++ + } + } + si.IsBotMajority = botCount*2 > len(linked) + + // T1, T2 are non-null as long as one PR is linked (created_at and + // merged_at are always present on shipped PRs). + var t1, t2 time.Time + for _, p := range linked { + if t1.IsZero() || p.CreatedAt.Before(t1) { + t1 = p.CreatedAt + } + if p.MergedAt != nil && p.MergedAt.After(t2) { + t2 = *p.MergedAt + } + } + + // T0 — first_commit_at. May be missing on some/all PRs. + var ( + t0 time.Time + anyT0 bool + allT0 = true + t0Source int + ) + for _, p := range linked { + if p.FirstCommitAt == nil { + allT0 = false + continue + } + t0Source++ + if !anyT0 || p.FirstCommitAt.Before(t0) { + t0 = *p.FirstCommitAt + anyT0 = true + } + } + + t3 := releaseDate + + // E9 — long-dev exclusion (plan §9 E9). + if anyT0 && t0.Before(window.Start.Add(-excludedLongDevThreshold)) { + return si, "long_dev" + } + + // Compute stages where data is present. + si.ReleaseHours = t3.Sub(t2).Hours() + si.ReviewHours = t2.Sub(t1).Hours() + + if anyT0 { + si.DevHours = t1.Sub(t0).Hours() + si.TotalHours = t3.Sub(t0).Hours() + } + + // C6 — any computed stage is negative (timezone / squash / draft + // occupancy bug). Drop completely. + if anyT0 && si.DevHours < 0 { + return si, "C6" + } + if si.ReviewHours < 0 { + return si, "C6" + } + if si.ReleaseHours < 0 { + return si, "C6" + } + + switch { + case !anyT0: + // C3 — Dev stage excluded; Review's effective start becomes T1. + si.TotalHours = t3.Sub(t1).Hours() + return si, "C3" + case !allT0: + // C2 — partial first_commit_at coverage; Dev still computed + // from the earliest available. + return si, "C2" + default: + return si, "C1" + } +} + +func recordCoverage(byComp map[string]*coverageStats, component, kase string) { + if component == "" { + component = "_unscoped" + } + cov, ok := byComp[component] + if !ok { + cov = &coverageStats{} + byComp[component] = cov + } + cov.IssuesTotal++ + switch kase { + case "C1": + cov.IssuesFull++ + case "C2": + cov.IssuesPartial++ + case "C3": + cov.IssuesDevMissing++ + case "C4": + cov.IssuesNoPRs++ + case "C5": + cov.IssuesUnreleased++ + case "C6": + cov.IssuesDataAnomaly++ + case "long_dev": + cov.IssuesLongDev++ + } +} + +func computeCoveragePct(c *coverageStats) int { + denom := c.IssuesTotal - c.IssuesUnreleased - c.IssuesDataAnomaly - c.IssuesLongDev + if denom <= 0 { + return 0 + } + num := c.IssuesFull + c.IssuesPartial + c.IssuesDevMissing + return int(math.Round(float64(num) * 100 / float64(denom))) +} + +// computeTeamBaselineP75 averages the per-stage p75 across every +// component bucket. This avoids letting one component's noisy month +// throw the bottleneck rule on every other component. For the v1 +// rollout the team has only 9 months of data — calibrate later. +func computeTeamBaselineP75(buckets map[string][]stagedIssue) map[string]float64 { + out := map[string]float64{stageDev: 0, stageReview: 0, stageRelease: 0} + if len(buckets) == 0 { + return out + } + merged := map[string][]float64{stageDev: {}, stageReview: {}, stageRelease: {}} + for _, issues := range buckets { + for _, si := range issues { + if !math.IsNaN(si.DevHours) { + merged[stageDev] = append(merged[stageDev], si.DevHours) + } + if !math.IsNaN(si.ReviewHours) { + merged[stageReview] = append(merged[stageReview], si.ReviewHours) + } + if !math.IsNaN(si.ReleaseHours) { + merged[stageRelease] = append(merged[stageRelease], si.ReleaseHours) + } + } + } + for stage, samples := range merged { + out[stage] = percentile(samples, 75) + } + return out +} + +func computeStageBreakdowns(issues []stagedIssue, baselineP75 map[string]float64) []stageBreakdown { + devSamples := make([]float64, 0, len(issues)) + revSamples := make([]float64, 0, len(issues)) + rlsSamples := make([]float64, 0, len(issues)) + for _, si := range issues { + if !math.IsNaN(si.DevHours) { + devSamples = append(devSamples, si.DevHours) + } + if !math.IsNaN(si.ReviewHours) { + revSamples = append(revSamples, si.ReviewHours) + } + if !math.IsNaN(si.ReleaseHours) { + rlsSamples = append(rlsSamples, si.ReleaseHours) + } + } + + dev := buildStage(stageDev, "Dev", "first_commit_at → first_pr_opened", devSamples) + rev := buildStage(stageReview, "Review", "first_pr_opened → last_pr_merged", revSamples) + rls := buildStage(stageRelease, "Release", "last_pr_merged → fix_version.release_date", rlsSamples) + + stages := []stageBreakdown{dev, rev, rls} + sumP50 := dev.P50Hours + rev.P50Hours + rls.P50Hours + if sumP50 <= 0 { + return stages + } + for i := range stages { + share := stages[i].P50Hours / sumP50 + if share <= bottleneckShareThreshold { + continue + } + baseline := baselineP75[stages[i].Name] + if baseline <= 0 || stages[i].P50Hours < baseline { + continue + } + stages[i].Bottleneck = true + stages[i].BottleneckReason = fmt.Sprintf( + "%.0f%% of total Lead Time · >= team baseline P75 (%.0fh)", + share*100, baseline, + ) + } + return stages +} + +func buildStage(name, label, definition string, samples []float64) stageBreakdown { + return stageBreakdown{ + Name: name, + Label: label, + Definition: definition, + P50Hours: percentile(samples, 50), + P75Hours: percentile(samples, 75), + SampleCount: len(samples), + } +} + +// pickWorstPerStage returns up to three worst_issues — one per stage — +// chosen as the issue whose *dominant* stage is this stage AND whose +// duration there is the largest. The dominant-stage filter avoids the +// "release segment always wins by absolute hours" trap (plan §7 P2-4). +func pickWorstPerStage(issues []stagedIssue, stages []stageBreakdown) []worstIssue { + winners := map[string]stagedIssue{} + durations := map[string]float64{} + for _, si := range issues { + if math.IsNaN(si.TotalHours) || si.TotalHours <= 0 { + continue + } + dom := dominantStage(si) + var d float64 + switch dom { + case stageDev: + d = si.DevHours + case stageReview: + d = si.ReviewHours + case stageRelease: + d = si.ReleaseHours + default: + continue + } + if math.IsNaN(d) { + continue + } + if existing, ok := winners[dom]; !ok || d > durations[dom] { + _ = existing + winners[dom] = si + durations[dom] = d + } + } + order := []string{stageDev, stageReview, stageRelease} + out := make([]worstIssue, 0, 3) + for _, stage := range order { + si, ok := winners[stage] + if !ok { + continue + } + out = append(out, worstIssue{ + JiraKey: si.Key, + Issuetype: si.Issuetype, + Summary: si.Summary, + BottleneckStage: stage, + BottleneckDurationHours: durations[stage], + TotalLeadTimeHours: si.TotalHours, + PRCount: si.LinkedPRCount, + IsBotMajority: si.IsBotMajority, + }) + } + return out +} + +// dominantStage returns the stage with the largest share of the issue's +// total Lead Time. Returns "" when the issue has insufficient data. +func dominantStage(si stagedIssue) string { + if math.IsNaN(si.TotalHours) || si.TotalHours <= 0 { + return "" + } + type pair struct { + name string + dur float64 + } + candidates := []pair{ + {stageDev, si.DevHours}, + {stageReview, si.ReviewHours}, + {stageRelease, si.ReleaseHours}, + } + best := "" + bestDur := -1.0 + for _, p := range candidates { + if math.IsNaN(p.dur) { + continue + } + if p.dur > bestDur { + bestDur = p.dur + best = p.name + } + } + return best +} + +func collectTotals(issues []stagedIssue) []float64 { + out := make([]float64, 0, len(issues)) + for _, si := range issues { + if !math.IsNaN(si.TotalHours) { + out = append(out, si.TotalHours) + } + } + return out +} + +func percentileSummary(samples []float64) percentileStats { + return percentileStats{ + P25Hours: percentile(samples, 25), + P50Hours: percentile(samples, 50), + P75Hours: percentile(samples, 75), + P90Hours: percentile(samples, 90), + SampleCount: len(samples), + } +} + +// percentile returns the q-th percentile (0-100) using the +// nearest-rank method. Empty slice yields 0. +func percentile(samples []float64, q int) float64 { + if len(samples) == 0 { + return 0 + } + sorted := append([]float64(nil), samples...) + sort.Float64s(sorted) + if q <= 0 { + return sorted[0] + } + if q >= 100 { + return sorted[len(sorted)-1] + } + rank := math.Ceil(float64(q)*float64(len(sorted))/100) - 1 + if rank < 0 { + rank = 0 + } + if int(rank) >= len(sorted) { + rank = float64(len(sorted) - 1) + } + return sorted[int(rank)] +} + +func sumStageP50(stages []stageBreakdown) float64 { + var sum float64 + for _, s := range stages { + sum += s.P50Hours + } + return sum +} + +func consistencyOK(stages []stageBreakdown, total percentileStats) bool { + if total.P50Hours <= 0 { + return true + } + sum := sumStageP50(stages) + ratio := sum / total.P50Hours + return ratio >= 0.95 && ratio <= 1.05 +} + +func stringPtrOrNil(s string) interface{} { + if s == "" { + return nil + } + return s +} + +// readBoolOpt fetches a bool flag from an options map, falling back to +// the supplied default when the key is missing or has the wrong type. +func readBoolOpt(opts map[string]interface{}, key string, fallback bool) bool { + if opts == nil { + return fallback + } + v, ok := opts[key] + if !ok { + return fallback + } + if b, ok := v.(bool); ok { + return b + } + return fallback +} + +func containsString(xs []string, x string) bool { + for _, v := range xs { + if v == x { + return true + } + } + return false +} + +// ───────────────────────────────────────────────────────────────────── +// Trend (D17 / §11) — month-bucket p50 + MoM/QoQ. +// ───────────────────────────────────────────────────────────────────── + +const minTrendSamples = 3 + +func buildTrend(issues []stagedIssue, window models.TimeRange) *trendData { + if window.Start.IsZero() || window.End.IsZero() { + return nil + } + // Anchor month buckets to (window.End - N months) ... window.End, N=9. + // We snap each end to the first of its calendar month so partial + // windows still produce nine evenly-spaced points. + end := monthStart(window.End) + start := end.AddDate(0, -8, 0) // 9 buckets inclusive + if start.Before(monthStart(window.Start)) { + start = monthStart(window.Start) + } + + // Bucket issues by release-date month. + bucket := make(map[string][]float64) + for _, si := range issues { + if math.IsNaN(si.TotalHours) { + continue + } + key := si.ReleaseDate.Format("2006-01") + bucket[key] = append(bucket[key], si.TotalHours) + } + + points := make([]trendPoint, 0, 9) + currentKey := end.Format("2006-01") + for m := start; !m.After(end); m = m.AddDate(0, 1, 0) { + key := m.Format("2006-01") + samples := bucket[key] + p := trendPoint{Period: key, SampleCount: len(samples), IsCurrent: key == currentKey} + if len(samples) >= minTrendSamples { + v := percentile(samples, 50) + p.P50Hours = &v + } + points = append(points, p) + } + + mom, momDir := momPct(points) + qoq, qoqDir := qoqPct(points) + return &trendData{ + Granularity: "month", + Points: points, + MoMPct: mom, + QoQPct: qoq, + MoMDirection: momDir, + QoQDirection: qoqDir, + } +} + +func monthStart(t time.Time) time.Time { + return time.Date(t.Year(), t.Month(), 1, 0, 0, 0, 0, t.Location()) +} + +// momPct compares the last month vs the prior month. Returns nil when +// either value is missing. +func momPct(points []trendPoint) (*float64, string) { + if len(points) < 2 { + return nil, "n/a" + } + last := points[len(points)-1].P50Hours + prev := points[len(points)-2].P50Hours + if last == nil || prev == nil || *prev == 0 { + return nil, "n/a" + } + pct := (*last - *prev) / *prev * 100 + dir := "degraded" + if pct < 0 { + dir = "improved" + } + return &pct, dir +} + +// qoqPct compares the last 3 months vs the previous 3 months. Returns +// nil if any month is missing. +func qoqPct(points []trendPoint) (*float64, string) { + if len(points) < 6 { + return nil, "n/a" + } + curr := mean3(points[len(points)-3:]) + prev := mean3(points[len(points)-6 : len(points)-3]) + if curr == nil || prev == nil || *prev == 0 { + return nil, "n/a" + } + pct := (*curr - *prev) / *prev * 100 + dir := "degraded" + if pct < 0 { + dir = "improved" + } + return &pct, dir +} + +func mean3(window []trendPoint) *float64 { + var sum float64 + for _, p := range window { + if p.P50Hours == nil { + return nil + } + sum += *p.P50Hours + } + v := sum / float64(len(window)) + return &v +} diff --git a/roadmap-planner/backend/internal/metrics/calculators/lead_time_test.go b/roadmap-planner/backend/internal/metrics/calculators/lead_time_test.go new file mode 100644 index 00000000..3f350b55 --- /dev/null +++ b/roadmap-planner/backend/internal/metrics/calculators/lead_time_test.go @@ -0,0 +1,568 @@ +/* +Copyright 2024 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +*/ + +package calculators + +import ( + "context" + "math" + "testing" + "time" + + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/metrics/models" +) + +// --- helpers ---------------------------------------------------------- + +func ptrTime(t time.Time) *time.Time { return &t } + +func mustClose(t *testing.T, got, want, tol float64) { + t.Helper() + if math.Abs(got-want) > tol { + t.Fatalf("got %.4f, want %.4f (±%.4f)", got, want, tol) + } +} + +// --- percentile ------------------------------------------------------- + +// TestPercentile_NearestRank verifies the nearest-rank percentile +// implementation across the boundary cases the calculator hits in +// practice — empty input, single sample, exact boundaries. +func TestPercentile_NearestRank(t *testing.T) { + if got := percentile(nil, 50); got != 0 { + t.Errorf("empty p50 = %v, want 0", got) + } + if got := percentile([]float64{42}, 50); got != 42 { + t.Errorf("single p50 = %v, want 42", got) + } + // 10 samples 1..10 → p50 rounds up to rank 5 → value 5. + xs := []float64{10, 9, 8, 7, 6, 5, 4, 3, 2, 1} + if got := percentile(xs, 50); got != 5 { + t.Errorf("p50 of 1..10 = %v, want 5", got) + } + if got := percentile(xs, 75); got != 8 { + t.Errorf("p75 of 1..10 = %v, want 8", got) + } + if got := percentile(xs, 100); got != 10 { + t.Errorf("p100 = %v, want 10", got) + } +} + +// --- classifyIssue fallback matrix ----------------------------------- + +// TestClassifyIssue_C1AllFieldsPresent exercises the happy path: a +// shipped issue with a linked PR carrying first_commit_at, created_at, +// merged_at — all three stage durations are populated and the case is +// C1. +func TestClassifyIssue_C1AllFieldsPresent(t *testing.T) { + t0 := time.Date(2026, 4, 1, 9, 0, 0, 0, time.UTC) + t1 := t0.Add(18 * time.Hour) // dev = 18h + t2 := t1.Add(96 * time.Hour) // review = 96h + t3 := t2.Add(168 * time.Hour) // release = 168h + + iss := models.EnrichedIssue{Key: "DEVOPS-1", IssueType: "Story", Versions: []string{"tektoncd-operator-v4.6.3"}} + linked := []models.EnrichedPR{ + {ID: "AlaudaDevops/tektoncd-operator#1", JiraKey: "DEVOPS-1", + FirstCommitAt: ptrTime(t0), CreatedAt: t1, MergedAt: ptrTime(t2)}, + } + window := models.TimeRange{Start: t0.AddDate(0, -1, 0), End: t3.AddDate(0, 1, 0)} + + si, kase := classifyIssue(iss, linked, t3, "tektoncd-operator", window) + if kase != "C1" { + t.Fatalf("case = %q, want C1", kase) + } + mustClose(t, si.DevHours, 18, 0.01) + mustClose(t, si.ReviewHours, 96, 0.01) + mustClose(t, si.ReleaseHours, 168, 0.01) + mustClose(t, si.TotalHours, 18+96+168, 0.01) +} + +// TestClassifyIssue_C3DevStageMissing — every linked PR lacks +// first_commit_at. Dev stage is dropped (NaN); Review's effective +// start becomes T1; total = T3 − T1. +func TestClassifyIssue_C3DevStageMissing(t *testing.T) { + t1 := time.Date(2026, 4, 1, 9, 0, 0, 0, time.UTC) + t2 := t1.Add(96 * time.Hour) + t3 := t2.Add(168 * time.Hour) + + iss := models.EnrichedIssue{Key: "DEVOPS-2", IssueType: "Bug", Versions: []string{"tektoncd-operator-v4.6.3"}} + linked := []models.EnrichedPR{ + {ID: "x#1", JiraKey: "DEVOPS-2", FirstCommitAt: nil, CreatedAt: t1, MergedAt: ptrTime(t2)}, + } + window := models.TimeRange{Start: t1.AddDate(0, -1, 0), End: t3.AddDate(0, 1, 0)} + + si, kase := classifyIssue(iss, linked, t3, "c", window) + if kase != "C3" { + t.Fatalf("case = %q, want C3", kase) + } + if !math.IsNaN(si.DevHours) { + t.Errorf("DevHours = %v, want NaN", si.DevHours) + } + mustClose(t, si.ReviewHours, 96, 0.01) + mustClose(t, si.ReleaseHours, 168, 0.01) + mustClose(t, si.TotalHours, 96+168, 0.01) +} + +// TestClassifyIssue_C4NoLinkedPRs — no PRs linked to this issue. +// Should be excluded entirely (returns C4 with NaN stages). +func TestClassifyIssue_C4NoLinkedPRs(t *testing.T) { + t3 := time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC) + iss := models.EnrichedIssue{Key: "DEVOPS-3"} + window := models.TimeRange{Start: t3.AddDate(0, -3, 0), End: t3.AddDate(0, 1, 0)} + _, kase := classifyIssue(iss, nil, t3, "c", window) + if kase != "C4" { + t.Fatalf("case = %q, want C4", kase) + } +} + +// TestClassifyIssue_E9LongDev — first_commit_at is 200 days before the +// window start. The issue should be dropped (excluded_long_dev) to +// avoid dragging the Dev/Total p50. +func TestClassifyIssue_E9LongDev(t *testing.T) { + t3 := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC) + t0 := time.Date(2025, 5, 1, 0, 0, 0, 0, time.UTC) // 365d before t3 + t1 := t3.Add(-48 * time.Hour) + t2 := t3.Add(-24 * time.Hour) + + iss := models.EnrichedIssue{Key: "DEVOPS-4"} + linked := []models.EnrichedPR{{ID: "x#1", JiraKey: "DEVOPS-4", + FirstCommitAt: ptrTime(t0), CreatedAt: t1, MergedAt: ptrTime(t2)}} + // Window starts 30 days before release; t0 is well outside the + // 180-day soft cap. + window := models.TimeRange{Start: t3.AddDate(0, 0, -30), End: t3} + _, kase := classifyIssue(iss, linked, t3, "c", window) + if kase != "long_dev" { + t.Fatalf("case = %q, want long_dev", kase) + } +} + +// --- matchReleasedVersion -------------------------------------------- + +func TestMatchReleasedVersion_PicksEarliestInWindow(t *testing.T) { + a := time.Date(2026, 3, 1, 0, 0, 0, 0, time.UTC) // in window — earlier + b := time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC) // in window — later + c := time.Date(2026, 7, 1, 0, 0, 0, 0, time.UTC) // outside window + releases := map[string]models.EnrichedRelease{ + "tektoncd-operator-v4.5.0": {Name: "tektoncd-operator-v4.5.0", Component: "tektoncd-operator", Released: true, ReleaseDate: a}, + "tektoncd-operator-v4.6.0": {Name: "tektoncd-operator-v4.6.0", Component: "tektoncd-operator", Released: true, ReleaseDate: b}, + "tektoncd-operator-v4.7.0": {Name: "tektoncd-operator-v4.7.0", Component: "tektoncd-operator", Released: true, ReleaseDate: c}, + } + window := models.TimeRange{ + Start: time.Date(2026, 2, 1, 0, 0, 0, 0, time.UTC), + End: time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC), + } + names := []string{"tektoncd-operator-v4.7.0", "tektoncd-operator-v4.6.0", "tektoncd-operator-v4.5.0"} + gotDate, gotRelease := matchReleasedVersion(names, releases, window) + // gotDate is normalised to end-of-day (P1-2); compare calendar day. + if gotDate.Year() != a.Year() || gotDate.Month() != a.Month() || gotDate.Day() != a.Day() { + t.Errorf("date = %v, want same day as %v", gotDate, a) + } + if gotRelease.Name != "tektoncd-operator-v4.5.0" { + t.Errorf("name = %q, want tektoncd-operator-v4.5.0", gotRelease.Name) + } + if gotRelease.Component != "tektoncd-operator" { + t.Errorf("component = %q, want tektoncd-operator", gotRelease.Component) + } +} + +// TestMatchReleasedVersion_NormalisesToEndOfDay verifies T3 is snapped +// to 23:59:59.999999999, so a PR merged later on release day is still +// inside the release window (and not treated as a hotfix). +func TestMatchReleasedVersion_NormalisesToEndOfDay(t *testing.T) { + midnight := time.Date(2026, 4, 10, 0, 0, 0, 0, time.UTC) + releases := map[string]models.EnrichedRelease{ + "tektoncd-operator-v4.6.3": {Name: "tektoncd-operator-v4.6.3", Component: "tektoncd-operator", Released: true, ReleaseDate: midnight}, + } + window := models.TimeRange{Start: midnight.AddDate(0, -1, 0), End: midnight.AddDate(0, 1, 0)} + + gotDate, _ := matchReleasedVersion([]string{"tektoncd-operator-v4.6.3"}, releases, window) + if gotDate.Hour() != 23 || gotDate.Minute() != 59 || gotDate.Second() != 59 { + t.Errorf("releaseDate not normalised to end-of-day: %s", gotDate) + } + if gotDate.Day() != midnight.Day() { + t.Errorf("calendar day shifted: got %s, want same day as %s", gotDate, midnight) + } + + // A PR merged at 14:00 on the same day must survive the hotfix filter. + pr := models.EnrichedPR{ + ID: "p1", + JiraKey: "X-1", + CreatedAt: midnight.Add(-72 * time.Hour), + MergedAt: ptrTime(midnight.Add(14 * time.Hour)), + } + if kept := filterPreReleasePRs([]models.EnrichedPR{pr}, gotDate); len(kept) != 1 { + t.Fatalf("same-day PR dropped by filterPreReleasePRs: kept %d, want 1", len(kept)) + } +} + +// TestMatchReleasedVersion_HandlesNoVPrefix verifies the calculator +// reuses the collector-parsed Component field, so version names that +// do not match a `{component}-vX.Y.Z` shape (e.g. `argo-cd-2.9.0`) +// still bucket correctly under the configured component, matching +// how release_frequency and other metrics group their results. +func TestMatchReleasedVersion_HandlesNoVPrefix(t *testing.T) { + d := time.Date(2026, 3, 1, 0, 0, 0, 0, time.UTC) + releases := map[string]models.EnrichedRelease{ + "argo-cd-2.9.0": {Name: "argo-cd-2.9.0", Component: "argo-cd", Released: true, ReleaseDate: d}, + } + window := models.TimeRange{ + Start: time.Date(2026, 2, 1, 0, 0, 0, 0, time.UTC), + End: time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC), + } + _, got := matchReleasedVersion([]string{"argo-cd-2.9.0"}, releases, window) + if got.Component != "argo-cd" { + t.Errorf("component = %q, want argo-cd (collector-parsed, not regex-derived)", got.Component) + } +} + +// --- pickWorstPerStage ----------------------------------------------- + +// TestPickWorstPerStage_OnePerStage verifies that the function picks +// one issue per stage where that stage is the issue's *dominant* +// contributor — preventing the "release segment always wins on +// absolute hours" trap (plan §7 P2-4). +func TestPickWorstPerStage_OnePerStage(t *testing.T) { + issues := []stagedIssue{ + // Dev-dominant + {Key: "A", Summary: "dev hog", DevHours: 150, ReviewHours: 10, ReleaseHours: 20, TotalHours: 180}, + // Review-dominant + {Key: "B", Summary: "rev hog", DevHours: 10, ReviewHours: 200, ReleaseHours: 30, TotalHours: 240}, + // Release-dominant + absolute biggest + {Key: "C", Summary: "rls hog", DevHours: 5, ReviewHours: 10, ReleaseHours: 900, TotalHours: 915}, + // Smaller Dev-dominant — should NOT displace A + {Key: "D", Summary: "small dev", DevHours: 100, ReviewHours: 1, ReleaseHours: 1, TotalHours: 102}, + } + worst := pickWorstPerStage(issues, nil) + if len(worst) != 3 { + t.Fatalf("len(worst) = %d, want 3 (one per stage)", len(worst)) + } + got := map[string]string{} + for _, w := range worst { + got[w.BottleneckStage] = w.JiraKey + } + if got[stageDev] != "A" { + t.Errorf("dev winner = %q, want A", got[stageDev]) + } + if got[stageReview] != "B" { + t.Errorf("review winner = %q, want B", got[stageReview]) + } + if got[stageRelease] != "C" { + t.Errorf("release winner = %q, want C", got[stageRelease]) + } +} + +// --- bottleneck rule ------------------------------------------------- + +// TestStageBreakdown_BottleneckRequiresBothConditions exercises D15: +// stage is flagged bottleneck iff p50 > 40% of total AND p50 >= team +// baseline p75. One condition alone must not trigger. +func TestStageBreakdown_BottleneckRequiresBothConditions(t *testing.T) { + // Issues where Review absolutely dominates and is way above team baseline. + issues := []stagedIssue{ + {DevHours: 10, ReviewHours: 200, ReleaseHours: 40, TotalHours: 250}, + {DevHours: 12, ReviewHours: 240, ReleaseHours: 30, TotalHours: 282}, + {DevHours: 8, ReviewHours: 180, ReleaseHours: 20, TotalHours: 208}, + } + // Baseline p75 set so Review (>=190) trips it but Dev/Release do not. + baseline := map[string]float64{stageDev: 100, stageReview: 100, stageRelease: 100} + + stages := computeStageBreakdowns(issues, baseline) + + var rev, dev, rls stageBreakdown + for _, s := range stages { + switch s.Name { + case stageReview: + rev = s + case stageDev: + dev = s + case stageRelease: + rls = s + } + } + if !rev.Bottleneck { + t.Errorf("Review should be a bottleneck (share > 40%% and >= P75): %+v", rev) + } + if dev.Bottleneck { + t.Errorf("Dev should not be a bottleneck (share < 40%%): %+v", dev) + } + if rls.Bottleneck { + t.Errorf("Release should not be a bottleneck (share < 40%%): %+v", rls) + } +} + +// --- trend MoM / QoQ -------------------------------------------------- + +// TestTrend_MoMDirection — building a trend over 9 months with the +// current month showing improvement (lower Lead Time) and the quarter +// showing degradation. Verifies the +/- sign mapping is right and the +// "improved/degraded" direction string matches. +func TestTrend_MoMDirection(t *testing.T) { + end := time.Date(2026, 5, 21, 0, 0, 0, 0, time.UTC) + window := models.TimeRange{Start: end.AddDate(0, -9, 0), End: end} + + // Pre-build the per-month synthetic issues using one issue per + // month (sample == 1) → these are below minTrendSamples=3 so the + // point would be null. Use three issues per month instead. + hoursByMonth := map[string]float64{ + "2025-09": 576, "2025-10": 504, "2025-11": 624, + "2025-12": 456, "2026-01": 420, "2026-02": 380, + "2026-03": 528, "2026-04": 456, "2026-05": 432, // MoM ≈ -5.3%, QoQ degraded + } + var issues []stagedIssue + for monthKey, h := range hoursByMonth { + m, _ := time.Parse("2006-01", monthKey) + // Three samples per month so trend bucket meets minTrendSamples. + for i := 0; i < 3; i++ { + issues = append(issues, stagedIssue{ + Key: monthKey + "-" + string(rune('a'+i)), + ReleaseDate: m.AddDate(0, 0, 5), + TotalHours: h, + }) + } + } + + tr := buildTrend(issues, window) + if tr == nil { + t.Fatal("buildTrend returned nil") + } + if len(tr.Points) != 9 { + t.Fatalf("len(points) = %d, want 9", len(tr.Points)) + } + if tr.MoMPct == nil { + t.Fatal("MoMPct is nil") + } + // MoM = (432 - 456) / 456 * 100 ≈ -5.26 + if math.Abs(*tr.MoMPct - -5.263158) > 0.01 { + t.Errorf("MoMPct = %.4f, want ≈ -5.26", *tr.MoMPct) + } + if tr.MoMDirection != "improved" { + t.Errorf("MoM direction = %q, want improved", tr.MoMDirection) + } + // QoQ: current 3 (528, 456, 432) avg = 472; prev 3 (456, 420, 380) avg = 418.667 + // pct = (472 - 418.667) / 418.667 * 100 ≈ 12.7 + if tr.QoQPct == nil { + t.Fatal("QoQPct is nil") + } + if math.Abs(*tr.QoQPct - 12.738854) > 0.05 { + t.Errorf("QoQPct = %.4f, want ≈ 12.74", *tr.QoQPct) + } + if tr.QoQDirection != "degraded" { + t.Errorf("QoQ direction = %q, want degraded", tr.QoQDirection) + } +} + +// TestCalculate_FallbackWhenNoPRStore exercises the path that fires +// when storage.enabled is false: PRStoreAvailable=false makes +// Calculate return the legacy Jira-only days output, with the degraded +// flag set so the UI can warn. Prevents the regression where Lead Time +// would otherwise disappear on minimal deployments. +func TestCalculate_FallbackWhenNoPRStore(t *testing.T) { + c := NewLeadTimeCalculator(nil) + + relDate := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC) + window := models.TimeRange{ + Start: relDate.AddDate(0, -9, 0), + End: relDate.AddDate(0, 1, 0), + } + releases := []models.EnrichedRelease{ + {ID: "v1", Name: "argo-cd-2.9.0", Component: "argo-cd", Released: true, ReleaseDate: relDate}, + } + issues := []models.EnrichedIssue{ + {Key: "DEVOPS-1", Name: "one", IssueType: "Story", + Versions: []string{"argo-cd-2.9.0"}, + CreatedDate: relDate.AddDate(0, 0, -10), ReleaseDate: relDate}, + {Key: "DEVOPS-2", Name: "two", IssueType: "Bug", + Versions: []string{"argo-cd-2.9.0"}, + CreatedDate: relDate.AddDate(0, 0, -30), ReleaseDate: relDate}, + } + + ctx := &models.CalculationContext{ + Releases: releases, Issues: issues, + PullRequests: nil, + PRStoreAvailable: false, // ← storage.enabled = false scenario + TimeRange: window, + } + results, err := c.Calculate(context.Background(), ctx) + if err != nil { + t.Fatalf("Calculate: %v", err) + } + if len(results) != 1 { + t.Fatalf("len(results) = %d, want 1 (Lead Time must NOT disappear when PR store is unavailable)", len(results)) + } + r := results[0] + if r.Unit != "days" { + t.Errorf("fallback Unit = %q, want days", r.Unit) + } + if r.Value <= 0 { + t.Errorf("fallback Value = %v, want > 0 (median of {10, 30} days)", r.Value) + } + if got := r.Metadata["degraded"]; got != "no_pr_store" { + t.Errorf("metadata.degraded = %v, want \"no_pr_store\" so UI knows to warn", got) + } + if _, ok := r.Metadata["degraded_reason"].(string); !ok { + t.Error("metadata.degraded_reason missing — operator needs the failure mode spelled out") + } + // Stage / trend / coverage are intentionally absent in fallback. + for _, missing := range []string{"stages", "worst_issues", "trend", "coverage"} { + if _, present := r.Metadata[missing]; present { + t.Errorf("metadata.%s leaked into fallback path — fallback must stay minimal", missing) + } + } +} + +// TestCalculate_SkipsEmptyComponentReleases locks in the calculator-side +// guard for the invalid-component fix: a release whose name did not +// parse (Component == "", e.g. legacy "0.3") must not produce a metric +// bucket — neither under its raw name nor under "" — on both the +// PR-backed and the Jira-only fallback paths. The collector already +// drops such releases; this guards the in-calculator defense line +// against a silent revert. +func TestCalculate_SkipsEmptyComponentReleases(t *testing.T) { + relDate := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC) + window := models.TimeRange{ + Start: relDate.AddDate(0, -9, 0), + End: relDate.AddDate(0, 1, 0), + } + releases := []models.EnrichedRelease{ + {ID: "v1", Name: "argo-cd-2.9.0", Component: "argo-cd", Released: true, ReleaseDate: relDate}, + {ID: "v2", Name: "0.3", Component: "", Released: true, ReleaseDate: relDate}, + } + issues := []models.EnrichedIssue{ + {Key: "DEVOPS-1", Name: "valid", IssueType: "Story", + Versions: []string{"argo-cd-2.9.0"}, + CreatedDate: relDate.AddDate(0, 0, -10), ReleaseDate: relDate}, + {Key: "DEVOPS-2", Name: "legacy", IssueType: "Bug", + Versions: []string{"0.3"}, + CreatedDate: relDate.AddDate(0, 0, -30), ReleaseDate: relDate}, + } + + for _, tc := range []struct { + name string + prStoreAvailable bool + }{ + {"PR-backed path", true}, + {"Jira-only fallback path", false}, + } { + t.Run(tc.name, func(t *testing.T) { + c := NewLeadTimeCalculator(map[string]interface{}{"with_trend": false}) + ctx := &models.CalculationContext{ + Releases: releases, Issues: issues, + PRStoreAvailable: tc.prStoreAvailable, + TimeRange: window, + } + results, err := c.Calculate(context.Background(), ctx) + if err != nil { + t.Fatalf("Calculate: %v", err) + } + for _, r := range results { + switch r.Labels["component"] { + case "argo-cd": // the valid control bucket + case "", "0.3": + t.Errorf("empty-component release leaked into bucket %q", r.Labels["component"]) + default: + t.Errorf("unexpected component bucket %q", r.Labels["component"]) + } + } + }) + } +} + +// --- end-to-end Calculate sanity check -------------------------------- + +// TestCalculate_EndToEnd builds a tiny CalculationContext with two +// issues across one component and verifies Calculate produces a sane +// MetricResult: stages populated, coverage filled, trend present. +func TestCalculate_EndToEnd(t *testing.T) { + c := NewLeadTimeCalculator(map[string]interface{}{ + "include_bots": false, + "with_trend": true, + }) + + relDate := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC) + window := models.TimeRange{ + Start: relDate.AddDate(0, -9, 0), + End: relDate.AddDate(0, 1, 0), + } + + releases := []models.EnrichedRelease{ + {ID: "v1", Name: "tektoncd-operator-v4.6.3", Component: "tektoncd-operator", Released: true, ReleaseDate: relDate}, + } + issues := []models.EnrichedIssue{ + {Key: "DEVOPS-1", Name: "thing one", IssueType: "Story", + Versions: []string{"tektoncd-operator-v4.6.3"}, + CreatedDate: relDate.AddDate(0, 0, -30), ReleaseDate: relDate}, + {Key: "DEVOPS-2", Name: "thing two", IssueType: "Bug", + Versions: []string{"tektoncd-operator-v4.6.3"}, + CreatedDate: relDate.AddDate(0, 0, -20), ReleaseDate: relDate}, + } + prs := []models.EnrichedPR{ + // DEVOPS-1: C1 (full data) + {ID: "p1", Source: "github", JiraKey: "DEVOPS-1", + AuthorLogin: "alice", IsBot: false, + FirstCommitAt: ptrTime(relDate.AddDate(0, 0, -10)), + CreatedAt: relDate.AddDate(0, 0, -8), + MergedAt: ptrTime(relDate.AddDate(0, 0, -3))}, + // DEVOPS-2: C3 (no first_commit_at) + {ID: "p2", Source: "github", JiraKey: "DEVOPS-2", + AuthorLogin: "bob", + FirstCommitAt: nil, + CreatedAt: relDate.AddDate(0, 0, -5), + MergedAt: ptrTime(relDate.AddDate(0, 0, -1))}, + } + + ctx := &models.CalculationContext{ + Releases: releases, Issues: issues, PullRequests: prs, + PRStoreAvailable: true, + TimeRange: window, + } + results, err := c.Calculate(context.Background(), ctx) + if err != nil { + t.Fatalf("Calculate: %v", err) + } + if len(results) != 1 { + t.Fatalf("len(results) = %d, want 1", len(results)) + } + r := results[0] + if r.Labels["component"] != "tektoncd-operator" { + t.Errorf("component label = %q, want tektoncd-operator", r.Labels["component"]) + } + if r.Unit != "days" { + t.Errorf("Unit = %q, want days (backward-compat with MetricCard / Prometheus)", r.Unit) + } + if r.Value <= 0 { + t.Errorf("Value = %v, want > 0", r.Value) + } + // Sanity check that Value is days, not hours: a 30-day window + // here can never produce > 60 days of Lead Time. + if r.Value > 60 { + t.Errorf("Value = %v days, suspiciously high — is Value still hours?", r.Value) + } + + cov, ok := r.Metadata["coverage"].(*coverageStats) + if !ok { + t.Fatalf("coverage missing or wrong type: %T", r.Metadata["coverage"]) + } + if cov.IssuesFull != 1 || cov.IssuesDevMissing != 1 { + t.Errorf("coverage = %+v, want IssuesFull=1 IssuesDevMissing=1", cov) + } + + // MetricBreakdown.jsx reads legacy days fields directly off the + // Metadata map. P2 review fix — make sure they are populated so + // the UI does not show Min=0 / Max=0 / Count=0. + for _, key := range []string{"min", "max", "average"} { + v, ok := r.Metadata[key].(float64) + if !ok || v <= 0 { + t.Errorf("metadata[%q] = %v (%T), want positive days value", key, r.Metadata[key], r.Metadata[key]) + } + } + for _, key := range []string{"count", "sample_size"} { + v, ok := r.Metadata[key].(int) + if !ok || v <= 0 { + t.Errorf("metadata[%q] = %v (%T), want positive int sample count", key, r.Metadata[key], r.Metadata[key]) + } + } +} diff --git a/roadmap-planner/backend/internal/metrics/calculators/patch_ratio.go b/roadmap-planner/backend/internal/metrics/calculators/patch_ratio.go index 9c8fcc92..22ca62e1 100644 --- a/roadmap-planner/backend/internal/metrics/calculators/patch_ratio.go +++ b/roadmap-planner/backend/internal/metrics/calculators/patch_ratio.go @@ -70,7 +70,9 @@ func (c *PatchRatioCalculator) Calculate(ctx context.Context, data *models.Calcu component := release.Component if component == "" { - component = "unknown" + // Collector drops these, but guard here too: an unparsable + // version name is not a component bucket. + continue } // Apply component filter if specified diff --git a/roadmap-planner/backend/internal/metrics/calculators/patch_ratio_test.go b/roadmap-planner/backend/internal/metrics/calculators/patch_ratio_test.go new file mode 100644 index 00000000..03ccf8bf --- /dev/null +++ b/roadmap-planner/backend/internal/metrics/calculators/patch_ratio_test.go @@ -0,0 +1,51 @@ +/* +Copyright 2024 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package calculators + +import ( + "context" + "testing" + "time" + + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/metrics/models" +) + +// TestPatchRatio_SkipsEmptyComponentReleases locks in the +// invalid-component guard: a release whose name did not parse +// (Component == "") used to be bucketed under "unknown"; it must now be +// skipped entirely so legacy version names like "0.3" never surface as +// component buckets. +func TestPatchRatio_SkipsEmptyComponentReleases(t *testing.T) { + c := NewPatchRatioCalculator(nil) + + relDate := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC) + ctx := &models.CalculationContext{ + Releases: []models.EnrichedRelease{ + {ID: "v1", Name: "argo-cd-2.9.1", Component: "argo-cd", Released: true, ReleaseDate: relDate, Type: "patch"}, + {ID: "v2", Name: "0.3", Component: "", Released: true, ReleaseDate: relDate, Type: "unknown"}, + }, + TimeRange: models.TimeRange{ + Start: relDate.AddDate(0, -6, 0), + End: relDate.AddDate(0, 1, 0), + }, + } + + results, err := c.Calculate(context.Background(), ctx) + if err != nil { + t.Fatalf("Calculate: %v", err) + } + if len(results) != 1 { + t.Fatalf("len(results) = %d, want 1 (only the valid component)", len(results)) + } + if got := results[0].Labels["component"]; got != "argo-cd" { + t.Errorf("component = %q, want argo-cd (empty-component release must not bucket as %q)", got, got) + } +} diff --git a/roadmap-planner/backend/internal/metrics/calculators/release_frequency.go b/roadmap-planner/backend/internal/metrics/calculators/release_frequency.go index bb75098a..7e2c0056 100644 --- a/roadmap-planner/backend/internal/metrics/calculators/release_frequency.go +++ b/roadmap-planner/backend/internal/metrics/calculators/release_frequency.go @@ -62,7 +62,9 @@ func (c *ReleaseFrequencyCalculator) Calculate(ctx context.Context, data *models component := release.Component if component == "" { - component = "unknown" + // Collector drops these, but guard here too: an unparsable + // version name is not a component bucket. + continue } // Apply component filter if specified diff --git a/roadmap-planner/backend/internal/metrics/calculators/release_frequency_test.go b/roadmap-planner/backend/internal/metrics/calculators/release_frequency_test.go new file mode 100644 index 00000000..56e823e0 --- /dev/null +++ b/roadmap-planner/backend/internal/metrics/calculators/release_frequency_test.go @@ -0,0 +1,51 @@ +/* +Copyright 2024 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package calculators + +import ( + "context" + "testing" + "time" + + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/metrics/models" +) + +// TestReleaseFrequency_SkipsEmptyComponentReleases locks in the +// invalid-component guard: a release whose name did not parse +// (Component == "") used to be bucketed under "unknown"; it must now be +// skipped entirely so legacy version names like "0.3" never surface as +// component buckets. +func TestReleaseFrequency_SkipsEmptyComponentReleases(t *testing.T) { + c := NewReleaseFrequencyCalculator(nil) + + relDate := time.Date(2026, 5, 10, 12, 0, 0, 0, time.UTC) + ctx := &models.CalculationContext{ + Releases: []models.EnrichedRelease{ + {ID: "v1", Name: "argo-cd-2.9.0", Component: "argo-cd", Released: true, ReleaseDate: relDate, Type: "minor"}, + {ID: "v2", Name: "0.3", Component: "", Released: true, ReleaseDate: relDate, Type: "unknown"}, + }, + TimeRange: models.TimeRange{ + Start: relDate.AddDate(0, -3, 0), + End: relDate.AddDate(0, 1, 0), + }, + } + + results, err := c.Calculate(context.Background(), ctx) + if err != nil { + t.Fatalf("Calculate: %v", err) + } + if len(results) != 1 { + t.Fatalf("len(results) = %d, want 1 (only the valid component)", len(results)) + } + if got := results[0].Labels["component"]; got != "argo-cd" { + t.Errorf("component = %q, want argo-cd (empty-component release must not bucket as %q)", got, got) + } +} diff --git a/roadmap-planner/backend/internal/metrics/collector.go b/roadmap-planner/backend/internal/metrics/collector.go index e1b05a01..f324a175 100644 --- a/roadmap-planner/backend/internal/metrics/collector.go +++ b/roadmap-planner/backend/internal/metrics/collector.go @@ -30,12 +30,14 @@ import ( "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/logger" "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/metrics/models" baseModels "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/models" + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/storage" "go.uber.org/zap" ) // Collector handles periodic data collection from Jira type Collector struct { jiraClient *jira.Client + store storage.Store // optional — for PR-backed metrics (Lead Time Phase 2) config *config.Metrics logger *zap.Logger @@ -43,18 +45,35 @@ type Collector struct { releases []models.EnrichedRelease epics []models.EnrichedIssue issues []models.EnrichedIssue + pullRequests []models.EnrichedPR lastCollected time.Time + // prFetchOK reflects whether the last fetchPullRequests call + // succeeded. When false, GetData reports PRStoreAvailable=false so + // the Lead Time calculator falls back to the Jira-only path — + // otherwise a transient PR query failure would silently produce an + // empty PR set, classify every issue as C4, and drop the metric. + // Optimistic on startup (no fetch attempted yet); flipped only when + // fetchPullRequests actually returns an error against a configured + // store. + prFetchOK bool } -// NewCollector creates a new Jira data collector -func NewCollector(jiraClient *jira.Client, cfg *config.Metrics) *Collector { +// NewCollector creates a new Jira data collector. +// +// store is optional — when nil, PR-backed metrics (DORA Lead Time +// Phase 2) silently degrade to an empty PullRequests slice. main.go +// should pass the production storage; the Jira-only tests pass nil. +func NewCollector(jiraClient *jira.Client, store storage.Store, cfg *config.Metrics) *Collector { return &Collector{ - jiraClient: jiraClient, - config: cfg, - logger: logger.WithComponent("metrics-collector"), - releases: []models.EnrichedRelease{}, - epics: []models.EnrichedIssue{}, - issues: []models.EnrichedIssue{}, + jiraClient: jiraClient, + store: store, + config: cfg, + logger: logger.WithComponent("metrics-collector"), + releases: []models.EnrichedRelease{}, + epics: []models.EnrichedIssue{}, + issues: []models.EnrichedIssue{}, + pullRequests: []models.EnrichedPR{}, + prFetchOK: true, } } @@ -121,10 +140,27 @@ func (c *Collector) Collect(ctx context.Context) error { return fmt.Errorf("failed to fetch issues: %w", err) } + // Fetch pull requests (for DORA Lead Time Phase 2). Soft-fail if the + // store is unavailable or the query errors — Jira-only metrics still + // work, only Lead Time stage attribution degrades. Track the failure + // so GetData reports PRStoreAvailable=false and the Lead Time + // calculator falls back to the Jira-only days output; otherwise the + // PR-backed path runs against an empty PR slice and the metric + // vanishes entirely. + prs, err := c.fetchPullRequests(ctx) + prFetchOK := true + if err != nil { + c.logger.Warn("PR fetch failed; Lead Time will degrade to Jira-only fallback", zap.Error(err)) + prs = nil + prFetchOK = false + } + // Update cached data c.mu.Lock() c.epics = epics c.issues = issues + c.pullRequests = prs + c.prFetchOK = prFetchOK c.lastCollected = time.Now() c.mu.Unlock() @@ -132,11 +168,51 @@ func (c *Collector) Collect(ctx context.Context) error { zap.Int("releases", len(releases)), zap.Int("epics", len(epics)), zap.Int("issues", len(issues)), + zap.Int("pull_requests", len(prs)), zap.Duration("duration", time.Since(startTime))) return nil } +// prFetchLookbackDays is the extra lookback applied when loading PRs +// for the Lead Time calculator. The calculator filters issues by +// release date, but their linked PRs may have merged well before the +// release-date window opens. The buffer is aligned with the +// excludedLongDevThreshold (180d) — issues older than that are dropped +// by Lead Time E9 anyway, so we never need PRs older than (window +// start - 180d). +const prFetchLookbackDays = 180 + +// fetchPullRequests returns all merged PRs in the historical window +// (plus a lookback buffer; see prFetchLookbackDays) from the storage +// layer for use by the DORA Lead Time calculator. Returns an empty +// slice (not error) when no store is configured — this lets the +// Jira-only stack work without storage. +func (c *Collector) fetchPullRequests(ctx context.Context) ([]models.EnrichedPR, error) { + if c.store == nil { + return nil, nil + } + since := time.Now().AddDate(0, 0, -(c.config.HistoricalDays + prFetchLookbackDays)) + rows, err := c.store.ListPullRequestsSince(ctx, since) + if err != nil { + return nil, err + } + out := make([]models.EnrichedPR, 0, len(rows)) + for _, r := range rows { + out = append(out, models.EnrichedPR{ + ID: r.ID, + Source: r.Source, + JiraKey: r.JiraKey, + AuthorLogin: r.AuthorLogin, + IsBot: r.AuthorID == "bot", + CreatedAt: r.CreatedAt, + FirstCommitAt: r.FirstCommitAt, + MergedAt: r.MergedAt, + }) + } + return out, nil +} + // fetchReleases gets release/version data from Jira func (c *Collector) fetchReleases(ctx context.Context) ([]models.EnrichedRelease, error) { // Get project details which includes versions @@ -159,10 +235,54 @@ func (c *Collector) fetchReleases(ctx context.Context) ([]models.EnrichedRelease c.logger.Debug("Filtered releases") } + releases = c.dropInvalidComponentReleases(releases) + c.logger.Debug("Fetched releases", zap.Int("count", len(releases)), zap.Int("original", originalCount)) return releases, nil } +// dropInvalidComponentReleases removes releases that must not feed any +// component dimension: names that do not parse to component-X.Y.Z +// (Component is empty — legacy names like "0.3", "v2.1") and components +// in metrics.exclude_plugins (D6 — v3-era plugins). Dropping here keeps +// every calculator clean without per-calculator filtering, and also +// removes the releases from the versionDates maps so issues linked only +// to invalid versions stay out of release-scoped metrics. +func (c *Collector) dropInvalidComponentReleases(releases []models.EnrichedRelease) []models.EnrichedRelease { + kept := make([]models.EnrichedRelease, 0, len(releases)) + dropped := make([]string, 0) + for _, r := range releases { + if r.Component == "" || c.config.IsPluginExcluded(r.Component) { + dropped = append(dropped, r.Name) + continue + } + kept = append(kept, r) + } + if len(dropped) > 0 { + c.logger.Debug("Dropped releases with invalid or excluded components", + zap.Int("count", len(dropped)), zap.Strings("names", dropped)) + } + return kept +} + +// dropExcludedComponents removes metrics.exclude_plugins entries from an +// issue's component list (both Jira Component fields and components +// extracted from version names) so D6 plugins disappear from issue-based +// component dimensions (cycle_time, time_to_patch). +func (c *Collector) dropExcludedComponents(components []string) []string { + if len(c.config.ExcludePlugins) == 0 || len(components) == 0 { + return components + } + kept := make([]string, 0, len(components)) + for _, comp := range components { + if c.config.IsPluginExcluded(comp) { + continue + } + kept = append(kept, comp) + } + return kept +} + // enrichRelease converts a basic Version to an EnrichedRelease func (c *Collector) enrichRelease(v baseModels.Version) models.EnrichedRelease { release := models.EnrichedRelease{ @@ -205,8 +325,11 @@ func parseVersionName(name string) (component string, major, minor, patch int) { return } - // If no version pattern found, the whole name is the component - component = name + // No version pattern found — the name does not identify a component. + // Legacy names like "0.3", "v2.1" or "1.0" used to fall back to the + // whole name here, which polluted every per-component metric with + // bogus component buckets. Return empty so collectors/calculators + // drop the release from component dimensions. return } @@ -286,6 +409,7 @@ func (c *Collector) fetchEpics(ctx context.Context) ([]models.EnrichedIssue, err } } } + enriched.Components = c.dropExcludedComponents(enriched.Components) epics = append(epics, enriched) } @@ -355,6 +479,7 @@ func (c *Collector) fetchIssues(ctx context.Context) ([]models.EnrichedIssue, er } } } + enriched.Components = c.dropExcludedComponents(enriched.Components) issues = append(issues, enriched) } @@ -377,10 +502,15 @@ func (c *Collector) GetData() (*models.CalculationContext, error) { issues := make([]models.EnrichedIssue, len(c.issues)) copy(issues, c.issues) + prs := make([]models.EnrichedPR, len(c.pullRequests)) + copy(prs, c.pullRequests) + return &models.CalculationContext{ - Releases: releases, - Epics: epics, - Issues: issues, + Releases: releases, + Epics: epics, + Issues: issues, + PullRequests: prs, + PRStoreAvailable: c.store != nil && c.prFetchOK, TimeRange: models.TimeRange{ Start: time.Now().AddDate(0, 0, -c.config.HistoricalDays), End: time.Now(), diff --git a/roadmap-planner/backend/internal/metrics/collector_test.go b/roadmap-planner/backend/internal/metrics/collector_test.go new file mode 100644 index 00000000..bca5252d --- /dev/null +++ b/roadmap-planner/backend/internal/metrics/collector_test.go @@ -0,0 +1,183 @@ +/* +Copyright 2024 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package metrics + +import ( + "context" + "database/sql" + "reflect" + "testing" + "time" + + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/config" + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/metrics/models" + "github.com/AlaudaDevops/toolbox/roadmap-planner/backend/internal/storage" + "go.uber.org/zap" +) + +// prStoreStub satisfies storage.Store for the GetData truth-table test. +// Only ListPullRequestsSince is meaningful; the rest panic so any +// accidental dependency on a real store surfaces immediately instead of +// returning silent zero values. +type prStoreStub struct{} + +func (prStoreStub) ListPullRequestsSince(context.Context, time.Time) ([]storage.PullRequest, error) { + return nil, nil +} +func (prStoreStub) ListMergedPRsMissingFirstCommit(context.Context, string, time.Time, int) ([]storage.PullRequest, error) { + panic("not used") +} +func (prStoreStub) UpdatePRFirstCommitAt(context.Context, string, *time.Time) error { + panic("not used") +} +func (prStoreStub) Close() error { panic("not used") } +func (prStoreStub) Migrate(context.Context) error { panic("not used") } +func (prStoreStub) DB() *sql.DB { panic("not used") } +func (prStoreStub) WriteCollectionRun(context.Context, storage.CollectionRun) error { panic("not used") } +func (prStoreStub) LatestCollectionRun(context.Context, string) (*storage.CollectionRun, error) { + panic("not used") +} +func (prStoreStub) WriteIssueSnapshots(context.Context, string, []storage.IssueSnapshot) error { + panic("not used") +} +func (prStoreStub) UpsertPullRequests(context.Context, []storage.PullRequest) error { + panic("not used") +} +func (prStoreStub) UpsertPRReviews(context.Context, []storage.PRReview) error { panic("not used") } +func (prStoreStub) UpsertMember(context.Context, storage.Member) error { panic("not used") } +func (prStoreStub) SetMemberIdentity(context.Context, string, storage.MemberIdentity) error { + panic("not used") +} +func (prStoreStub) ListMembers(context.Context) ([]storage.Member, error) { panic("not used") } +func (prStoreStub) MemberWeekMetrics(context.Context, storage.MemberWeekQuery) ([]storage.MemberWeekRow, error) { + panic("not used") +} + +// TestGetData_PRStoreAvailable_ReflectsLastFetchOutcome guards the P2 +// fix: PRStoreAvailable must encode runtime fetch health, not just +// "store is configured". When fetchPullRequests last failed against a +// configured store, GetData has to report PRStoreAvailable=false so the +// Lead Time calculator falls back to its Jira-only days path — +// otherwise the PR-backed path runs against an empty PR slice, every +// issue is classified C4, and the metric disappears. +func TestGetData_PRStoreAvailable_ReflectsLastFetchOutcome(t *testing.T) { + cases := []struct { + name string + store storage.Store + prFetchOK bool + want bool + }{ + {"no store configured: legacy minimal deployment", nil, true, false}, + {"store configured, last fetch ok: PR-backed path", prStoreStub{}, true, true}, + {"store configured, last fetch failed: must degrade", prStoreStub{}, false, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + c := &Collector{ + store: tc.store, + prFetchOK: tc.prFetchOK, + config: &config.Metrics{HistoricalDays: 90}, + } + data, err := c.GetData() + if err != nil { + t.Fatalf("GetData: %v", err) + } + if data.PRStoreAvailable != tc.want { + t.Errorf("PRStoreAvailable = %v, want %v", data.PRStoreAvailable, tc.want) + } + }) + } +} + +// TestParseVersionName guards the invalid-component fix: names that do +// not carry a component-X.Y.Z pattern must return an empty component +// instead of the legacy whole-name fallback that polluted every +// per-component metric with buckets like "0.3" or "v2.1". +func TestParseVersionName(t *testing.T) { + cases := []struct { + name string + wantComponent string + major, minor, patch int + }{ + // Valid names — component extracted. + {"argo-cd-2.9.0", "argo-cd", 2, 9, 0}, + {"tektoncd-operator-v4.6.3", "tektoncd-operator", 4, 6, 3}, + {"harbor 1.2.3", "harbor", 1, 2, 3}, + {"connectors-operator-1.2.3-rc1", "connectors-operator", 1, 2, 3}, + // Legacy / invalid names — no component. + {"0.3", "", 0, 0, 0}, + {"v2.1", "", 0, 0, 0}, + {"1.0", "", 0, 0, 0}, + {"Sprint 2024", "", 0, 0, 0}, + {"", "", 0, 0, 0}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + component, major, minor, patch := parseVersionName(tc.name) + if component != tc.wantComponent || major != tc.major || minor != tc.minor || patch != tc.patch { + t.Errorf("parseVersionName(%q) = (%q, %d, %d, %d), want (%q, %d, %d, %d)", + tc.name, component, major, minor, patch, + tc.wantComponent, tc.major, tc.minor, tc.patch) + } + }) + } +} + +// TestDropInvalidComponentReleases covers both invalid classes: empty +// component (unparsable version name) and metrics.exclude_plugins (D6 — +// v3-era plugins). +func TestDropInvalidComponentReleases(t *testing.T) { + c := &Collector{ + config: &config.Metrics{ + ExcludePlugins: []string{"katanomi", "knative", "jenkins", "tekton-operator"}, + }, + logger: zap.NewNop(), + } + in := []models.EnrichedRelease{ + {Name: "tektoncd-operator-v4.6.3", Component: "tektoncd-operator"}, + {Name: "0.3", Component: ""}, // unparsable → dropped + {Name: "katanomi-v3.1.0", Component: "katanomi"}, // D6 → dropped + {Name: "tekton-operator-v3.20.0", Component: "tekton-operator"}, // D6 → dropped + {Name: "argo-cd-2.9.0", Component: "argo-cd"}, + } + got := c.dropInvalidComponentReleases(in) + want := []string{"tektoncd-operator", "argo-cd"} + gotComponents := make([]string, 0, len(got)) + for _, r := range got { + gotComponents = append(gotComponents, r.Component) + } + if !reflect.DeepEqual(gotComponents, want) { + t.Errorf("kept components = %v, want %v", gotComponents, want) + } +} + +// TestDropExcludedComponents covers the issue-side path: D6 plugins are +// removed from issue component lists while everything else is kept. +func TestDropExcludedComponents(t *testing.T) { + c := &Collector{config: &config.Metrics{ExcludePlugins: []string{"katanomi", "jenkins"}}} + cases := []struct { + name string + in []string + want []string + }{ + {"mixed", []string{"katanomi", "tektoncd-operator", "jenkins"}, []string{"tektoncd-operator"}}, + {"all excluded", []string{"katanomi"}, []string{}}, + {"none excluded", []string{"argo-cd"}, []string{"argo-cd"}}, + {"empty", nil, nil}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := c.dropExcludedComponents(tc.in); !reflect.DeepEqual(got, tc.want) { + t.Errorf("dropExcludedComponents(%v) = %v, want %v", tc.in, got, tc.want) + } + }) + } +} diff --git a/roadmap-planner/backend/internal/metrics/models/metrics.go b/roadmap-planner/backend/internal/metrics/models/metrics.go index 58b17100..4cbcdb6c 100644 --- a/roadmap-planner/backend/internal/metrics/models/metrics.go +++ b/roadmap-planner/backend/internal/metrics/models/metrics.go @@ -77,11 +77,24 @@ type MetricFilters struct { // CalculationContext provides all data needed for metric calculation type CalculationContext struct { - Releases []EnrichedRelease - Epics []EnrichedIssue - Issues []EnrichedIssue - TimeRange TimeRange - Filters MetricFilters + Releases []EnrichedRelease + Epics []EnrichedIssue + Issues []EnrichedIssue + PullRequests []EnrichedPR // DORA Phase 2 — required for Lead Time 3-stage attribution + // PRStoreAvailable is true when the collector has a configured + // storage backend that can serve PR data. Distinguishes "no PR + // store configured" (legacy / minimal deployments) from "store + // configured but the slice is empty for this window" — the Lead + // Time calculator falls back to a Jira-only path in the former + // case to avoid silently dropping the whole metric. + PRStoreAvailable bool + TimeRange TimeRange + Filters MetricFilters + // Options carries per-request flags (e.g. include_bots, with_trend + // for Lead Time) that override the calculator's startup defaults. + // Calculators should read these first and fall back to GetBoolOption + // only when the key is absent here. + Options map[string]interface{} } // EnrichedRelease represents a Jira version/release with additional metadata @@ -116,6 +129,23 @@ type EnrichedIssue struct { PillarID string `json:"pillar_id,omitempty"` } +// EnrichedPR is the minimal slice of pull_requests rows the Lead Time +// calculator needs. We deliberately keep this narrow — additional +// fields (additions, deletions, author_id, etc.) can be added when +// future calculators need them. JiraKey is the linker output from W5; +// pre-Phase-2 PRs may have NULL FirstCommitAt and the calculator's +// fallback matrix (plan §4) handles that. +type EnrichedPR struct { + ID string `json:"id"` + Source string `json:"source"` // "github" | "gitlab" + JiraKey string `json:"jira_key"` // W5 linker — empty if unmatched + AuthorLogin string `json:"author_login"` + IsBot bool `json:"is_bot"` // derived from author_id == "bot" + CreatedAt time.Time `json:"created_at"` // T1 for the Dev → Review boundary + FirstCommitAt *time.Time `json:"first_commit_at"` // T0 for Dev start (Phase 2 nullable) + MergedAt *time.Time `json:"merged_at"` // T2 for Review → Release boundary +} + // PrometheusMetricDesc describes a Prometheus metric type PrometheusMetricDesc struct { Name string diff --git a/roadmap-planner/backend/internal/metrics/prometheus.go b/roadmap-planner/backend/internal/metrics/prometheus.go index f00a652b..2c8dba5c 100644 --- a/roadmap-planner/backend/internal/metrics/prometheus.go +++ b/roadmap-planner/backend/internal/metrics/prometheus.go @@ -219,7 +219,7 @@ func (e *PrometheusExporter) Update(ctx context.Context) error { filters := models.MetricFilters{} // Update release frequency - if results, err := e.service.CalculateMetric(ctx, "release_frequency", filters, models.TimeRange{}); err == nil { + if results, err := e.service.CalculateMetric(ctx, "release_frequency", filters, models.TimeRange{}, nil); err == nil { for _, r := range results { component := r.Labels["component"] if component == "" { @@ -233,7 +233,7 @@ func (e *PrometheusExporter) Update(ctx context.Context) error { } // Update lead time - if results, err := e.service.CalculateMetric(ctx, "lead_time_to_release", filters, models.TimeRange{}); err == nil { + if results, err := e.service.CalculateMetric(ctx, "lead_time_to_release", filters, models.TimeRange{}, nil); err == nil { for _, r := range results { component := r.Labels["component"] if component == "" { @@ -247,7 +247,7 @@ func (e *PrometheusExporter) Update(ctx context.Context) error { } // Update cycle time - if results, err := e.service.CalculateMetric(ctx, "cycle_time", filters, models.TimeRange{}); err == nil { + if results, err := e.service.CalculateMetric(ctx, "cycle_time", filters, models.TimeRange{}, nil); err == nil { for _, r := range results { component := r.Labels["component"] if component == "" { @@ -261,7 +261,7 @@ func (e *PrometheusExporter) Update(ctx context.Context) error { } // Update patch ratio - if results, err := e.service.CalculateMetric(ctx, "patch_ratio", filters, models.TimeRange{}); err == nil { + if results, err := e.service.CalculateMetric(ctx, "patch_ratio", filters, models.TimeRange{}, nil); err == nil { for _, r := range results { component := r.Labels["component"] if component == "" { @@ -275,7 +275,7 @@ func (e *PrometheusExporter) Update(ctx context.Context) error { } // Update time to patch - if results, err := e.service.CalculateMetric(ctx, "time_to_patch", filters, models.TimeRange{}); err == nil { + if results, err := e.service.CalculateMetric(ctx, "time_to_patch", filters, models.TimeRange{}, nil); err == nil { for _, r := range results { component := r.Labels["component"] if component == "" { diff --git a/roadmap-planner/backend/internal/metrics/service.go b/roadmap-planner/backend/internal/metrics/service.go index ab3b201c..d7c922fd 100644 --- a/roadmap-planner/backend/internal/metrics/service.go +++ b/roadmap-planner/backend/internal/metrics/service.go @@ -80,8 +80,10 @@ func (s *Service) ListAvailableMetrics() []models.MetricInfo { return s.registry.ListMetricInfo() } -// CalculateMetric calculates a specific metric with the given filters -func (s *Service) CalculateMetric(ctx context.Context, name string, filters models.MetricFilters, timeRange models.TimeRange) ([]models.MetricResult, error) { +// CalculateMetric calculates a specific metric with the given filters. +// opts carries optional per-request flags (e.g. include_bots, with_trend +// for Lead Time); pass nil when none are supplied. +func (s *Service) CalculateMetric(ctx context.Context, name string, filters models.MetricFilters, timeRange models.TimeRange, opts map[string]interface{}) ([]models.MetricResult, error) { calc, exists := s.registry.Get(name) if !exists { return nil, fmt.Errorf("metric %s not found", name) @@ -98,6 +100,9 @@ func (s *Service) CalculateMetric(ctx context.Context, name string, filters mode if !timeRange.Start.IsZero() || !timeRange.End.IsZero() { data.TimeRange = timeRange } + if len(opts) > 0 { + data.Options = opts + } // Calculate metric results, err := calc.Calculate(ctx, data) diff --git a/roadmap-planner/backend/internal/storage/first_commit_backfill_test.go b/roadmap-planner/backend/internal/storage/first_commit_backfill_test.go new file mode 100644 index 00000000..b9cbce28 --- /dev/null +++ b/roadmap-planner/backend/internal/storage/first_commit_backfill_test.go @@ -0,0 +1,156 @@ +/* +Copyright 2024 The AlaudaDevops Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package storage + +import ( + "context" + "path/filepath" + "testing" + "time" +) + +// TestListMergedPRsMissingFirstCommit_Backfill guards the post-0009 +// backfill path. The query must return: +// - only the requested source (no cross-platform leak), +// - only merged PRs (open / closed-without-merge are excluded), +// - only rows still missing first_commit_at, +// - only rows whose merged_at is within the window, +// - at most `limit` rows, ordered merged_at DESC (recent history first). +// UpdatePRFirstCommitAt should populate the column so a follow-up +// listing returns one fewer row — that is the "convergence" property +// the syncer relies on. +func TestListMergedPRsMissingFirstCommit_Backfill(t *testing.T) { + dir := t.TempDir() + store, err := OpenSQLite(filepath.Join(dir, "test.db")) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = store.Close() }) + + ctx := context.Background() + if err := store.Migrate(ctx); err != nil { + t.Fatalf("migrate: %v", err) + } + + now := time.Now().UTC().Truncate(time.Second) + merged60d := now.AddDate(0, 0, -60) + merged30d := now.AddDate(0, 0, -30) + merged1d := now.AddDate(0, 0, -1) + existingFirstCommit := merged30d.Add(-2 * 24 * time.Hour) + + prs := []PullRequest{ + // In window, gitlab, missing first_commit_at — must show up. + { + ID: "org/proj!10", Source: "gitlab", RepoID: "org/proj", Number: 10, + Title: "gl old", State: "merged", + CreatedAt: merged60d.Add(-time.Hour), MergedAt: &merged60d, FetchedAt: now, + }, + // In window, github, missing first_commit_at — must show up. + { + ID: "org/repo#11", Source: "github", RepoID: "org/repo", Number: 11, + Title: "gh recent", State: "merged", + CreatedAt: merged1d.Add(-time.Hour), MergedAt: &merged1d, FetchedAt: now, + }, + // In window, github, ALREADY has first_commit_at — must be excluded. + { + ID: "org/repo#12", Source: "github", RepoID: "org/repo", Number: 12, + Title: "gh with commit", State: "merged", + CreatedAt: merged30d.Add(-time.Hour), + FirstCommitAt: &existingFirstCommit, + MergedAt: &merged30d, FetchedAt: now, + }, + // Open (no merged_at) — must be excluded even though first_commit_at is NULL. + { + ID: "org/repo#13", Source: "github", RepoID: "org/repo", Number: 13, + Title: "gh open", State: "open", + CreatedAt: now.Add(-time.Hour), FetchedAt: now, + }, + // Merged BEFORE the window — must be excluded. + { + ID: "org/repo#14", Source: "github", RepoID: "org/repo", Number: 14, + Title: "gh ancient", State: "merged", + CreatedAt: now.AddDate(-1, 0, 0), MergedAt: ptrTime(now.AddDate(-1, 0, 0)), FetchedAt: now, + }, + // In window, github, also missing — second github row to exercise ordering. + { + ID: "org/repo#15", Source: "github", RepoID: "org/repo", Number: 15, + Title: "gh mid", State: "merged", + CreatedAt: merged30d.Add(-time.Hour), MergedAt: &merged30d, FetchedAt: now, + }, + } + if err := store.UpsertPullRequests(ctx, prs); err != nil { + t.Fatalf("seed PRs: %v", err) + } + + windowStart := now.AddDate(0, 0, -180) + + // --- GitHub source: exactly the two NULL rows, recent first. --- + gh, err := store.ListMergedPRsMissingFirstCommit(ctx, "github", windowStart, 10) + if err != nil { + t.Fatalf("list github: %v", err) + } + if len(gh) != 2 { + t.Fatalf("github rows = %d, want 2 (excludes #12 with commit, #13 open, #14 out-of-window)", len(gh)) + } + if gh[0].ID != "org/repo#11" || gh[1].ID != "org/repo#15" { + t.Errorf("github order = [%s, %s], want [org/repo#11, org/repo#15] (DESC by merged_at)", + gh[0].ID, gh[1].ID) + } + + // --- GitLab source: one row, no cross-platform leak. --- + gl, err := store.ListMergedPRsMissingFirstCommit(ctx, "gitlab", windowStart, 10) + if err != nil { + t.Fatalf("list gitlab: %v", err) + } + if len(gl) != 1 || gl[0].ID != "org/proj!10" { + t.Fatalf("gitlab rows = %+v, want [org/proj!10] only", gl) + } + + // --- limit honoured. --- + limited, err := store.ListMergedPRsMissingFirstCommit(ctx, "github", windowStart, 1) + if err != nil { + t.Fatalf("list limited: %v", err) + } + if len(limited) != 1 { + t.Fatalf("limited rows = %d, want 1", len(limited)) + } + + // --- UpdatePRFirstCommitAt removes a row from the result set. --- + commitDate := merged1d.Add(-3 * 24 * time.Hour) + if err := store.UpdatePRFirstCommitAt(ctx, "org/repo#11", &commitDate); err != nil { + t.Fatalf("update first_commit_at: %v", err) + } + after, err := store.ListMergedPRsMissingFirstCommit(ctx, "github", windowStart, 10) + if err != nil { + t.Fatalf("list after update: %v", err) + } + if len(after) != 1 || after[0].ID != "org/repo#15" { + t.Fatalf("after update = %+v, want only org/repo#15 (the still-NULL row)", after) + } + // Sanity: the update actually persisted the value (so a future + // listing won't oscillate). + listed, err := store.ListPullRequestsSince(ctx, windowStart) + if err != nil { + t.Fatalf("list since: %v", err) + } + var got *time.Time + for _, p := range listed { + if p.ID == "org/repo#11" { + got = p.FirstCommitAt + break + } + } + if got == nil || !got.Equal(commitDate) { + t.Errorf("persisted first_commit_at = %v, want %v", got, commitDate) + } +} + +func ptrTime(t time.Time) *time.Time { return &t } diff --git a/roadmap-planner/backend/internal/storage/generic.go b/roadmap-planner/backend/internal/storage/generic.go index ccf301e0..8a94ba37 100644 --- a/roadmap-planner/backend/internal/storage/generic.go +++ b/roadmap-planner/backend/internal/storage/generic.go @@ -124,9 +124,9 @@ func (s *genericStore) UpsertPullRequests(ctx context.Context, prs []PullRequest INSERT INTO pull_requests ( id, source, repo_id, number, title, state, author_id, author_login, head_branch, base_branch, additions, deletions, changed_files, - jira_key, created_at, first_review_at, first_human_review_at, + jira_key, created_at, first_commit_at, first_review_at, first_human_review_at, merged_at, closed_at, fetched_at - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(id) DO UPDATE SET source = excluded.source, title = excluded.title, @@ -137,6 +137,7 @@ func (s *genericStore) UpsertPullRequests(ctx context.Context, prs []PullRequest deletions = excluded.deletions, changed_files = excluded.changed_files, jira_key = excluded.jira_key, + first_commit_at = COALESCE(excluded.first_commit_at, pull_requests.first_commit_at), first_review_at = excluded.first_review_at, first_human_review_at = excluded.first_human_review_at, merged_at = excluded.merged_at, @@ -155,7 +156,7 @@ func (s *genericStore) UpsertPullRequests(ctx context.Context, prs []PullRequest _, err := stmt.ExecContext(ctx, p.ID, source, p.RepoID, p.Number, p.Title, p.State, nullable(p.AuthorID), nullable(p.AuthorLogin), nullable(p.HeadBranch), nullable(p.BaseBranch), p.Additions, p.Deletions, p.ChangedFiles, - nullable(p.JiraKey), p.CreatedAt, p.FirstReviewAt, p.FirstHumanReviewAt, p.MergedAt, p.ClosedAt, p.FetchedAt, + nullable(p.JiraKey), p.CreatedAt, p.FirstCommitAt, p.FirstReviewAt, p.FirstHumanReviewAt, p.MergedAt, p.ClosedAt, p.FetchedAt, ) if err != nil { return fmt.Errorf("upsert pr %s: %w", p.ID, err) @@ -164,6 +165,99 @@ func (s *genericStore) UpsertPullRequests(ctx context.Context, prs []PullRequest return tx.Commit() } +func (s *genericStore) ListPullRequestsSince(ctx context.Context, since time.Time) ([]PullRequest, error) { + q := rebind(s.d, ` + SELECT id, source, repo_id, number, title, state, + COALESCE(author_id, '') AS author_id, + COALESCE(author_login, '') AS author_login, + COALESCE(head_branch, '') AS head_branch, + COALESCE(base_branch, '') AS base_branch, + COALESCE(additions, 0) AS additions, + COALESCE(deletions, 0) AS deletions, + COALESCE(changed_files, 0) AS changed_files, + COALESCE(jira_key, '') AS jira_key, + created_at, first_commit_at, first_review_at, first_human_review_at, + merged_at, closed_at, fetched_at + FROM pull_requests + WHERE merged_at IS NOT NULL AND merged_at >= ? + ORDER BY merged_at ASC`) + rows, err := s.db.QueryContext(ctx, q, since) + if err != nil { + return nil, fmt.Errorf("list PRs since %s: %w", since.Format(time.RFC3339), err) + } + defer rows.Close() + out := make([]PullRequest, 0, 256) + for rows.Next() { + var p PullRequest + if err := rows.Scan( + &p.ID, &p.Source, &p.RepoID, &p.Number, &p.Title, &p.State, + &p.AuthorID, &p.AuthorLogin, &p.HeadBranch, &p.BaseBranch, + &p.Additions, &p.Deletions, &p.ChangedFiles, &p.JiraKey, + &p.CreatedAt, &p.FirstCommitAt, &p.FirstReviewAt, &p.FirstHumanReviewAt, + &p.MergedAt, &p.ClosedAt, &p.FetchedAt, + ); err != nil { + return nil, fmt.Errorf("scan PR: %w", err) + } + out = append(out, p) + } + return out, rows.Err() +} + +func (s *genericStore) ListMergedPRsMissingFirstCommit(ctx context.Context, source string, since time.Time, limit int) ([]PullRequest, error) { + if limit <= 0 { + return nil, nil + } + q := rebind(s.d, ` + SELECT id, source, repo_id, number, title, state, + COALESCE(author_id, '') AS author_id, + COALESCE(author_login, '') AS author_login, + COALESCE(head_branch, '') AS head_branch, + COALESCE(base_branch, '') AS base_branch, + COALESCE(additions, 0) AS additions, + COALESCE(deletions, 0) AS deletions, + COALESCE(changed_files, 0) AS changed_files, + COALESCE(jira_key, '') AS jira_key, + created_at, first_commit_at, first_review_at, first_human_review_at, + merged_at, closed_at, fetched_at + FROM pull_requests + WHERE source = ? + AND merged_at IS NOT NULL + AND merged_at >= ? + AND first_commit_at IS NULL + ORDER BY merged_at DESC + LIMIT ?`) + rows, err := s.db.QueryContext(ctx, q, source, since, limit) + if err != nil { + return nil, fmt.Errorf("list PRs missing first_commit_at (%s since %s): %w", + source, since.Format(time.RFC3339), err) + } + defer rows.Close() + out := make([]PullRequest, 0, limit) + for rows.Next() { + var p PullRequest + if err := rows.Scan( + &p.ID, &p.Source, &p.RepoID, &p.Number, &p.Title, &p.State, + &p.AuthorID, &p.AuthorLogin, &p.HeadBranch, &p.BaseBranch, + &p.Additions, &p.Deletions, &p.ChangedFiles, &p.JiraKey, + &p.CreatedAt, &p.FirstCommitAt, &p.FirstReviewAt, &p.FirstHumanReviewAt, + &p.MergedAt, &p.ClosedAt, &p.FetchedAt, + ); err != nil { + return nil, fmt.Errorf("scan PR: %w", err) + } + out = append(out, p) + } + return out, rows.Err() +} + +func (s *genericStore) UpdatePRFirstCommitAt(ctx context.Context, id string, firstCommitAt *time.Time) error { + q := rebind(s.d, `UPDATE pull_requests SET first_commit_at = ? WHERE id = ?`) + _, err := s.db.ExecContext(ctx, q, firstCommitAt, id) + if err != nil { + return fmt.Errorf("update first_commit_at for %s: %w", id, err) + } + return nil +} + func (s *genericStore) UpsertPRReviews(ctx context.Context, reviews []PRReview) error { if len(reviews) == 0 { return nil diff --git a/roadmap-planner/backend/internal/storage/migrations/0009_pr_first_commit_at.sql b/roadmap-planner/backend/internal/storage/migrations/0009_pr_first_commit_at.sql new file mode 100644 index 00000000..0b7c34df --- /dev/null +++ b/roadmap-planner/backend/internal/storage/migrations/0009_pr_first_commit_at.sql @@ -0,0 +1,34 @@ +-- ---------------------------------------------------------------------- +-- 0009_pr_first_commit_at — DORA Lead Time Phase 2 (2026-05-21). +-- +-- Adds pull_requests.first_commit_at so the Lead Time calculator can +-- measure from "first commit author date" rather than PR.created_at. +-- The squash-merge workflow rewrites commit author dates on the local +-- ref, so we fetch this value once at sync time via the platform PR +-- commits API: +-- +-- GitHub: GET /repos/{owner}/{repo}/pulls/{number}/commits +-- GitLab: GET /projects/{id}/merge_requests/{iid}/commits +-- +-- and take the earliest commit's author.date. +-- +-- This is the only schema change for the DORA Lead Time 3-stage +-- attribution work (Dev → Review → Release, see +-- docs/plans/2026-05-21-dora-optimization-plan.md §Phase 2). All other +-- inputs (pull_requests.jira_key from W5, issue_snapshots.versions +-- from W1, in-memory data.Releases) are already in place. +-- +-- pull_requests.first_commit_at +-- Earliest commit author date on the PR. Nullable because: +-- (a) Backfill is incremental — historical PRs populate over the +-- first sync cycles after this migration applies. +-- (b) Some PRs (e.g. forks with squash-merged history) may not +-- expose the original commits even via the API. +-- (c) The Lead Time calculator handles NULL through the C2/C3 +-- fallback rules in the plan's detailed design §4. +-- No index — column is read per-issue inside the calculator after +-- the existing idx_pr_jira lookup has narrowed to that issue's +-- linked PRs (typically <10 rows). +-- ---------------------------------------------------------------------- + +ALTER TABLE pull_requests ADD COLUMN first_commit_at TIMESTAMP; diff --git a/roadmap-planner/backend/internal/storage/store.go b/roadmap-planner/backend/internal/storage/store.go index b0c2fbd8..a9414f52 100644 --- a/roadmap-planner/backend/internal/storage/store.go +++ b/roadmap-planner/backend/internal/storage/store.go @@ -49,6 +49,25 @@ type Store interface { // GitHub. UpsertPullRequests(ctx context.Context, prs []PullRequest) error UpsertPRReviews(ctx context.Context, reviews []PRReview) error + // ListPullRequestsSince returns all PRs (github + gitlab) that merged + // on or after `since`. Open / closed-without-merge rows are excluded — + // the DORA Lead Time calculator only attributes shipped work. + ListPullRequestsSince(ctx context.Context, since time.Time) ([]PullRequest, error) + // ListMergedPRsMissingFirstCommit returns up to `limit` merged PRs + // for the given source that still have first_commit_at IS NULL and + // merged on or after `since`. Drives the post-0009 backfill: rows + // ingested before migration 0009 (or by an older syncer build) keep + // NULL first_commit_at until a later sync touches them, which the + // incremental PR fetch can never do for already-merged history. + // Without backfill those rows fall to Lead Time C3 (no Dev stage), + // distorting the metric on deployments upgraded across 0009. + // Ordered by merged_at DESC so recent history is rehydrated first. + ListMergedPRsMissingFirstCommit(ctx context.Context, source string, since time.Time, limit int) ([]PullRequest, error) + // UpdatePRFirstCommitAt writes a single first_commit_at value + // without touching any other column. Use for backfill paths — full + // UpsertPullRequests would require re-fetching all PR fields just to + // move one column. + UpdatePRFirstCommitAt(ctx context.Context, id string, firstCommitAt *time.Time) error // Members. UpsertMember(ctx context.Context, m Member) error @@ -125,6 +144,7 @@ type PullRequest struct { ChangedFiles int JiraKey string // any Jira key matched by the linker; was misnamed `EpicKey` pre-W5 CreatedAt time.Time + FirstCommitAt *time.Time // DORA Phase 2: MIN(commit.author.date) from PR/MR commits API FirstReviewAt *time.Time FirstHumanReviewAt *time.Time // W2: MIN(submitted_at) over non-bot reviews MergedAt *time.Time diff --git a/roadmap-planner/docs/plans/2026-05-21-dora-optimization-plan.md b/roadmap-planner/docs/plans/2026-05-21-dora-optimization-plan.md new file mode 100644 index 00000000..8f9b0a2a --- /dev/null +++ b/roadmap-planner/docs/plans/2026-05-21-dora-optimization-plan.md @@ -0,0 +1,758 @@ +--- +title: DORA Optimization — Engineering Delivery Dashboard +type: feat +status: handoff +date: 2026-05-21 +companion: docs/team-analytics/dora-optimization-mockup.html +brainstorm_partial_merge: docs/brainstorms/2026-05-21-dora-drilldown-brainstorm.md +--- + +# DORA Optimization — Engineering Delivery Dashboard + +> **本文档目的**:把 DORA Lead Time 优化的**决策、数据现状、mock 形态、未完成项**整理成可交接的 plan。接手者拿这份文档 + 配套 mock 即可进入实施阶段。 +> +> **本文档是逻辑 spec**(算法、字段、数据源、UI 形态、文件位置);配套 mock 是视觉 spec。 + +## Overview + +把 `MetricsDashboard.jsx` 的 **Lead Time for Changes** 卡片从单一数字升级到 **Dev → Review → Release** 3 段拆分 + 瓶颈高亮 + hover worst_issues,再加一个独立 9 月趋势 panel。严守 DORA 经典语义(first commit → release,不含 backlog)。 + +> Deployment Frequency / Change Failure Rate / Mean Time to Recovery / Cycle Time 4 个指标 **calculator + UI 完全不动**(见 §Decisions D2)。本 plan 的实施范围聚焦在 Lead Time 一个指标。 + +**前置事实**(决策依赖): +- 上架 AC 时间 == Jira version releaseDate(团队纪律保证) +- DEVOPS Jira project 版本命名规范:`{component}-v{semver}`(如 `tektoncd-operator-v4.6.3`) +- 排除 v3 旧插件:katanomi / knative / jenkins(不在 v4 团队关注范围) + +## Problem Statement + +`MetricsDashboard.jsx` 的 Lead Time 现状偏差: + +| 维度 | 现状 | 目标 | +|---|---|---| +| 起点 | `Epic.created`(含 backlog 等待) | first commit author date(纯工程交付) | +| 展示形态 | **单一数字** | **3 段拆分(Dev/Review/Release)+ 瓶颈高亮 + 9 月趋势 + MoM/QoQ** | +| 行动信号 | 数字变化时看不到瓶颈在哪段、是否在恶化 | 卡片直接指向需要改进的 stage + worst issues | + +`Epic.created` 起点被"epic 早建但晚开始"扭曲;单一数字让团队"看到指标动了但不知道改哪儿"。 + +## Ground truth (2026-05-21) + +实施前已验证的项目实际状态,作为算法 / 数据流的事实输入。 + +### DEVOPS issuetype 分布(挂 fix_version 的 issue) + +``` +206 Story +151 Bug + 98 Technical Debt + 21 Job + 11 Epic ← 仅 2.2% + 11 Document + 1 Sub-task + 1 Improvement +``` + +**结论**:`Epic` 几乎不挂 fix_version,主流是 `Story / Bug / Technical Debt`(共 91%)。**计算粒度必须用通用 "issue",不是 "epic"**。Lead Time worst_issues 应按 issue_type 分组展示。 + +### Probe A · fix_version 命名实证 + +| Pattern | 示例 | 处置 | +|---|---|---| +| `tektoncd-operator-v4.X.X` | `tektoncd-operator-v4.6.3` | v4 主流,**计入**统计 | +| `tekton-operator-v3.X.X` | `tekton-operator-v3.20.0` | v3 旧系列,**排除**(D6) | +| 简单数字 / `v2.X` | `0.3`, `v2.1`, `1.0` | 老命名(早期 issue),component filter 上不显示 | + +Component filter 解析正则:`^([a-z][a-z0-9-]+)-v(\d+(\.\d+)*)$` 提取 component 名。 + +### Probe B · `pull_requests.jira_key` 字段实证 + +- **来源**:`backend/internal/storage/migrations/0006_rename_epic_key_to_jira_key.sql`(**W5**,2026-05-19)。**不是 W2**——plan 之前所有"W2 jira_key"应理解为 W5 +- **schema**:单值 `TEXT`(从 `epic_key` RENAME 来;W5 commit message 解释了原因 = 关联的不只 Epic) +- **索引**:`idx_pr_jira ON pull_requests(jira_key)` +- **多 issue 关联**:schema 不支持,单 PR 只能关联 1 个 jira_key → 详细设计 E2"1 PR 引用多 issue"实际**几乎不发生**,简化为退化场景 + +### Probe B · Lead Time 数据流(3 跳 join,已验证全部就位) + +``` +pull_requests.first_commit_at ──┐ + └─ Phase 2 新增字段(唯一新增) │ +pull_requests.jira_key (W5) ──┼──→ T0..T3 计算 + └─ TEXT 单值 │ +issue_snapshots.versions (0001) ──┤ + └─ JSON array of version names │ +data.Releases (in-memory cache) ──┘ + └─ Jira API pull · Version{ReleaseDate, Released, Archived} +``` + +**关键事实**: +- `issue_snapshots` 表已存 fix_version names(`versions` 列,JSON array),无需新增表 +- Jira version 的 `ReleaseDate` **不持久化 SQL**——存在 collector 内存 cache `data.Releases` 里,calculator 通过 `CalculationContext` 读取(参考 `release_frequency.go:50-65`) +- Lead Time calculator 拼接:取 issue_snapshots 最新 run 的 issue → versions[0] 名字 → 在 data.Releases 里查同名 version 的 ReleaseDate +- 注意 `models.Version.Archieved` 字段(typo 在源码中) + +### Probe B · pull_requests 现有 schema + +```sql +-- 0001_init.sql + 0002-0008 演化结果 +pull_requests ( + id, repo_id, number, title, state, author_id, + head_branch, base_branch, additions, deletions, changed_files, + jira_key, -- W5 改名自 epic_key + created_at, first_review_at, merged_at, closed_at, fetched_at, + first_human_review_at -- 0005 (W2) + -- ↓ Phase 2 新增 + -- first_commit_at TIMESTAMP NULL +) +``` + +### Probe 结论给 Phase 2 的影响 + +| 之前假设 | Probe 验证后 | +|---|---| +| 计算单位 = "epic" | **改为 "issue"**(详细设计已用,与 Probe A 一致)| +| W2 jira_key 字段 | **W5(0006 migration)** | +| jira_key 多值 | **单值 TEXT**(E2 简化)| +| 需要新增 issue/version 表 | **不需要**——issue_snapshots + data.Releases 已就位 | +| 唯一新增字段 | **`pull_requests.first_commit_at`** 确认 | +| Phase 2 schema migration 编号 | **0009_pr_first_commit_at.sql**(沿用 0001-0008 命名约定)| + +## Scope + +### What's in + +- **Lead Time 起点改**:从 `Epic.created` 改为 PR first commit author date(涉及 GitHub PR API + W5 `jira_key` 字段,见 §Ground truth) +- **Lead Time stage 3 段拆分**:Dev → Review → Release(严守 D3 起点 = first commit,**不含 Backlog**),calculator 输出 3 个 duration,UI 在 Lead Time 卡片内嵌 3 段水平 bar,瓶颈段高亮(D15),hover 展开 worst_issues(每 stage top-1) +- **Lead Time Trend Panel**:KPI grid **下方**独立 panel(不在卡片内)· 全宽 sparkline + 当月点高亮 + 端点日期标签 + 右上方向箭头 MoM/QoQ chip · 新建 `LeadTimeTrendPanel.jsx` +- **Lead Time 卡片 transparency footer**:单行显示 `linked-PR coverage X% · N excluded` +- **v3 plugin 排除**:katanomi / knative / jenkins 在数据采集层过滤 + +### What's out (explicit defer) + +- ❌ **其他 4 指标**(Deployment Frequency / Change Failure Rate / Mean Time to Recovery / Cycle Time):**calculator + UI 完全不动**(见 §Decisions D2) +- ❌ **Dashboard chrome 重塑**:顶部 masthead / tabs / filter bar 保持项目原状;不加 contributors / window 切换 chip;不新建 `MetricsTrend.jsx` / `MetricsNarrative.jsx`;不加底部 footnote +- ⚠️ **Drill-down 完整详情页**(部分 defer):本次仅吸收 Lead Time stage 3 段 + hover worst_issues。以下仍 defer 到独立 plan:Backlog 段拆分(brainstorm §4 的 4 段方案,跟 D3 不自洽)/ per-epic 全表 / 7d 滚动 leading signals / notable months 自动 flag / `/metric/` 独立路由 +- ❌ **Trend panel 内嵌 worst_issues / detail view**:panel 只展示 sparkline + chip,不混入其他信息密度 + + +## Decisions (frozen) + +记录所有已敲定的决策,避免走回头路。 + +| ID | Decision | Why | +|---|---|---| +| D1 | "上架 AC 时间 == Jira version releaseDate" 作为 Lead Time 终点的事实依据 | 团队纪律保证 | +| D2 | 其他 4 指标(Deployment Frequency / Change Failure Rate / Mean Time to Recovery / Cycle Time)**calculator + UI 完全不动** | 本 plan 范围明确聚焦 Lead Time;其他指标改造需要独立 plan | +| D3 | Lead Time 起点改为 PR first commit author date | Epic.created 含 backlog 等待,不反工程效能 | +| D6 | 数据采集层 + UI 层 **排除 v3 插件**:katanomi / knative / jenkins | v4 团队不关注,避免历史产物稀释水位 | +| D7 | UI 默认 **human only**,可切换含 bot | bot 占 71%,含 bot 数字漂亮但 actionable 弱 | +| D8 | UI 文本**全英文** + 全称(Deployment Frequency 等),禁止 DORA 缩写 | 团队读者不熟悉 DORA 缩写;跟 `MetricsDashboard.jsx` 原始 displayName 风格一致 | +| D14 | Lead Time **不含 Backlog 段**——严守 D3 起点(first commit),只输出 Dev / Review / Release 3 段;brainstorm §4 的 4 段方案(含 Backlog)跟 D3 矛盾,保留在 brainstorm 留待未来 drill-down plan | 方向 A 决议:按 DORA 经典语义严守 commit-centric 起点,Backlog 决策延迟可见度让位给指标自洽 | +| D15 | **瓶颈高亮规则**:某 stage 同时满足「占总时长 > 40%」**且**「该 stage duration ≥ 全员该 stage P75」时标红;只满足一项不标 | 单一阈值要么噪音多要么漏检;占比阈值找"主导段",P75 阈值过滤"整体都快但比例高"的伪瓶颈 | +| D16 | Stage 数字 / 瓶颈高亮跟随顶部 human/include-bots chip 切换;卡片左下角带小字 `human only` / `incl. bots` 标识当前模式 | bot PR 占 71%,自动 merge 会把 Review stage 大幅压低;不绑定 chip 会让 stage 数字跟头部 KPI 数字脱节 | +| D17 | Lead Time 的「9-month trend」放在 **KPI grid 下方独立 trend panel**(不嵌入卡片):全宽 sparkline(每月 total p50 一个点)+ 右上 **MoM / QoQ chip**(含方向箭头 + 百分比);不展示 stage 分趋势;不开 detail page;Lead Time 卡片本身**保持瘦**(不显示 trend section) | (1) 痛点 2「看不到月份环比」是 brainstorm §1 明确写的;(2) sparkline 在卡片内信息密度过高,2026-05-21 mockup 实测后决定外移;(3) 分趋势 / 详情页留给 future drill-down plan;(4) YoY 因窗口 9 月不计算 | + +## Dashboard layout (final spec) + +> **范围收紧** · 仅改造 Lead Time 卡片,**不动 dashboard chrome、不动其他卡片、不新增 panel**。Canonical 视觉态:`docs/team-analytics/dora-optimization-mockup.html`(mockup 用灰盒占位标识"unchanged · existing"区域,唯一渲染细节的是 Lead Time 卡片)。 + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ App chrome (existing · unchanged): │ +│ masthead · tabs · filter bar │ +│ ⇒ 跟主分支 baseline 像素级 diff 为零 │ +├─────────────────────────────────────────────────────────────────────┤ +│ KPI cards (5 columns; 现有 grid 不动): │ +│ ┌────────┐ ┌────────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ +│ │ Deploy │ │ Lead Time │ │Change │ │ MTTR │ │ Cycle │ │ +│ │ Freq │ │ ──(NEW)─── │ │Failure │ │unchang.│ │ Time │ │ +│ │unchang.│ │ 18d Medium │ │unchang.│ │existing│ │unchang.│ │ +│ │existing│ │ ───────── │ │existing│ │ │ │existing│ │ +│ │ │ │ ▰▰▰▰▰▰▰▰ │ │ │ │ │ │ │ │ +│ │ │ │ Dev/Rev⚠/ │ │ │ │ │ │ │ │ +│ │ │ │ Release │ │ │ │ │ │ │ │ +│ │ │ │ hover→worst│ │ │ │ │ │ │ │ +│ │ │ │ coverage96%│ │ │ │ │ │ │ │ +│ └────────┘ └────────────┘ └────────┘ └────────┘ └────────┘ │ +├─────────────────────────────────────────────────────────────────────┤ +│ Lead Time Trend (NEW panel · full-width · D17): │ +│ ┌───────────────────────────────────────────────────────────────┐ │ +│ │ Lead Time Trend · 9mo · total p50 ▼5% MoM ▲13% QoQ │ │ +│ │ │ │ +│ │ ╱╲___╱╲ │ │ +│ │ ╱ ╲___╱╲___╱╲ ╱╲ │ │ +│ │ ╱ ╲___╱ ╲___╱ ● ← current month │ │ +│ │ │ │ +│ │ 2025-09 10 11 12 01 02 03 04 2026-05 │ │ +│ └───────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────┘ + +⇒ Lead Time 卡片瘦身(不含 trend),独立 Trend Panel 单独占一行。 +⇒ 其他 4 卡 + 上下 chrome 全部保持项目原有渲染路径。 +⇒ 文件层:新建 LeadTimeTrendPanel.jsx;MetricsDashboard.jsx 仅 1 行 JSX 改动(append panel);MetricCard.jsx 按 metric.name 分支渲染。 +``` + +**Lead Time 卡片内部信息层级**(自上而下,**瘦身后 8 slot**): + +| Slot | 内容 | Spec 锚点 | +|---|---|---| +| 1 | Title `Lead Time for Changes` | existing | +| 2 | Value `18d` | existing | +| 3 | Unit `first commit → released · p50` + DORA band | D3 + D9 | +| 4 | separator | new | +| 5 | 3-stage horizontal bar (Dev / Review ⚠ / Release) | D14 + D15 + 详细设计 §6 | +| 6 | Stage legend (per-stage p50 hours + bottleneck flag) | D15 | +| 7 | "hover → worst issues per stage" cue | 详细设计 §7 | +| 8 | Footer `linked-PR coverage X% · N excluded` | Q1 a + 详细设计 §4 | +| (`:hover`) | worst_issues popup · 每 stage top-1 | 详细设计 §7 | + +**Lead Time Trend Panel 信息层级**(D17 修订 · 独立 panel): + +| Slot | 内容 | Spec 锚点 | +|---|---|---| +| 1 | Header 左:title `Lead Time Trend · 9 months · total p50` | D17 | +| 2 | Header 右:MoM chip + QoQ chip (▼/▲ + % + period label) | D17 | +| 3 | Main:inline SVG sparkline · 9 个点连线 · null 月虚线 · 当月点 accent 高亮 | §11 | +| 4 | Axis:左 `2025-09` 右 `2026-05 ●` | §11 | +| 5 | Footnote(可选):`Lead Time 越短越好 · 负值方向 (▼) = 改善` | §11 | + +**其他 4 卡片** · 完全不动: +- Deploy Freq · Change Failure · MTTR · Cycle Time 渲染逻辑、字段、文案、CSS 跟主分支 baseline 像素级一致 +- 不加 band 渲染、不英文化 displayName、不加 placeholder/proxy/unchanged 视觉标识(mockup 里灰盒占位**只是文档说明**,不是 React 渲染目标) + +## Phase split + +### Phase 0 · backend 启动 + Jira sync + PR 数据采集 + +**目的**:让 Lead Time 计算依赖的所有数据源真的能流通——Jira issue/version 同步 + GitHub/GitLab PR 同步 + 新字段 `first_commit_at` 填充。 + +**改动**: +1. Backend `.env`:填入 `JIRA_BASE_URL=https://jira.alauda.cn`、`JIRA_USERNAME=systembot`、`JIRA_PASSWORD=`、`JIRA_PROJECT=DEVOPS` +2. `backend/internal/config/config.go`:把 `storage.enabled` 默认改为 `true`(或新建 production yaml) +3. `backend/internal/metrics/collector.go`:确认 collector 启动时跑一次 Jira + PR sync +4. **新增 plug exclusion filter**(D6):`metrics.exclude_plugins: [katanomi, knative, jenkins]`,collector 在拉 versions 时按 name regex 过滤 +5. **数据归档**:把采集的 Tekton 14 仓库 PR + Jira version 样本归档到 `backend/testdata/`(作为离线测试数据) + +**验收**: +- `curl /api/metrics/lead_time_to_release?component=tektoncd-operator` 返回非空 `metadata.stages` +- `SELECT COUNT(*) FROM pull_requests WHERE first_commit_at IS NULL` 在一轮 sync 后保持小数(< 5% 全量) + +### Phase 1 · Lead Time 卡片改造 + 独立 Trend Panel + +**目的**:在现有 `MetricsDashboard.jsx` 上:(1) **升级 Lead Time 卡片**(stage bar + 瓶颈高亮 + hover worst_issues + transparency footer,**不含** trend);(2) **在 KPI grid 下方插入独立 `LeadTimeTrendPanel`**(sparkline + MoM/QoQ chip)。**不动其他卡片、不动 dashboard chrome / masthead / tabs / filter bar**。 + +**范围红线**(与 Q1 a / Q2 b / D17 修订一致): +- ❌ 不重塑顶部 dash-head / masthead / tabs / filter bar +- ❌ 不加 contributors `human only / include bots` 切换 chip +- ❌ 不加 window 切换 chip +- ❌ **不新建** `MetricsTrend.jsx`(泛化柱图);仅新建专用 `LeadTimeTrendPanel.jsx` +- ❌ **不新建** `MetricsNarrative.jsx`(叙事 panel 不做) +- ❌ Change Failure / MTTR / Cycle Time / **Deploy Freq** 四卡视觉零改动 +- ❌ Component filter 形态、CSS 不改 + +**改动**(2 个文件修改 + 1 个新建): + +**1. 修改 `frontend/src/components/metrics/MetricCard.jsx`**——按 metric name 分支渲染: + +``` +if (metric.name === "lead_time"): + ┌─ title (existing) + ├─ value + band (existing) + ├─ ─────── (新增 separator) + ├─ stagebar (3 段 · 瓶颈高亮 ⚠) + ├─ stage legend (Dev / Review / Release · 各段 p50) + ├─ "hover → worst issues per stage" cue + ├─ footer · "linked-PR coverage X% · {N} excluded" ← Q1 a + └─ :hover → worst_issues popup (每 stage top-1) + ⇒ 卡片瘦身:不含 trend section(trend 移到独立 panel) + +else: + return ← 完全不动 +``` + +**2. 新建 `frontend/src/components/metrics/LeadTimeTrendPanel.jsx`**: + +``` +┌─ panel header (横向 flex) +│ ├─ 左:title "Lead Time Trend · 9 months · total p50" +│ └─ 右:MoM chip + QoQ chip +├─ 全宽 sparkline · 9 个点 · 当月点 accent 高亮 +├─ axis labels: 2025-09 ... 2026-05 +└─ optional footnote: "Lead Time 越短越好 · 负值 = 改善" +``` + +**3. 修改 `frontend/src/components/MetricsDashboard.jsx`**——**仅 1 行 JSX 改动**:在现有 KPI grid 下方 append ``。dashboard 顶部 / grid 渲染 / 其他卡片传 prop 全部不动。 + +UI 细节: +- **stage bar**:3 段 flex 比例按 `p50_hours`;瓶颈段按 D15 高亮(红边框 + ⚠ 角标) +- **MoM / QoQ chip**(panel 内):方向箭头 + 百分比;负数 (▼) = improved 绿色;正数 (▲) = degraded 红色;null = 灰色 "n/a" +- **sparkline**:inline SVG ``,9 月点连线,null 月画虚线,当月圆点放大 + accent 色,高度 80-100px +- **hover preview**:纯 CSS `:hover`(无 JS);浮层显示 worst_issues 每 stage top-1 +- **footer**:单行 mono 文本,coverage % 来自 API `coverage.coverage_pct` + +**API 调用变化**: +- Lead Time 单独请求 `?include_bots=false&component=...&with_trend=true`(trend 字段按 D17 §11) +- `MetricCard` (Lead Time) 用同一 API 响应的 `stages` + `worst_issues` + `coverage` 部分 +- `LeadTimeTrendPanel` 用同一 API 响应的 `trend` 部分(无需独立请求 → 复用 cache) +- 其他 4 个 metric 走原有 API endpoint,不变 + +**验收**: +- Lead Time **卡片**显示 3 段 bar + stage legend + footer + hover preview;**不**含 trend +- KPI grid **下方**显示独立 `Lead Time Trend` panel:sparkline + 当月点高亮 + MoM/QoQ chip + 端点日期 +- Deploy Freq / Change Failure / MTTR / Cycle Time 四卡跟主分支 baseline **像素级 diff 为零**(除 grid 高度自适应) +- dashboard chrome(masthead、tabs、filter bar)跟主分支 baseline **像素级 diff 为零** +- 浏览器访问 `/metrics` 第一屏跟 `dora-optimization-mockup.html` 视觉一致 + +### Phase 2 · Lead Time 起点改正 + 3 段 stage 拆分 + +**目的**:把 Lead Time 从 "Epic.created → releaseDate · 单一数字" 改为 "first commit → releaseDate · Dev / Review / Release 3 段拆分"。 + +**3 段定义**(方向 A · 严守 D3): + +| Stage | 起点 | 终点 | 来源字段 | +|---|---|---|---| +| Dev | `pull_requests.first_commit_at` | `pull_requests.created_at`(first PR opened on linked epic) | GitHub PR commits API | +| Review | `pull_requests.created_at`(first PR) | `pull_requests.merged_at`(last PR on linked epic) | 现有 PR sync | +| Release | `pull_requests.merged_at`(last PR) | Jira version `releaseDate` | 现有 Jira sync | + +**核心数据洞察**:3 段所需字段中 first_commit_at 是唯一新字段,其他全部已有。**不需要** Jira changelog 提取(Backlog 段在方向 A 下不存在,省掉 brainstorm 决策 7 关联的实现成本)。 + +**改动**: + +1. `backend/internal/metrics/calculators/lead_time.go`: + - 算法起点改为 `pull_requests.first_commit_at`(D3) + - 输出从单一 duration 改为 `{ total_hours, stages: [{name: "dev"|"review"|"release", duration_hours, p50, p75}], worst_issues: [...] }` + - 瓶颈识别按 D15:占比 >40% **且** ≥ P75 才标 bottleneck=true + - human/include_bots 参数透传,影响 worst_issues 排序和 stage 数字(D16) + +2. `backend/internal/github/`:新增 `ListPRCommits(repo, prNumber)` 客户端方法,调用 `GET /repos/{owner}/{repo}/pulls/{n}/commits` 取 first commit author date + +3. `backend/internal/gitlab/`:同上,GitLab MR 的 commits API(`GET /projects/:id/merge_requests/:iid/commits`) + +4. 数据模型扩展: + - **schema migration**:新建 `backend/internal/storage/migrations/0009_pr_first_commit_at.sql`,内容 = `ALTER TABLE pull_requests ADD COLUMN first_commit_at TIMESTAMP NULL`(SQLite + Postgres 同一份,沿用 0001-0008 命名约定) + - sync 时调 PR commits API 填充该字段;**retroactive backfill 复用现有 `BackfillDays` 配置**(不需要新建一次性脚本)——启动 sync 时以 `Since=now-BackfillDays` 拉 PR,每个 PR 触发一次 `ListPRCommits`/`ListMRCommits` 填字段;`COALESCE` UPSERT 保护已有值 + +5. PR ↔ Jira issue 关联(**已验证**,见 §Probe results): + - **3 跳 join**:`pull_requests.jira_key (W5, TEXT 单值)` → `issue_snapshots.versions[0]` (取最新 run 的 issue 快照;JSON array 第一个 name) → `data.Releases` (in-memory cache,含 ReleaseDate) + - **不需要新增表**:`issue_snapshots`(0001)已存 versions,`data.Releases` 由现有 collector cache + - **前置依赖**:`docs/plans/2026-05-20-feat-pr-jira-link-audit-plan.md` 的覆盖率 ≥ 70%(见 Risks R2) + +6. **Fallback 规则**(与详细设计 §4 矩阵一致,此处仅列实施要点;详细决策见详细设计 §4 C1-C6): + - 单 PR 缺 first_commit_at → 该 PR 标 NULL,calculator 在该 issue 内**取剩余 PR 的 min(first_commit_at)** 作 T0;该 issue 仍计入 3 段统计(C2) + - 该 issue 全部 PR 都缺 first_commit_at → 排除 Dev 段,Review 段起点改用 T1(C3) + - issue 无 linked PR → **完全排除统计**(不引入非 commit-centric 起点,严守 D14;C4) + - issue 无 released fix_version → 完全排除(C5) + - 任一 stage < 0 → 完全排除 + 记 `data_anomaly`(C6) + - 透明度:API 返回 `coverage: { full, partial, dev_missing, no_prs, unreleased, data_anomaly, coverage_pct }` + +7. **趋势计算**(D17 + §11):calculator 同时输出 `trend.points[9]` + `trend.mom_pct` + `trend.qoq_pct` + `trend.mom_direction` + `trend.qoq_direction`;month bucket 按 `fix_version.releaseDate` 分组;sample < 3 的月 `p50 = null` + +8. UI 数据交付: + - MetricCard Lead Time 卡片读 `stages` + `worst_issues` + `trend` + `coverage` prop(Phase 1 渲染逻辑已收 prop) + - 卡片 unit 文案 "first commit → released · p50" + - 卡片 footer transparency "linked-PR coverage X% · {N} excluded"(来自 `coverage.coverage_pct` + 排除 issue 数) + +**验收**: +- `curl /api/metrics/lead_time?include_bots=false&component=tektoncd-operator&with_trend=true` 返回 `{ total, stages: [3], worst_issues: [top 3 per stage], coverage: {...}, trend: { points: [9 月], mom_pct, qoq_pct, mom_direction, qoq_direction }, consistency_warning: ... }` +- Lead Time API 输出的 `total.p50_hours` 中位数明显高于 PR merge lead time(含 release 等待) +- `sum(stages[].p50_hours)` 与 `total.p50_hours` 的偏差 < 5%(详细设计 §4 一致性约束) +- 切换 `include_bots=true` 后 Review stage 数字下降(bot auto-merge 拉低) +- worst_issues 按每 stage top-1 凑齐 3 个,bot PR 携带 `is_bot_majority: true` 标识 +- Bot PR 在 UI hover preview 上带 🤖 徽章 +- `trend.points` 数组长度严格等于 `window_months`,按时间升序 +- `mom_pct` / `qoq_pct` 在数据不足时返回 `null`(不是 0);UI chip 显示 `n/a` + +### Phase 3 · 扩展到全部 67 活跃 repo + +**目的**:从 Tekton 14 个 repo 验证形态,扩到 AlaudaDevops org 全部 67 个活跃 repo(每个 plugin 一个独立视图)。 + +**改动**: +1. `backend/internal/github/sync.go`:repos 配置从手动 list 改成 `OWNER/*` 通配符 + 排除规则(archived / `_pac-quota-pad-*` / v3 plugins) +2. Backend 跑 first-run backfill,拉所有 active repo PR 数据 +3. 仪表盘 Component selector 显示全部 plugin +4. 默认选中 "tektoncd-operator"(高频 plugin),保留"留空 = 全部合并"模式 + +**验收**: +- 切换 Component 能看到 connectors-operator / harbor-ce-operator / gitlab-ce-operator / sonarqube-ce-operator / nexus-ce-operator 各自的 DORA 数字 +- 留空时显示组织整体水位 + +## Lead Time 3 段计算 · 详细设计 + +> Phase 1 / Phase 2 / Dashboard layout / Risks R7-R8 都引用本节。落到 `backend/internal/metrics/calculators/lead_time.go`。 + +### 1 · 输入数据 + +| 来源 | 字段 | 备注 | +|---|---|---| +| `pull_requests` 表 | `id, jira_key, author_id, head_branch, additions, deletions, changed_files, first_commit_at, created_at, first_review_at, first_human_review_at, merged_at, closed_at, fetched_at` | **Probe B 已验证**:`jira_key` 来源 W5/0006 migration,TEXT 单值;`first_commit_at` 为 Phase 2 新增字段(0009 migration),PR commits API 填充;其他字段全部已有 | +| `issue_snapshots` 表 | `run_id, issue_key, issue_type, status, versions (JSON array of fix_version names), created_at, resolved_at` | **Probe B 已验证**:fix_version names 以 JSON array 存 `versions` 列;按 `(MAX(run_id), issue_key)` 拿最新快照 | +| `data.Releases` (in-memory) | `models.Version{ID, Name, ReleaseDate, UserReleaseDate, Released, Archieved}` | **Probe B 已验证**:Jira version 不持久化 SQL,collector cache 在内存,calculator 通过 `CalculationContext` 读取(参考 `release_frequency.go:50-65`)。注意源码 typo:字段名 `Archieved` | + +**单位**:所有时间戳 UTC;duration 用小时(`hours`)存储和返回。 + +**术语澄清**:本节后续用 **"issue"** 作计算单位(Probe A 实证:Epic 仅占 2.2%,主流为 Story 41% + Bug 30% + Technical Debt 20%)。UI 文案 / worst_issues 显示 issue 的实际 issuetype(Story / Task / Bug / Technical Debt)。 + +### 2 · Issue 级 3 段公式 + +对每个**目标 issue** s(定义:fix_version 在统计窗口内 released = true 且 fix_version.name 匹配 component filter),计算 4 个时间锚点: + +``` +T0 = min(p.first_commit_at) for p in s.linked_prs // 第一次写代码 +T1 = min(p.created_at) for p in s.linked_prs // 第一次开 PR (ready-for-review) +T2 = max(p.merged_at) for p in s.linked_prs // 最后一个 PR merge +T3 = s.fix_version.releaseDate (snapped to end-of-day) // 上架时刻 +``` + +3 段定义(**保证 dev + review + release = total,无 gap**): + +``` +dev_s = T1 − T0 // 编码到首次 review 提交(含独立写代码时间) +review_s = T2 − T1 // 首次 review 到全部 merge(含多 PR 串联 / 反复修改 / review 等待) +release_s = T3 − T2 // 全部 merge 到上架(含发版纪律 / 集成测试等待) +total_s = T3 − T0 // = dev_s + review_s + release_s +``` + +**关键设计取舍**:Review 段是"剩余时间",吸收所有 PR 间 gap、多 PR 串联、reopen、rebase 等情形,使得 3 段在 issue 维度严格可加。代价是 Review 段语义比 brainstorm 4 段方案宽——包含"中间编码"和"等待复评"。在 footnote 用一句话向团队解释。 + +### 3 · linked_prs 解析规则 + +- `pull_requests.jira_key` 为来源(待 Probe B 验证字段细节) +- 单 PR 引用多 jira_key → 该 PR 计入每个 issue 的 linked_prs 各一次(重复计入符合"该 issue 的贡献 PR"语义;worst_N 展示侧按 jira_key 去重避免视觉重复,见 P3-4) +- `include_bots=false` 模式下:在求 T0/T1/T2 之前**先过滤掉** `is_bot=true` 的 PR;过滤后无 PR 的 issue 走 Case 3 fallback +- PR 的 `merged_at > T3`(先 release 后 hotfix)→ 该 PR **排除**在当前 issue 的 T0/T1/T2 计算外(hotfix 自然归到下一个 release 的 issue) + +### 4 · Fallback 矩阵 + +| Case | 触发 | 处理 | +|---|---|---| +| C1 数据齐全 | issue 有 PR · 所有 PR 有 first_commit_at · fix_version 有 releaseDate | 正常 3 段 | +| C2 部分 PR 缺 first_commit_at | 至少一个 linked PR 有 first_commit_at(其他 NULL) | 用 available PR 的 min(first_commit_at) 作 T0;该 issue 计入 3 段统计但 `coverage.partial += 1` | +| C3 issue 全部 PR 都缺 first_commit_at | 所有 linked PR 的 first_commit_at NULL | issue 排除 Dev 段;Review 段起点改为 T1 = min(created_at),Release 段不变;total = T3 − T1;`coverage.dev_missing += 1` | +| C4 issue 无 linked PR | jira_key 关联失败或无 PR | **完全排除**(KPI + stages 都不计,避免引入非 commit-centric 起点违反 D14);`coverage.no_prs += 1` 仅用于 transparency footnote 暴露 | +| C5 issue 无 fix_version / fix_version 未 released | fix_version.released=false 或 releaseDate=NULL | issue 完全**排除**(KPI + stages 都不计);`coverage.unreleased += 1` | +| C6 任一 stage duration < 0 | 时区错乱 / squash 重写 / draft PR 占位等导致 T0 > T1 或 T1 > T2 或 T2 > T3 | issue 完全**排除**;`coverage.data_anomaly += 1`,API 在 `consistency_warning` 提示 | + +**一致性约束**:calculator 强制校验 `sum(stages.p50) / total.p50 ∈ [0.95, 1.05]`,否则 API 返回 `consistency_warning`(见 R8)。 + +### 5 · 聚合到团队 + +对所有符合 C1/C2/C3 的 issue,计算各 stage duration 的分布统计: + +``` +For each stage s in [dev, review, release]: + s.p25, s.p50, s.p75, s.p90 = percentile(issue_s for issue in qualified_issues) + s.sample_count = len(issue_s) +``` + +⚠️ **中位数不可加性**:`p50(dev) + p50(review) + p50(release) ≠ p50(total)` —— 团队读图时必须知道这一点。 + +**UI 处理**: +- Stage bar 长度按 **stage p50 的相对比例**渲染(归一化为 100%),不按绝对值 +- bar 旁标注每段 p50 绝对小时数(如 `Dev 18h · Review 96h · Release 168h`) +- 顶部 KPI 数字仍是 `total.p50` +- Footnote 一句话:「Stage 数字为各段中位数;因中位数不可加,stage 之和 ≠ 顶部 Lead Time 数字」 + +### 6 · 瓶颈识别(D15 落地) + +对 stage s 标 `bottleneck=true` 当且仅当: + +``` +s.p50 / sum(stages.p50) > 0.40 // 占主导比例 + AND +s.p50 ≥ team_baseline.p75 for stage s // 该段绝对值偏高 +``` + +`team_baseline.p75` 来源:过去 12 个月所有 issue 的 stage p75(rolling baseline,存于内存或单独表,初次启动用 9 个月可用数据;后续按需引入 `metric_period_values` 表持久化,见 brainstorm §5.1,本 plan 内**不实施**)。 + +UI 高亮:bar 红色边框 + ⚠ 图标 + tooltip 文案 `45% of total · >= team baseline P75 (210h)`(英文,D8/D9)。 + +### 7 · worst_issues 排序 + +**每个 stage 各 top-1,凑齐 3 个**(避免 release 段绝对值大压倒 review/dev 异常的 visibility 问题): + +``` +For each stage in [dev, review, release]: + candidates = qualified_issues where argmax(stage.duration_in_issue / total_in_issue) == stage + worst_per_stage[stage] = candidates sorted by stage.duration DESC, take 1 + +worst_issues = [worst_per_stage["dev"], worst_per_stage["review"], worst_per_stage["release"]] + filter out None // stage 无 candidate 时省略 +``` + +**取舍说明**:原 plan 写"按 bottleneck_duration_hours 全局 DESC top 3" 会导致 worst_3 全是 release 段瓶颈(release 段绝对值通常远大于其他段)。每 stage top-1 让团队同时看到 3 段各自的最差案例,可指认改进点更平衡。 + +每条返回 `{ jira_key, issuetype, summary, bottleneck_stage, bottleneck_duration_hours, total_lead_time_hours, pr_count, is_bot_majority }`。 + +`is_bot_majority`:issue 关联 PR 中 bot PR 占比 > 50%(用于 UI 显示 🤖 徽章)。 + +### 8 · API 返回结构 + +`MetricResult.Value` 字段为 **days**(向后兼容现有 `MetricCard` / `MetricBreakdown` / Prometheus exporter `lead_time_days` 的 day 阈值消费)。Metadata 含两层: +- **新的 hour-precision payload**:`total` / `stages` / `worst_issues` / `coverage` / `trend` 等(Phase 1 frontend 用) +- **legacy days 兼容字段**:`min` / `max` / `average` / `count` / `sample_size` / `percentile`,供 `MetricBreakdown.jsx` 直接读旧路径 + +```json +GET /api/metrics/lead_time?component=tektoncd-operator&window_months=9&include_bots=false + +{ + "metric": "lead_time", + "value": 18, // ← p50 in days, backward-compat with MetricCard + "unit": "days", + "window": { "start": "2025-09-01", "end": "2026-05-21" }, + "filters": { "component": "tektoncd-operator", "include_bots": false }, + "total": { + "p25_hours": 56, "p50_hours": 432, "p75_hours": 1320, "p90_hours": 4080, + "sample_count": 47 + }, + "stages": [ + { + "name": "dev", "label": "Dev", + "definition": "first_commit_at → first_pr_opened", + "p50_hours": 18, "p75_hours": 72, "sample_count": 45, + "bottleneck": false + }, + { + "name": "review", "label": "Review", + "definition": "first_pr_opened → last_pr_merged", + "p50_hours": 96, "p75_hours": 240, "sample_count": 45, + "bottleneck": true, + "bottleneck_reason": "45% of total Lead Time · >= team baseline P75 (210h)" + }, + { + "name": "release", "label": "Release", + "definition": "last_pr_merged → fix_version.release_date", + "p50_hours": 168, "p75_hours": 480, "sample_count": 47, + "bottleneck": false + } + ], + "worst_issues": [ + { + "jira_key": "DEVOPS-12345", + "issuetype": "Story", + "summary": "Tekton operator: support tier overrides", + "bottleneck_stage": "review", + "bottleneck_duration_hours": 720, + "total_lead_time_hours": 1100, + "pr_count": 5, + "is_bot_majority": false + } + // ...one entry per stage (dev / review / release), 3 total + ], + "coverage": { + "issues_total": 50, + "issues_full": 45, // C1 + "issues_partial": 3, // C2 + "issues_dev_missing": 1, // C3 + "issues_no_prs": 1, // C4 + "issues_unreleased": 0, // C5 (excluded, count only) + "issues_data_anomaly": 0,// C6 (excluded, count only) + "coverage_pct": 96 // (full + partial + dev_missing) / (total - unreleased - data_anomaly) + }, + "consistency_warning": null // string when sum(stages.p50) vs total.p50 偏差 > 5% +} +``` + +### 9 · Edge cases 清单(实施时逐条验证) + +| # | 场景 | 处理 | +|---|---|---| +| E1 | 1 个 issue 关联多 fix_version | **按 component filter 范围筛选**:先 filter `fix_version.name` 与 component prefix 匹配的子集,再取最早 released=true 的 releaseDate 作 T3;如全集都 released=true(极少),取最早 | +| E2 | 1 个 PR 引用多 issue | 该 PR 计入每个 issue 的 linked_prs 各一次(重复但正确);**worst_issues 展示侧按 jira_key 去重**——同一 PR 让多个 issue 同时进 top-N 时仅显示第一个并标 `cross_referenced: true` | +| E3 | PR.merged_at 晚于 release day(calendar day 比较)| 该 PR 不算入当前 issue(视为 hotfix,归下一 release)。**Jira release date 是 calendar date,calculator 在 release 当日 EOD 之前 merge 的 PR 不算 hotfix**——避免丢失 release day 当天的 last PR | +| E4 | PR.first_commit_at < issue.created(提前编码)| 仍以 PR.first_commit_at 为 T0(commit-centric 严守 D3)| +| E5 | issue.fix_version 名称不匹配 `{component}-v{semver}`(如 `argo-cd-2.9.0` 无 `v` 前缀) | calculator 直接复用 `EnrichedRelease.Component`(collector 已 parsed,跟 release_frequency 同源),不重新解析;component filter 仍可命中 | +| E6 | 窗口边界:first_commit 在窗口外但 release 在窗口内 | 计入(窗口判定按 release 日期,对齐 DORA 惯例)| +| E7 | issue 跨 release(早 fix_version → 又重新 fix_version)| 用最新 released=true 的 fix_version;如有多个,记录 `multiple_releases=true` | +| E8 | 任一 stage duration < 0(dev/review/release 中任一为负,常见于时区错乱 / squash 重写 / draft PR 占位早于 first commit)| 该 issue 完全排除,记入 `coverage.data_anomaly`(与详细设计 §4 C6 一致)| +| E9 | 超长 dev 段:T0 早于 window.start 超过 180d(first_commit 在 2024 写,2026 才 release)| issue 完全排除该窗口的 Lead Time 统计 + 记入 `coverage.excluded_long_dev`;transparency footnote 暴露占比。避免长尾拉飞 p50 | + +### 10 · 数据维护与刷新 + +- calculator 是**纯函数**,不写表;按需调用、按需聚合 +- 数据源刷新走现有 collector 链:Jira sync(按 fixVersion / changelog) + GitHub PR sync(含新增 commits API) +- **PR fetch lookback**:collector 拉 PR 时用 `since = now - (HistoricalDays + 180d)`。Lead Time 按 release date 判定 window,但关联 PR 可能在 release window 起点前 merge;buffer 跟 E9 long-dev 阈值(180d)对齐,超出 180d 的 issue 已经被 E9 排除,所以更早的 PR 不需要 fetch +- **无 PR store fallback**:当 `storage.enabled = false`(最小部署)时 `PRStoreAvailable` 为 false,calculator 退化到 Jira-only calendar lead time(issue.created → release.releaseDate,days),输出 Value + legacy `min/max/count` + `degraded: "no_pr_store"` 标记。这破坏 D3 commit-centric 起点,但只在 PR 数据完全不可达时生效,避免老 deployment 升级后 Lead Time 整体消失。UI 可读 `metadata.degraded` 显示配置警告 +- 性能预算:9 月 ~50 issue × 平均 3 PR ≈ 150 PR 维度查询,单次 calculator 调用 < 200ms(含 percentile 计算) +- 如未来 issue 数量级 > 1000,再考虑预聚合(brainstorm §5.1 的 `metric_period_values` 表可启用,本 plan 内不实施) + +**依赖**: +- percentile 计算:Go 标准库无;用 `gonum.org/v1/gonum/stat`(如项目已 vendor 则复用,否则 Phase 2 一并加入 go.mod)。fallback:自实现 `quickselect` 算法(< 30 行,无外部依赖) + +**缓存策略**: +- 同一 dashboard 加载触发 5 个 metric 并发请求 → 5 × 200ms = 1s+ 用户感知 +- calculator 结果按 `(metric, component, window_months, include_bots)` 作 cache key,TTL = **1 小时**(与现有 Jira sync 间隔对齐) +- 缓存 invalidate 触发器:collector 完成一次 sync 后,pub `metrics.cache_invalidate` 事件清空整个 namespace +- 本地开发模式可用 `?no_cache=1` 绕过 + +**Future enhancement**(不在本 plan 内): +- **Review 段细粒度拆分**:当前 Review 段含 "PR open → first review comment / first approve / last merge" 整体;团队反馈"看不清是 review 等待慢还是 merge 等待慢"时,可拆为 `review_wait → approve_wait → merge_wait` 3 子段。**前置数据**:需采集 PR review event(comments / approvals)—— W2/W8 当前不含,需新增 GitHub events sync +- **Backlog 段重新引入**:如 Probe A 确认 Story 普遍走 backlog→in_progress→resolution 状态流,且团队明确要求看 backlog 决策延迟,再开独立 plan 实施 brainstorm §4 的 4 段方案(含 Backlog) +- **Stage 分趋势 / 详情页**:当前 trend 只画 total p50;如团队要看"Review 段在哪个月恶化"则需 3 段叠加趋势 / 独立 detail page,留给 future drill-down plan + +### 11 · 趋势计算(D17 落地) + +> 卡片内 9-month sparkline + MoM / QoQ chip 的算法。沿用 `lead_time.go` calculator,不开新文件。 + +**Month bucketing 规则**: + +``` +For each month m in [window.start, window.start + 1mo, ..., window.end]: + qualified_issues_m = filter(qualified_issues, + fix_version.releaseDate falls in [m.start, m.end]) + month_p50_m = p50(total_s for s in qualified_issues_m) + month_sample_m = len(qualified_issues_m) +``` + +- 月份判定**按 `fix_version.releaseDate`**(与 release_frequency 一致),不按 issue.created +- `sample < 3` 的月份 `month_p50 = null`(点上标空心圆,sparkline 在该月画虚线) +- 月份桶不受 component / include_bots filter 影响逻辑,但 filter 参数会作用在 qualified_issues 上 + +**MoM / QoQ 公式**: + +``` +last_month = month_p50[-1] // 当月 (e.g., 2026-05) +prev_month = month_p50[-2] // 上月 (e.g., 2026-04) +mom_pct = (last_month - prev_month) / prev_month * 100 + +curr_q_avg = mean(month_p50[-3:]) // 最近 3 月 +prev_q_avg = mean(month_p50[-6:-3]) // 再往前 3 月 +qoq_pct = (curr_q_avg - prev_q_avg) / prev_q_avg * 100 +``` + +- 任一参与的月份 `month_p50 = null` 时,对应 MoM / QoQ chip 显示 `n/a` 而不是 0 +- Lead Time 是越短越好的指标:**正数 (▲) = 恶化(红色 chip)**;**负数 (▼) = 改善(绿色 chip)** + +**API 返回结构扩展**(详细设计 §8 基础上): + +```jsonc +{ + "metric": "lead_time", + // ...existing fields (window, filters, total, stages, worst_issues, coverage) + "trend": { + "granularity": "month", + "points": [ + { "period": "2025-09", "p50_hours": 576, "sample_count": 5 }, + { "period": "2025-10", "p50_hours": 504, "sample_count": 6 }, + { "period": "2025-11", "p50_hours": null,"sample_count": 1 }, // < 3 samples + // ... 9 月共 9 个 entries + { "period": "2026-05", "p50_hours": 432, "sample_count": 5, "is_current": true } + ], + "mom_pct": -5.3, // last month vs prev month + "qoq_pct": 12.7, // last 3 mo avg vs prev 3 mo avg + "mom_direction": "improved", // "improved" (negative for lead time) | "degraded" | "n/a" + "qoq_direction": "degraded" + } +} +``` + +**性能与缓存**: +- Month 聚合用 SQL 一次取出(`GROUP BY strftime('%Y-%m', releaseDate)`),无 N+1 +- 复用 §10 cache key + 增加 `granularity=month` 维度 → 同一 dashboard 加载一次 trend 计算服务整个会话 +- 性能预算 +30ms(9 个 percentile × 10ms 上限) + +**UI 渲染规则**(Phase 1 落地 · D17 修订后): +- **位置**:KPI grid 下方独立 panel(**不嵌 Lead Time 卡片**),新文件 `LeadTimeTrendPanel.jsx` +- **panel 宽度**:跟随 dashboard 容器(full-width),不强求 5-column 对齐 +- panel 顶部 header:左侧标题 `Lead Time Trend · 9 months · total p50`;右侧 MoM / QoQ chip +- panel 主体:inline SVG `` 9 个点连线,高度 80-100px(比卡片内更高,信息更舒展) +- null 点 → 段画虚线 +- 当月点(`is_current=true`)画放大圆点 + accent color +- panel 底部:端点日期标签 (`2025-09 ... 2026-05`);可选最小化 footnote `Lead Time 越短越好 · 负值方向 = 改善` +- panel 自身不带 hover preview(保持纯展示),worst_issues hover 仍在 Lead Time 卡片 +- panel 跟随 component / include_bots filter 联动(与 Lead Time 卡片同源 API 调用,复用 cache) + +## Open Questions + +实施阶段的 sub-task 决定(每个有 default,plan 不阻塞)。 + +| Q | Question | Default assumption | +|---|---|---| +| Q1 | Bot 识别规则:仅按 GitHub `is_bot` 还是按 login pattern? | **混合**:is_bot OR login 含 bot/renovate/dependabot/alaudaa 关键字。Sub-task of Phase 1 | +| Q2 | 单 plugin 视图 vs 合并视图的优先级 | **单 plugin 优先**(默认 tektoncd-operator),合并视图 filter 留空触发。Sub-task of Phase 3 | +| Q3 | DORA band 阈值:用 DORA 2024 报告标准 vs 内部 calibrate? | **DORA 2024 标准**作为 v1,UI 旁标 "DORA 2024 bands",团队自行解读 | +| Q4 | Lead Time stage bar 窄屏(< 768px)降级 | **垂直堆叠**(3 段横→竖);hover preview 改为 tap 展开。Sub-task of Phase 1 | + +## Files referenced + +### 配套 mock(视觉 spec) +- `docs/team-analytics/dora-optimization-mockup.html` — 静态 HTML,无依赖,打开即可 + +### Brainstorm 配套文档 +- `docs/brainstorms/2026-05-21-dora-drilldown-brainstorm.md` — drill-down 完整方案;本 plan 仅吸收 **Lead Time 3 段拆分子集**(方向 A),其余(per-epic 全表 / leading signals / notable months / 独立路由 / 4 段含 Backlog 方案)保留在 brainstorm 留待未来独立 plan + +### Backend 关键文件(Probe B 已验证存在性 + 内容) +- `backend/internal/config/config.go` — Storage / Jira / GitHub / GitLab 配置入口 +- `backend/internal/metrics/collector.go` — Jira sync 入口 +- `backend/internal/metrics/calculators/release_frequency.go` — **完全不动**(D2),但其 `CalculationContext.Releases` 消费模式被 Lead Time calculator 复用(一样从 in-memory cache 读 Jira version) +- `backend/internal/metrics/calculators/lead_time.go` — **Phase 2 重点改造对象**(起点改 + 输出 3 个 stage duration + worst_issues + coverage) +- `backend/internal/metrics/calculators/{cycle_time,patch_ratio,time_to_patch}.go` — **完全不动**(D4 + D5) +- `backend/internal/github/{client,sync}.go` — 新增 `ListPRCommits` 方法 + `pull_requests.first_commit_at` 字段填充 +- `backend/internal/gitlab/` — 同上,MR commits API +- `backend/internal/jirasync/sync.go` + `backend/internal/jira/snapshot_search.go` — issue_snapshots.versions 已经存 fix_version names(JSON array,`snapshot_search.go:278-279`);**不需要新增 changelog 提取**(方向 A 下 Backlog 段不在范围) +- `backend/internal/jira/models.go` — `ConvertJiraVersionToVersion`(line 311)已实现 Jira API → `models.Version{ReleaseDate, Released, Archieved}` 转换;calculator 读 `data.Releases` 即可 +- `backend/internal/storage/migrations/0001_init.sql` — `pull_requests` + `issue_snapshots` 基础 schema +- `backend/internal/storage/migrations/0006_rename_epic_key_to_jira_key.sql` — **W5**,`jira_key` 字段来源(不是 W2) +- `backend/internal/storage/migrations/0009_pr_first_commit_at.sql` — **Phase 2 新建**,唯一 schema 变更 +- `backend/testdata/` — 持久化 `/tmp/dora/*.json{,l}` 到这里 + +### Frontend 关键文件(Lead Time 卡片 + 独立 Trend Panel 范围) +- `frontend/src/components/MetricsDashboard.jsx` — **仅 1 行 JSX 改动**:KPI grid 下方 append ``;顶层 chrome、grid、其他卡片渲染路径完全不动 +- `frontend/src/components/metrics/MetricCard.jsx` — 按 `metric.name === "lead_time"` 分支渲染,新增 stage bar / 瓶颈高亮 / footer / hover worst_issues popup(**不含** trend section);其他 4 卡(Deploy Freq / Change Failure / MTTR / Cycle Time)渲染路径**完全不动** +- `frontend/src/components/metrics/LeadTimeTrendPanel.jsx` — **新建**:full-width panel 渲染 sparkline + MoM/QoQ chip;消费 `metric.trend` API 子结构;纯展示(无 hover preview / drill-down) +- `frontend/src/components/MetricsDashboard.css` — 新增 Lead Time 卡片 CSS(stage bar、瓶颈红框、popup)+ TrendPanel CSS(sparkline、chip);不改现有规则 +- ❌ ~~`frontend/src/components/MetricsTrend.jsx`~~ — **不新建**(泛化版本不做,仅做专用 LeadTimeTrendPanel) +- ❌ ~~`frontend/src/components/MetricsNarrative.jsx`~~ — **不新建**(范围收紧后取消) + +## Risks + +| R | Risk | Mitigation | +|---|---|---| +| R1 | Jira version 命名不规范(漏填 `releaseDate` 或不用 `{component}-v{semver}` 格式) | Phase 0 启动时加纪律监控指标:检测 `released=true` 但 `releaseDate=null` 的版本,仪表盘上暴露给团队 | +| R2 | PR ↔ Jira issue 关联率低(**W5** 的 `jira_key` 字段没覆盖到所有 PR) | Phase 2 启动前先跑 `docs/plans/2026-05-20-feat-pr-jira-link-audit-plan.md` 的审计,确认覆盖率 ≥ 70% | +| R3 | squash merge 后 author date 重写,影响 Lead Time 准确性 | Phase 2 改用 GitHub PR API 的 `/pulls/{n}/commits` 拿原始 first commit time(不依赖 git log) | +| R4 | Bot 识别遗漏(如 `alauda-github-idpbot` 不以 `bot$` 结尾) | Phase 1 实现时维护一份 known-bots config,定期 review。Sub-task of Q2. | +| R5 | UI 默认 human only 让"包含 bot 工作量"的工程视角隐形 | 切换 chip 设计为显眼的 toggle,保留 1-click 切换到 include bots | +| R6 | 67 repo 全拉对 GitHub API rate limit 压力大 | 使用 GitHub App 而不是 PAT(rate limit 高一倍),按 repo 串行 + 失败重试 | +| R7 | first_commit_at 抽取失败率高(PR API 404 / squash 后无原始 commit / 私有 fork 不可达),导致 Dev 段统计样本不足 | Phase 2 改动 6 已定义 fallback 规则:单 PR 缺失 → epic 排除 Dev 段(其他段仍计);epic 完全无 PR → 该 epic 仅计 KPI 总数不进 stage bar。API 返回 `coverage` 字段,UI footnote 暴露 `linked-PR coverage X%`;< 60% 时仪表盘顶部加 banner 提示 stage 数据可信度低 | +| R8 | Lead Time stage 数字与顶部 KPI 总数不一致(fallback epic 排除带来误差) | calculator 内做一致性校验:stages 之和 / total 偏差 > 5% 时 API 返回 `consistency_warning`,UI 展示 ⓘ 图标可点开看哪些 epic 被排除及原因 | + +## Handoff checklist + +接手者按这个清单逐项确认: + +- [x] Brainstorm 决策清单冻结(见 §Decisions · D1-D16) +- [x] 方向 A 决议(Lead Time 拆 3 段 · 不含 Backlog · 严守 D3) +- [x] Cycle Time / Patch Ratio / Time to Patch 三者**完全不动**(D4 + D5) +- [x] 真实数据已采集并验证(19 Jira releases + 2,457 GitHub PRs) +- [x] Mock 文件英文化 + react-select 形态最终态 (`docs/team-analytics/dora-optimization-mockup.html`) +- [x] Open Questions 都有 default assumption(可直接进 Phase 0-3) +- [x] **Probe A · Jira 探针**(2026-05-21 已跑,见 §Probe results): + ```bash + # 验证 DEVOPS project 里哪些 issuetype 真正挂 fix_version + curl -sk -u "$JIRA_USER:$JIRA_PASS" \ + "$JIRA_URL/rest/api/2/search?jql=project=DEVOPS+AND+fixVersion+is+not+EMPTY&fields=issuetype&maxResults=200" | \ + jq -r '.issues[].fields.issuetype.name' | sort | uniq -c | sort -rn + ``` + - 预期产出:issuetype 分布(如 `Story 180 / Task 45 / Bug 30 / Epic 5`),决定计算单位 + - 如果 Epic 占比 < 10% → 确认按 "issue" 通用计算(当前详细设计假设) + - 如果 Epic 占主导 → 维持 "epic" 语义,无需改设计 + - 如果 fixVersion 在多 issuetype 都分布 → 维持通用 "issue" 计算 + UI 按 issuetype 分组展示 +- [x] **Probe B · jira_key 字段探针**(2026-05-21 已跑,见 §Probe results): + ```bash + # 在 roadmap-planner 项目里 grep schema 定义 + grep -rn "jira_key\|JiraKey" backend/internal/db/ backend/internal/ | grep -i "schema\|migration\|struct" + ``` + - 验证 `pull_requests.jira_key` 字段实际存在 + schema 形态(TEXT 单值 / TEXT[] / 关联表) + - 如不存在 → Phase 2 前置依赖增加"先实现 jira_key 关联"(参考 W2 / `2026-05-20-feat-pr-jira-link-audit-plan.md`) +- [x] **mockup 视觉补充**:Lead Time 卡片 3 段水平 stage bar + 瓶颈高亮(Review 段 ⚠)+ hover worst_issues(每 stage top-1)已落地到 `docs/team-analytics/dora-optimization-mockup.html`(2026-05-21) +- [ ] **数据从 `/tmp/dora/` 持久化到 `backend/testdata/`**(Phase 0 第 1 步) +- [ ] Phase 0 启动:backend 启动 Jira + PR sync,让 Lead Time 数据流通(`first_commit_at` 字段在一轮 sync 后填充到位) +- [ ] Phase 1 启动:UI 改造(英文 + 卡片 + 趋势图 + Lead Time stage bar) +- [x] **Phase 2 完成**(2026-05-21,feature 分支 `feat/dora-lead-time-phase2`): + - migration 0009 · `pull_requests.first_commit_at` + - PR/MR commits API client + sync 集成(COALESCE-保护 UPSERT) + - `lead_time.go` 算法重写:3 段 + fallback C1-C6 + worst_issues 每 stage top-1 + coverage + trend (MoM/QoQ) + consistency_warning + - `/api/metrics/lead_time_to_release` 加 `include_bots` / `with_trend` query params + - 11 个 unit test 全部 PASS +- [ ] Phase 3 启动:扩展到 67 repo diff --git a/roadmap-planner/docs/team-analytics/dora-optimization-mockup.html b/roadmap-planner/docs/team-analytics/dora-optimization-mockup.html new file mode 100644 index 00000000..ca6dac12 --- /dev/null +++ b/roadmap-planner/docs/team-analytics/dora-optimization-mockup.html @@ -0,0 +1,538 @@ + + + + + +DORA Optimization — Lead Time Card + Trend Panel (Spec) + + + + + + + +
+ Scope · this plan + NEW · Lead Time card (lean) + Lead Time Trend Panel (red borders) + UNCHANGED · existing project rendering (grey hatched) +
+ + + + +
+ Existing dashboard chrome + masthead · tabs (Roadmap | Metrics | Team) · filter bar (react-select) + pixel-diff vs main branch baseline = 0 +
+ + + + + +
+ + +
+ Deployment Frequency + project default + unchanged +
+ + +
+ NEW · lean (no trend) + +
Lead Time for Changes
+
18 d
+
first commit → released · p50
+ Medium + +
+ +
+
18h
+
96h
+
168h
+
+
+
+ Dev + 18 h +
+
+ Review ⚠ + 96 h +
+
+ Release + 168 h +
+
+ +
hover → worst issues per stage
+ + + + +
+
Worst issues · one per stage
+ +
+ Dev +
+
CI pipeline tier-override migration prep + Story +
+
DEVOPS-14821 · 4 PRs
+
+ 142 h +
+ +
+ Review +
+
Tekton operator: support tier overrides + Story +
+
DEVOPS-12345 · 5 PRs
+
+ 720 h +
+ +
+ Release +
+
Renovate: bump tektoncd/pipeline v0.62 → v0.66 + Technical Debt + 🤖 +
+
DEVOPS-15007 · 1 PR
+
+ 912 h +
+
+
+ + +
+ Change Failure Rate
(Patch Ratio)
+ project default + unchanged +
+ + +
+ Mean Time to Recovery
(Time to Patch)
+ project default + unchanged +
+ + +
+ Cycle Time + project default + unchanged +
+ +
+ + + + +
+ NEW · Lead Time Trend Panel + +
+
+
Lead Time Trend
+
9 months · total p50 · first commit → released
+
+
+ + + 5% + MoM + + + + 13% + QoQ + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + 24d + 26d (peak) + 16d (low) + 18d + + + 2025-09 + 10 + 11 + 12 + 01 + 02 + 03 + 04 + 2026-05 ● + + +
+ Lead Time 越短越好 · 负方向 (▼ MoM) = 改善 · 正方向 (▲ QoQ) = 恶化 + · sample < 3 月份的 p50 = null(点上画空心 · 段线画虚线) +
+
+ +
spec · card + trend panel · 2026-05-21
+ + +