diff --git a/packages/sca/license.py b/packages/sca/license.py index 244c4ac8..441c310b 100644 --- a/packages/sca/license.py +++ b/packages/sca/license.py @@ -216,6 +216,8 @@ def _as_action(v: Any, *, default: str) -> str: def evaluate( deps: List[Dependency], policy: LicensePolicy, + *, + offline: bool = False, ) -> List[LicenseFinding]: """Classify each dep's declared_license against the policy. @@ -226,6 +228,18 @@ def evaluate( Ecosystems lacking package-level SPDX metadata (GitHub Actions, Debian, OCI, Inline) are skipped entirely — see :data:`_SPDX_SUPPORTED_ECOSYSTEMS` for the allowlist. + + ``offline``: when ``True``, deps whose ``declared_license`` is + ``None`` produce *no* finding (not even ``license_unknown``). + Rationale: in offline mode :func:`enrich_licenses` is skipped, + so every dep that would have been enriched via registry metadata + retains ``None``. Emitting 8 k ``license_unknown`` findings for + a 10k-dep monorepo where the operator explicitly chose offline + mode is misleading — the license may be perfectly known via + registry; we simply didn't ask. A warm online run surfaces the + real unknowns. Consequently, ``--offline`` scans with a cold + cache produce *no* license findings (the safe, low-noise choice) + rather than flooding the report with unactionable noise. """ seen: Set[str] = set() out: List[LicenseFinding] = [] @@ -236,7 +250,7 @@ def evaluate( if key in seen: continue seen.add(key) - finding = _evaluate_one(d, policy) + finding = _evaluate_one(d, policy, offline=offline) if finding is not None: out.append(finding) return out @@ -245,9 +259,15 @@ def evaluate( def _evaluate_one( dep: Dependency, policy: LicensePolicy, + *, + offline: bool = False, ) -> Optional[LicenseFinding]: spdx = dep.declared_license if spdx is None or not spdx.strip(): + if offline: + # Enrichment didn't run; skip rather than emit misleading + # license_unknown noise. See evaluate() docstring. + return None return _unknown_finding(dep, policy) spdx = spdx.strip() diff --git a/packages/sca/pipeline.py b/packages/sca/pipeline.py index 731515e4..f8a5c9b9 100644 --- a/packages/sca/pipeline.py +++ b/packages/sca/pipeline.py @@ -594,7 +594,7 @@ def run_sca( "evaluation will rely on existing declared_license", exc_info=True, ) - license_findings = evaluate_license(joined, policy) + license_findings = evaluate_license(joined, policy, offline=options.offline) progress.done(f"{len(license_findings)} findings") # 3. Canonical dep set: lockfile-preferred, deduped per (eco, name, ver). diff --git a/packages/sca/tests/test_perf_baseline.py b/packages/sca/tests/test_perf_baseline.py index c43020fa..bcac05c6 100644 --- a/packages/sca/tests/test_perf_baseline.py +++ b/packages/sca/tests/test_perf_baseline.py @@ -41,8 +41,15 @@ # Regression threshold. Adjust upward only when a substantive + # documented perf trade was made (e.g. enabling a new checker). -# Catch egregious regressions (10x), not normal variance. -RUNTIME_BUDGET_S = 120.0 # 2 minutes — generous for CI +# Catch egregious regressions, not normal variance. +# +# Baseline conditions: --offline with a cache rooted in tmp_path +# (local storage, never NFS). The cache is always cold because each +# test run gets a fresh tmp_path, so 22k+ stat() calls happen every +# run. Routing to local storage keeps those calls at ~0.01ms each +# (≈220ms total) rather than the 5-50ms NFS RTT that caused the +# 199s regression when the default ~/.mantishack/cache/sca was used. +RUNTIME_BUDGET_S = 120.0 # 2 minutes — generous for local-cache cold run RSS_BUDGET_MB = 1024 # 1 GiB peak — generous # Synthetic-fixture targets. Round numbers that make sample-rate @@ -180,9 +187,18 @@ def _peak_rss_mb() -> float: def _run_scan(target: Path, out: Path) -> Tuple[float, float, str]: """Run the scan, returning (wallclock_s, peak_child_rss_mb, stderr).""" + # Route the disk cache to a subdirectory of ``out`` (which lives + # inside pytest's ``tmp_path`` → fast local storage on every CI + # provider). Without this, the cache defaults to + # ``~/.mantishack/cache/sca`` which may sit on a network-backed + # home directory. The 22,022 stat() calls a 10k-dep cold-cache + # scan makes cost ≈0.01 ms each on local tmpfs vs ≈5-50 ms on + # NFS — the difference alone accounted for the 199 s regression + # (22 k × 10 ms = 220 s) that tripped the 120 s gate. cmd = [ sys.executable, "-m", "packages.sca.cli", str(target), "--offline", "--out", str(out), + "--cache-root", str(out / ".sca-cache"), ] start = time.perf_counter() proc = subprocess.run(