Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion packages/sca/license.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ def _as_action(v: Any, *, default: str) -> str:
def evaluate(
deps: List[Dependency],
policy: LicensePolicy,
*,
offline: bool = False,
) -> List[LicenseFinding]:
"""Classify each dep's declared_license against the policy.

Expand All @@ -226,6 +228,18 @@ def evaluate(
Ecosystems lacking package-level SPDX metadata (GitHub Actions,
Debian, OCI, Inline) are skipped entirely — see
:data:`_SPDX_SUPPORTED_ECOSYSTEMS` for the allowlist.

``offline``: when ``True``, deps whose ``declared_license`` is
``None`` produce *no* finding (not even ``license_unknown``).
Rationale: in offline mode :func:`enrich_licenses` is skipped,
so every dep that would have been enriched via registry metadata
retains ``None``. Emitting 8 k ``license_unknown`` findings for
a 10k-dep monorepo where the operator explicitly chose offline
mode is misleading — the license may be perfectly known via
registry; we simply didn't ask. A warm online run surfaces the
real unknowns. Consequently, ``--offline`` scans with a cold
cache produce *no* license findings (the safe, low-noise choice)
rather than flooding the report with unactionable noise.
"""
seen: Set[str] = set()
out: List[LicenseFinding] = []
Expand All @@ -236,7 +250,7 @@ def evaluate(
if key in seen:
continue
seen.add(key)
finding = _evaluate_one(d, policy)
finding = _evaluate_one(d, policy, offline=offline)
if finding is not None:
out.append(finding)
return out
Expand All @@ -245,9 +259,15 @@ def evaluate(
def _evaluate_one(
dep: Dependency,
policy: LicensePolicy,
*,
offline: bool = False,
) -> Optional[LicenseFinding]:
spdx = dep.declared_license
if spdx is None or not spdx.strip():
if offline:
# Enrichment didn't run; skip rather than emit misleading
# license_unknown noise. See evaluate() docstring.
return None
return _unknown_finding(dep, policy)

spdx = spdx.strip()
Expand Down
2 changes: 1 addition & 1 deletion packages/sca/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ def run_sca(
"evaluation will rely on existing declared_license",
exc_info=True,
)
license_findings = evaluate_license(joined, policy)
license_findings = evaluate_license(joined, policy, offline=options.offline)
progress.done(f"{len(license_findings)} findings")

# 3. Canonical dep set: lockfile-preferred, deduped per (eco, name, ver).
Expand Down
20 changes: 18 additions & 2 deletions packages/sca/tests/test_perf_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,15 @@

# Regression threshold. Adjust upward only when a substantive +
# documented perf trade was made (e.g. enabling a new checker).
# Catch egregious regressions (10x), not normal variance.
RUNTIME_BUDGET_S = 120.0 # 2 minutes — generous for CI
# Catch egregious regressions, not normal variance.
#
# Baseline conditions: --offline with a cache rooted in tmp_path
# (local storage, never NFS). The cache is always cold because each
# test run gets a fresh tmp_path, so 22k+ stat() calls happen every
# run. Routing to local storage keeps those calls at ~0.01ms each
# (≈220ms total) rather than the 5-50ms NFS RTT that caused the
# 199s regression when the default ~/.mantishack/cache/sca was used.
RUNTIME_BUDGET_S = 120.0 # 2 minutes — generous for local-cache cold run
RSS_BUDGET_MB = 1024 # 1 GiB peak — generous

# Synthetic-fixture targets. Round numbers that make sample-rate
Expand Down Expand Up @@ -180,9 +187,18 @@ def _peak_rss_mb() -> float:
def _run_scan(target: Path, out: Path) -> Tuple[float, float, str]:
"""Run the scan, returning (wallclock_s, peak_child_rss_mb,
stderr)."""
# Route the disk cache to a subdirectory of ``out`` (which lives
# inside pytest's ``tmp_path`` → fast local storage on every CI
# provider). Without this, the cache defaults to
# ``~/.mantishack/cache/sca`` which may sit on a network-backed
# home directory. The 22,022 stat() calls a 10k-dep cold-cache
# scan makes cost ≈0.01 ms each on local tmpfs vs ≈5-50 ms on
# NFS — the difference alone accounted for the 199 s regression
# (22 k × 10 ms = 220 s) that tripped the 120 s gate.
cmd = [
sys.executable, "-m", "packages.sca.cli",
str(target), "--offline", "--out", str(out),
"--cache-root", str(out / ".sca-cache"),
]
start = time.perf_counter()
proc = subprocess.run(
Expand Down