From 9ba0e9e2c8b7a9d2543f6112d6e369aa7421cea1 Mon Sep 17 00:00:00 2001 From: hello-args Date: Fri, 12 Jun 2026 01:36:35 +0530 Subject: [PATCH 01/16] feat(scoring): integrate v2 multi-factor scoring on develop Ship parallel absolute_risk scoring alongside frozen legacy overall, with corpus calibration, analyzer evidence emitters, and reporting across CLI, API, dashboard, and CI while defaulting to dual-score mode. --- .github/workflows/scoring-v2.yml | 20 ++ README.md | 2 +- action/README.md | 8 +- action/action.yml | 44 ++- docs/analysis/adr-003-scoring-v2.md | 36 ++ docs/analysis/architecture.md | 18 + docs/migration/scoring-v2.md | 69 ++++ docs/platform/ci-integration.md | 25 +- docs/platform/cli.md | 16 +- docs/platform/rest-api.md | 11 +- docs/reporting/sarif-score-v2.md | 27 ++ docs/reporting/scoring-spec-v2.md | 120 +++++++ docs/reporting/scoring-spec.md | 1 + pyproject.toml | 1 + scripts/calibrate_scoring_weights.py | 70 ++++ scripts/run_scoring_corpus.py | 44 +++ src/mcts/analyzers/attack_chains.py | 34 +- src/mcts/analyzers/behavioral_static.py | 3 +- src/mcts/analyzers/command_execution.py | 3 +- src/mcts/analyzers/cross_server.py | 3 +- src/mcts/analyzers/data_leakage.py | 3 +- src/mcts/analyzers/jailbreak.py | 5 +- src/mcts/analyzers/path_validation.py | 3 +- src/mcts/analyzers/permissions.py | 3 +- src/mcts/analyzers/prompt_injection.py | 3 +- src/mcts/analyzers/schema_surface.py | 3 +- src/mcts/analyzers/tool_abuse.py | 3 +- src/mcts/api/app.py | 33 +- src/mcts/cli/machine_wide.py | 10 +- src/mcts/cli/main.py | 146 +++++++- src/mcts/core/config.py | 18 +- src/mcts/core/scanner.py | 63 +++- src/mcts/discovery/static_meta.py | 7 +- src/mcts/governance/policy.py | 36 +- src/mcts/governance/scan_gates.py | 78 +++++ src/mcts/inventory/scan_all.py | 19 +- src/mcts/mcp_server/server.py | 27 +- src/mcts/output/artifacts.py | 5 + src/mcts/output/history.py | 27 +- src/mcts/pentest/models.py | 1 + src/mcts/pentest/runner.py | 1 + src/mcts/probe/discovery_meta.py | 4 +- src/mcts/report/assets/dashboard.js | 276 ++++++++++++++- src/mcts/report/assets/styles.css | 62 +++- src/mcts/report/data.py | 328 +++++++++++++++--- src/mcts/report/scan_meta.py | 17 + src/mcts/report/templates/dashboard.html | 52 ++- src/mcts/reporting/models.py | 3 + src/mcts/reporting/sarif.py | 6 + src/mcts/scan/machine_wide.py | 52 ++- src/mcts/scoring/__init__.py | 29 +- src/mcts/scoring/asset.py | 37 ++ src/mcts/scoring/chains.py | 43 +++ src/mcts/scoring/context.py | 71 ++++ src/mcts/scoring/corpus.py | 21 ++ src/mcts/scoring/corpus_runner.py | 177 ++++++++++ .../scoring/data/scoring_v2_corpus_stats.json | 50 +++ src/mcts/scoring/engine_v2.py | 190 ++++++++++ src/mcts/scoring/evidence_emit.py | 28 ++ src/mcts/scoring/evidence_tags.py | 210 +++++++++++ src/mcts/scoring/exposure.py | 27 ++ src/mcts/scoring/factors.py | 143 ++++++++ src/mcts/scoring/graph.py | 184 ++++++++++ src/mcts/scoring/levels.py | 23 ++ src/mcts/scoring/models.py | 104 ++++++ src/mcts/scoring/normalize.py | 19 + src/mcts/scoring/pipeline_trace.py | 13 + src/mcts/scoring/preconditions.py | 21 ++ src/mcts/scoring/reachability.py | 24 ++ src/mcts/scoring/uncertainty.py | 85 +++++ src/mcts/scoring/weights.py | 39 +++ src/mcts/scoring/weights_learned.yaml | 39 +++ src/mcts/scoring/weights_v1.yaml | 39 +++ src/mcts/testing/regression_harness.py | 7 + src/mcts/ui/alternate_formats.py | 9 +- src/mcts/ui/dashboard.py | 22 ++ src/mcts/ui/report_renderer.py | 8 +- src/mcts/ui/theme.py | 10 + tests/fixtures/rfc_worked_example.json | 9 + tests/fixtures/scoring_corpus/README.md | 8 + .../scoring_corpus/expected_order.json | 1 + .../scoring_corpus/expert_rankings.json | 17 + tests/fixtures/scoring_corpus/servers.json | 35 ++ tests/scoring/test_analyzer_evidence.py | 193 +++++++++++ tests/scoring/test_category_scores_v2.py | 52 +++ tests/scoring/test_chains.py | 84 +++++ tests/scoring/test_corpus_ordering.py | 21 ++ tests/scoring/test_corpus_runner.py | 28 ++ tests/scoring/test_engine_v2.py | 176 ++++++++++ tests/scoring/test_evidence_coverage.py | 39 +++ tests/scoring/test_evidence_emit.py | 61 ++++ tests/scoring/test_factors.py | 83 +++++ tests/scoring/test_graph.py | 74 ++++ tests/scoring/test_history_trend.py | 88 +++++ tests/scoring/test_import_layers.py | 20 ++ tests/scoring/test_levels.py | 10 + tests/scoring/test_pentest_paths.py | 15 + tests/scoring/test_scanner_bypass.py | 27 ++ tests/scoring/test_scanner_v2.py | 96 +++++ tests/scoring/test_spearman.py | 30 ++ tests/scoring/test_uncertainty.py | 43 +++ tests/test_analysis_output.py | 63 ++++ tests/test_api_gate_violations.py | 29 ++ tests/test_attack_graph.py | 38 +- tests/test_cli_gates_v2.py | 42 +++ tests/test_cli_report.py | 12 + tests/test_governance.py | 33 ++ tests/test_html_report.py | 19 + tests/test_inventory_scan_all.py | 42 +++ tests/test_mcp_server.py | 17 + tests/test_ui.py | 10 + 111 files changed, 4844 insertions(+), 192 deletions(-) create mode 100644 .github/workflows/scoring-v2.yml create mode 100644 docs/analysis/adr-003-scoring-v2.md create mode 100644 docs/migration/scoring-v2.md create mode 100644 docs/reporting/sarif-score-v2.md create mode 100644 docs/reporting/scoring-spec-v2.md create mode 100644 scripts/calibrate_scoring_weights.py create mode 100644 scripts/run_scoring_corpus.py create mode 100644 src/mcts/governance/scan_gates.py create mode 100644 src/mcts/scoring/asset.py create mode 100644 src/mcts/scoring/chains.py create mode 100644 src/mcts/scoring/context.py create mode 100644 src/mcts/scoring/corpus.py create mode 100644 src/mcts/scoring/corpus_runner.py create mode 100644 src/mcts/scoring/data/scoring_v2_corpus_stats.json create mode 100644 src/mcts/scoring/engine_v2.py create mode 100644 src/mcts/scoring/evidence_emit.py create mode 100644 src/mcts/scoring/evidence_tags.py create mode 100644 src/mcts/scoring/exposure.py create mode 100644 src/mcts/scoring/factors.py create mode 100644 src/mcts/scoring/graph.py create mode 100644 src/mcts/scoring/levels.py create mode 100644 src/mcts/scoring/models.py create mode 100644 src/mcts/scoring/normalize.py create mode 100644 src/mcts/scoring/pipeline_trace.py create mode 100644 src/mcts/scoring/preconditions.py create mode 100644 src/mcts/scoring/reachability.py create mode 100644 src/mcts/scoring/uncertainty.py create mode 100644 src/mcts/scoring/weights.py create mode 100644 src/mcts/scoring/weights_learned.yaml create mode 100644 src/mcts/scoring/weights_v1.yaml create mode 100644 tests/fixtures/rfc_worked_example.json create mode 100644 tests/fixtures/scoring_corpus/README.md create mode 100644 tests/fixtures/scoring_corpus/expected_order.json create mode 100644 tests/fixtures/scoring_corpus/expert_rankings.json create mode 100644 tests/fixtures/scoring_corpus/servers.json create mode 100644 tests/scoring/test_analyzer_evidence.py create mode 100644 tests/scoring/test_category_scores_v2.py create mode 100644 tests/scoring/test_chains.py create mode 100644 tests/scoring/test_corpus_ordering.py create mode 100644 tests/scoring/test_corpus_runner.py create mode 100644 tests/scoring/test_engine_v2.py create mode 100644 tests/scoring/test_evidence_coverage.py create mode 100644 tests/scoring/test_evidence_emit.py create mode 100644 tests/scoring/test_factors.py create mode 100644 tests/scoring/test_graph.py create mode 100644 tests/scoring/test_history_trend.py create mode 100644 tests/scoring/test_import_layers.py create mode 100644 tests/scoring/test_levels.py create mode 100644 tests/scoring/test_pentest_paths.py create mode 100644 tests/scoring/test_scanner_bypass.py create mode 100644 tests/scoring/test_scanner_v2.py create mode 100644 tests/scoring/test_spearman.py create mode 100644 tests/scoring/test_uncertainty.py create mode 100644 tests/test_api_gate_violations.py create mode 100644 tests/test_cli_gates_v2.py create mode 100644 tests/test_inventory_scan_all.py diff --git a/.github/workflows/scoring-v2.yml b/.github/workflows/scoring-v2.yml new file mode 100644 index 0000000..16791e2 --- /dev/null +++ b/.github/workflows/scoring-v2.yml @@ -0,0 +1,20 @@ +name: scoring-v2 + +on: + push: + pull_request: + +jobs: + scoring: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - run: uv sync --group dev + - run: uv run pytest tests/scoring/ tests/test_attack_graph.py tests/test_cli_gates_v2.py tests/test_cli_report.py tests/test_analysis_output.py tests/test_html_report.py tests/test_governance.py tests/test_mcp_server.py tests/test_api_gate_violations.py tests/test_inventory_scan_all.py -v + - run: uv run pytest tests/test_scoring.py -v + - run: uv run python scripts/calibrate_scoring_weights.py --min-rho 0.80 + - run: uv build + - run: | + uv run python -c "from mcts.scoring.weights import load_weights; load_weights('manual_v1'); load_weights('weights_learned')" + uv run python -c "from mcts.scoring.corpus import load_corpus_stats; load_corpus_stats()" diff --git a/README.md b/README.md index c011f40..425e1ed 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ MCTS is **alpha** software with a local-first MCP security pipeline — no cloud | Capability | How | |------------|-----| -| Risk scoring | Exponential 0–100 score, risk index, category breakdown | +| Risk scoring | Legacy 0–100 index (default) + opt-in v2 multi-factor `absolute_risk` (`--scoring v2\|both`) | | Compliance mapping | OWASP LLM Top 10 + OWASP MCP Top 10 (non-scoring meta-findings) | | Terminal UI | Rich dashboard — themes, progress, `--terminal-format` views | | Export formats | JSON, SARIF (`--format sarif`), raw envelope, HTML (`mcts report`) | diff --git a/action/README.md b/action/README.md index 565a219..6e14779 100644 --- a/action/README.md +++ b/action/README.md @@ -82,7 +82,13 @@ If the action lives in your repo under `action/`: |-------|---------|-------------| | `target` | `./server.py` | Path to MCP server entrypoint or repo directory | | `fail-on-critical` | `true` | Fail workflow if any critical finding is detected | -| `min-score` | — | Fail if overall score is below this threshold (0–100) | +| `min-score` | — | Fail if legacy overall score is below this threshold (0–100) | +| `scoring` | `legacy` | `legacy`, `v2`, or `both` — enable multi-factor scoring | +| `min-security-score` | — | Fail if v2 benchmark security score is below threshold (requires `scoring: v2` or `both`) | +| `max-absolute-risk` | — | Fail if v2 absolute risk exceeds threshold | +| `max-risk-level` | — | Fail if v2 risk level exceeds band (`low` / `medium` / `high` / `critical`) | +| `weights-profile` | `manual_v1` | v2 weights profile when `scoring` is `v2` or `both` | +| `assets-path` | — | Optional `.mcts/assets.yaml` for v2 asset-value overrides | | `extras` | `mcp,sast` | Comma-separated optional extras (`all` installs every extra) | --- diff --git a/action/action.yml b/action/action.yml index 1737d70..30c9651 100644 --- a/action/action.yml +++ b/action/action.yml @@ -14,7 +14,31 @@ inputs: required: false default: "true" min-score: - description: Fail if security score is below this value (0-100). Leave empty to skip. + description: Fail if legacy security score is below this value (0-100). Leave empty to skip. + required: false + default: "" + scoring: + description: Scoring mode — legacy, v2, or both (default both) + required: false + default: "both" + min-security-score: + description: Fail if v2 benchmark security score is below this value (requires scoring v2 or both) + required: false + default: "" + max-absolute-risk: + description: Fail if v2 absolute risk exceeds this value (requires scoring v2 or both) + required: false + default: "" + max-risk-level: + description: Fail if v2 risk level exceeds this band (low, medium, high, critical) + required: false + default: "" + weights-profile: + description: v2 weights profile (default manual_v1) + required: false + default: "manual_v1" + assets-path: + description: Optional .mcts/assets.yaml path for v2 asset-value overrides required: false default: "" extras: @@ -72,6 +96,24 @@ runs: if [ -n "${{ inputs.min-score }}" ]; then ARGS+=(--min-score "${{ inputs.min-score }}") fi + if [ -n "${{ inputs.scoring }}" ] && [ "${{ inputs.scoring }}" != "legacy" ]; then + ARGS+=(--scoring "${{ inputs.scoring }}") + fi + if [ -n "${{ inputs.min-security-score }}" ]; then + ARGS+=(--min-security-score "${{ inputs.min-security-score }}") + fi + if [ -n "${{ inputs.max-absolute-risk }}" ]; then + ARGS+=(--max-absolute-risk "${{ inputs.max-absolute-risk }}") + fi + if [ -n "${{ inputs.max-risk-level }}" ]; then + ARGS+=(--max-risk-level "${{ inputs.max-risk-level }}") + fi + if [ -n "${{ inputs.weights-profile }}" ] && [ "${{ inputs.weights-profile }}" != "manual_v1" ]; then + ARGS+=(--weights "${{ inputs.weights-profile }}") + fi + if [ -n "${{ inputs.assets-path }}" ]; then + ARGS+=(--assets-path "${{ inputs.assets-path }}") + fi uv run mcts "${ARGS[@]}" cp "$REPO_ROOT/mcts_analysis/scan-report.sarif" "$SARIF_OUT" diff --git a/docs/analysis/adr-003-scoring-v2.md b/docs/analysis/adr-003-scoring-v2.md new file mode 100644 index 0000000..ba07f16 --- /dev/null +++ b/docs/analysis/adr-003-scoring-v2.md @@ -0,0 +1,36 @@ +# ADR-003: MCTS Risk Score v2 + +**Status:** Accepted +**Date:** 2026-06-11 +**Spec:** [scoring-spec-v2.md](../reporting/scoring-spec-v2.md) + +## Context + +Legacy scoring (`score.overall`) uses severity-only exponential decay. Clients need explainable, stable absolute risk with factor breakdowns and attack-chain amplification without double-counting chain meta-findings. + +## Decisions + +| Topic | Choice | +|-------|--------| +| Dual score in CI | `--min-score` stays on legacy `overall` until v2.2 | +| `scoring_mode="v2"` | Runs **both** engines: legacy `score` + `score_v2` | +| Chain meta-findings in v2 sum | **Exclude** — `attack_chains` in `NON_SCORING_V2` | +| Chain multiplier | `paths_v1` tool correlation on validated paths (`medium+` severity) | +| `hop_count` | `len(path_nodes) - 1` on edge-validated paths | +| Analyzer when v2 on | Always run `AttackChainAnalyzer`; bypass `--analyzers` / `--surfaces` | +| `chain_factor` gating | `enable_attack_chains` / `--no-attack-chains` sets `chain_factor_mode: disabled` | +| `weights_hash` | `ScoreV2Basis.weights_hash` only — not on `RiskScoreV2` | +| API score gates | CLI-only in v2.0; API returns JSON without HTTP gate exit | +| Canonical graph | `scoring/graph.py` owns paths; `report/data.build_attack_graph()` delegates | +| Fake path rejection | BFS returns `None` when disconnected — never `[start, end]` | +| Model location | v2 types in `scoring/models.py`; `ScanReport` imports `RiskScoreV2` | +| `dimension_scores` | RFC factor axes only; OWASP in `category_scores_v2()` (PR-4d) | +| Bracket formula | `1 + Σ factor_increments` — no YAML bracket double-weight | +| Confidence | Affects `confidence_score` / `risk_range` only — never `absolute_risk` | + +## Consequences + +- `ScanReport.score` remains always populated (backward compatible). +- `ScanReport.score_v2` is additive when v2/both is enabled. +- Under v2/both, attack chains analyzer always runs; `--no-attack-chains` disables multiplier only. +- Legacy and v2 scores may diverge on the same scan — expected (different formulas and scorable sets). diff --git a/docs/analysis/architecture.md b/docs/analysis/architecture.md index 21ff37e..e557fc7 100644 --- a/docs/analysis/architecture.md +++ b/docs/analysis/architecture.md @@ -379,6 +379,24 @@ Used by `behavioral_static`. Python AST taint + optional tree-sitter for TS/Go/R `capability/inferrer.py` assigns per-tool flags (`reads_untrusted_input`, `egresses_network`, `executes_commands`, …). BFS finds paths like read → exfiltrate. Graph stored on `ScanReport.attack_graph`. +When `scoring_mode` is `v2` or `both`, paths are built at scan time via `scoring/graph.build_paths()` and stored on the canonical graph: + +```json +{ + "nodes": [{"id": "read_file", "label": "read_file", "type": "tool"}], + "edges": [{"from": "read_file", "to": "send_webhook", "label": "read→exfil"}], + "paths": [{ + "id": "path-chain-credential-theft-2", + "nodes": ["read_file", "get_env", "send_webhook"], + "tools_on_path": ["read_file", "get_env", "send_webhook"], + "hop_count": 2, + "finding_ids": ["chain-credential-theft"] + }] +} +``` + +`hop_count` is validated edge hops only (`len(nodes) - 1`). Scanner, v2 engine, and HTML dashboard all use `canonical_attack_graph(report)` (invariant I3/I11). + --- ## Scoring and reporting diff --git a/docs/migration/scoring-v2.md b/docs/migration/scoring-v2.md new file mode 100644 index 0000000..e2c7340 --- /dev/null +++ b/docs/migration/scoring-v2.md @@ -0,0 +1,69 @@ +# Migrating to MCTS Risk Score v2 + +## Enable v2 + +```bash +mcts scan --scoring v2 # legacy + score_v2 in JSON +mcts scan --scoring both # same; UI shows both when supported +``` + +Default remains `--scoring legacy` until GA (see integration plan §15). + +## Score differences (expected) + +| Metric | Legacy `score.overall` | v2 `absolute_risk` | +|--------|------------------------|-------------------| +| Scorable set | All except `compliance` | Excludes `compliance` **and** `attack_chains` | +| Chain signal | Critical chain meta-rows in sum | `chain_factor` on tool-attributed findings | +| Scale | 0–100 (higher = better) | Unbounded integer (higher = worse) | + +Same scan can show different numbers — not a regression. + +## `--no-attack-chains` + +Under `--scoring v2|both`, the attack chains analyzer **still runs** (graph + meta-findings). The flag disables the chain multiplier only (`chain_factor_mode: disabled`). Use `--scoring legacy` to omit chain meta-findings entirely. + +## CI gates + +| Flag | Applies to | +|------|------------| +| `--min-score` | Legacy `score.overall` only | +| `--min-security-score` | v2 `security_score` (requires corpus stats) | +| `--max-absolute-risk` | v2 `absolute_risk` | +| `--max-risk-level` | v2 `risk_level` | +| `--fail-on-category` | Legacy category points only | + +## API + +`ScanRequest` accepts `scoring_mode`, `weights_profile`, and v2 gate fields. HTTP responses do not fail on gates in v2.0 — inspect `score_v2` client-side (ADR-003). + +## History & trends + +`mcts_analysis/history.json` entries include `scoring_version`. When all runs use v2/both, the HTML trend chart plots `absolute_risk` (never mixed with legacy score on one axis). Mixed history shows legacy score with a warning. + +## Machine-wide & inventory + +`mcts scan --machine-wide` and inventory batch scans include `absolute_risk`, `security_score`, and `risk_level` per server when v2 is enabled. `worst_absolute_risk` is reported in machine-wide summaries. + +## Governance policy + +Optional `.mcts/policy.yaml` fields: + +```yaml +min_score: 70 # legacy overall only +min_security_score: 50 # v2 benchmark score +max_absolute_risk: 500 # v2 absolute risk ceiling +max_risk_level: medium # v2 band gate +``` + +## Asset overrides + +Optional `.mcts/assets.yaml` for v2 `asset_value` overrides: + +```yaml +overrides: + customer_db: 0.9 + temp_cache: 0.2 +``` + +Pass `--assets-path .mcts/assets.yaml` or set `assets_path` on `ScanConfig`. diff --git a/docs/platform/ci-integration.md b/docs/platform/ci-integration.md index 755e229..1048826 100644 --- a/docs/platform/ci-integration.md +++ b/docs/platform/ci-integration.md @@ -113,7 +113,30 @@ mcts scan ./repo/ \ --fail-on-category execution:10 ``` -Category semantics: [Scoring Specification](../reporting/scoring-spec.md). +Category semantics: [Scoring Specification](../reporting/scoring-spec.md). Category gates apply to **legacy** v1 tiles only. + +### Scoring v2 gates (opt-in) + +Enable multi-factor scoring, then gate on v2 fields: + +```bash +mcts scan ./server.py \ + --scoring v2 \ + --max-absolute-risk 500 \ + --max-risk-level high \ + --min-security-score 40 \ + -o report.json +``` + +| Flag | Metric | +|------|--------| +| `--scoring v2\|both` | Enables `score_v2` in report JSON | +| `--min-score` | Legacy `score.overall` only (unchanged) | +| `--min-security-score` | v2 benchmark percentile score | +| `--max-absolute-risk` | v2 stable integer risk sum | +| `--max-risk-level` | v2 band (`low` < `medium` < `high` < `critical`) | + +GitHub Action equivalents: `scoring`, `min-security-score`, `max-absolute-risk`, `max-risk-level` inputs. See [Scoring v2 migration](../migration/scoring-v2.md). ### SARIF for code scanning diff --git a/docs/platform/cli.md b/docs/platform/cli.md index 6054d3e..b116096 100644 --- a/docs/platform/cli.md +++ b/docs/platform/cli.md @@ -84,11 +84,19 @@ When `-o` is set, format determines serialization. SARIF uses `reporting/sarif.p | Flag | Default | Description | |------|---------|-------------| | `--fail-on-critical` | false | Exit **1** if any critical finding | -| `--min-score` | — | Exit **1** if `score.overall` < N (0–100) | +| `--min-score` | — | Exit **1** if legacy `score.overall` < N (0–100) | | `--max-critical` | — | Exit **1** if critical count > N | -| `--fail-on-category` | — | Repeatable. Format: `category:limit`. Exit **1** when category score ≥ limit | - -Valid category keys: `permissions`, `injection`, `execution`, `data_leakage`, `attack_chains`, `shadowing`, `jailbreak`. See [Scoring Specification](../reporting/scoring-spec.md). +| `--fail-on-category` | — | Repeatable. Format: `category:limit`. Exit **1** when **legacy** category score ≥ limit | +| `--scoring` | `both` | `legacy`, `v2`, or `both` — enable multi-factor scoring | +| `--min-security-score` | — | Exit **1** if v2 benchmark security score < N (requires `--scoring v2` or `both`) | +| `--max-absolute-risk` | — | Exit **1** if v2 `absolute_risk` > N (requires `--scoring v2` or `both`) | +| `--max-risk-level` | — | Exit **1** if v2 `risk_level` exceeds band (`low` < `medium` < `high` < `critical`) | +| `--min-category-score-v2` | — | Repeatable. Format: `category:min`. Exit **1** when v2 OWASP tile score < min (100=good) | +| `--weights` | `manual_v1` | v2 weights profile name | +| `--corpus-stats-path` | packaged default | Override corpus stats JSON for v2 percentile scoring | +| `--no-attack-chains` | false | Disable attack-chain analyzer (v2 still runs; chain multiplier = 1.0) | + +Valid **legacy** category keys: `permissions`, `injection`, `execution`, `data_leakage`, `attack_chains`, `shadowing`, `jailbreak`. Category gates apply to v1 tiles only — not `category_scores_v2`. See [Scoring Specification](../reporting/scoring-spec.md) and [Scoring v2 migration](../migration/scoring-v2.md). ### Terminal UI flags diff --git a/docs/platform/rest-api.md b/docs/platform/rest-api.md index 4c6012e..756386a 100644 --- a/docs/platform/rest-api.md +++ b/docs/platform/rest-api.md @@ -123,6 +123,13 @@ All scan endpoints accept these fields (plus endpoint-specific fields where note | `analyzer_filter` | string[] | `[]` | Limit output to named analyzers | | `fanout_offset` | int | `0` | Pagination offset for batch scan endpoints | | `fanout_limit` | int | env max (50) | Page size for batch scan endpoints | +| `scoring_mode` | string | `"both"` | `legacy`, `v2`, or `both` | +| `weights_profile` | string | `"manual_v1"` | v2 weights profile when scoring is enabled | +| `corpus_stats_path` | string | — | Optional path to corpus stats JSON for v2 percentiles | +| `min_security_score` | int | — | Gate: fail when v2 security score below threshold (not enforced server-side by default) | +| `max_absolute_risk` | int | — | Gate: fail when v2 absolute risk above threshold | +| `max_risk_level` | string | — | Gate: fail when v2 risk level exceeds band | +| `assets_path` | string | — | Optional `.mcts/assets.yaml` path for v2 asset-value overrides | Batch endpoints (`/scan-all-tools`, `/scan-all-prompts`, `/scan-all-resources`) run one full analyzer pass per item. Use `fanout_offset` and `fanout_limit` to paginate; responses include `truncated` and `truncation_warning` when more items remain. @@ -163,7 +170,9 @@ Batch endpoints (`/scan-all-tools`, `/scan-all-prompts`, `/scan-all-resources`) } ``` -Response: full `ScanReport` JSON (`model_dump()`). +Response: `ScanResponse` shape — full `ScanReport` fields plus echoed `scoring_mode` and `gate_violations` (string array). When `scoring_mode` is `v2` or `both`, the payload includes `score_v2` (absolute risk, dimension scores, top contributors) and `scoring_version`. Legacy `score.overall` is always populated (invariant I1). The REST API does not fail HTTP status on gate violations — consumers inspect `gate_violations` or use the CLI for exit-code enforcement. + +Optional request field `min_category_score_v2`: map of OWASP category key → minimum health score (100=good). ### Planned API extensions diff --git a/docs/reporting/sarif-score-v2.md b/docs/reporting/sarif-score-v2.md new file mode 100644 index 0000000..9ab4eb5 --- /dev/null +++ b/docs/reporting/sarif-score-v2.md @@ -0,0 +1,27 @@ +# SARIF `mcts/scoreV2` extension + +MCTS SARIF output (`--format sarif`) includes optional run properties when `score_v2` is present: + +```json +{ + "runs": [{ + "properties": { + "mcts/scoreV2": { + "absoluteRisk": 2260, + "securityScore": 12, + "riskLevel": "critical" + } + } + }] +} +``` + +## Code Scanning adoption + +GitHub Code Scanning ingests SARIF by default but **does not surface custom run properties** in the Security tab. Consumers must: + +1. Parse SARIF JSON in CI or dashboards. +2. Read `runs[].properties["mcts/scoreV2"]` explicitly. +3. Gate on `absoluteRisk` / `securityScore` with `--min-security-score` or `--max-absolute-risk` in the MCTS CLI/Action instead of relying on Code Scanning UI alone. + +Legacy `score.overall` is not written to SARIF run properties in v2.0 — use CLI gates or custom SARIF post-processing for dual-score policies. diff --git a/docs/reporting/scoring-spec-v2.md b/docs/reporting/scoring-spec-v2.md new file mode 100644 index 0000000..fddeeca --- /dev/null +++ b/docs/reporting/scoring-spec-v2.md @@ -0,0 +1,120 @@ +# MCTS Risk Score v2 — Specification + +**Status:** GA (default `--scoring both`) +**ADR:** [adr-003-scoring-v2.md](../analysis/adr-003-scoring-v2.md) +**Legacy spec:** [scoring-spec.md](scoring-spec.md) +**SARIF:** [sarif-score-v2.md](sarif-score-v2.md) + +## Overview + +v2 adds `score_v2` with a stable **absolute risk** integer (higher = worse) alongside frozen legacy `score.overall`. Default is `--scoring both`. + +## Scorable set + +Excluded from v2 sum: `compliance`, `attack_chains` meta-findings. Tool-attributed findings from other analyzers are scored. + +## Per-finding formula (RFC §4.1) + +``` +bracket = 1 + Σ factor_increments +base_risk = severity_w × bracket +finding_risk = round(base_risk × chain_factor) +absolute_risk = Σ finding_risk +``` + +Factor increments come from classifiers in `weights_v1.yaml` under `classifiers:`. Evidence tags on findings refine classifiers when emitters populate `reachability_tag`, `exploitability_class`, etc. + +## Chain multiplier + +`chain_factor` applies to tool findings on validated graph paths (`hop_count` ≥ 1). Severity floor: medium+. Meta chain rows are display-only. + +| hop_count | chain_factor | +|-----------|--------------| +| 0–1 | 1.0 | +| 2 | 1.15 | +| 3 | 1.35 | +| 4+ | 1.50 | + +## Output (`score_v2`) + +| Field | Description | +|-------|-------------| +| `absolute_risk` | Stable integer sum | +| `security_score` | `100 - percentile(absolute_risk, corpus)` when stats available | +| `risk_level` | Band from corpus or literals: low/medium/high/critical | +| `risk_range` | Confidence interval on absolute risk (not driven by finding confidence) | +| `dimension_scores` | Eight factor axes 0–100 (higher = worse) | +| `top_contributors` | Top 10 findings/paths by contribution | +| `category_scores_v2` | Separate OWASP tiles, 100 = good (dashboard JSON) | +| `basis` | Scorable counts, excluded meta-rows, `weights_hash` | + +## Aggregation formulas (§8.8–8.10) + +### §8.8 `confidence_score` (RFC §4.3) + +Confidence affects `confidence_score` and `risk_range` only — **never** `absolute_risk`. Inputs are v2-scorable findings with aligned per-finding risks: + +``` +pairs = [(risk, finding) for finding, risk in zip(scorable, risks) if risk > 0] +if no pairs → confidence_score = 100 +else confidence_score = round(100 × Σ(effective_confidence(f) × risk) / Σ risk) +``` + +`effective_confidence` applies per-analyzer caps from `uncertainty.py` when `finding.confidence >= 0.99`. + +### §8.9 `risk_range` spread (RFC §4.12) + +``` +if absolute_risk == 0 → risk_range = (0, 0), label = "high" +mean_conf = weighted mean of effective_confidence by finding_risk +base_spread = absolute_risk × (1 - mean_conf) × 0.35 +spread = base_spread × evidence_quality_factor × analyzer_disagreement_factor +low = max(0, round(absolute_risk - spread)) +high = round(absolute_risk + spread) +label = high if mean_conf >= 0.85 else medium if mean_conf >= 0.65 else low +``` + +- `evidence_quality_factor`: 0.8 when live_probe + handler_traced tags present; else 1.2 +- `analyzer_disagreement_factor`: 1.4 when conflicting severities share a tool; else 1.0 + +### §8.10 `top_contributors` selection (RFC §4.14) + +1. Rank scorable findings by `finding_risk` descending; take up to **9** rows (`type=finding`). +2. Append one explainability row (`type=attack_chain`) for the highest `hop_count` path when paths exist and total rows < 10. +3. JSON export caps at **10** rows and omits verbose `evidence_tags`. + +Per-finding contributor fields: `risk_contribution`, `confidence` (effective × 100), `chain_factor`, `factors` breakdown. + +### `dimension_scores` normalization (§7.5) + +Per-axis raw sum = Σ factor increment for that axis across scorable findings. Normalized 0–100 (higher = worse): + +``` +if raw <= 0 → 0 +elif corpus dimension_p95[axis] > 0 → min(100, round(100 × raw / p95)) +else → min(100, round(100 × raw / max(absolute_risk, 1))) +``` + +`dimension_p95` per axis is recomputed from corpus scans via `scripts/calibrate_scoring_weights.py --write-package-stats`. + +## CI gates + +| Flag | Applies to | +|------|------------| +| `--min-score` | Legacy only | +| `--min-security-score` | v2 benchmark score | +| `--max-absolute-risk` | v2 absolute risk | +| `--max-risk-level` | v2 band | +| `--min-category-score-v2` | v2 OWASP tiles (100=good; fail when below minimum) | +| `--fail-on-category` | Legacy category tiles only | + +## Implementation map + +| Module | Role | +|--------|------| +| `scoring/engine_v2.py` | Sum, verify, contributors | +| `scoring/context.py` | `build_scoring_context`, chain factors | +| `scoring/graph.py` | `canonical_attack_graph`, `build_paths` | +| `scoring/evidence_tags.py` | PR-4b analyzer evidence tag helpers | +| `scoring/evidence_emit.py` | Graph/scope-dependent evidence enrichment | +| `scoring/weights_v1.yaml` | Classifier lookup tables | diff --git a/docs/reporting/scoring-spec.md b/docs/reporting/scoring-spec.md index ac9d3a8..98ce531 100644 --- a/docs/reporting/scoring-spec.md +++ b/docs/reporting/scoring-spec.md @@ -5,6 +5,7 @@ This document explains how MCTS calculates the **security score** (0–100) and **risk index** from findings. Use it to set CI gate thresholds, explain scores to stakeholders, or verify that scoring is working correctly. > **Just want to set a CI gate?** Use `--min-score 70 --fail-on-critical`. See [CI Integration](../platform/ci-integration.md). +> **Multi-factor scoring (v2)?** See [Scoring v2 specification](scoring-spec-v2.md) and [migration guide](../migration/scoring-v2.md). > **Unfamiliar with terms?** See the [Glossary](../glossary.md). --- diff --git a/pyproject.toml b/pyproject.toml index ea5903b..a5e80d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -117,6 +117,7 @@ path = "src/mcts/__init__.py" [tool.hatch.build.targets.wheel] packages = ["src/mcts"] +force-include = { "src/mcts/scoring/weights_v1.yaml" = "mcts/scoring/weights_v1.yaml", "src/mcts/scoring/weights_learned.yaml" = "mcts/scoring/weights_learned.yaml", "src/mcts/scoring/data/scoring_v2_corpus_stats.json" = "mcts/scoring/data/scoring_v2_corpus_stats.json" } [tool.hatch.build.targets.sdist] only-include = [ diff --git a/scripts/calibrate_scoring_weights.py b/scripts/calibrate_scoring_weights.py new file mode 100644 index 0000000..7be3388 --- /dev/null +++ b/scripts/calibrate_scoring_weights.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +"""Refresh packaged corpus stats and print Spearman correlation vs expert rankings.""" + +from __future__ import annotations + +import argparse +import json + +from mcts.scoring.corpus_runner import ( + EXPERT_RANKINGS_PATH, + PACKAGE_STATS_PATH, + build_package_stats_from_metrics, + scan_corpus_metrics, + spearman_rho, +) +from mcts.scoring.weights import PACKAGE_DIR + + +def main() -> int: + parser = argparse.ArgumentParser(description="Calibrate v2 scoring corpus stats") + parser.add_argument("--scoring", default="v2", choices=["v2", "both"]) + parser.add_argument("--write-package-stats", action="store_true") + parser.add_argument("--min-rho", type=float, default=0.0, help="Exit 1 if Spearman rho below threshold") + parser.add_argument( + "--stats-version", + default="corpus-2026-06", + help="Version label written into packaged corpus stats JSON", + ) + parser.add_argument( + "--write-learned-weights", + action="store_true", + help="Copy manual_v1 weights to weights_learned.yaml (offline calibration placeholder)", + ) + args = parser.parse_args() + + metrics = scan_corpus_metrics(scoring_mode=args.scoring) + risks = metrics.risks + for server_id, absolute_risk in risks.items(): + print(f"{server_id}: absolute_risk={absolute_risk}") + + if args.write_package_stats and risks: + stats = build_package_stats_from_metrics(metrics, version=args.stats_version) + PACKAGE_STATS_PATH.write_text(json.dumps(stats, indent=2) + "\n", encoding="utf-8") + print(f"Wrote {PACKAGE_STATS_PATH}") + + if args.write_learned_weights: + manual = PACKAGE_DIR / "weights_v1.yaml" + learned = PACKAGE_DIR / "weights_learned.yaml" + text = manual.read_text(encoding="utf-8").replace("version: manual_v1", "version: learned_v1", 1) + learned.write_text(text, encoding="utf-8") + print(f"Wrote {learned}") + + if EXPERT_RANKINGS_PATH.exists(): + expert = json.loads(EXPERT_RANKINGS_PATH.read_text(encoding="utf-8")) + ids = [row["server_id"] for row in expert["rankings"] if row["server_id"] in risks] + model_vals = [float(risks[sid]) for sid in ids] + expert_vals = [ + float(row.get("expert_score") or max(0, 100 - (int(row["rank"]) - 1) * 15)) + for row in expert["rankings"] + if row["server_id"] in risks + ] + rho = spearman_rho(model_vals, expert_vals) + print(f"Spearman rho={rho:.3f} (n={len(ids)})") + if rho < args.min_rho: + raise SystemExit(1) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_scoring_corpus.py b/scripts/run_scoring_corpus.py new file mode 100644 index 0000000..aca23ea --- /dev/null +++ b/scripts/run_scoring_corpus.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +"""Batch-scan scoring corpus servers and optionally refresh packaged stats.""" + +from __future__ import annotations + +import argparse +import json + +from mcts.scoring.corpus_runner import ( + PACKAGE_STATS_PATH, + build_package_stats_from_metrics, + scan_corpus_metrics, +) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run v2 scoring across corpus servers") + parser.add_argument("--scoring", default="v2", choices=["v2", "both"]) + parser.add_argument( + "--write-package-stats", + action="store_true", + help="Write distribution snapshot to packaged corpus stats JSON", + ) + parser.add_argument( + "--stats-version", + default="corpus-2026-06", + help="Version label written into packaged corpus stats JSON", + ) + args = parser.parse_args() + + metrics = scan_corpus_metrics(scoring_mode=args.scoring) + risks = metrics.risks + for server_id, absolute_risk in risks.items(): + print(f"{server_id}: absolute_risk={absolute_risk}") + + if args.write_package_stats and risks: + stats = build_package_stats_from_metrics(metrics, version=args.stats_version) + PACKAGE_STATS_PATH.write_text(json.dumps(stats, indent=2) + "\n", encoding="utf-8") + print(f"Wrote {PACKAGE_STATS_PATH}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/mcts/analyzers/attack_chains.py b/src/mcts/analyzers/attack_chains.py index 13b6330..e9915b2 100644 --- a/src/mcts/analyzers/attack_chains.py +++ b/src/mcts/analyzers/attack_chains.py @@ -2,12 +2,13 @@ from __future__ import annotations -from collections import deque from typing import Any from mcts.analyzers.base import BaseAnalyzer from mcts.mcp.models import MCPServerInfo, MCPTool from mcts.reporting.models import Finding, Severity +from mcts.scoring.evidence_tags import tag_attack_chain_finding +from mcts.scoring.graph import bfs_path, build_paths class AttackChainAnalyzer(BaseAnalyzer): @@ -28,7 +29,7 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: exec_tools = [t for t in server.tools if _cap(t, "executes_commands")] if read_tools and exfil_tools: - path = _find_path(self.last_graph, read_tools[0].name, exfil_tools[0].name) + path = bfs_path(self.last_graph, read_tools[0].name, exfil_tools[0].name) findings.append( Finding( id="chain-read-exfil", @@ -81,7 +82,9 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: ) ) - return findings + paths = build_paths(self.last_graph, findings) + self.last_graph = {**self.last_graph, "paths": paths} + return [tag_attack_chain_finding(f) for f in findings] def _build_graph(self, server: MCPServerInfo) -> dict[str, Any]: nodes: dict[str, dict[str, str]] = {} @@ -135,9 +138,10 @@ def _can_chain(src: MCPTool, dst: MCPTool) -> bool: if not src.capability or not dst.capability: return False s, d = src.capability, dst.capability - return (s.reads_untrusted_input and (d.egresses_network or d.executes_commands)) or ( - s.accesses_sensitive_data and d.egresses_network - ) + return ( + s.reads_untrusted_input + and (d.egresses_network or d.executes_commands or d.accesses_sensitive_data) + ) or (s.accesses_sensitive_data and d.egresses_network) def _edge_label(src: MCPTool, dst: MCPTool) -> str: @@ -150,21 +154,3 @@ def _edge_label(src: MCPTool, dst: MCPTool) -> str: return "→ chain" -def _find_path(graph: dict[str, Any], start: str, end: str) -> list[str]: - adjacency: dict[str, list[str]] = {} - for edge in graph.get("edges", []): - adjacency.setdefault(edge["from"], []).append(edge["to"]) - - queue: deque[list[str]] = deque([[start]]) - visited = {start} - while queue: - path = queue.popleft() - node = path[-1] - if node == end: - return path - for neighbor in adjacency.get(node, []): - if neighbor in visited: - continue - visited.add(neighbor) - queue.append([*path, neighbor]) - return [start, end] diff --git a/src/mcts/analyzers/behavioral_static.py b/src/mcts/analyzers/behavioral_static.py index a067c9c..8bd7f80 100644 --- a/src/mcts/analyzers/behavioral_static.py +++ b/src/mcts/analyzers/behavioral_static.py @@ -8,6 +8,7 @@ from mcts.analyzers.base import BaseAnalyzer from mcts.mcp.models import MCPServerInfo, MCPTool from mcts.reporting.models import Finding, Severity, SourceLocation +from mcts.scoring.evidence_tags import tag_behavioral_static_finding from mcts.sast.go.sinks import detect_go_sinks from mcts.sast.go.taint import analyze_go_taint from mcts.sast.python.crossfile import expand_python_handler @@ -83,7 +84,7 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: if not snippet: continue findings.extend(self._analyze_tool(tool, snippet, server)) - return findings + return [tag_behavioral_static_finding(f) for f in findings] def _analyze_tool( self, diff --git a/src/mcts/analyzers/command_execution.py b/src/mcts/analyzers/command_execution.py index 79149e1..684f899 100644 --- a/src/mcts/analyzers/command_execution.py +++ b/src/mcts/analyzers/command_execution.py @@ -7,6 +7,7 @@ from mcts.analyzers.base import BaseAnalyzer from mcts.mcp.models import MCPServerInfo, MCPTool from mcts.reporting.models import Finding, Severity, SourceLocation +from mcts.scoring.evidence_tags import tag_command_execution_finding DANGEROUS_CALLS: dict[str, tuple[str, Severity]] = { "subprocess": ("subprocess invocation", Severity.CRITICAL), @@ -25,7 +26,7 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: findings: list[Finding] = [] for tool in server.tools: findings.extend(self._analyze_tool(tool, server.source_files)) - return findings + return [tag_command_execution_finding(f) for f in findings] def _analyze_tool(self, tool: MCPTool, source_files: dict[str, str]) -> list[Finding]: if not tool.source_file or tool.source_file not in source_files: diff --git a/src/mcts/analyzers/cross_server.py b/src/mcts/analyzers/cross_server.py index 0cfc51c..a0ede1c 100644 --- a/src/mcts/analyzers/cross_server.py +++ b/src/mcts/analyzers/cross_server.py @@ -8,6 +8,7 @@ from mcts.inventory.models import InventoryEntry from mcts.mcp.models import MCPServerInfo from mcts.reporting.models import Finding, Severity +from mcts.scoring.evidence_tags import tag_cross_server_finding def _similarity(a: str, b: str) -> float: @@ -87,4 +88,4 @@ def analyze_inventory(self, inventory: list[InventoryEntry]) -> list[Finding]: ) ) - return findings + return [tag_cross_server_finding(f) for f in findings] diff --git a/src/mcts/analyzers/data_leakage.py b/src/mcts/analyzers/data_leakage.py index 73bc44a..488d205 100644 --- a/src/mcts/analyzers/data_leakage.py +++ b/src/mcts/analyzers/data_leakage.py @@ -7,6 +7,7 @@ from mcts.analyzers.base import BaseAnalyzer from mcts.mcp.models import MCPServerInfo from mcts.reporting.models import Finding, Severity, SourceLocation +from mcts.scoring.evidence_tags import tag_data_leakage_finding SECRET_PATTERNS: list[tuple[str, re.Pattern[str], Severity]] = [ ("OpenAI API Key", re.compile(r"sk-[A-Za-z0-9]{20,}"), Severity.CRITICAL), @@ -68,7 +69,7 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: findings: list[Finding] = [] findings.extend(self._scan_metadata(server)) findings.extend(self._scan_source_files(server)) - return findings + return [tag_data_leakage_finding(f) for f in findings] def _scan_metadata(self, server: MCPServerInfo) -> list[Finding]: findings: list[Finding] = [] diff --git a/src/mcts/analyzers/jailbreak.py b/src/mcts/analyzers/jailbreak.py index 0fcdf7f..c7eaae7 100644 --- a/src/mcts/analyzers/jailbreak.py +++ b/src/mcts/analyzers/jailbreak.py @@ -8,6 +8,7 @@ from mcts.mcp.models import MCPServerInfo, MCPTool from mcts.probe.jailbreak import summarize_jailbreak_events from mcts.reporting.models import Finding, Severity +from mcts.scoring.evidence_tags import tag_jailbreak_finding class JailbreakAnalyzer(BaseAnalyzer): @@ -26,7 +27,7 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: elif score >= 5: severity = Severity.MEDIUM else: - return findings + return [tag_jailbreak_finding(f) for f in findings] findings.append( Finding( @@ -51,7 +52,7 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: }, ) ) - return findings + return [tag_jailbreak_finding(f) for f in findings] def _live_finding(self, summary: dict[str, Any]) -> Finding: accepted = int(summary["accepted_count"]) diff --git a/src/mcts/analyzers/path_validation.py b/src/mcts/analyzers/path_validation.py index 8bd2519..f5f5470 100644 --- a/src/mcts/analyzers/path_validation.py +++ b/src/mcts/analyzers/path_validation.py @@ -8,6 +8,7 @@ from mcts.analyzers.tool_classification import is_file_access_tool from mcts.mcp.models import MCPServerInfo from mcts.reporting.models import Finding, Severity, SourceLocation +from mcts.scoring.evidence_tags import tag_path_validation_finding CANONICALIZATION_HINTS = re.compile( r"\b(resolve|realpath|abspath|canonicalize|normpath|is_relative_to|startswith)\b", @@ -44,4 +45,4 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: evidence={"missing": "path_canonicalization"}, ) ) - return findings + return [tag_path_validation_finding(f) for f in findings] diff --git a/src/mcts/analyzers/permissions.py b/src/mcts/analyzers/permissions.py index cea151f..64c4ba7 100644 --- a/src/mcts/analyzers/permissions.py +++ b/src/mcts/analyzers/permissions.py @@ -7,6 +7,7 @@ from mcts.analyzers.base import BaseAnalyzer from mcts.mcp.models import MCPServerInfo, MCPTool from mcts.reporting.models import Finding, Severity +from mcts.scoring.evidence_tags import tag_permission_finding DESTRUCTIVE_PATTERNS = re.compile( r"\b(delete|drop|remove|destroy|wipe|purge|truncate|kill|shutdown)\b", @@ -27,7 +28,7 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: findings: list[Finding] = [] for tool in server.tools: findings.extend(self._analyze_tool(tool)) - return findings + return [tag_permission_finding(f) for f in findings] def _analyze_tool(self, tool: MCPTool) -> list[Finding]: findings: list[Finding] = [] diff --git a/src/mcts/analyzers/prompt_injection.py b/src/mcts/analyzers/prompt_injection.py index c83aea9..35925dc 100644 --- a/src/mcts/analyzers/prompt_injection.py +++ b/src/mcts/analyzers/prompt_injection.py @@ -21,6 +21,7 @@ ) from mcts.mcp.models import MCPServerInfo, MCPTool from mcts.reporting.models import Finding, Severity +from mcts.scoring.evidence_tags import tag_prompt_injection_finding INSTRUCTION_LIKE = re.compile( r"(?i)\b(ignore|disregard|forget|override|system prompt|you must|always|never reveal)\b" @@ -36,7 +37,7 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: findings: list[Finding] = [] for surface in scan_surfaces(server): findings.extend(self._analyze_surface(server, surface)) - return findings + return [tag_prompt_injection_finding(f) for f in findings] def _analyze_surface(self, server: MCPServerInfo, surface: ScanSurface) -> list[Finding]: findings: list[Finding] = [] diff --git a/src/mcts/analyzers/schema_surface.py b/src/mcts/analyzers/schema_surface.py index 0fafd68..f99dab3 100644 --- a/src/mcts/analyzers/schema_surface.py +++ b/src/mcts/analyzers/schema_surface.py @@ -12,6 +12,7 @@ ) from mcts.mcp.models import MCPServerInfo, MCPTool from mcts.reporting.models import Finding, Severity, SourceLocation +from mcts.scoring.evidence_tags import tag_schema_surface_finding CREDENTIAL_PARAM_NAMES = re.compile( r"(?i)^(password|secret|token|api_key|apikey|credential|auth|private_key)$" @@ -27,7 +28,7 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: findings: list[Finding] = [] for tool in server.tools: findings.extend(self._analyze_tool(tool)) - return findings + return [tag_schema_surface_finding(f) for f in findings] def _analyze_tool(self, tool: MCPTool) -> list[Finding]: findings: list[Finding] = [] diff --git a/src/mcts/analyzers/tool_abuse.py b/src/mcts/analyzers/tool_abuse.py index 642b7f7..1dc3342 100644 --- a/src/mcts/analyzers/tool_abuse.py +++ b/src/mcts/analyzers/tool_abuse.py @@ -7,6 +7,7 @@ from mcts.analyzers.tool_classification import is_file_access_tool from mcts.mcp.models import MCPServerInfo from mcts.reporting.models import Finding, Severity +from mcts.scoring.evidence_tags import tag_tool_abuse_finding class ToolAbuseAnalyzer(BaseAnalyzer): @@ -37,4 +38,4 @@ def analyze(self, server: MCPServerInfo) -> list[Finding]: }, ) ) - return findings + return [tag_tool_abuse_finding(f) for f in findings] diff --git a/src/mcts/api/app.py b/src/mcts/api/app.py index fab08d1..1061beb 100644 --- a/src/mcts/api/app.py +++ b/src/mcts/api/app.py @@ -3,7 +3,7 @@ from __future__ import annotations from pathlib import Path -from typing import Any +from typing import Any, Literal from fastapi import Depends, FastAPI, HTTPException, Request from pydantic import BaseModel, Field, field_validator @@ -52,6 +52,14 @@ class ScanRequest(BaseModel): runtime_events: list[dict[str, Any]] = Field(default_factory=list) fail_on_critical: bool = False min_score: int | None = Field(default=None, ge=0, le=100) + scoring_mode: Literal["legacy", "v2", "both"] = "both" + weights_profile: str = "manual_v1" + corpus_stats_path: str | None = None + min_security_score: int | None = Field(default=None, ge=0, le=100) + max_absolute_risk: int | None = Field(default=None, ge=0) + max_risk_level: Literal["low", "medium", "high", "critical"] | None = None + min_category_score_v2: dict[str, int] = Field(default_factory=dict) + assets_path: str | None = None understand_live_risk: bool = False fanout_offset: int = Field(default=0, ge=0) fanout_limit: int | None = Field(default=None, ge=1) @@ -65,6 +73,13 @@ def _limit_runtime_events(cls, value: list[dict[str, Any]]) -> list[dict[str, An return value +class ScanResponse(ScanReport): + """REST scan payload with echoed scoring mode and gate violations.""" + + scoring_mode: str = "both" + gate_violations: list[str] = Field(default_factory=list) + + class ToolScanRequest(ScanRequest): tool_name: str @@ -123,6 +138,14 @@ def _build_config(req: ScanRequest, *, request: Request | None = None) -> ScanCo runtime_events=req.runtime_events, fail_on_critical=req.fail_on_critical, min_score=req.min_score, + scoring_mode=req.scoring_mode, + weights_profile=req.weights_profile, + corpus_stats_path=Path(req.corpus_stats_path) if req.corpus_stats_path else None, + min_security_score=req.min_security_score, + max_absolute_risk=req.max_absolute_risk, + max_risk_level=req.max_risk_level, + min_category_score_v2=req.min_category_score_v2, + assets_path=Path(req.assets_path) if req.assets_path else None, oauth_client_id=req.oauth_client_id, oauth_client_secret=req.oauth_client_secret, oauth_token_url=req.oauth_token_url, @@ -149,7 +172,13 @@ def _scan_server( report: ScanReport = Scanner(config).analyze_server(server) except Exception as exc: raise HTTPException(status_code=400, detail=str(exc)) from exc - return report.model_dump() + from mcts.governance.scan_gates import evaluate_scan_gate_violations + + return ScanResponse( + **report.model_dump(), + scoring_mode=config.scoring_mode, + gate_violations=evaluate_scan_gate_violations(report, config), + ).model_dump() async def _discover_async(req: ScanRequest, *, request: Request) -> MCPServerInfo: diff --git a/src/mcts/cli/machine_wide.py b/src/mcts/cli/machine_wide.py index abe04f8..07645cd 100644 --- a/src/mcts/cli/machine_wide.py +++ b/src/mcts/cli/machine_wide.py @@ -31,9 +31,13 @@ def run_machine_wide_cli( for row in summary.results: label = f"[{row.entry.client}] {row.entry.server_name}" if row.report is not None: - console.print( - f" {label} — score {row.report.score.overall}/100, {len(row.report.findings)} finding(s)" - ) + line = f" {label} — score {row.report.score.overall}/100" + if row.report.score_v2 is not None: + line += f", absolute_risk {row.report.score_v2.absolute_risk}" + if row.report.score_v2.security_score is not None: + line += f", security_score {row.report.score_v2.security_score}/100" + line += f", {len(row.report.findings)} finding(s)" + console.print(line) elif row.error: console.print(f" {label} — [dim]skipped: {row.error}[/dim]") diff --git a/src/mcts/cli/main.py b/src/mcts/cli/main.py index 9275702..c1ced07 100644 --- a/src/mcts/cli/main.py +++ b/src/mcts/cli/main.py @@ -20,7 +20,11 @@ resolve_report_input_path, ) from mcts.output.artifacts import persist_scan_artifacts -from mcts.report.data import category_gate_failures, parse_category_gates +from mcts.report.data import ( + category_gate_failures, + parse_category_gates, + parse_min_category_score_v2, +) from mcts.reporting.sarif import write_sarif_report from mcts.ui.progress import print_scan_command, run_with_progress from mcts.ui.report_renderer import ReportRenderer @@ -186,25 +190,56 @@ def _print_min_score_gate_failure(report, min_score: int) -> None: f"[dim]Lowest bucket ({lowest_label}) is below the overall minimum; " "review findings in that area before changing MCP tool code.[/dim]" ) + if report.score_v2 is not None: + console.print( + f"[dim]v2 absolute_risk={report.score_v2.absolute_risk}, " + f"risk_level={report.score_v2.risk_level}[/dim]" + ) + if report.score_v2.legacy_overall is not None: + console.print( + f"[dim]Legacy overall (includes chain meta-findings): " + f"{report.score_v2.legacy_overall}[/dim]" + ) + + +_LEVEL_ORDER = {"low": 0, "medium": 1, "high": 2, "critical": 3} + + +def _any_v2_gate(config: ScanConfig) -> bool: + from mcts.governance.scan_gates import _any_v2_gate as gate_any_v2 + + return gate_any_v2(config) + + +def _level_exceeds(actual: str, maximum: str) -> bool: + return _LEVEL_ORDER.get(actual, 0) > _LEVEL_ORDER.get(maximum, 0) def _check_gates(report, config: ScanConfig) -> None: - if config.fail_on_critical and report.summary.critical > 0: - raise typer.Exit(code=1) + from mcts.governance.scan_gates import evaluate_scan_gate_violations + if config.min_score is not None and report.score.overall < config.min_score: _print_min_score_gate_failure(report, config.min_score) - raise typer.Exit(code=1) - if config.max_critical is not None and report.summary.critical > config.max_critical: - console.print( - f"[red]Critical findings ({report.summary.critical}) exceed maximum ({config.max_critical})[/red]" - ) - raise typer.Exit(code=1) - category_failures = category_gate_failures(report.findings, config.fail_on_category) + + violations = evaluate_scan_gate_violations(report, config) + if not violations: + return + + category_failures = [item for item in violations if "risk score" in item] + other_failures = [ + item + for item in violations + if item not in category_failures and not item.startswith("legacy overall") + ] if category_failures: console.print("[red]Category risk thresholds exceeded:[/red]") for failure in category_failures: console.print(f" [red]•[/red] {failure}") - raise typer.Exit(code=1) + if other_failures: + console.print("[red]CI gate failed:[/red]") + for failure in other_failures: + console.print(f"[red]{failure}[/red]") + raise typer.Exit(code=1) @app.callback() @@ -289,9 +324,9 @@ def scan( typer.Option( "--fail-on-category", help=( - "Exit 1 when category risk score meets or exceeds threshold (inclusive). " - "e.g. permissions:0 fails when score is 0 or more. " - "Use permissions:1 to allow zero-point categories. Repeatable." + "Exit 1 when legacy category risk score meets or exceeds threshold (inclusive). " + "Legacy v1 tiles only — not category_scores_v2. " + "e.g. permissions:0 fails when score is 0 or more. Repeatable." ), ), ] = None, @@ -568,6 +603,65 @@ def scan( help="When --surfaces is a subset, run only analyzers relevant to those surfaces", ), ] = True, + scoring: Annotated[ + str, + typer.Option( + "--scoring", + help="Scoring mode: legacy, v2, or both (default: both)", + case_sensitive=False, + ), + ] = "both", + no_attack_chains: Annotated[ + bool, + typer.Option( + "--no-attack-chains", + help="Disable chain multiplier (chain_factor=1.0); under v2/both the analyzer still runs", + ), + ] = False, + min_security_score: Annotated[ + int | None, + typer.Option( + "--min-security-score", + help="Exit 1 when v2 security_score is below this (requires --scoring v2 or both)", + ), + ] = None, + max_absolute_risk: Annotated[ + int | None, + typer.Option( + "--max-absolute-risk", + help="Exit 1 when v2 absolute_risk exceeds this (requires --scoring v2 or both)", + ), + ] = None, + max_risk_level: Annotated[ + str | None, + typer.Option( + "--max-risk-level", + help="Exit 1 when v2 risk_level exceeds threshold (low|medium|high|critical)", + case_sensitive=False, + ), + ] = None, + min_category_score_v2: Annotated[ + list[str] | None, + typer.Option( + "--min-category-score-v2", + help=( + "Exit 1 when v2 OWASP category health score is below minimum (100=good). " + "e.g. injection:80. Requires --scoring v2 or both." + ), + ), + ] = None, + weights_profile: Annotated[ + str, + typer.Option("--weights", help="Scoring weights profile (default: manual_v1)"), + ] = "manual_v1", + corpus_stats_path: Annotated[ + Path | None, + typer.Option("--corpus-stats-path", help="Override packaged v2 corpus statistics JSON"), + ] = None, + assets_path: Annotated[ + Path | None, + typer.Option("--assets-path", help="YAML asset value overrides for v2 scoring (.mcts/assets.yaml)"), + ] = None, ) -> None: """Run a full security scan against an MCP server.""" import json @@ -669,6 +763,12 @@ def scan( console.print(f"[red]Error:[/red] {exc}") raise typer.Exit(code=2) from exc + try: + category_gates_v2 = parse_min_category_score_v2(min_category_score_v2) + except ValueError as exc: + console.print(f"[red]Error:[/red] {exc}") + raise typer.Exit(code=2) from exc + output_format = format.lower() if output_format not in ("json", "sarif", "raw"): console.print(f"[red]Error:[/red] Unknown format {format!r}. Use json, sarif, or raw.") @@ -758,6 +858,15 @@ def scan( instruction_files=instruction_file or [], skills_dirs=skills_dir or [], surface_scoped_analyzers=surface_scoped, + scoring_mode=scoring.lower(), + enable_attack_chains=not no_attack_chains, + min_security_score=min_security_score, + max_absolute_risk=max_absolute_risk, + max_risk_level=max_risk_level.lower() if max_risk_level else None, + min_category_score_v2=category_gates_v2, + weights_profile=weights_profile, + corpus_stats_path=corpus_stats_path, + assets_path=assets_path, ) try: @@ -872,9 +981,9 @@ def _execute_scan(): raw_path = resolve_output_path(output, "scan-report.raw.json") _write_report(report, raw_path, "raw", target=str(display_target), remote_url=url) renderer.render_saved_notice(str(raw_path)) - renderer.render_saved_notice(str(json_path)) - renderer.render_saved_notice(str(html_path)) - renderer.render_saved_notice(str(sarif_path)) + renderer.render_saved_notice(str(json_path), report) + renderer.render_saved_notice(str(html_path), report) + renderer.render_saved_notice(str(sarif_path), report) console.print(f"[dim] mcts report {json_path}[/dim] [dim](or open {html_path})[/dim]") _print_discovery_warnings(report.server, stderr_file) @@ -888,6 +997,9 @@ def _execute_scan(): critical=report.summary.critical, high=report.summary.high, servers=[str(display_target)], + absolute_risk=report.score_v2.absolute_risk if report.score_v2 else None, + security_score=report.score_v2.security_score if report.score_v2 else None, + risk_level=report.score_v2.risk_level if report.score_v2 else None, ) if violations: console.print("[red]Governance policy violations:[/red]") diff --git a/src/mcts/core/config.py b/src/mcts/core/config.py index 6afb223..ba34f9e 100644 --- a/src/mcts/core/config.py +++ b/src/mcts/core/config.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator DEFAULT_EXCLUDE_DIRS = ( ".git", @@ -123,3 +123,19 @@ class ScanConfig(BaseModel): instruction_files: list[Path] = Field(default_factory=list) skills_dirs: list[Path] = Field(default_factory=list) surface_scoped_analyzers: bool = True + scoring_mode: str = "both" + weights_profile: str = "manual_v1" + corpus_stats_path: Path | None = None + assets_path: Path | None = None + min_security_score: int | None = Field(default=None, ge=0, le=100) + max_absolute_risk: int | None = Field(default=None, ge=0) + max_risk_level: str | None = None + min_category_score_v2: dict[str, int] = Field(default_factory=dict) + + @field_validator("scoring_mode") + @classmethod + def _validate_scoring_mode(cls, value: str) -> str: + normalized = value.lower() + if normalized not in {"legacy", "v2", "both"}: + raise ValueError("scoring_mode must be legacy, v2, or both") + return normalized diff --git a/src/mcts/core/scanner.py b/src/mcts/core/scanner.py index d34b2f4..794b2eb 100644 --- a/src/mcts/core/scanner.py +++ b/src/mcts/core/scanner.py @@ -47,14 +47,18 @@ from mcts.mcp.models import MCPServerInfo, SurfaceScanOptions from mcts.probe.protocol_checks import probe_protocol_security from mcts.report.scan_meta import ( + append_chain_scan_notes, build_scan_notes, infer_scan_scope, is_config_static_scan, tool_discovery_notice_text, ) from mcts.reporting.models import Finding, ScanReport, ScanSummary +from mcts.scoring.context import build_scoring_context from mcts.scoring.engine import RiskScoringEngine +from mcts.scoring.engine_v2 import RiskScoringEngineV2 from mcts.scoring.partitions import score_partitioned +from mcts.scoring.pipeline_trace import record as _trace_pipeline from mcts.taxonomy.mapper import enrich_findings @@ -205,19 +209,51 @@ def analyze_server(self, server_info: MCPServerInfo) -> ScanReport: findings = enrich_findings(findings) findings.extend(self.compliance.check(findings, tools_discovered=len(server_info.tools))) analyzers_executed.append("compliance") - score = self.scoring.score(findings) - summary = ScanSummary.from_findings(findings) + if "attack_chains" in analyzers_executed: + raw_graph = self.attack_chains.last_graph + else: + raw_graph = {} + _trace_pipeline("graph") + + scan_scope = infer_scan_scope(self.config) + from mcts.scoring.evidence_emit import enrich_scoring_evidence + + findings = enrich_scoring_evidence( + findings, attack_graph=raw_graph, scan_scope=scan_scope + ) + _trace_pipeline("scope") + scan_notes = build_scan_notes(self.config) + + score = self.scoring.score(findings) + _trace_pipeline("v1") if not RiskScoringEngine.verify(findings, score): raise RuntimeError("Risk score does not match findings — scoring regression") - attack_graph = self.attack_chains.last_graph if self.config.enable_attack_chains else {} + score_v2 = None + report_attack_graph = raw_graph + if self.config.scoring_mode in {"v2", "both"}: + chain_factor_mode = "paths_v1" if self.config.enable_attack_chains else "disabled" + ctx = build_scoring_context( + findings=findings, + server=server_info, + attack_graph=raw_graph, + scan_scope=scan_scope, + config=self.config, + chain_factor_mode=chain_factor_mode, + ) + score_v2 = RiskScoringEngineV2().score(ctx, legacy_overall=score.overall) + if not RiskScoringEngineV2.verify(ctx, score_v2): + raise RuntimeError( + "Risk score v2 does not match context — scoring regression" + ) + report_attack_graph = ctx.attack_graph + _trace_pipeline("v2") + + summary = ScanSummary.from_findings(findings) if self.config.save_baseline_path is not None: save_baseline(server_info, self.config.save_baseline_path, target=str(self.config.target)) - - scan_scope = infer_scan_scope(self.config) - scan_notes = build_scan_notes(self.config) if server_info.agent_skills or server_info.instruction_sources: scan_notes.append( "Instruction discovery: found " @@ -226,7 +262,7 @@ def analyze_server(self, server_info: MCPServerInfo) -> ScanReport: f"{len(server_info.instruction_sources)} system instruction file(s) in repository markdown." ) - return ScanReport( + report = ScanReport( version=__version__, target=str(self.config.target), scanned_at=datetime.now(UTC), @@ -234,13 +270,17 @@ def analyze_server(self, server_info: MCPServerInfo) -> ScanReport: findings=findings, summary=summary, score=score, - attack_graph=attack_graph, + score_v2=score_v2, + scoring_version=self.config.scoring_mode, + attack_graph=report_attack_graph, scan_scope=scan_scope, scan_notes=scan_notes, score_breakdown=score_partitioned(findings), tool_discovery_notice=tool_discovery_notice_text(server_info, scan_scope=scan_scope), analyzers_executed=analyzers_executed, ) + append_chain_scan_notes(report.scan_notes, report, self.config) + return report def _attach_surface_options(self, server_info: MCPServerInfo) -> MCPServerInfo: cfg = self.config @@ -266,6 +306,8 @@ def _is_enabled(self, analyzer: object) -> bool: if name == "JailbreakAnalyzer": return self.config.enable_jailbreak if name == "AttackChainAnalyzer": + if self.config.scoring_mode in {"v2", "both"}: + return True return self.config.enable_attack_chains if name == "MetadataDiffAnalyzer": return self.config.baseline_path is not None @@ -276,6 +318,11 @@ def _is_enabled(self, analyzer: object) -> bool: return True def _analyzer_allowed(self, analyzer: object) -> bool: + if ( + self.config.scoring_mode in {"v2", "both"} + and getattr(analyzer, "name", None) == "attack_chains" + ): + return True if self.config.analyzers: name = getattr(analyzer, "name", type(analyzer).__name__) if name not in self.config.analyzers and type(analyzer).__name__ not in self.config.analyzers: diff --git a/src/mcts/discovery/static_meta.py b/src/mcts/discovery/static_meta.py index 6e994b7..ca6cb00 100644 --- a/src/mcts/discovery/static_meta.py +++ b/src/mcts/discovery/static_meta.py @@ -9,6 +9,7 @@ from mcts.discovery.language_detect import RUST_MCP_INDICATORS, detect_repo_languages from mcts.mcp.models import MCPServerInfo from mcts.reporting.models import Finding, Severity +from mcts.scoring.evidence_tags import tag_static_discovery_finding def static_discovery_meta_findings(server: MCPServerInfo, config: ScanConfig) -> list[Finding]: @@ -28,6 +29,7 @@ def static_discovery_meta_findings(server: MCPServerInfo, config: ScanConfig) -> if rust_sources and ("rust" in langs or "rs" in langs): return [ + tag_static_discovery_finding( Finding( id="static-discovery-rust-incomplete", analyzer="static_discovery", @@ -48,11 +50,12 @@ def static_discovery_meta_findings(server: MCPServerInfo, config: ScanConfig) -> "detected_languages": sorted(detected), "discovery_mode": server.discovery_mode, }, - ) + )) ] if detected & langs: return [ + tag_static_discovery_finding( Finding( id="static-discovery-incomplete", analyzer="static_discovery", @@ -72,7 +75,7 @@ def static_discovery_meta_findings(server: MCPServerInfo, config: ScanConfig) -> "detected_languages": sorted(detected), "discovery_mode": server.discovery_mode, }, - ) + )) ] return [] diff --git a/src/mcts/governance/policy.py b/src/mcts/governance/policy.py index 7eb8c76..c091857 100644 --- a/src/mcts/governance/policy.py +++ b/src/mcts/governance/policy.py @@ -11,6 +11,9 @@ class GovernancePolicy(BaseModel): min_score: int | None = Field(default=None, ge=0, le=100) + min_security_score: int | None = Field(default=None, ge=0, le=100) + max_absolute_risk: int | None = Field(default=None, ge=0) + max_risk_level: str | None = Field(default=None) max_critical: int | None = Field(default=None, ge=0) max_high: int | None = Field(default=None, ge=0) allowed_servers: list[str] = Field(default_factory=list) @@ -40,10 +43,41 @@ def evaluate_policy( critical: int, high: int, servers: list[str], + absolute_risk: int | None = None, + security_score: int | None = None, + risk_level: str | None = None, ) -> list[str]: + _LEVEL_ORDER = {"low": 0, "medium": 1, "high": 2, "critical": 3} violations: list[str] = [] if policy.min_score is not None and score < policy.min_score: - violations.append(f"score {score} below minimum {policy.min_score}") + violations.append(f"legacy score {score} below minimum {policy.min_score}") + if policy.min_security_score is not None: + if security_score is None: + violations.append( + f"min_security_score {policy.min_security_score} requires v2 scoring (use --scoring v2 or both)" + ) + elif security_score < policy.min_security_score: + violations.append( + f"security score {security_score} below minimum {policy.min_security_score}" + ) + if policy.max_absolute_risk is not None: + if absolute_risk is None: + violations.append( + f"max_absolute_risk {policy.max_absolute_risk} requires v2 scoring (use --scoring v2 or both)" + ) + elif absolute_risk > policy.max_absolute_risk: + violations.append( + f"absolute risk {absolute_risk} exceeds maximum {policy.max_absolute_risk}" + ) + if policy.max_risk_level is not None: + if risk_level is None: + violations.append( + f"max_risk_level {policy.max_risk_level!r} requires v2 scoring (use --scoring v2 or both)" + ) + elif _LEVEL_ORDER.get(risk_level, 0) > _LEVEL_ORDER.get(policy.max_risk_level, 0): + violations.append( + f"risk level {risk_level!r} exceeds maximum {policy.max_risk_level!r}" + ) if policy.max_critical is not None and critical > policy.max_critical: violations.append(f"critical findings {critical} exceed max {policy.max_critical}") if policy.max_high is not None and high > policy.max_high: diff --git a/src/mcts/governance/scan_gates.py b/src/mcts/governance/scan_gates.py new file mode 100644 index 0000000..e5e447c --- /dev/null +++ b/src/mcts/governance/scan_gates.py @@ -0,0 +1,78 @@ +"""Evaluate CI/policy scan gates without exiting the process.""" + +from __future__ import annotations + +from mcts.core.config import ScanConfig +from mcts.report.data import category_gate_failures, category_scores_v2_gate_failures +from mcts.reporting.models import ScanReport + +_LEVEL_ORDER = {"low": 0, "medium": 1, "high": 2, "critical": 3} + + +def _level_exceeds(actual: str, maximum: str) -> bool: + return _LEVEL_ORDER.get(actual, 0) > _LEVEL_ORDER.get(maximum, 0) + + +def _any_v2_gate(config: ScanConfig) -> bool: + return any( + value is not None + for value in ( + config.min_security_score, + config.max_absolute_risk, + config.max_risk_level, + ) + ) or bool(config.min_category_score_v2) + + +def evaluate_scan_gate_violations(report: ScanReport, config: ScanConfig) -> list[str]: + """Return human-readable gate violations for CLI, API, and GitHub Action consumers.""" + violations: list[str] = [] + + if config.fail_on_critical and report.summary.critical > 0: + violations.append(f"critical findings present ({report.summary.critical})") + + if config.min_score is not None and report.score.overall < config.min_score: + violations.append( + f"legacy overall score {report.score.overall}/100 below minimum {config.min_score}" + ) + + if _any_v2_gate(config): + if report.score_v2 is None: + violations.append("v2 gate requires scoring_mode v2 or both") + elif report.score_v2 is not None: + if config.min_security_score is not None: + if report.score_v2.security_score is None: + violations.append("min_security_score requires packaged corpus stats") + elif report.score_v2.security_score < config.min_security_score: + violations.append( + f"security_score {report.score_v2.security_score} " + f"below minimum {config.min_security_score}" + ) + if ( + config.max_absolute_risk is not None + and report.score_v2.absolute_risk > config.max_absolute_risk + ): + violations.append( + f"absolute_risk {report.score_v2.absolute_risk} " + f"exceeds maximum {config.max_absolute_risk}" + ) + if ( + config.max_risk_level is not None + and _level_exceeds(report.score_v2.risk_level, config.max_risk_level) + ): + violations.append( + f"risk_level {report.score_v2.risk_level} " + f"exceeds maximum {config.max_risk_level}" + ) + + if config.max_critical is not None and report.summary.critical > config.max_critical: + violations.append( + f"critical findings ({report.summary.critical}) exceed maximum ({config.max_critical})" + ) + + violations.extend(category_gate_failures(report.findings, config.fail_on_category)) + if config.min_category_score_v2 and report.score_v2 is not None: + violations.extend( + category_scores_v2_gate_failures(report.findings, config.min_category_score_v2) + ) + return violations diff --git a/src/mcts/inventory/scan_all.py b/src/mcts/inventory/scan_all.py index 38b41cf..ef24744 100644 --- a/src/mcts/inventory/scan_all.py +++ b/src/mcts/inventory/scan_all.py @@ -27,14 +27,17 @@ def run_inventory_scan_all(base_config: ScanConfig) -> tuple[InventoryReport, li except Exception as exc: # noqa: BLE001 rows.append(_row(entry, error=str(exc))) continue - rows.append( - _row( - entry, - report=report, - score=report.score.overall, - findings=len(report.findings), - ) - ) + row_payload: dict = { + "score": report.score.overall, + "findings": len(report.findings), + "scoring_version": report.scoring_version, + "report": report.model_dump(mode="json"), + } + if report.score_v2 is not None: + row_payload["absolute_risk"] = report.score_v2.absolute_risk + row_payload["security_score"] = report.score_v2.security_score + row_payload["risk_level"] = report.score_v2.risk_level + rows.append(_row(entry, **row_payload)) return inventory, rows diff --git a/src/mcts/mcp_server/server.py b/src/mcts/mcp_server/server.py index f333d3a..702b646 100644 --- a/src/mcts/mcp_server/server.py +++ b/src/mcts/mcp_server/server.py @@ -71,13 +71,26 @@ def compare_baselines(baseline_report_json: str, current_report_json: str) -> st """Compare two scan reports and summarize score and finding deltas.""" baseline = _report_summary(json.loads(baseline_report_json)) current = _report_summary(json.loads(current_report_json)) - delta = { + delta: dict[str, Any] = { "baseline": baseline, "current": current, "score_delta": current["overall_score"] - baseline["overall_score"], "finding_delta": current["finding_count"] - baseline["finding_count"], "new_findings": _new_finding_ids(baseline, current), } + if baseline.get("absolute_risk") is not None and current.get("absolute_risk") is not None: + delta["absolute_risk_delta"] = current["absolute_risk"] - baseline["absolute_risk"] + if baseline.get("security_score") is not None and current.get("security_score") is not None: + delta["security_score_delta"] = current["security_score"] - baseline["security_score"] + if baseline.get("scoring_version") or current.get("scoring_version"): + delta["scoring_mode_note"] = ( + "Legacy overall_score and v2 absolute_risk use different scales — compare like with like." + ) + chain_delta = (current.get("critical") or 0) - (baseline.get("critical") or 0) + if chain_delta and delta.get("finding_delta", 0) != chain_delta: + delta["chain_meta_note"] = ( + "Finding deltas may include attack_chains meta-rows excluded from v2 absolute_risk." + ) return json.dumps(delta, indent=2) @@ -103,14 +116,24 @@ def create_server(): def _report_summary(payload: dict[str, Any]) -> dict[str, Any]: score = payload.get("score") or {} + score_v2 = payload.get("score_v2") or {} findings = payload.get("findings") or [] - return { + summary: dict[str, Any] = { "overall_score": int(score.get("overall") or 0), "finding_count": len(findings), "finding_ids": sorted(str(row.get("id")) for row in findings if row.get("id")), "critical": int((payload.get("summary") or {}).get("critical") or 0), "high": int((payload.get("summary") or {}).get("high") or 0), + "scoring_version": payload.get("scoring_version") or "legacy", } + if score_v2: + if score_v2.get("absolute_risk") is not None: + summary["absolute_risk"] = int(score_v2["absolute_risk"]) + if score_v2.get("security_score") is not None: + summary["security_score"] = int(score_v2["security_score"]) + if score_v2.get("risk_level"): + summary["risk_level"] = str(score_v2["risk_level"]) + return summary def _new_finding_ids(baseline: dict[str, Any], current: dict[str, Any]) -> list[str]: diff --git a/src/mcts/output/artifacts.py b/src/mcts/output/artifacts.py index 87868b8..0634754 100644 --- a/src/mcts/output/artifacts.py +++ b/src/mcts/output/artifacts.py @@ -25,8 +25,13 @@ def _report_with_scan_history(report: ScanReport) -> ScanReport: "date": scanned.strftime("%b %d"), "score": report.score.overall, "scanned_at": scanned.isoformat(), + "scoring_version": report.scoring_version, } ] + if report.score_v2 is not None: + points[0]["absolute_risk"] = report.score_v2.absolute_risk + if report.score_v2.security_score is not None: + points[0]["security_score"] = report.score_v2.security_score return report.model_copy(update={"scan_history": points}) diff --git a/src/mcts/output/history.py b/src/mcts/output/history.py index 76f6e29..17ea936 100644 --- a/src/mcts/output/history.py +++ b/src/mcts/output/history.py @@ -78,12 +78,17 @@ def record_scan_run(report: ScanReport, root: Path | None = None) -> None: store = _load_store(root) runs: list[dict[str, Any]] = store["runs"] key = normalize_target(report.target) - entry = { + entry: dict[str, Any] = { "scanned_at": report.scanned_at.astimezone(UTC).isoformat(), "target": key, + "scoring_version": report.scoring_version, "score": report.score.overall, "findings_total": report.summary.total, } + if report.score_v2 is not None: + entry["absolute_risk"] = report.score_v2.absolute_risk + entry["security_score"] = report.score_v2.security_score + entry["risk_level"] = report.score_v2.risk_level if runs and runs[-1].get("scanned_at") == entry["scanned_at"] and runs[-1].get("target") == key: runs[-1] = entry else: @@ -122,13 +127,19 @@ def trend_points_for_target(target: str, root: Path | None = None) -> list[dict[ scanned_at = datetime.fromisoformat(str(raw)) if scanned_at.tzinfo is None: scanned_at = scanned_at.replace(tzinfo=UTC) - points.append( - { - "date": _trend_label(scanned_at, day_counts), - "score": int(row.get("score", 0)), - "scanned_at": scanned_at.isoformat(), - } - ) + point: dict[str, Any] = { + "date": _trend_label(scanned_at, day_counts), + "score": int(row.get("score", 0)), + "scanned_at": scanned_at.isoformat(), + "scoring_version": row.get("scoring_version", "legacy"), + } + if "absolute_risk" in row: + point["absolute_risk"] = int(row["absolute_risk"]) + if row.get("security_score") is not None: + point["security_score"] = int(row["security_score"]) + if row.get("risk_level"): + point["risk_level"] = str(row["risk_level"]) + points.append(point) return points diff --git a/src/mcts/pentest/models.py b/src/mcts/pentest/models.py index 08e93ff..b7e04cc 100644 --- a/src/mcts/pentest/models.py +++ b/src/mcts/pentest/models.py @@ -16,6 +16,7 @@ class PentestReport(BaseModel): target: str verdict: str score: int + absolute_risk: int | None = None phases: list[PentestPhase] = Field(default_factory=list) attack_paths: list[dict] = Field(default_factory=list) top_findings: list[dict] = Field(default_factory=list) diff --git a/src/mcts/pentest/runner.py b/src/mcts/pentest/runner.py index c9cf7f2..f0814e5 100644 --- a/src/mcts/pentest/runner.py +++ b/src/mcts/pentest/runner.py @@ -77,6 +77,7 @@ def run_pentest(config: ScanConfig, *, run_fuzz: bool = True) -> PentestReport: target=str(config.target), verdict=verdict, score=static_report.score.overall, + absolute_risk=static_report.score_v2.absolute_risk if static_report.score_v2 else None, phases=phases, attack_paths=attack_paths[:20], top_findings=[row.model_dump(mode="json") for row in combined[:15]], diff --git a/src/mcts/probe/discovery_meta.py b/src/mcts/probe/discovery_meta.py index e12636b..fd475ed 100644 --- a/src/mcts/probe/discovery_meta.py +++ b/src/mcts/probe/discovery_meta.py @@ -4,6 +4,7 @@ from mcts.mcp.models import MCPServerInfo from mcts.reporting.models import Finding, Severity +from mcts.scoring.evidence_tags import tag_live_discovery_finding def list_failure_warning(operation: str, exc: Exception, stderr_file: str | None) -> str: @@ -42,6 +43,7 @@ def discovery_meta_findings(server: MCPServerInfo) -> list[Finding]: ) return [ + tag_live_discovery_finding( Finding( id="live-discovery-incomplete", analyzer="live_discovery", @@ -61,7 +63,7 @@ def discovery_meta_findings(server: MCPServerInfo) -> list[Finding]: "initialize_succeeded": server.initialize_succeeded, }, confidence=1.0, - ) + )) ] diff --git a/src/mcts/report/assets/dashboard.js b/src/mcts/report/assets/dashboard.js index d156baa..3705018 100644 --- a/src/mcts/report/assets/dashboard.js +++ b/src/mcts/report/assets/dashboard.js @@ -79,12 +79,181 @@ return `${value} / 100 pts`; } + const V2_DIMENSION_LABELS = { + exploitability: "Exploitability", + reachability: "Reachability", + exposure: "Exposure", + blast_radius: "Blast radius", + business_impact: "Business impact", + asset_value: "Asset value", + attack_preconditions: "Preconditions", + threat_maturity: "Threat maturity", + }; + + function fillScoreV2() { + const v2 = DATA.score_v2; + const section = document.getElementById("v2-score-section"); + if (!section || !v2) return; + section.hidden = false; + + const absEl = document.getElementById("v2-absolute-risk"); + const pill = document.getElementById("v2-risk-pill"); + const rangeEl = document.getElementById("v2-risk-range"); + const secEl = document.getElementById("v2-security-score"); + const confEl = document.getElementById("v2-confidence"); + const pctEl = document.getElementById("v2-percentile"); + const legacyNote = document.getElementById("v2-legacy-note"); + const titleEl = document.getElementById("score-card-title"); + + if (absEl) absEl.textContent = String(v2.absolute_risk); + if (pill) { + pill.textContent = `${String(v2.risk_level || "low").toUpperCase()} RISK`; + pill.className = `risk-pill ${v2.risk_level || "low"}`; + } + if (rangeEl && Array.isArray(v2.risk_range)) { + const rangeConf = v2.risk_range_confidence != null ? String(v2.risk_range_confidence) : "—"; + rangeEl.textContent = `Estimated range ${v2.risk_range[0]}–${v2.risk_range[1]} (confidence ${rangeConf})`; + } + if (secEl) { + secEl.textContent = v2.security_score != null ? `${v2.security_score} / 100 pts` : "—"; + } + if (confEl) { + confEl.textContent = v2.confidence_score != null ? `${v2.confidence_score}%` : "—"; + } + if (pctEl) { + pctEl.textContent = v2.risk_percentile != null ? `${v2.risk_percentile}th percentile` : "—"; + } + if (legacyNote) { + legacyNote.hidden = !(DATA.scoring_version === "both" || v2.legacy_overall != null); + } + if (titleEl && (DATA.scoring_version === "both" || v2.legacy_overall != null)) { + titleEl.textContent = "Legacy Security Index (deprecated)"; + } + + const metricsRow = document.querySelector(".metrics-primary-row"); + if (metricsRow && metricsRow.parentNode && section) { + metricsRow.parentNode.insertBefore(section, metricsRow); + } + const legacyCard = document.getElementById("score-card"); + if (legacyCard && DATA.scoring_version === "both") { + legacyCard.classList.add("legacy-score-secondary"); + } + + fillV2Contributors(v2.top_contributors || []); + fillV2Categories(DATA.category_scores_v2 || []); + initV2DimensionRadar(v2.dimension_scores || {}); + } + + function fillV2Categories(categories) { + const list = document.getElementById("v2-category-list"); + const card = document.getElementById("v2-categories-card"); + if (!list || !card) return; + if (!categories.length) { + card.hidden = true; + return; + } + card.hidden = false; + list.innerHTML = categories + .map((c) => { + const pct = Math.max(0, Math.min(100, Number(c.score) || 0)); + const barColor = pct >= 80 ? COLORS.low : pct >= 50 ? COLORS.medium : COLORS.critical; + return `
  • +
    + ${escapeHtml(c.label)} + ${escapeHtml(c.display)} +
    +
    +
  • `; + }) + .join(""); + } + + function fillV2Contributors(contributors) { + const tbody = document.getElementById("v2-contributors-body"); + const card = document.getElementById("v2-contributors-card"); + if (!tbody || !card) return; + if (!contributors.length) { + card.hidden = true; + return; + } + card.hidden = false; + const findingById = Object.fromEntries((DATA.findings || []).map((f) => [f.id, f])); + tbody.innerHTML = contributors + .map((row) => { + const finding = row.finding_id ? findingById[row.finding_id] : null; + const title = finding ? finding.title : row.type === "attack_chain" ? "Attack path" : row.finding_id || "—"; + const tool = finding ? finding.tool : row.nodes ? row.nodes.join(" → ") : "—"; + const factors = row.factors + ? Object.entries(row.factors) + .filter(([, v]) => Number(v) > 0) + .map(([k, v]) => `${k}: ${v}`) + .join(", ") + : row.hop_count != null + ? `hop_count: ${row.hop_count}` + : "—"; + return ` + ${escapeHtml(title)} + ${escapeHtml(tool || "—")} + ${escapeHtml(String(row.risk_contribution ?? "—"))} + ${escapeHtml(factors)} + `; + }) + .join(""); + } + + function initV2DimensionRadar(dimensions) { + const canvas = document.getElementById("v2-dimension-radar"); + if (!canvas || typeof Chart === "undefined") return; + const keys = Object.keys(V2_DIMENSION_LABELS).filter((k) => k in dimensions); + if (!keys.length) return; + const labels = keys.map((k) => V2_DIMENSION_LABELS[k]); + const values = keys.map((k) => Number(dimensions[k]) || 0); + new Chart(canvas, { + type: "radar", + data: { + labels, + datasets: [ + { + label: "Factor load", + data: values, + borderColor: COLORS.high, + backgroundColor: "rgba(249,115,22,0.15)", + borderWidth: 2, + pointRadius: 3, + }, + ], + }, + options: { + responsive: true, + maintainAspectRatio: false, + scales: { + r: { + beginAtZero: true, + max: 100, + ticks: { display: false, stepSize: 25 }, + grid: { color: COLORS.grid }, + angleLines: { color: COLORS.grid }, + pointLabels: { color: COLORS.text, font: { size: 10 } }, + }, + }, + plugins: { legend: { display: false } }, + }, + }); + } + function fillScoreBreakdown() { const section = document.getElementById("score-breakdown-section"); const row = document.getElementById("score-breakdown-row"); const b = DATA.score && DATA.score.breakdown; if (!section || !row || !b) return; section.hidden = false; + if (DATA.score_v2) { + const intro = section.querySelector(".metrics-section-intro"); + if (intro) { + intro.textContent += + " Partition scores use the legacy v1 formula and may shift when attack chains run."; + } + } const cards = [ ["MCP Surface", b.mcp_surface], ["Supply Chain", b.supply_chain], @@ -122,11 +291,18 @@ const pill = document.getElementById("risk-pill"); const gaugeScore = document.getElementById("gauge-score-value"); const gradeEl = document.getElementById("security-grade"); - const scoreText = String(DATA.score.overall); + const v2 = DATA.score_v2; + const useV2Primary = v2 && DATA.scoring_version === "v2"; + const scoreText = useV2Primary && v2.security_score != null + ? String(v2.security_score) + : String(DATA.score.overall); - if (pill) { + if (pill && !useV2Primary) { pill.textContent = DATA.risk.badge; pill.className = `risk-pill ${DATA.risk.level}`; + } else if (pill && useV2Primary) { + pill.textContent = `${String(v2.risk_level || "low").toUpperCase()} RISK`; + pill.className = `risk-pill ${v2.risk_level || "low"}`; } if (gaugeScore) gaugeScore.textContent = scoreText; @@ -136,7 +312,11 @@ gradeEl.className = `grade-badge grade-${(grade.letter || "f").toLowerCase()}`; } const briefEl = document.getElementById("score-brief"); - if (briefEl) briefEl.textContent = DATA.risk.brief || DATA.risk.description || "—"; + if (briefEl) { + briefEl.textContent = useV2Primary + ? `Absolute risk ${v2.absolute_risk} — see v2 section below` + : DATA.risk.brief || DATA.risk.description || "—"; + } const detailEl = document.getElementById("score-detail"); const basis = DATA.score?.basis; @@ -161,10 +341,22 @@ s.low ? `${s.low} low` : null, ].filter(Boolean); const breakdown = parts.length ? ` (${parts.join(" + ")})` : ""; + let scoreLine = `Security score: ${score} / 100 points (rating, not a percentage).`; + if (DATA.score_v2) { + scoreLine = + `Absolute risk: ${DATA.score_v2.absolute_risk} (${DATA.score_v2.risk_level})` + + (DATA.score_v2.security_score != null + ? ` · benchmark security score ${DATA.score_v2.security_score}/100` + : "") + + (DATA.scoring_version === "both" + ? ` · legacy index ${score}/100` + : "") + + "."; + } el.innerHTML = `${s.total || 0} issue${s.total === 1 ? "" : "s"} (count) across ` + `${tools} MCP tool${tools === 1 ? "" : "s"}${breakdown}. ` + - `Security score: ${score} / 100 points (rating, not a percentage).`; + scoreLine; } function fillIssuesSummary() { @@ -747,36 +939,58 @@ }); } + function trendSeriesKey() { + return (DATA.trend_meta && DATA.trend_meta.series_key) || "score"; + } + + function trendValue(point) { + if (point.trend_value != null) return Number(point.trend_value); + const key = trendSeriesKey(); + if (key === "absolute_risk") return Number(point.absolute_risk) || 0; + if (key === "security_score") return Number(point.security_score) || 0; + return Number(point.score) || 0; + } + + function trendValueLabel(value) { + const key = trendSeriesKey(); + if (key === "absolute_risk") return `${value} risk`; + return `${value} / 100 pts`; + } + function renderTrendTable(points) { const wrap = document.getElementById("trend-table-wrap"); if (!wrap || !points.length) return; wrap.hidden = false; + const col = trendSeriesKey() === "absolute_risk" ? "Absolute risk" : "Score"; const rows = points .map( (point) => - `${escapeHtml(point.date)}${scorePtsHtml(point.score)}` + `${escapeHtml(point.date)}${escapeHtml(trendValueLabel(trendValue(point)))}` ) .join(""); - wrap.innerHTML = `${rows}
    DateScore
    `; + wrap.innerHTML = `${rows}
    Date${escapeHtml(col)}
    `; } function trendYRange(values) { if (!values.length) return { min: 0, max: 100 }; const minVal = Math.min(...values); const maxVal = Math.max(...values); + const isLegacyScore = trendSeriesKey() === "score" || trendSeriesKey() === "security_score"; if (minVal === maxVal) { - if (minVal <= 5) return { min: 0, max: 25 }; - if (minVal >= 95) return { min: 75, max: 100 }; - const pad = Math.max(8, Math.round(minVal * 0.15)); + if (isLegacyScore) { + if (minVal <= 5) return { min: 0, max: 25 }; + if (minVal >= 95) return { min: 75, max: 100 }; + } + const pad = Math.max(8, Math.round(Math.max(minVal * 0.15, 10))); return { min: Math.max(0, minVal - pad), - max: Math.min(100, maxVal + pad), + max: isLegacyScore ? Math.min(100, maxVal + pad) : maxVal + pad, }; } const pad = Math.max(4, Math.round((maxVal - minVal) * 0.12)); return { min: Math.max(0, minVal - pad), - max: Math.min(100, maxVal + pad), + max: isLegacyScore ? Math.min(100, maxVal + pad) : maxVal + pad, }; } @@ -784,7 +998,7 @@ const wrap = document.getElementById("trend-chart-wrap"); if (!wrap || !points.length) return; - const values = points.map((p) => Number(p.score) || 0); + const values = points.map((p) => trendValue(p)); const { min: yMin, max: yMax } = trendYRange(values); const width = 640; const height = 220; @@ -818,7 +1032,7 @@ const dots = coords .map( (pt, index) => - `${escapeHtml(points[index].date)}: ${values[index]} / 100 pts` + `${escapeHtml(points[index].date)}: ${escapeHtml(trendValueLabel(values[index]))}` ) .join(""); const gridLines = [0, 0.5, 1] @@ -839,7 +1053,7 @@ : `${escapeHtml(points[0].date)}${escapeHtml(points[count - 1].date)}`; const flatLabel = allSame && count > 1 - ? `Score flat at ${values[0]} / 100 pts across ${count} scans` + ? `Flat at ${escapeHtml(trendValueLabel(values[0]))} across ${count} scans` : ""; wrap.hidden = false; @@ -858,14 +1072,22 @@ "1 scan recorded — run mcts scan again from the same project folder to compare over time."; return; } + if (meta.mixed_metrics) { + note.hidden = false; + note.textContent = + "History mixes legacy and v2 scoring — chart shows legacy security score only. Re-scan with a consistent --scoring mode for comparable trends."; + return; + } if (meta.score_unchanged && points.length > 1) { note.hidden = false; - note.textContent = `${meta.runs} scans recorded — score unchanged at ${meta.latest_score} / 100 pts.`; + const suffix = meta.series_label ? ` (${meta.series_label})` : ""; + note.textContent = `${meta.runs} scans recorded — value unchanged at ${trendValueLabel(meta.latest_score)}${suffix}.`; return; } if (meta.runs >= 2) { note.hidden = false; - note.textContent = `${meta.runs} scans recorded for this target.`; + const suffix = meta.series_label ? ` ${meta.series_label}` : ""; + note.textContent = `${meta.runs} scans recorded for this target.${suffix}`; return; } note.hidden = true; @@ -911,6 +1133,27 @@ function fillRiskGuide() { const container = document.getElementById("risk-guide"); if (!container) return; + if (DATA.score_v2) { + const bands = [ + ["low", "0 – 99", COLORS.low], + ["medium", "100 – 249", COLORS.medium], + ["high", "250 – 499", COLORS.high], + ["critical", "500+", COLORS.critical], + ]; + const active = String(DATA.score_v2.risk_level || "low").toLowerCase(); + container.innerHTML = bands + .map(([key, range, color]) => { + const isActive = key === active; + return `
    +

    ${escapeHtml(key.toUpperCase())}

    +
    Absolute risk ${escapeHtml(range)}
    +
    ${isActive ? "Current band" : ""}
    +

    v2 multi-factor sum — higher = worse. Legacy 0–100 gauge may differ.

    +
    `; + }) + .join(""); + return; + } const score = DATA.score.overall; const iconMap = { critical: "critical", @@ -1522,6 +1765,7 @@ fillMetricsHeadline(); fillIssuesSummary(); fillScoreBreakdown(); + fillScoreV2(); fillChecksSummary(); fillOverviewPanels(); fillScanMeta(); diff --git a/src/mcts/report/assets/styles.css b/src/mcts/report/assets/styles.css index e02fec8..221b77b 100644 --- a/src/mcts/report/assets/styles.css +++ b/src/mcts/report/assets/styles.css @@ -1201,6 +1201,65 @@ body.modal-open { background: #22c55e !important; } +.v2-score-section { + margin-bottom: var(--section-gap); +} + +.legacy-score-secondary { + opacity: 0.92; +} + +.legacy-score-secondary #score-card-title { + color: var(--muted, #94a3b8); +} + +.v2-metrics-primary-row { + display: grid; + grid-template-columns: 1fr 1fr; + gap: var(--grid-gap); + margin-bottom: var(--grid-gap); +} + +.v2-score-card { + text-align: center; + background: linear-gradient(180deg, rgba(249, 115, 22, 0.1), rgba(11, 23, 48, 1)); +} + +.v2-absolute-risk { + font-size: 3rem; + font-weight: 700; + line-height: 1.1; + margin: 8px 0; +} + +.v2-meta-dl { + display: grid; + grid-template-columns: auto 1fr; + gap: 6px 16px; + margin: 16px 0 0; + text-align: left; + font-size: 13px; +} + +.v2-meta-dl dt { + color: var(--muted); +} + +.v2-meta-dl dd { + margin: 0; + font-weight: 600; +} + +.v2-radar-box { + min-height: 260px; +} + +.v2-legacy-note { + color: var(--muted); + font-size: 13px; + margin-top: 12px; +} + .score-card { display: flex; flex-direction: column; @@ -2354,7 +2413,8 @@ body.modal-open { } @media (max-width: 1280px) { - .metrics-primary-row { + .metrics-primary-row, + .v2-metrics-primary-row { grid-template-columns: 1fr; } .scores-legend-grid { diff --git a/src/mcts/report/data.py b/src/mcts/report/data.py index ad127dd..32cf8cc 100644 --- a/src/mcts/report/data.py +++ b/src/mcts/report/data.py @@ -228,6 +228,29 @@ def _score_brief(score: int) -> str: return "Strong security posture maintained" +def risk_description_v2(risk_level: str, absolute_risk: int) -> str: + level = risk_level.lower() + if level == "critical": + return ( + f"Critical multi-factor risk (absolute risk {absolute_risk}). " + "Remediate tool-attributed findings on attack paths immediately." + ) + if level == "high": + return ( + f"High multi-factor risk (absolute risk {absolute_risk}). " + "Prioritize high-severity tool findings and chain-exposed tools." + ) + if level == "medium": + return ( + f"Moderate multi-factor risk (absolute risk {absolute_risk}). " + "Schedule hardening for elevated factor dimensions." + ) + return ( + f"Low multi-factor risk (absolute risk {absolute_risk}). " + "Maintain controls; re-scan after material changes." + ) + + def risk_description(score: int) -> str: if score <= 25: return "Your MCP server has critical security issues that require immediate attention." @@ -400,6 +423,117 @@ def parse_category_gates(raw_values: list[str] | None) -> dict[str, int]: return gates +CATEGORY_TAGS_V2: dict[str, frozenset[str]] = { + "injection": frozenset({ + "prompt_injection", "jailbreak", "schema_surface", "metadata_integrity", + "skill_md", "sigma_metadata", "surface_metadata", + }), + "exfiltration": frozenset({"data_leakage", "embedding_secrets"}), + "privilege": frozenset({ + "permission_analyzer", "command_execution", "path_validation", "tool_abuse", + }), + "supply_chain": frozenset({ + "supply_chain", "vulnerable_package", "npm_audit", "virustotal", "semgrep_sast", + }), + "protocol": frozenset({"oauth_config", "runtime_events", "cloud_inspect"}), +} +CATEGORY_PRIORITY_V2 = ("injection", "exfiltration", "privilege", "supply_chain", "protocol") +CATEGORY_LABELS_V2: dict[str, str] = { + "injection": "Injection & Metadata", + "exfiltration": "Data Exfiltration", + "privilege": "Privilege & Execution", + "supply_chain": "Supply Chain", + "protocol": "Protocol & Runtime", +} +_CATEGORY_V2_PENALTY = { + Severity.CRITICAL: 35, + Severity.HIGH: 20, + Severity.MEDIUM: 10, + Severity.LOW: 5, +} + + +def assign_category_v2(analyzer: str) -> str | None: + """First-match category assignment for v2 OWASP tiles.""" + for cat in CATEGORY_PRIORITY_V2: + if analyzer in CATEGORY_TAGS_V2[cat]: + return cat + return None + + +def category_scores_v2_gate_keys() -> frozenset[str]: + return frozenset(CATEGORY_PRIORITY_V2) + + +def parse_min_category_score_v2(raw_values: list[str] | None) -> dict[str, int]: + """Parse `--min-category-score-v2 injection:80` style minimum health scores.""" + gates: dict[str, int] = {} + if not raw_values: + return gates + valid = category_scores_v2_gate_keys() + for raw in raw_values: + for part in raw.split(","): + part = part.strip() + if not part: + continue + if ":" not in part: + raise ValueError( + f"Invalid --min-category-score-v2 value {part!r}. Use category:min_score." + ) + category, limit_text = part.split(":", 1) + category = category.strip() + if category not in valid: + valid_list = ", ".join(sorted(valid)) + raise ValueError(f"Unknown v2 category {category!r}. Valid categories: {valid_list}") + minimum = int(limit_text.strip()) + if not 0 <= minimum <= 100: + raise ValueError(f"v2 category minimum must be 0–100, got {minimum}") + gates[category] = minimum + return gates + + +def category_scores_v2_gate_failures(findings: list[Finding], gates: dict[str, int]) -> list[str]: + """Fail when OWASP v2 tile score falls below minimum (100 = good polarity).""" + if not gates: + return [] + by_key = {row["key"]: row for row in category_scores_v2(findings)} + failures: list[str] = [] + for category, minimum in gates.items(): + row = by_key.get(category) + if not row: + continue + if row["score"] < minimum: + failures.append( + f"{row['label']}: v2 category score {row['score']} below minimum {minimum} " + f"(100=good; {row['findings_count']} findings)" + ) + return failures + + +def category_scores_v2(findings: list[Finding]) -> list[dict[str, Any]]: + """OWASP category health scores — 100 = good (RFC §4.15 polarity).""" + from mcts.scoring.context import scorable_findings_v2 + + scorable = scorable_findings_v2(findings) + rows: list[dict[str, Any]] = [] + for key in CATEGORY_PRIORITY_V2: + matched = [f for f in scorable if assign_category_v2(f.analyzer) == key] + penalty = sum(_CATEGORY_V2_PENALTY.get(f.severity, 5) for f in matched) + score = max(0, 100 - min(100, penalty)) + passed = len(matched) == 0 + rows.append( + { + "key": key, + "label": CATEGORY_LABELS_V2[key], + "score": score, + "display": "100/100" if passed else f"{score}/100", + "findings_count": len(matched), + "passed": passed, + } + ) + return rows + + def category_gate_failures(findings: list[Finding], gates: dict[str, int]) -> list[str]: """Return human-readable failures when a category score meets/exceeds its gate.""" if not gates: @@ -763,72 +897,153 @@ def build_recommendations(findings: list[Finding]) -> list[dict[str, Any]]: def build_attack_graph(report: ScanReport) -> dict[str, Any]: - if report.attack_graph.get("edges") or report.attack_graph.get("nodes"): - return report.attack_graph + from mcts.scoring.graph import canonical_attack_graph - nodes: dict[str, dict[str, str]] = {} - edges: list[dict[str, str]] = [] + return canonical_attack_graph(report) - for tool in report.server.tools: - nodes[tool.name] = {"id": tool.name, "label": tool.name, "type": "tool"} - for finding in report.findings: - if finding.analyzer != "attack_chains": - continue - evidence = finding.evidence - read_tools = evidence.get("read_tools", []) - exfil_tools = evidence.get("exfil_tools", []) - cred_tools = evidence.get("credential_tools", []) - exec_tools = evidence.get("exec_tools", []) - - for name in read_tools + exfil_tools + cred_tools + exec_tools: - nodes[name] = {"id": name, "label": name, "type": "tool"} - - for src in read_tools: - for dst in exfil_tools: - edges.append({"from": src, "to": dst, "label": "exfil"}) - for src in cred_tools: - for dst in exfil_tools: - edges.append({"from": src, "to": dst, "label": "credential → exfil"}) - for src in read_tools: - for dst in cred_tools: - edges.append({"from": src, "to": dst, "label": "read → cred"}) - for src in read_tools: - for dst in exec_tools: - edges.append({"from": src, "to": dst, "label": "read → exec"}) +def _trend_series_key(points: list[dict[str, Any]]) -> str: + """Pick Y-axis metric — never mix legacy score with v2 absolute_risk.""" + if not points: + return "score" + versions = {str(row.get("scoring_version", "legacy")) for row in points} + if versions == {"legacy"}: + return "score" + if versions.isdisjoint({"legacy"}) and all("absolute_risk" in row for row in points): + return "absolute_risk" + if versions.isdisjoint({"legacy"}) and all(row.get("security_score") is not None for row in points): + return "security_score" + return "score" - return { - "nodes": list(nodes.values()), - "edges": edges, - } + +def _trend_value(row: dict[str, Any], series_key: str) -> int: + if series_key == "absolute_risk": + return int(row.get("absolute_risk", 0)) + if series_key == "security_score": + return int(row.get("security_score", 0)) + return int(row.get("score", 0)) def score_trend(report: ScanReport) -> list[dict[str, Any]]: if report.scan_history: - return list(report.scan_history) - from mcts.output.history import trend_points_for_target + points = list(report.scan_history) + else: + from mcts.output.history import trend_points_for_target - points = trend_points_for_target(report.target) + points = trend_points_for_target(report.target) if points: + series_key = _trend_series_key(points) + for row in points: + row["trend_value"] = _trend_value(row, series_key) return points label = report.scanned_at.strftime("%b %d") - return [{"date": label, "score": report.score.overall}] + row: dict[str, Any] = { + "date": label, + "score": report.score.overall, + "scoring_version": report.scoring_version, + "trend_value": report.score.overall, + } + if report.score_v2 is not None: + row["absolute_risk"] = report.score_v2.absolute_risk + if report.score_v2.security_score is not None: + row["security_score"] = report.score_v2.security_score + row["risk_level"] = report.score_v2.risk_level + series_key = _trend_series_key([row]) + row["trend_value"] = _trend_value(row, series_key) + return [row] def trend_meta(report: ScanReport, points: list[dict[str, Any]]) -> dict[str, Any]: - scores = [int(row.get("score", 0)) for row in points] - unique_scores = sorted(set(scores)) + series_key = _trend_series_key(points) + values = [_trend_value(row, series_key) for row in points] + unique_values = sorted(set(values)) + latest = values[-1] if values else ( + report.score_v2.absolute_risk + if series_key == "absolute_risk" and report.score_v2 is not None + else report.score.overall + ) + labels = { + "score": "Security score (legacy, 0–100 pts, higher=better)", + "absolute_risk": "Absolute risk (v2, higher=worse)", + "security_score": "Security score (v2 benchmark, 0–100, higher=better)", + } return { "runs": len(points), - "unique_scores": len(unique_scores), - "latest_score": scores[-1] if scores else report.score.overall, - "score_unchanged": len(unique_scores) <= 1 and len(points) > 1, + "unique_scores": len(unique_values), + "latest_score": latest, + "score_unchanged": len(unique_values) <= 1 and len(points) > 1, + "series_key": series_key, + "series_label": labels.get(series_key, labels["score"]), + "mixed_metrics": len({str(row.get("scoring_version", "legacy")) for row in points}) > 1 + if points + else False, } +def _score_v2_payload(report: ScanReport) -> dict[str, Any] | None: + if report.score_v2 is None: + return None + score = report.score_v2 + return { + "absolute_risk": score.absolute_risk, + "risk_range": list(score.risk_range), + "risk_range_confidence": score.risk_range_confidence, + "risk_level": score.risk_level, + "security_score": score.security_score, + "risk_percentile": score.risk_percentile, + "confidence_score": score.confidence_score, + "legacy_overall": score.legacy_overall, + "dimension_scores": score.dimension_scores, + "top_contributors": [c.model_dump() for c in score.top_contributors[:10]], + "weights_profile": score.weights_profile, + "chain_factor_mode": score.chain_factor_mode, + "benchmark_corpus_version": score.benchmark_corpus_version, + "basis": score.basis.model_dump(), + } + + +def _build_score_help(report: ScanReport) -> dict[str, Any]: + items = [ + "Security points from 0–100 (not a percentage of tests passed)", + "Critical, High, Medium, and Low findings (severity-weighted)", + "Attack chain detections", + "Exponential decay: more severe findings lower the score", + ] + if report.score_v2 is not None: + items.extend( + [ + "Absolute risk: multi-factor sum on tool-attributed findings (higher = worse)", + "Security score: benchmark percentile when corpus stats are available", + "Chain multiplier applies to tool findings on validated attack paths only", + ] + ) + if report.score_v2.legacy_overall is not None: + items.append( + "Legacy overall includes attack_chains meta-findings; absolute risk excludes them" + ) + title = "Score derived from:" + if report.score_v2 is not None: + title = "Scores derived from:" + return {"title": title, "items": items} + + +def _primary_risk_header(report: ScanReport) -> tuple[str, str, str]: + if report.score_v2 is not None: + level = report.score_v2.risk_level.upper() + badge = f"{level} RISK" + brief = ( + f"Absolute risk {report.score_v2.absolute_risk} " + f"(range {report.score_v2.risk_range[0]}–{report.score_v2.risk_range[1]})" + ) + return badge, level.lower(), brief + return risk_rating(report.score.overall)[0], risk_rating(report.score.overall)[1], _score_brief( + report.score.overall + ) + + def build_dashboard_payload(report: ScanReport) -> dict[str, Any]: scanned_at: datetime = report.scanned_at - badge, level = risk_rating(report.score.overall) + badge, level, score_brief = _primary_risk_header(report) executed = list(report.analyzers_executed) or sorted({f.analyzer for f in report.findings}) analyzer_results = build_analyzer_results(report.findings, executed, report=report) categories = category_scores(report.findings) @@ -907,24 +1122,27 @@ def build_dashboard_payload(report: ScanReport) -> dict[str, Any]: "grade": security_grade(report.score.overall), "breakdown": breakdown_payload, }, + **({"score_v2": _score_v2_payload(report)} if report.score_v2 is not None else {}), + **( + {"category_scores_v2": category_scores_v2(report.findings)} + if report.score_v2 is not None + else {} + ), + "scoring_version": report.scoring_version, "summary": report.summary.model_dump(), "risk": { "badge": badge, "level": level, - "description": risk_description(report.score.overall), - "brief": _score_brief(report.score.overall), + "description": ( + risk_description_v2(report.score_v2.risk_level, report.score_v2.absolute_risk) + if report.score_v2 is not None + else risk_description(report.score.overall) + ), + "brief": score_brief, }, "executive_summary": executive, "checks_summary": checks_summary, - "score_help": { - "title": "Score derived from:", - "items": [ - "Security points from 0–100 (not a percentage of tests passed)", - "Critical, High, Medium, and Low findings (severity-weighted)", - "Attack chain detections", - "Exponential decay: more severe findings lower the score", - ], - }, + "score_help": _build_score_help(report), "categories": categories, "trend": trend_points, "trend_meta": trend_meta(report, trend_points), diff --git a/src/mcts/report/scan_meta.py b/src/mcts/report/scan_meta.py index 3ffa340..2a79b5d 100644 --- a/src/mcts/report/scan_meta.py +++ b/src/mcts/report/scan_meta.py @@ -86,6 +86,23 @@ def tool_discovery_context(report: ScanReport, *, live: bool, snapshot: bool) -> } +def append_chain_scan_notes( + scan_notes: list[str], report: ScanReport, config: ScanConfig +) -> None: + if config.scoring_mode == "legacy": + return + if "attack_chains" in report.analyzers_executed: + if not config.enable_attack_chains: + scan_notes.append( + "Chain multiplier disabled (chain_factor=1.0); graph and meta-findings still shown." + ) + return + scan_notes.append( + "Attack chains analyzer did not run (--analyzers filter or --surfaces without tool) " + "— chain_factor=1.0." + ) + + def _rel_path(path: Path | None) -> str: if path is None: return "" diff --git a/src/mcts/report/templates/dashboard.html b/src/mcts/report/templates/dashboard.html index 6d21e42..6171dab 100644 --- a/src/mcts/report/templates/dashboard.html +++ b/src/mcts/report/templates/dashboard.html @@ -121,7 +121,7 @@

    Scores vs counts — read this first

    View sub-scores →
    -

    Security Score

    +

    Security Score

    @@ -162,6 +162,56 @@

    Issues found

    + + `; }) .join(""); diff --git a/src/mcts/report/assets/styles.css b/src/mcts/report/assets/styles.css index 221b77b..539e5eb 100644 --- a/src/mcts/report/assets/styles.css +++ b/src/mcts/report/assets/styles.css @@ -1205,14 +1205,6 @@ body.modal-open { margin-bottom: var(--section-gap); } -.legacy-score-secondary { - opacity: 0.92; -} - -.legacy-score-secondary #score-card-title { - color: var(--muted, #94a3b8); -} - .v2-metrics-primary-row { display: grid; grid-template-columns: 1fr 1fr; @@ -1254,12 +1246,6 @@ body.modal-open { min-height: 260px; } -.v2-legacy-note { - color: var(--muted); - font-size: 13px; - margin-top: 12px; -} - .score-card { display: flex; flex-direction: column; diff --git a/src/mcts/report/data.py b/src/mcts/report/data.py index 32cf8cc..00c4bff 100644 --- a/src/mcts/report/data.py +++ b/src/mcts/report/data.py @@ -1017,10 +1017,6 @@ def _build_score_help(report: ScanReport) -> dict[str, Any]: "Chain multiplier applies to tool findings on validated attack paths only", ] ) - if report.score_v2.legacy_overall is not None: - items.append( - "Legacy overall includes attack_chains meta-findings; absolute risk excludes them" - ) title = "Score derived from:" if report.score_v2 is not None: title = "Scores derived from:" diff --git a/src/mcts/report/templates/dashboard.html b/src/mcts/report/templates/dashboard.html index 6171dab..aa21924 100644 --- a/src/mcts/report/templates/dashboard.html +++ b/src/mcts/report/templates/dashboard.html @@ -206,10 +206,6 @@

    Top risk contributors

    -

    Risk factor dimensions

    -

    0 = best · 100 = worst per axis

    +

    Relative load on this scan · 100 = dominant axis · 0 = none

    diff --git a/src/mcts/scoring/engine_v2.py b/src/mcts/scoring/engine_v2.py index ab9a212..21f7f19 100644 --- a/src/mcts/scoring/engine_v2.py +++ b/src/mcts/scoring/engine_v2.py @@ -51,20 +51,18 @@ def dimension_raw_sums(findings: list[Finding], ctx: ScoringContext) -> dict[str def compute_dimension_scores(findings: list[Finding], ctx: ScoringContext) -> dict[str, int]: + """Relative factor load per axis on this scan (0–100; highest axis = 100).""" dim_raw = dimension_raw_sums(findings, ctx) - return {dim: normalize_dim(dim_raw[dim], dim, ctx) for dim in FACTOR_DIMENSIONS} + return {dim: normalize_dim(dim_raw[dim], dim_raw) for dim in FACTOR_DIMENSIONS} -def normalize_dim(raw: float, dim: str, ctx: ScoringContext) -> int: +def normalize_dim(raw: float, dim_raw: dict[str, float]) -> int: if raw <= 0: return 0 - corpus_p95 = None - if ctx.corpus_stats: - corpus_p95 = ctx.corpus_stats.dimension_p95.get(dim) - if corpus_p95 and corpus_p95 > 0: - return min(100, round(100 * raw / corpus_p95)) - denom = max(ctx.last_absolute_risk or raw, 1.0) - return min(100, round(100 * raw / denom)) + max_raw = max(dim_raw.values()) if dim_raw else 0.0 + if max_raw <= 0: + return 0 + return min(100, round(100 * raw / max_raw)) def build_top_contributors( diff --git a/tests/scoring/test_engine_v2.py b/tests/scoring/test_engine_v2.py index c02b5e8..28f0705 100644 --- a/tests/scoring/test_engine_v2.py +++ b/tests/scoring/test_engine_v2.py @@ -6,7 +6,12 @@ from mcts.reporting.models import Finding, Severity from mcts.scoring.context import scorable_findings_v2 -from mcts.scoring.engine_v2 import RiskScoringEngineV2, finding_risk +from mcts.scoring.engine_v2 import ( + RiskScoringEngineV2, + compute_dimension_scores, + dimension_raw_sums, + finding_risk, +) from mcts.scoring.factors import ScoringContext from mcts.scoring.models import RiskFactorVector from mcts.scoring.weights import load_weights @@ -152,6 +157,46 @@ def test_absolute_risk_invariant_to_confidence() -> None: assert score_low.confidence_score != score_high.confidence_score +def test_dimension_scores_are_relative_not_flat() -> None: + """Radar axes must differ when factor loads differ (not all corpus-saturated 100).""" + weights = load_weights("manual_v1") + findings = [ + Finding( + id="exec", + analyzer="command_execution", + title="Exec", + description="d", + severity=Severity.HIGH, + recommendation="fix", + tool="run", + ), + Finding( + id="perm", + analyzer="permissions", + title="Perm", + description="delete all", + severity=Severity.CRITICAL, + recommendation="fix", + tool="wipe", + ), + ] + ctx = ScoringContext( + findings=findings, + tools=[], + attack_graph={}, + scan_scope="entrypoint", + weights=weights, + corpus_stats=None, + chain_factors={}, + ) + raw = dimension_raw_sums(findings, ctx) + scores = compute_dimension_scores(findings, ctx) + assert max(scores.values()) == 100 + assert min(scores.values()) < 100 + assert scores["threat_maturity"] < scores["exploitability"] + assert sum(raw.values()) > 0 + + def test_attack_chains_excluded_from_scorable() -> None: findings = [ Finding( From ec45fc69f8b476338cf41b3a209e9912bd287dac Mon Sep 17 00:00:00 2001 From: hello-args Date: Fri, 12 Jun 2026 02:21:51 +0530 Subject: [PATCH 07/16] fix(reporting): enforce hiding legacy score card when v2 is present CSS display:flex on .score-card overrode the hidden attribute, so the legacy gauge still appeared on v2 scans. Hide at render time and with !important so only the v2 absolute-risk block shows. --- src/mcts/report/assets/dashboard.js | 5 +++++ src/mcts/report/assets/styles.css | 8 ++++++++ src/mcts/report/generators/html_report.py | 1 + src/mcts/report/templates/dashboard.html | 4 ++-- tests/test_html_report.py | 1 + 5 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/mcts/report/assets/dashboard.js b/src/mcts/report/assets/dashboard.js index f00ddb0..98c058b 100644 --- a/src/mcts/report/assets/dashboard.js +++ b/src/mcts/report/assets/dashboard.js @@ -278,6 +278,11 @@ const el = document.getElementById(id); if (el) el.textContent = val; }); + const legacyCard = document.getElementById("score-card"); + if (DATA.score_v2 && legacyCard) { + legacyCard.hidden = true; + return; + } const pill = document.getElementById("risk-pill"); const gaugeScore = document.getElementById("gauge-score-value"); const gradeEl = document.getElementById("security-grade"); diff --git a/src/mcts/report/assets/styles.css b/src/mcts/report/assets/styles.css index 539e5eb..f0ed3b5 100644 --- a/src/mcts/report/assets/styles.css +++ b/src/mcts/report/assets/styles.css @@ -1255,6 +1255,14 @@ body.modal-open { background: linear-gradient(180deg, rgba(239, 68, 68, 0.08), rgba(11, 23, 48, 1)); } +#score-card[hidden] { + display: none !important; +} + +.metrics-primary-row:has(#score-card[hidden]) { + grid-template-columns: 1fr; +} + .score-title-row { display: flex; align-items: center; diff --git a/src/mcts/report/generators/html_report.py b/src/mcts/report/generators/html_report.py index 6948d83..1b98755 100644 --- a/src/mcts/report/generators/html_report.py +++ b/src/mcts/report/generators/html_report.py @@ -48,6 +48,7 @@ def write_html_report(report: ScanReport, output: Path) -> None: logo_src=logo_data_uri(for_report=True), icons_json=json.dumps(_load_icons()), app_version=report.version, + hide_legacy_score_card=report.score_v2 is not None, ) output.parent.mkdir(parents=True, exist_ok=True) output.write_text(html, encoding="utf-8") diff --git a/src/mcts/report/templates/dashboard.html b/src/mcts/report/templates/dashboard.html index 39b3148..04b7ba3 100644 --- a/src/mcts/report/templates/dashboard.html +++ b/src/mcts/report/templates/dashboard.html @@ -118,7 +118,7 @@

    Scores vs counts — read this first

    -
    +
    View sub-scores →

    Security Score

    @@ -162,7 +162,7 @@

    Issues found

    -