From a1b214dc835c13de1ba68a859d83ced1137dadcd Mon Sep 17 00:00:00 2001 From: tuirk <65666288+tuirk@users.noreply.github.com> Date: Sat, 6 Jun 2026 21:22:22 +0300 Subject: [PATCH 1/2] fix(security): clear Scorecard #70 transformers OSV alerts Bump torch to 2.6.0 (CPU) and exact-pin transformers to 5.9.0. The exact pin avoids Scorecard/OSV treating a vulnerable range as still affected, while staying below transformers 5.10+ imports that require torch float8 symbols 2.6 lacks. Align unit-tests-nlp CI with Dockerfile torch install. Add OSV triage script and document fix in scorecard-deferred Bucket E. Signed-off-by: tuirk <65666288+tuirk@users.noreply.github.com> --- .github/dependabot.yml | 4 +- .github/workflows/integration-test.yml | 3 ++ CHANGELOG.md | 7 +++ docs/security/scorecard-deferred.md | 14 ++++- nlp-service/Dockerfile | 18 +++---- nlp-service/requirements.txt | 11 ++-- .../tests/test_embedding_import_smoke.py | 8 +-- scripts/osv-transformers-triage.py | 54 +++++++++++++++++++ 8 files changed, 98 insertions(+), 21 deletions(-) create mode 100644 scripts/osv-transformers-triage.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml index eb270df..34abd02 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -73,9 +73,9 @@ updates: versions: [">=0.5.0"] - dependency-name: numpy versions: [">=2.0.0"] - # transformers 5.x breaks torch 2.5.1 + Dockerfile 4.x assert; cap in requirements.txt. + # transformers >=5.10 imports torch.float8_e8m0fnu; torch 2.6.0 lacks it. - dependency-name: transformers - versions: [">=5.0.0"] + versions: [">=5.10.0"] # pyrate-limiter 4.x API change; defer until rate-limiter path is retested. - dependency-name: pyrate-limiter versions: [">=4.0.0"] diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 4c65529..c7472f1 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -63,6 +63,9 @@ jobs: python-version: '3.11' cache: 'pip' cache-dependency-path: nlp-service/requirements.txt + # Match nlp-service/Dockerfile: CPU torch before requirements (avoids + # CUDA wheel + keeps transformers 5.x compatible with torch 2.6.x). + - run: pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu - run: pip install -r requirements.txt working-directory: nlp-service - run: python -m pytest diff --git a/CHANGELOG.md b/CHANGELOG.md index dbcf898..d8d094d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,13 @@ ship with a `migrate.py` step that runs at boot. ## [Unreleased] +### Security + +- **nlp-service:** bump `torch` to 2.6.0 (CPU) and exact-pin `transformers` + to 5.9.0 to clear Scorecard alert #70 (27 OSV IDs). Keep below 5.10 because + 5.10+ requires torch float8 symbols 2.6.0 lacks. CI `unit-tests-nlp` now + installs the CPU torch wheel before `requirements.txt` (Dockerfile parity). + ## [0.2.2] — 2026-05-30 Incremental release over 0.2.1: pre-compile health checks, DeepSeek V4 Flash, diff --git a/docs/security/scorecard-deferred.md b/docs/security/scorecard-deferred.md index 83d82be..892bf2a 100644 --- a/docs/security/scorecard-deferred.md +++ b/docs/security/scorecard-deferred.md @@ -109,7 +109,7 @@ improvement at disproportionate maintenance cost for this project. | Alert | File:Line | |---|---| -| 40 | nlp-service/Dockerfile:46 — `pip install ... torch==2.5.1 --index-url ` (was :44 pre-#121) | +| 40 | nlp-service/Dockerfile:46 — `pip install ... torch==2.6.0 --index-url ` | | 41 | nlp-service/Dockerfile:48 — `pip install -r requirements.txt` (was :46 pre-#121) | | 62 | .github/workflows/integration-test.yml:67 — `pip install -r requirements.txt` | | 71 | nlp-service/Dockerfile:48 — same finding as #41 (Scorecard re-file after line shift) | @@ -134,6 +134,18 @@ Re-evaluate when: - Upstream PyPI compromise affects any pinned dependency. - The pip dependency surface shrinks below ~30 transitive deps. +## Bucket E — resolved (alert #70, VulnerabilitiesID) + +**Rule:** `VulnerabilitiesID` — OSV scan of dependency manifests. + +Scorecard #70 reported 27 open OSV IDs; triage (2026-06-06) found all 27 in +`transformers` on PyPI. npm lockfiles (`app`, `cli`, `mcp-server`) were clean. +Fix: bump `torch` to **2.6.0** (CPU wheel) and exact-pin `transformers` to +**5.9.0** in [nlp-service/requirements.txt](../../nlp-service/requirements.txt). +The exact pin avoids Scorecard/OSV treating a broad vulnerable range as still +affected; 5.10+ still needs torch float8 symbols 2.6 lacks. Alert should +auto-close on the next Scorecard run after merge. + ## Bucket D — tracked TODO (1 alert, left open) **Alert 27 — `BranchProtectionID`.** diff --git a/nlp-service/Dockerfile b/nlp-service/Dockerfile index 8212865..9a722cd 100644 --- a/nlp-service/Dockerfile +++ b/nlp-service/Dockerfile @@ -36,19 +36,19 @@ COPY requirements.txt /app/requirements.txt # (~2 GB) from PyPI, bloating the image to 10 GB+. sentence-transformers and # keybert find torch already installed and skip their own resolution. # -# Torch MUST be pinned. sentence-transformers==3.3.1 (Nov 2024) calls -# `.to(device)` on meta-device-initialised weights; torch >=2.7 raises -# NotImplementedError("Cannot copy out of meta tensor") and requires -# `.to_empty()` instead. An unpinned install after a --no-cache rebuild -# pulls the latest torch and breaks /resolve/embedding + /extract/keybert -# at runtime — and the unit tests stub sentence_transformers, so nothing -# catches the drift. -RUN pip install --no-cache-dir torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu +# Torch MUST be pinned. sentence-transformers calls `.to(device)` on meta- +# device-initialised weights; torch >=2.7 raises NotImplementedError +# ("Cannot copy out of meta tensor") and requires `.to_empty()` instead. +# 2.6.x has float8 symbols transformers 5.x needs; 2.5.1 does not. +# An unpinned install after a --no-cache rebuild pulls the latest torch +# and breaks /resolve/embedding + /extract/keybert at runtime — and unit +# tests stub sentence_transformers, so nothing catches the drift. +RUN pip install --no-cache-dir torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu RUN pip install --no-cache-dir -r requirements.txt # Fail the image build if torch/transformers pins drift (pytest stubs hide this). -RUN python -c "import transformers; from sentence_transformers import SentenceTransformer; assert transformers.__version__.startswith('4.')" +RUN python -c "import transformers; from sentence_transformers import SentenceTransformer; assert transformers.__version__.startswith('5.')" # Override markitdown's youtube-transcript-api~=1.0.0 pin. 1.0.x returns empty # timedtext bodies on current YouTube (ParseError) even from residential IPs; diff --git a/nlp-service/requirements.txt b/nlp-service/requirements.txt index 19491d0..5246e18 100644 --- a/nlp-service/requirements.txt +++ b/nlp-service/requirements.txt @@ -25,11 +25,12 @@ rake-nltk==1.0.6 yake==0.7.3 keybert==0.9.0 sentence-transformers==5.5.0 -# Dockerfile pins torch==2.5.1 (meta-tensor compat). transformers 5.x imports -# torch.float8_e8m0fnu at load time, which 2.5.1 lacks — breaks -# /resolve/embedding on fresh pip installs. Cap at 4.x; pytest stubs ST so -# only test_embedding_import_smoke.py catches drift. -transformers>=4.41.0,<5.0.0 +# Dockerfile pins torch==2.6.0 (CPU wheel). torch 2.5.1 breaks transformers +# 5.x imports; torch >=2.7 breaks sentence-transformers meta-device init +# (NotImplementedError on .to(device)). +# Exact pin avoids Scorecard/OSV treating a broad vulnerable range as still +# affected by #70, while staying below 5.10+ imports of torch.float8_e8m0fnu. +transformers==5.9.0 # pytextrank: spaCy-native TextRank component (PyPI-available, no git dep). pytextrank==3.3.0 scikit-learn==1.8.0 diff --git a/nlp-service/tests/test_embedding_import_smoke.py b/nlp-service/tests/test_embedding_import_smoke.py index cf2980e..80f08d3 100644 --- a/nlp-service/tests/test_embedding_import_smoke.py +++ b/nlp-service/tests/test_embedding_import_smoke.py @@ -1,12 +1,12 @@ """Smoke test: real sentence-transformers import stack (not conftest-stubbed). conftest.py stubs sentence_transformers for speed, so a fresh ``pip install`` -can pull transformers 5.x incompatible with the Dockerfile's torch==2.5.1 pin +can pull transformers/torch versions incompatible with the Dockerfile pins without any unit test failing. This runs the import in a subprocess so the stub never applies. -Regression: resolve 500 ``embedding_failed`` when transformers 5.10+ met -torch 2.5.1 after a Docker requirements-layer cache bust (2026-06-06). +Regression: resolve 500 ``embedding_failed`` when transformers 5.x met +torch 2.5.1 (missing float8) or torch >=2.7 (meta-tensor .to(device)). """ from __future__ import annotations @@ -19,7 +19,7 @@ def test_sentence_transformer_imports_with_pinned_torch() -> None: script = ( "import transformers; " "from sentence_transformers import SentenceTransformer; " - "assert transformers.__version__.startswith('4.'), transformers.__version__; " + "assert transformers.__version__.startswith('5.'), transformers.__version__; " "print('ok')" ) result = subprocess.run( diff --git a/scripts/osv-transformers-triage.py b/scripts/osv-transformers-triage.py new file mode 100644 index 0000000..88b77ff --- /dev/null +++ b/scripts/osv-transformers-triage.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +"""Map Scorecard #70 OSV IDs to transformers fix boundaries.""" +from __future__ import annotations + +import json +import urllib.request + +IDS = [ + "PYSEC-2023-299", "GHSA-282v-666c-3fvg", + "GHSA-37mw-44qp-f5jm", "GHSA-37q5-v5qm-c9v8", + "PYSEC-2023-300", "GHSA-3863-2447-669p", + "GHSA-4w7r-h757-3r74", "GHSA-59p9-h35m-wg4g", + "GHSA-69w3-r845-3855", "GHSA-6rvg-6v2m-4j46", + "GHSA-9356-575x-2w9m", "GHSA-fpwr-67px-3qhx", + "PYSEC-2024-229", "GHSA-hxxf-235m-72v3", + "GHSA-jjph-296x-mrcr", "GHSA-phhr-52qp-3mj4", + "GHSA-q2wp-rjmx-x6x9", + "PYSEC-2025-40", "GHSA-qq3j-4f4f-9583", + "PYSEC-2024-227", "GHSA-qxrp-vhvm-j765", + "GHSA-rcv9-qm8p-9p6j", + "PYSEC-2023-301", "GHSA-v68g-wm8c-6x7j", + "PYSEC-2024-228", "GHSA-wrfc-pvp9-mr9g", + "PYSEC-2025-211", "PYSEC-2025-212", "PYSEC-2025-213", + "PYSEC-2025-214", "PYSEC-2025-215", "PYSEC-2025-216", + "PYSEC-2025-217", "PYSEC-2025-218", +] + +seen: set[str] = set() +max_la = "" +needs_5x: list[str] = [] +for vid in IDS: + if vid in seen: + continue + seen.add(vid) + with urllib.request.urlopen(f"https://api.osv.dev/v1/vulns/{vid}", timeout=20) as r: + v = json.load(r) + pkg = v["affected"][0] + la = fix = None + for rng in pkg.get("ranges", []): + for ev in rng.get("events", []): + if "last_affected" in ev: + la = ev["last_affected"] + if "fixed" in ev: + fix = ev["fixed"] + if la and (not max_la or la > max_la): + max_la = la + if fix and fix.startswith("5."): + needs_5x.append(f"{vid} (fix {fix})") + print(f"{vid:22} last_affected={la or '-':10} fixed={fix or '-'}") + +print(f"\nMax last_affected in 4.x: {max_la}") +print(f"Need 5.x to fix: {len(needs_5x)}") +for x in needs_5x: + print(f" {x}") From 87eab6296db476d321844370ec6b3e5ef6482f26 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 7 Jun 2026 05:30:38 +0000 Subject: [PATCH 2/2] chore(deps): update json-repair requirement in /nlp-service Updates the requirements on [json-repair](https://github.com/mangiucugna/json_repair) to permit the latest version. - [Release notes](https://github.com/mangiucugna/json_repair/releases) - [Commits](https://github.com/mangiucugna/json_repair/compare/v0.59.10...v0.60.1) --- updated-dependencies: - dependency-name: json-repair dependency-version: 0.60.1 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- nlp-service/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nlp-service/requirements.txt b/nlp-service/requirements.txt index 5246e18..bbb088c 100644 --- a/nlp-service/requirements.txt +++ b/nlp-service/requirements.txt @@ -40,7 +40,7 @@ nltk==3.9.4 rapidfuzz>=3.14.5 # Recovers truncated JSON from Gemini 2.5 Flash repetition-loop bug # (see issue in repo) — salvages already-billed output on parse failure. -json-repair>=0.59.10 +json-repair>=0.60.1 # Commit 7: Chroma vector store (embedded, no separate server). 0.4.x API: # PersistentClient, get_or_create_collection, cosine distance. chromadb==0.4.24