-
Notifications
You must be signed in to change notification settings - Fork 0
Harden public reporting and leak gates #11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import re | ||
| import subprocess | ||
| import sys | ||
| from pathlib import Path | ||
|
|
||
|
|
@@ -15,18 +16,110 @@ | |
| "sk-public-test-DO-NOT-LEAK", | ||
| } | ||
|
|
||
| SKIP_DIRS = {".git", ".venv", "venv", "__pycache__", "private", "runs", "traces", "artifacts", "build", "dist"} | ||
| SKIP_CONTENT_DIRS = { | ||
| ".git", | ||
| ".venv", | ||
| "venv", | ||
| "__pycache__", | ||
| "private", | ||
| "runs", | ||
| "traces", | ||
| "artifacts", | ||
| "build", | ||
| "dist", | ||
| } | ||
| SKIP_WALK_DIRS = {".git", ".venv", "venv", "__pycache__", "build", "dist"} | ||
| SKIP_WALK_PREFIXES = { | ||
| "runs/", | ||
| "traces/", | ||
| "artifacts/", | ||
| "examples/artifacts/", | ||
| "reports/generated/", | ||
| } | ||
| TEXT_SUFFIXES = {".md", ".py", ".json", ".txt", ".csv", ".html", ".env", ".toml", ".yaml", ".yml", ".gitignore", ""} | ||
|
|
||
| DENIED_PATH_PREFIXES = { | ||
| "runs/", | ||
| "traces/", | ||
| "private/", | ||
| "fixtures/private/", | ||
| "examples/artifacts/", | ||
| "artifacts/", | ||
| "reports/generated/", | ||
| } | ||
| DENIED_PATH_PARTS = {"runs", "traces", "private", "artifacts"} | ||
| DENIED_PATH_TERMS = { | ||
| ".env", | ||
| ".env.", | ||
| "secret", | ||
| "token", | ||
| "key", | ||
| "answer_key", | ||
| "hidden_label", | ||
| "customer_private", | ||
| } | ||
|
|
||
| def should_skip(path: Path) -> bool: | ||
| return any(part in SKIP_DIRS for part in path.parts) | ||
|
|
||
| def should_skip_content(path: Path) -> bool: | ||
| return any(part in SKIP_CONTENT_DIRS for part in path.parts) | ||
|
|
||
| def scan(root: Path) -> list[str]: | ||
| findings = [] | ||
|
|
||
| def _relative_path(path: Path) -> str: | ||
| return path.as_posix().removeprefix("./") | ||
|
|
||
|
|
||
| def denylisted_path_reason(path: str) -> str | None: | ||
| normalized = _relative_path(Path(path)).lower() | ||
| parts = normalized.split("/") | ||
| for prefix in DENIED_PATH_PREFIXES: | ||
| if normalized.startswith(prefix): | ||
| return f"denied path prefix {prefix}" | ||
| for part in parts: | ||
| if part in DENIED_PATH_PARTS: | ||
| return f"denied path component {part}" | ||
| if part == ".env" or part.startswith(".env."): | ||
| return "denied env file path" | ||
| for term in DENIED_PATH_TERMS: | ||
| if term in normalized: | ||
| return f"denied path term {term}" | ||
| return None | ||
|
|
||
|
|
||
| def git_tracked_paths(root: Path) -> list[str] | None: | ||
| try: | ||
| result = subprocess.run( | ||
| ["git", "-C", str(root), "ls-files", "-z"], | ||
| check=True, | ||
| stdout=subprocess.PIPE, | ||
| stderr=subprocess.DEVNULL, | ||
| ) | ||
| except (FileNotFoundError, subprocess.CalledProcessError): | ||
| return None | ||
| return [item.decode("utf-8") for item in result.stdout.split(b"\0") if item] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
|
|
||
|
|
||
| def walk_source_paths(root: Path) -> list[str]: | ||
| paths = [] | ||
| for path in root.rglob("*"): | ||
| if path.is_dir() or should_skip(path): | ||
| rel = path.relative_to(root) | ||
| rel_name = rel.as_posix() | ||
| if any(part in SKIP_WALK_DIRS for part in rel.parts): | ||
| continue | ||
| if any(rel_name.startswith(prefix) for prefix in SKIP_WALK_PREFIXES): | ||
| continue | ||
| if path.is_file(): | ||
| paths.append(rel_name) | ||
| return sorted(paths) | ||
|
|
||
|
|
||
| def scan_paths(root: Path, paths: list[str]) -> list[str]: | ||
| findings = [] | ||
| for rel_path in sorted(paths): | ||
| reason = denylisted_path_reason(rel_path) | ||
| if reason: | ||
| findings.append(f"{rel_path}: {reason}") | ||
| path = root / rel_path | ||
| if not path.is_file() or should_skip_content(path): | ||
| continue | ||
| if path.suffix not in TEXT_SUFFIXES and path.name != ".gitignore": | ||
| continue | ||
|
|
@@ -46,6 +139,12 @@ def scan(root: Path) -> list[str]: | |
| return findings | ||
|
|
||
|
|
||
| def scan(root: Path) -> list[str]: | ||
| tracked_paths = git_tracked_paths(root) | ||
| paths = tracked_paths if tracked_paths is not None else walk_source_paths(root) | ||
| return scan_paths(root, paths) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| root = Path(sys.argv[1] if len(sys.argv) > 1 else ".").resolve() | ||
| findings = scan(root) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import re | ||
| from collections.abc import Mapping, Sequence | ||
| from typing import Any | ||
|
|
||
| REDACTED = "[REDACTED]" | ||
| REDACTED_KEY = "[REDACTED_KEY]" | ||
|
|
||
| UNSAFE_TEXT_PATTERNS = [ | ||
| re.compile(pattern, re.IGNORECASE) | ||
| for pattern in [ | ||
| r"\banswer[_ -]?key\b", | ||
| r"\bhidden[_ -]?label(s)?\b", | ||
| r"\bprivate[_ -]?threshold\b", | ||
| r"\bprotected[_ -]?scorer[_ -]?config\b", | ||
| r"\bscorer[_ -]?config\b", | ||
| r"\bcanary\b", | ||
| r"\bCANARY_", | ||
| r"\bHONEY_", | ||
| r"\bhoney row\b", | ||
| r"\bsecret\b", | ||
| r"\btoken\b", | ||
| r"\bapi[_ -]?key\b", | ||
| r"\bexpected\s*=", | ||
| r"\bexpected\s*:", | ||
| r"\bcorrect answer\b", | ||
| r"\bprivate rubric\b", | ||
| r"\bcustomer[_ -]?private\b", | ||
| r"fixtures/private", | ||
| r"(^|/)private/", | ||
| r"\braw[_ -]?trace\b", | ||
| r"\braw[_ -]?diagnostics\b", | ||
| ] | ||
| ] | ||
|
|
||
|
|
||
| def is_public_safe_text(text: str) -> bool: | ||
| return not any(pattern.search(text) for pattern in UNSAFE_TEXT_PATTERNS) | ||
|
|
||
|
|
||
| def redact_text(text: str) -> str: | ||
| if is_public_safe_text(text): | ||
| return text | ||
| return REDACTED | ||
|
|
||
|
|
||
| def redact_obj(obj: Any) -> Any: | ||
| if isinstance(obj, str): | ||
| return redact_text(obj) | ||
| if isinstance(obj, Mapping): | ||
| redacted: dict[str, Any] = {} | ||
| for key, value in obj.items(): | ||
| safe_key = str(key) if is_public_safe_text(str(key)) else REDACTED_KEY | ||
| redacted[safe_key] = redact_obj(value) | ||
|
Comment on lines
+54
to
+55
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In Useful? React with 👍 / 👎. |
||
| return redacted | ||
| if isinstance(obj, tuple): | ||
| return tuple(redact_obj(item) for item in obj) | ||
| if isinstance(obj, Sequence) and not isinstance(obj, bytes | bytearray): | ||
| return [redact_obj(item) for item in obj] | ||
| return obj | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new path gate uses substring checks (
if term in normalized) for generic terms likekeyandtoken, so benign filenames such assrc/monkeypatch.pyordocs/tokenization.mdwill be flagged as leaks even when file contents are safe. Becausemake leak-checkruns this script over tracked files, this can block normal repo changes with false positives. Match on path components/word boundaries (or stricter patterns) instead of raw substring inclusion.Useful? React with 👍 / 👎.