Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion graphify/detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,24 @@ class FileType(str, Enum):
re.compile(r'(aws_credentials|gcloud_credentials|service.account)', re.IGNORECASE),
]

_INLINE_SECRET_PATTERNS = [
re.compile(r"sk-(live|test|proj)-[A-Za-z0-9_-]{12,}", re.IGNORECASE),
re.compile(r"gh[pousr]_[A-Za-z0-9]{20,}", re.IGNORECASE),
re.compile(r"AIza[0-9A-Za-z\-_]{20,}", re.IGNORECASE),
re.compile(r"xox[baprs]-[A-Za-z0-9-]{20,}", re.IGNORECASE),
re.compile(r"-----BEGIN [A-Z ]*PRIVATE KEY-----"),
re.compile(
r"""(?ix)
\b(api[_-]?key|secret|token|password|passwd|bearer)\b
[^=\n:]{0,32}
(?:=|:)
[ \t]*["']?[A-Za-z0-9_\-./+=]{12,}["']?
"""
),
]

_INLINE_SECRET_SCAN_BYTES = 128 * 1024

# Signals that a .md/.txt file is actually a converted academic paper
_PAPER_SIGNALS = [
re.compile(r'\barxiv\b', re.IGNORECASE),
Expand All @@ -65,6 +83,20 @@ def _is_sensitive(path: Path) -> bool:
return any(p.search(name) or p.search(full) for p in _SENSITIVE_PATTERNS)


def _contains_secret_content(path: Path) -> bool:
"""Return True if file contents look like an embedded credential.

graphify is designed to feed matching files into LLM extraction. Path-based
filtering is not enough because secrets often live in innocuous filenames
such as ``settings.ts`` or generated markdown memory entries.
"""
try:
text = path.read_text(errors="ignore")[:_INLINE_SECRET_SCAN_BYTES]
except Exception:
return False
return any(pattern.search(text) for pattern in _INLINE_SECRET_PATTERNS)


def _looks_like_paper(path: Path) -> bool:
"""Heuristic: does this text file read like an academic paper?"""
try:
Expand Down Expand Up @@ -397,7 +429,7 @@ def detect(root: Path, *, follow_symlinks: bool = False) -> dict:
continue
if _is_ignored(p, root, ignore_patterns):
continue
if _is_sensitive(p):
if _is_sensitive(p) or _contains_secret_content(p):
skipped_sensitive.append(str(p))
continue
ftype = classify_file(p)
Expand Down
38 changes: 37 additions & 1 deletion graphify/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,42 @@ def _safe_filename(url: str, suffix: str) -> str:
return name + suffix


_ANSWER_SECRET_PATTERNS = [
re.compile(r"sk-(live|test|proj)-[A-Za-z0-9_-]{12,}", re.IGNORECASE),
re.compile(r"gh[pousr]_[A-Za-z0-9]{20,}", re.IGNORECASE),
re.compile(r"AIza[0-9A-Za-z\-_]{20,}", re.IGNORECASE),
re.compile(r"xox[baprs]-[A-Za-z0-9-]{20,}", re.IGNORECASE),
re.compile(r"-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----", re.DOTALL),
re.compile(
r"""(?ix)
(
\b(?:api[_-]?key|secret|token|password|passwd|bearer)\b
[^=\n:]{0,32}
(?:=|:)
[ \t]*["']?
)
([A-Za-z0-9_\-./+=]{12,})
(["']?)
"""
),
]


def _redact_answer_secrets(answer: str) -> str:
"""Remove credential-like substrings before persisting query memory.

Query memory is re-ingested into future graph runs, so raw credentials must
never be written here even if an agent accidentally included them.
"""
redacted = answer
for pattern in _ANSWER_SECRET_PATTERNS[:-1]:
redacted = pattern.sub("[REDACTED_SECRET]", redacted)

assignment_pattern = _ANSWER_SECRET_PATTERNS[-1]
redacted = assignment_pattern.sub(r"\1[REDACTED_SECRET]\3", redacted)
return redacted


def _detect_url_type(url: str) -> str:
"""Classify the URL for targeted extraction."""
lower = url.lower()
Expand Down Expand Up @@ -273,7 +309,7 @@ def save_query_result(
"",
"## Answer",
"",
answer,
_redact_answer_secrets(answer),
]
if source_nodes:
body_lines += ["", "## Source Nodes", ""]
Expand Down
10 changes: 10 additions & 0 deletions tests/test_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,3 +236,13 @@ def test_detect_video_not_in_words(tmp_path):
result = detect(tmp_path)
# Only video file present — total_words should be 0
assert result["total_words"] == 0


def test_detect_skips_inline_secret_content_in_benign_filename(tmp_path):
settings = tmp_path / "settings.ts"
settings.write_text('export const OPENAI_API_KEY = "sk-test-1234567890abcdef"\n')

result = detect(tmp_path)

assert result["files"]["code"] == []
assert any("settings.ts" in f for f in result["skipped_sensitive"])
11 changes: 11 additions & 0 deletions tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,14 @@ def test_answer_in_body(tmp_path):
out = save_query_result("what is the answer?", answer, mem)
content = out.read_text()
assert answer in content


def test_answer_redacts_secret_like_values(tmp_path):
mem = tmp_path / "memory"
answer = 'Use X-API-Key: sk-live-1234567890abcdef and token="abcd1234efgh5678"'
out = save_query_result("how do I call the API?", answer, mem)
content = out.read_text()

assert "sk-live-1234567890abcdef" not in content
assert 'token="abcd1234efgh5678"' not in content
assert "[REDACTED_SECRET]" in content