safishamsi · aa-matsunari · Apr 14, 2026
diff --git a/graphify/detect.py b/graphify/detect.py
@@ -39,6 +39,24 @@ class FileType(str, Enum):
     re.compile(r'(aws_credentials|gcloud_credentials|service.account)', re.IGNORECASE),
 ]
 
+_INLINE_SECRET_PATTERNS = [
+    re.compile(r"sk-(live|test|proj)-[A-Za-z0-9_-]{12,}", re.IGNORECASE),
+    re.compile(r"gh[pousr]_[A-Za-z0-9]{20,}", re.IGNORECASE),
+    re.compile(r"AIza[0-9A-Za-z\-_]{20,}", re.IGNORECASE),
+    re.compile(r"xox[baprs]-[A-Za-z0-9-]{20,}", re.IGNORECASE),
+    re.compile(r"-----BEGIN [A-Z ]*PRIVATE KEY-----"),
+    re.compile(
+        r"""(?ix)
+        \b(api[_-]?key|secret|token|password|passwd|bearer)\b
+        [^=\n:]{0,32}
+        (?:=|:)
+        [ \t]*["']?[A-Za-z0-9_\-./+=]{12,}["']?
+        """
+    ),
+]
+
+_INLINE_SECRET_SCAN_BYTES = 128 * 1024
+
 # Signals that a .md/.txt file is actually a converted academic paper
 _PAPER_SIGNALS = [
     re.compile(r'\barxiv\b', re.IGNORECASE),
@@ -65,6 +83,20 @@ def _is_sensitive(path: Path) -> bool:
     return any(p.search(name) or p.search(full) for p in _SENSITIVE_PATTERNS)
 
 
+def _contains_secret_content(path: Path) -> bool:
+    """Return True if file contents look like an embedded credential.
+
+    graphify is designed to feed matching files into LLM extraction. Path-based
+    filtering is not enough because secrets often live in innocuous filenames
+    such as ``settings.ts`` or generated markdown memory entries.
+    """
+    try:
+        text = path.read_text(errors="ignore")[:_INLINE_SECRET_SCAN_BYTES]
+    except Exception:
+        return False
+    return any(pattern.search(text) for pattern in _INLINE_SECRET_PATTERNS)
+
+
 def _looks_like_paper(path: Path) -> bool:
     """Heuristic: does this text file read like an academic paper?"""
     try:
@@ -397,7 +429,7 @@ def detect(root: Path, *, follow_symlinks: bool = False) -> dict:
                 continue
         if _is_ignored(p, root, ignore_patterns):
             continue
-        if _is_sensitive(p):
+        if _is_sensitive(p) or _contains_secret_content(p):
             skipped_sensitive.append(str(p))
             continue
         ftype = classify_file(p)

diff --git a/graphify/ingest.py b/graphify/ingest.py
@@ -24,6 +24,42 @@ def _safe_filename(url: str, suffix: str) -> str:
     return name + suffix
 
 
+_ANSWER_SECRET_PATTERNS = [
+    re.compile(r"sk-(live|test|proj)-[A-Za-z0-9_-]{12,}", re.IGNORECASE),
+    re.compile(r"gh[pousr]_[A-Za-z0-9]{20,}", re.IGNORECASE),
+    re.compile(r"AIza[0-9A-Za-z\-_]{20,}", re.IGNORECASE),
+    re.compile(r"xox[baprs]-[A-Za-z0-9-]{20,}", re.IGNORECASE),
+    re.compile(r"-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----", re.DOTALL),
+    re.compile(
+        r"""(?ix)
+        (
+            \b(?:api[_-]?key|secret|token|password|passwd|bearer)\b
+            [^=\n:]{0,32}
+            (?:=|:)
+            [ \t]*["']?
+        )
+        ([A-Za-z0-9_\-./+=]{12,})
+        (["']?)
+        """
+    ),
+]
+
+
+def _redact_answer_secrets(answer: str) -> str:
+    """Remove credential-like substrings before persisting query memory.
+
+    Query memory is re-ingested into future graph runs, so raw credentials must
+    never be written here even if an agent accidentally included them.
+    """
+    redacted = answer
+    for pattern in _ANSWER_SECRET_PATTERNS[:-1]:
+        redacted = pattern.sub("[REDACTED_SECRET]", redacted)
+
+    assignment_pattern = _ANSWER_SECRET_PATTERNS[-1]
+    redacted = assignment_pattern.sub(r"\1[REDACTED_SECRET]\3", redacted)
+    return redacted
+
+
 def _detect_url_type(url: str) -> str:
     """Classify the URL for targeted extraction."""
     lower = url.lower()
@@ -273,7 +309,7 @@ def save_query_result(
         "",
         "## Answer",
         "",
-        answer,
+        _redact_answer_secrets(answer),
     ]
     if source_nodes:
         body_lines += ["", "## Source Nodes", ""]

diff --git a/tests/test_detect.py b/tests/test_detect.py
@@ -236,3 +236,13 @@ def test_detect_video_not_in_words(tmp_path):
     result = detect(tmp_path)
     # Only video file present — total_words should be 0
     assert result["total_words"] == 0
+
+
+def test_detect_skips_inline_secret_content_in_benign_filename(tmp_path):
+    settings = tmp_path / "settings.ts"
+    settings.write_text('export const OPENAI_API_KEY = "sk-test-1234567890abcdef"\n')
+
+    result = detect(tmp_path)
+
+    assert result["files"]["code"] == []
+    assert any("settings.ts" in f for f in result["skipped_sensitive"])
diff --git a/tests/test_ingest.py b/tests/test_ingest.py
@@ -66,3 +66,14 @@ def test_answer_in_body(tmp_path):
     out = save_query_result("what is the answer?", answer, mem)
     content = out.read_text()
     assert answer in content
+
+
+def test_answer_redacts_secret_like_values(tmp_path):
+    mem = tmp_path / "memory"
+    answer = 'Use X-API-Key: sk-live-1234567890abcdef and token="abcd1234efgh5678"'
+    out = save_query_result("how do I call the API?", answer, mem)
+    content = out.read_text()
+
+    assert "sk-live-1234567890abcdef" not in content
+    assert 'token="abcd1234efgh5678"' not in content
+    assert "[REDACTED_SECRET]" in content