From 08906c5017de838db658b35c2fee7ddf3d832819 Mon Sep 17 00:00:00 2001
From: "Aaron K. Clark" <cryptojones@owasp.org>
Date: Wed, 20 May 2026 16:51:43 -0500
Subject: [PATCH] reliability(interview): atomic save + graceful recovery from
 corrupt resume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`Interview.save()` previously did `answers_path.write_text(...)` which
opens, truncates, and writes. If the process is killed mid-write —
SIGINT from Ctrl-C, OOM, power loss — the file on disk is left in a
half-written state. The next `socrates init --resume` then calls
`json.loads()` on the partial JSON and crashes with a stacktrace:

  json.decoder.JSONDecodeError: Expecting ',' delimiter ...
  (8 lines of traceback)

Operator's only escape is to delete .socrates-answers.json and start
over — losing every answer that *did* successfully land before the
interrupt.

Fix is two-part:

1. Atomic save: write to `<file>.tmp` then `os.replace` onto the final
   path. On POSIX (and Windows ≥ 3.3 via os.replace) the rename is
   atomic — the file is either fully old or fully new, never partial.
   Even if the tempfile write is interrupted, the real answers file is
   untouched. Cleanup unlinks the tempfile on success or failure.

2. Resume recovery: if --resume is passed and the file IS corrupt
   (despite #1, e.g. a pre-fix file or out-of-process tampering),
   warn the operator on stderr and start with empty answers instead
   of crashing. The corrupt file gets overwritten cleanly on the
   first answer.

Tests added (4):
- save() leaves no .tmp stranded
- save() with simulated rename failure: pre-existing file untouched,
  no tempfile leaked
- load() with corrupt JSON: warns, returns empty answers, no raise
- load() with permission-denied OSError: same graceful path

151/151 tests pass; ruff + mypy clean.
---
 src/socrates120x/interview.py | 54 ++++++++++++++++++++-
 tests/test_interview.py       | 91 +++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 2 deletions(-)
diff --git a/src/socrates120x/interview.py b/src/socrates120x/interview.py
index 723ad11..d047b63 100644
--- a/src/socrates120x/interview.py
+++ b/src/socrates120x/interview.py
@@ -11,7 +11,10 @@
 
 from __future__ import annotations
 
+import contextlib
 import json
+import os
+import sys
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
@@ -30,6 +33,31 @@
 __all__ = ["Interview", "Question", "QUESTIONS", "is_interactive"]
 
 
+def _atomic_write_text(path: Path, text: str) -> None:
+    """Write *text* to *path* via a same-directory tempfile + rename.
+
+    A naked path.write_text() opens, truncates, and writes — if the process
+    is killed mid-write (Ctrl-C, OOM, power loss) the file is left
+    truncated or partially written, which then breaks the next
+    `socrates init --resume` with a JSONDecodeError stacktrace.
+    Tempfile + rename is atomic on POSIX (and on Windows since Python 3.3
+    via os.replace) so the file on disk is either the old contents or
+    the new contents — never half-and-half.
+    """
+    tmp = path.with_name(path.name + ".tmp")
+    try:
+        tmp.write_text(text)
+        # os.replace is atomic across filesystems on the same device; if the
+        # tempfile lives in the same dir as the target (which it does here),
+        # we're always on the same device.
+        os.replace(tmp, path)
+    finally:
+        # If anything went wrong, leave a clean directory — don't strand the
+        # tempfile for the next run to wonder about.
+        with contextlib.suppress(FileNotFoundError):
+            tmp.unlink()
+
+
 # The full init interview. Order matters — earlier answers prime the user for
 # later ones (decisions before risks, risks before open questions, etc.).
 QUESTIONS: tuple[Question, ...] = (
@@ -193,11 +221,33 @@ class Interview:
     questions: tuple[Question, ...] = field(default_factory=lambda: QUESTIONS)
 
     def load(self) -> None:
-        if self.answers_path.exists() and self.resume:
+        """Load answers from disk if --resume was passed.
+
+        A previous run that was killed mid-save could have left a corrupted
+        file. Instead of blowing up with a JSONDecodeError stacktrace, warn
+        the operator and start fresh — they'd have to re-answer questions
+        either way, and the alternative is unrecoverable from the CLI.
+        """
+        if not (self.answers_path.exists() and self.resume):
+            return
+        try:
             self.answers = json.loads(self.answers_path.read_text())
+        except (json.JSONDecodeError, OSError) as e:
+            print(
+                f"warning: could not read {self.answers_path}: {e}\n"
+                f"  Starting interview from scratch. The corrupt file will be "
+                f"overwritten on the first answer.",
+                file=sys.stderr,
+            )
+            self.answers = {}
 
     def save(self) -> None:
-        self.answers_path.write_text(json.dumps(self.answers, indent=2) + "\n")
+        # Atomic write — see _atomic_write_text. Without this, Ctrl-C during
+        # save() leaves a truncated file that crashes the next --resume.
+        _atomic_write_text(
+            self.answers_path,
+            json.dumps(self.answers, indent=2) + "\n",
+        )
 
     def run(
         self,
diff --git a/tests/test_interview.py b/tests/test_interview.py
index e7b1ad5..c4068e0 100644
--- a/tests/test_interview.py
+++ b/tests/test_interview.py
@@ -120,3 +120,94 @@ def fake_editor_run(cmd: list[str], check: bool = True) -> Any:  # noqa: ARG001
 
     # business_goal is a multiline required question — should have come from the editor.
     assert iv.answers["business_goal"] == "real answer line one\nreal answer line two"
+
+
+# ---------------------------------------------------------------------------
+# Atomic save + corrupted resume recovery (reliability/interview-atomic-save)
+# ---------------------------------------------------------------------------
+
+
+def test_save_is_atomic_no_tempfile_left_behind(tmp_path: Path) -> None:
+    """save() must not leave behind the .tmp file used for the atomic rename."""
+    iv = Interview(answers_path=tmp_path / "answers.json", project_name="p")
+    iv.answers = {"k": "v"}
+    iv.save()
+    assert (tmp_path / "answers.json").exists()
+    assert not (tmp_path / "answers.json.tmp").exists()
+
+
+def test_save_does_not_leave_partial_file_on_failure(
+    tmp_path: Path, monkeypatch: Any
+) -> None:
+    """If the rename step fails, the final file should not exist in a partial
+    state. (The tempfile may exist but the target should be untouched.)"""
+    answers_path = tmp_path / "answers.json"
+    # Seed with a known good file.
+    answers_path.write_text('{"old": "value"}\n')
+
+    iv = Interview(answers_path=answers_path, project_name="p")
+    iv.answers = {"new": "value"}
+
+    # Make os.replace fail so we hit the cleanup path.
+    import socrates120x.interview as iv_mod
+    real_replace = iv_mod.os.replace
+    def boom(src: object, dst: object) -> None:
+        raise OSError("simulated replace failure")
+    monkeypatch.setattr(iv_mod.os, "replace", boom)
+
+    import contextlib  # noqa: PLC0415
+    with contextlib.suppress(OSError):
+        iv.save()  # expected to fail at the rename; verify state AFTER
+
+    # The pre-existing file must NOT have been clobbered by the failed save.
+    assert answers_path.read_text() == '{"old": "value"}\n'
+    # And no tempfile should be left behind to confuse the next run.
+    assert not (tmp_path / "answers.json.tmp").exists()
+    # Restore (paranoia).
+    monkeypatch.setattr(iv_mod.os, "replace", real_replace)
+
+
+def test_resume_with_corrupt_answers_file_warns_and_starts_fresh(
+    tmp_path: Path, capsys: Any
+) -> None:
+    """A previous Ctrl-C during save could have left a truncated/invalid
+    JSON file. Re-running with --resume must warn and start fresh
+    instead of crashing with JSONDecodeError."""
+    answers_path = tmp_path / "answers.json"
+    # Simulate a half-written file (the kind a SIGINT mid-write would leave).
+    answers_path.write_text('{"k": "v"')  # missing closing brace
+
+    iv = Interview(
+        answers_path=answers_path,
+        project_name="recoverable",
+        resume=True,
+    )
+    iv.load()  # should NOT raise
+    err = capsys.readouterr().err
+    assert "warning" in err.lower()
+    assert "answers.json" in err
+    # And answers should be empty so the interview can proceed from scratch.
+    assert iv.answers == {}
+
+
+def test_resume_with_unreadable_answers_file_warns_and_starts_fresh(
+    tmp_path: Path, capsys: Any, monkeypatch: Any
+) -> None:
+    """An OSError on read (e.g., permission denied) should also warn rather
+    than crash."""
+    answers_path = tmp_path / "answers.json"
+    answers_path.write_text('{"k": "v"}')
+
+    # Monkeypatch read_text to raise OSError to simulate permission denied.
+    real_read = Path.read_text
+    def boom(self: Path, *a: object, **kw: object) -> str:
+        if self == answers_path:
+            raise OSError("simulated permission denied")
+        return real_read(self, *a, **kw)
+    monkeypatch.setattr(Path, "read_text", boom)
+
+    iv = Interview(answers_path=answers_path, project_name="p", resume=True)
+    iv.load()  # should NOT raise
+    err = capsys.readouterr().err
+    assert "warning" in err.lower()
+    assert iv.answers == {}