ChicagoHAI · dangng2004 · Apr 25, 2026
diff --git a/src/reviewer/cli.py b/src/reviewer/cli.py
@@ -95,11 +95,6 @@ def cmd_review(args: argparse.Namespace) -> None:
     method = args.method
     print(f"Running method: {method}...")
 
-    if args.skip_nonsubstantial == "true":
-        skip = True
-    else:
-        skip = False
-
     reasoning = getattr(args, "reasoning_effort", None)
 
     if method == "zero_shot":
@@ -117,7 +112,6 @@ def cmd_review(args: argparse.Namespace) -> None:
             slug, content,
             model=args.model,
             reasoning_effort=reasoning,
-            skip_nonsubstantial=skip,
             ocr=was_ocr,
         )
         result = full if method == "progressive_full" else consolidated
@@ -521,12 +515,6 @@ def main() -> None:
         default=None,
         help="Reasoning effort level (default: adaptive/auto)",
     )
-    review_parser.add_argument(
-        "--skip-nonsubstantial",
-        choices=["true", "false"],
-        default="false",
-        help="Skip non-substantial passages (default: false)",
-    )
     review_parser.add_argument(
         "--ocr",
         choices=["mistral", "deepseek", "marker", "pymupdf"],

diff --git a/src/reviewer/method_local.py b/src/reviewer/method_local.py
@@ -7,7 +7,28 @@
 from .client import chat
 from .models import ReviewResult
 from .prompts import DEEP_CHECK_PROMPT, OCR_CAVEAT, OVERALL_FEEDBACK_PROMPT
-from .utils import count_tokens, locate_comments_in_window, parse_comments_from_list, split_into_paragraphs
+from .utils import count_tokens, locate_comments_in_window, parse_comments_from_list
+
+
+def split_into_paragraphs(text: str, min_chars: int = 100) -> list[str]:
+    """Split document into paragraphs, merging short ones with the next."""
+    raw = [p.strip() for p in text.split("\n\n") if p.strip()]
+    paragraphs: list[str] = []
+    carry = ""
+    for p in raw:
+        if carry:
+            p = carry + "\n\n" + p
+            carry = ""
+        if len(p) < min_chars:
+            carry = p
+        else:
+            paragraphs.append(p)
+    if carry:
+        if paragraphs:
+            paragraphs[-1] = paragraphs[-1] + "\n\n" + carry
+        else:
+            paragraphs.append(carry)
+    return paragraphs
 
 
 def merge_into_chunks(
@@ -45,10 +66,8 @@ def get_chunk_window_context(
     """Get surrounding passages as context (asymmetric: more before, less after)."""
     before = window + 2
     after = max(1, window - 1)
-
     start = max(0, chunk_idx - before)
     end = min(len(chunks), chunk_idx + after + 1)
-
     context_parts = []
     for i in range(start, end):
         _, text = chunks[i]

diff --git a/src/reviewer/method_progressive.py b/src/reviewer/method_progressive.py
@@ -2,7 +2,7 @@
 
 Processes the paper sequentially, maintaining a running summary of definitions,
 equations, theorems, and key claims. For each passage:
-  1. (Optional) Pre-filter to skip non-substantial content
+  1. (Optional) Pre-filter to skip non-technical content
   2. Deep-check: running summary + window context + passage → find errors
   3. Summary update: current summary + passage → updated summary
   4. Post-hoc consolidation: one final call to deduplicate and prune low-confidence issues
@@ -16,19 +16,40 @@
 from .models import ReviewResult
 from .prompts import (
     CONSOLIDATION_PROMPT,
-    DEEP_CHECK_PROMPT,
+    DEEP_CHECK_PROGRESSIVE_PROMPT as DEEP_CHECK_PROMPT,
     OCR_CAVEAT,
     OVERALL_FEEDBACK_PROMPT,
     SUMMARY_UPDATE_PROMPT,
-    SUBSTANTIAL_FILTER_PROMPT,
+    TECHNICAL_FILTER_PROMPT,
 )
-from .utils import count_tokens, locate_comments_in_window, parse_comments_from_list, split_into_paragraphs
+from .utils import count_tokens, locate_comments_in_window, parse_comments_from_list
 
 
 # ---------------------------------------------------------------------------
 # Paragraph / passage helpers
 # ---------------------------------------------------------------------------
 
+def split_into_paragraphs(text: str, min_chars: int = 100) -> list[str]:
+    """Split document into paragraphs, merging short ones with the next."""
+    raw = [p.strip() for p in text.split("\n\n") if p.strip()]
+    paragraphs: list[str] = []
+    carry = ""
+    for p in raw:
+        if carry:
+            p = carry + "\n\n" + p
+            carry = ""
+        if len(p) < min_chars:
+            carry = p
+        else:
+            paragraphs.append(p)
+    if carry:
+        if paragraphs:
+            paragraphs[-1] = paragraphs[-1] + "\n\n" + carry
+        else:
+            paragraphs.append(carry)
+    return paragraphs
+
+
 def merge_into_passages(
     paragraphs: list[str],
     target_chars: int = 8000,
@@ -114,14 +135,14 @@ def update_running_summary(
     return updated
 
 
-def is_substantial_passage(
+def is_technical_passage(
     passage_text: str,
     model: str,
     result: ReviewResult,
     reasoning_effort: str | None = None,
 ) -> bool:
-    """Use the model to decide if a passage has substantial content worth checking."""
-    prompt = SUBSTANTIAL_FILTER_PROMPT.format(passage=passage_text[:2000])
+    """Use the model to decide if a passage has technical content worth checking."""
+    prompt = TECHNICAL_FILTER_PROMPT.format(passage=passage_text[:2000])
     response, usage = chat(
         messages=[{"role": "user", "content": prompt}],
         model=model,
@@ -185,14 +206,14 @@ def review_progressive(
     document_content: str,
     model: str = "anthropic/claude-opus-4-6",
     reasoning_effort: str | None = None,
-    skip_nonsubstantial: bool = False,
+    skip_nontechnical: bool = False,
     window_size: int = 3,
     ocr: bool = False,
 ) -> tuple[ReviewResult, ReviewResult]:
     """Review a paper using progressive summary approach.
 
     Processes the paper sequentially. For each passage:
-      1. (Optional) Pre-filter non-substantial content
+      1. (Optional) Pre-filter non-technical content
       2. Deep-check with running summary + window context
       3. Update the running summary
     Then consolidate all comments in a final pass.
@@ -222,10 +243,10 @@ def review_progressive(
         _, passage_text = passages[idx]
 
         # Step 0: Optional pre-filter
-        if skip_nonsubstantial:
-            if not is_substantial_passage(passage_text, model, result, reasoning_effort):
+        if skip_nontechnical:
+            if not is_technical_passage(passage_text, model, result, reasoning_effort):
                 skipped += 1
-                print(f"    Passage {idx+1}/{len(passages)}: SKIPPED (non-substantial)")
+                print(f"    Passage {idx+1}/{len(passages)}: SKIPPED (non-technical)")
                 # Still update summary even for skipped passages (may have definitions)
                 running_summary = update_running_summary(
                     current_summary=running_summary,
@@ -296,8 +317,8 @@ def review_progressive(
             max_summary_tokens=max_summary_tokens,
         )
 
-    if skip_nonsubstantial:
-        print(f"  Skipped {skipped}/{len(passages)} non-substantial passages")
+    if skip_nontechnical:
+        print(f"  Skipped {skipped}/{len(passages)} non-technical passages")
 
     # Generate overall feedback
     paper_start = document_content[:8000]

diff --git a/src/reviewer/method_zero_shot.py b/src/reviewer/method_zero_shot.py
@@ -4,7 +4,7 @@
 
 from .client import chat
 from .models import ReviewResult
-from .prompts import ZERO_SHOT_CHUNK_PROMPT, OCR_CAVEAT, ZERO_SHOT_PROMPT
+from .prompts import LARGE_PAPER_CHUNK_PROMPT, OCR_CAVEAT, ZERO_SHOT_PROMPT
 from .utils import assign_paragraph_indices, chunk_text, count_tokens, parse_review_response
 
 MAX_TOKENS_SINGLE = 100_000  # use single prompt if paper fits
@@ -44,7 +44,7 @@ def review_zero_shot(
         overall_parts = []
         for i, chunk in enumerate(chunks):
             ocr_caveat = OCR_CAVEAT if ocr else ""
-            prompt = ZERO_SHOT_CHUNK_PROMPT.format(
+            prompt = LARGE_PAPER_CHUNK_PROMPT.format(
                 chunk_num=i + 1,
                 total_chunks=len(chunks),
                 chunk_text=chunk,