From 8a30150b4cbcb7108fa2496fa6819d8d3b618d51 Mon Sep 17 00:00:00 2001 From: Dang Nguyen <38245908+dangng2004@users.noreply.github.com> Date: Sat, 25 Apr 2026 12:32:45 -0500 Subject: [PATCH] Revert "Improve review prompts and add --skip-nonsubstantial flag" --- src/reviewer/cli.py | 12 -- src/reviewer/method_local.py | 25 +++- src/reviewer/method_progressive.py | 49 +++++-- src/reviewer/method_zero_shot.py | 4 +- src/reviewer/prompts.py | 210 +++++++++++++++++++---------- 5 files changed, 196 insertions(+), 104 deletions(-) diff --git a/src/reviewer/cli.py b/src/reviewer/cli.py index 5efd8ed..5743f05 100644 --- a/src/reviewer/cli.py +++ b/src/reviewer/cli.py @@ -95,11 +95,6 @@ def cmd_review(args: argparse.Namespace) -> None: method = args.method print(f"Running method: {method}...") - if args.skip_nonsubstantial == "true": - skip = True - else: - skip = False - reasoning = getattr(args, "reasoning_effort", None) if method == "zero_shot": @@ -117,7 +112,6 @@ def cmd_review(args: argparse.Namespace) -> None: slug, content, model=args.model, reasoning_effort=reasoning, - skip_nonsubstantial=skip, ocr=was_ocr, ) result = full if method == "progressive_full" else consolidated @@ -521,12 +515,6 @@ def main() -> None: default=None, help="Reasoning effort level (default: adaptive/auto)", ) - review_parser.add_argument( - "--skip-nonsubstantial", - choices=["true", "false"], - default="false", - help="Skip non-substantial passages (default: false)", - ) review_parser.add_argument( "--ocr", choices=["mistral", "deepseek", "marker", "pymupdf"], diff --git a/src/reviewer/method_local.py b/src/reviewer/method_local.py index d5f7113..1c0d0c7 100644 --- a/src/reviewer/method_local.py +++ b/src/reviewer/method_local.py @@ -7,7 +7,28 @@ from .client import chat from .models import ReviewResult from .prompts import DEEP_CHECK_PROMPT, OCR_CAVEAT, OVERALL_FEEDBACK_PROMPT -from .utils import count_tokens, locate_comments_in_window, parse_comments_from_list, split_into_paragraphs +from .utils import count_tokens, locate_comments_in_window, parse_comments_from_list + + +def split_into_paragraphs(text: str, min_chars: int = 100) -> list[str]: + """Split document into paragraphs, merging short ones with the next.""" + raw = [p.strip() for p in text.split("\n\n") if p.strip()] + paragraphs: list[str] = [] + carry = "" + for p in raw: + if carry: + p = carry + "\n\n" + p + carry = "" + if len(p) < min_chars: + carry = p + else: + paragraphs.append(p) + if carry: + if paragraphs: + paragraphs[-1] = paragraphs[-1] + "\n\n" + carry + else: + paragraphs.append(carry) + return paragraphs def merge_into_chunks( @@ -45,10 +66,8 @@ def get_chunk_window_context( """Get surrounding passages as context (asymmetric: more before, less after).""" before = window + 2 after = max(1, window - 1) - start = max(0, chunk_idx - before) end = min(len(chunks), chunk_idx + after + 1) - context_parts = [] for i in range(start, end): _, text = chunks[i] diff --git a/src/reviewer/method_progressive.py b/src/reviewer/method_progressive.py index e0012b0..31fb947 100644 --- a/src/reviewer/method_progressive.py +++ b/src/reviewer/method_progressive.py @@ -2,7 +2,7 @@ Processes the paper sequentially, maintaining a running summary of definitions, equations, theorems, and key claims. For each passage: - 1. (Optional) Pre-filter to skip non-substantial content + 1. (Optional) Pre-filter to skip non-technical content 2. Deep-check: running summary + window context + passage → find errors 3. Summary update: current summary + passage → updated summary 4. Post-hoc consolidation: one final call to deduplicate and prune low-confidence issues @@ -16,19 +16,40 @@ from .models import ReviewResult from .prompts import ( CONSOLIDATION_PROMPT, - DEEP_CHECK_PROMPT, + DEEP_CHECK_PROGRESSIVE_PROMPT as DEEP_CHECK_PROMPT, OCR_CAVEAT, OVERALL_FEEDBACK_PROMPT, SUMMARY_UPDATE_PROMPT, - SUBSTANTIAL_FILTER_PROMPT, + TECHNICAL_FILTER_PROMPT, ) -from .utils import count_tokens, locate_comments_in_window, parse_comments_from_list, split_into_paragraphs +from .utils import count_tokens, locate_comments_in_window, parse_comments_from_list # --------------------------------------------------------------------------- # Paragraph / passage helpers # --------------------------------------------------------------------------- +def split_into_paragraphs(text: str, min_chars: int = 100) -> list[str]: + """Split document into paragraphs, merging short ones with the next.""" + raw = [p.strip() for p in text.split("\n\n") if p.strip()] + paragraphs: list[str] = [] + carry = "" + for p in raw: + if carry: + p = carry + "\n\n" + p + carry = "" + if len(p) < min_chars: + carry = p + else: + paragraphs.append(p) + if carry: + if paragraphs: + paragraphs[-1] = paragraphs[-1] + "\n\n" + carry + else: + paragraphs.append(carry) + return paragraphs + + def merge_into_passages( paragraphs: list[str], target_chars: int = 8000, @@ -114,14 +135,14 @@ def update_running_summary( return updated -def is_substantial_passage( +def is_technical_passage( passage_text: str, model: str, result: ReviewResult, reasoning_effort: str | None = None, ) -> bool: - """Use the model to decide if a passage has substantial content worth checking.""" - prompt = SUBSTANTIAL_FILTER_PROMPT.format(passage=passage_text[:2000]) + """Use the model to decide if a passage has technical content worth checking.""" + prompt = TECHNICAL_FILTER_PROMPT.format(passage=passage_text[:2000]) response, usage = chat( messages=[{"role": "user", "content": prompt}], model=model, @@ -185,14 +206,14 @@ def review_progressive( document_content: str, model: str = "anthropic/claude-opus-4-6", reasoning_effort: str | None = None, - skip_nonsubstantial: bool = False, + skip_nontechnical: bool = False, window_size: int = 3, ocr: bool = False, ) -> tuple[ReviewResult, ReviewResult]: """Review a paper using progressive summary approach. Processes the paper sequentially. For each passage: - 1. (Optional) Pre-filter non-substantial content + 1. (Optional) Pre-filter non-technical content 2. Deep-check with running summary + window context 3. Update the running summary Then consolidate all comments in a final pass. @@ -222,10 +243,10 @@ def review_progressive( _, passage_text = passages[idx] # Step 0: Optional pre-filter - if skip_nonsubstantial: - if not is_substantial_passage(passage_text, model, result, reasoning_effort): + if skip_nontechnical: + if not is_technical_passage(passage_text, model, result, reasoning_effort): skipped += 1 - print(f" Passage {idx+1}/{len(passages)}: SKIPPED (non-substantial)") + print(f" Passage {idx+1}/{len(passages)}: SKIPPED (non-technical)") # Still update summary even for skipped passages (may have definitions) running_summary = update_running_summary( current_summary=running_summary, @@ -296,8 +317,8 @@ def review_progressive( max_summary_tokens=max_summary_tokens, ) - if skip_nonsubstantial: - print(f" Skipped {skipped}/{len(passages)} non-substantial passages") + if skip_nontechnical: + print(f" Skipped {skipped}/{len(passages)} non-technical passages") # Generate overall feedback paper_start = document_content[:8000] diff --git a/src/reviewer/method_zero_shot.py b/src/reviewer/method_zero_shot.py index 3c232bc..4cc30c7 100644 --- a/src/reviewer/method_zero_shot.py +++ b/src/reviewer/method_zero_shot.py @@ -4,7 +4,7 @@ from .client import chat from .models import ReviewResult -from .prompts import ZERO_SHOT_CHUNK_PROMPT, OCR_CAVEAT, ZERO_SHOT_PROMPT +from .prompts import LARGE_PAPER_CHUNK_PROMPT, OCR_CAVEAT, ZERO_SHOT_PROMPT from .utils import assign_paragraph_indices, chunk_text, count_tokens, parse_review_response MAX_TOKENS_SINGLE = 100_000 # use single prompt if paper fits @@ -44,7 +44,7 @@ def review_zero_shot( overall_parts = [] for i, chunk in enumerate(chunks): ocr_caveat = OCR_CAVEAT if ocr else "" - prompt = ZERO_SHOT_CHUNK_PROMPT.format( + prompt = LARGE_PAPER_CHUNK_PROMPT.format( chunk_num=i + 1, total_chunks=len(chunks), chunk_text=chunk, diff --git a/src/reviewer/prompts.py b/src/reviewer/prompts.py index 7c8018d..38ddd69 100644 --- a/src/reviewer/prompts.py +++ b/src/reviewer/prompts.py @@ -5,42 +5,52 @@ REVIEWER_PREAMBLE = """\ You are a thoughtful reviewer checking a passage from an academic paper. \ Today's date is {current_date}. \ -Engage with the material in DETAIL. For each potential issue, first try to understand the authors' \ +Engage deeply with the material. For each potential issue, first try to understand the authors' \ intent and check whether your concern is resolved by context before flagging it.""" CHECK_CRITERIA = """\ Check for: -1. Mathematical correctness (e.g. wrong formulas, sign errors, missing factors, incorrect derivations, subscript or index errors) -2. Notation inconsistencies (e.g. symbols used differently than defined, undefined notation) -3. Definition/Theorem inconsistencies (e.g. statements that contradict formal definitions/theorems) -4. Numerical inconsistencies (e.g. stated values contradict what can be derived from definitions, tables, or other sections) -5. Insufficient justification (e.g. skipped non-trivial step in derivation) -6. Overclaiming (e.g. statements that claim more than the evidence supports) -7. Ambiguity (e.g. lack of detail/specification that could lead reader to incorrect conclusions)""" +1. Mathematical / formula errors: wrong formulas, sign errors, missing factors, incorrect derivations, subscript or index errors +2. Notation inconsistencies: symbols used in a way that contradicts their earlier definition +3. Inconsistency between text and formal definitions: prose says one thing but the equation says another +4. Parameter / numerical inconsistencies: stated values contradict what can be derived from definitions or tables elsewhere +5. Insufficient justification: a key derivation step is skipped where the result is non-trivial +6. Questionable claims: statements that overstate what has actually been shown +7. Ambiguity that could mislead: flag only if a careful reader could reasonably reach an incorrect conclusion +8. Underspecified methods: an algorithm, procedure, or modification is described too vaguely for a reader to reproduce — key choices, boundary conditions, or parameter settings are left implicit""" EXPLANATION_STYLE = """\ -For each issue, state precisely what is correct, as well as what is wrong and why. Quote the exact text, explain the specific error, and if relevant, show what the correct version should be. Do not flag issues that can be resolved from context. Reference standard results or conventions in the field when relevant.""" +For each issue, write like a careful reader thinking aloud. Describe what initially confused or \ +concerned you, what you checked to resolve it, and what specifically remains problematic. \ +Acknowledge what the authors got right before noting the issue. Reference standard results \ +or conventions in the field when relevant.""" LENIENCY_RULES = """\ Be lenient with: -- Introductory and overview sections -- Forward references (e.g. notation/claims that may be defined/justified later in the paper) -- Informal prose (e.g. conceptual descriptions for the purpose of intuition)""" +- Introductory and overview sections, which intentionally simplify or gloss over details +- Forward references — symbols or claims that may be defined or justified later in the paper +- Informal prose that paraphrases a formal result without repeating every qualifier""" OCR_CAVEAT = """\ -NOTE: This text was extracted via OCR and may contain notation errors. If a symbol appears inconsistent with surrounding usage, consider whether it is an OCR misread before flagging it.""" +NOTE: This text was extracted from a PDF via OCR. While automatic corrections \ +have been applied, some notation errors may remain. If you spot a symbol that \ +appears inconsistent with surrounding usage (e.g. a variable that appears once \ +with a different letter than everywhere else), consider whether it is an OCR \ +misread rather than an author error. Flag it only if it would be a real issue \ +even assuming the most plausible intended symbol.""" DO_NOT_FLAG_BASE = """\ Do NOT flag: -- Standard field conventions and notational shorthands (e.g. dropping summation bounds, overloading common symbols) -- Formatting, typesetting, capitalization -- References to content that exists elsewhere in the paper but isn't shown in the current context -- Issues resolvable by a competent reader through basic inference -- Stylistic or presentation preferences that don't affect correctness""" +- Formatting, typesetting, or capitalization issues +- References to equations or sections not shown in the context (they exist elsewhere) +- Trivial observations that any reader in the field would immediately resolve""" DO_NOT_FLAG_CHUNKED = DO_NOT_FLAG_BASE.rstrip() + """ - Incomplete text at passage boundaries""" +DO_NOT_FLAG_PROGRESSIVE = DO_NOT_FLAG_CHUNKED.rstrip() + """ +- Notation not yet in the summary — it may be introduced later""" + JSON_ARRAY_OUTPUT = """\ Return ONLY a JSON array (can be []). Each item: - "title": concise title of the issue @@ -49,13 +59,29 @@ - "type": "technical" or "logical" """ +JSON_OBJECT_OUTPUT = """\ +Return a JSON object with this structure: +{{{{ + "overall_feedback": "{feedback_desc}", + "comments": [ + {{{{ + "title": "short descriptive title of the issue", + "quote": "the exact verbatim text from the paper containing the issue (copy it exactly, preserving LaTeX)", + "explanation": "deep reasoning — what you initially thought, whether context resolves it, and what specifically remains problematic", + "type": "technical" or "logical" + }}}} + ] +}}}} + +Return ONLY the JSON object{empty_note}. No other text.""" + + # ── Deep-check prompt (used by local and progressive methods) ─────────────── DEEP_CHECK_PROMPT = f"""{REVIEWER_PREAMBLE} {{ocr_caveat}} - -CONTEXT: +FULL PAPER CONTEXT (relevant sections): {{context}} --- @@ -75,18 +101,16 @@ {JSON_ARRAY_OUTPUT}""" - -# ── Zero-shot prompts ─────────────────────────────────────────────────────── - -ZERO_SHOT_PROMPT = f"""{REVIEWER_PREAMBLE} +DEEP_CHECK_PROGRESSIVE_PROMPT = f"""{REVIEWER_PREAMBLE} {{ocr_caveat}} +FULL PAPER CONTEXT (relevant sections): +{{context}} --- -PAPER: - -{{paper_text}} +PASSAGE TO CHECK: +{{passage}} --- @@ -96,36 +120,73 @@ {LENIENCY_RULES} +{DO_NOT_FLAG_PROGRESSIVE} + +{JSON_ARRAY_OUTPUT}""" + + +# ── Zero-shot prompts ─────────────────────────────────────────────────────── + +ZERO_SHOT_PROMPT = f"""\ +You are a thoughtful reviewer reading the following academic paper. \ +Today's date is {{current_date}}. \ +Engage deeply with the material. For each potential issue, first try to understand the authors' \ +intent and check whether your concern is resolved by context before flagging it. + +Carefully check for: +1. Mathematical / formula errors: wrong formulas, sign errors, missing factors, incorrect derivations, subscript or index errors +2. Notation inconsistencies: symbols used in a way that contradicts their earlier definition +3. Inconsistency between text and formal definitions: prose says one thing but the equation says another +4. Parameter / numerical inconsistencies: stated values contradict what can be derived from definitions or tables elsewhere +5. Insufficient justification: a key derivation step is skipped where the result is non-trivial +6. Questionable claims: statements that overstate what has actually been shown +7. Ambiguity that could mislead: flag only if a careful reader could reasonably reach an incorrect conclusion +8. Underspecified methods: an algorithm, procedure, or modification is described too vaguely for a reader to reproduce — key choices, boundary conditions, or parameter settings are left implicit + +{EXPLANATION_STYLE} + +{LENIENCY_RULES} + {DO_NOT_FLAG_BASE} Return a JSON object with this structure: {{{{ - "overall_feedback": "one paragraph high-level assessment of the paper's quality and main issues", + "overall_feedback": "One paragraph high-level assessment of the paper's quality and main issues", "comments": [ {{{{ - "title": "concise title of the issue", - "quote": "exact verbatim text from the paper (preserving LaTeX)", - "explanation": "precise explanation of what is wrong and why", + "title": "short descriptive title of the issue", + "quote": "the exact verbatim text from the paper containing the issue (copy it exactly, preserving LaTeX)", + "explanation": "deep reasoning — what you initially thought, whether context resolves it, and what specifically remains problematic", "type": "technical" or "logical" }}}} ] }}}} -Return ONLY the JSON object. No other text.""" - -ZERO_SHOT_CHUNK_PROMPT = f"""{REVIEWER_PREAMBLE} +Return ONLY the JSON object, no other text. {{ocr_caveat}} - --- -PASSAGE TO CHECK: - -{{chunk_text}} +PAPER: ---- +{{paper_text}} +""" -{CHECK_CRITERIA} +LARGE_PAPER_CHUNK_PROMPT = f"""\ +You are a thoughtful reviewer checking a section of an academic paper. \ +Today's date is {{current_date}}. \ +Engage deeply with the material. For each potential issue, first try to understand the authors' \ +intent and check whether your concern is resolved by context before flagging it. + +Carefully check for: +1. Mathematical / formula errors: wrong formulas, sign errors, missing factors, incorrect derivations, subscript or index errors +2. Notation inconsistencies: symbols used in a way that contradicts their earlier definition +3. Inconsistency between text and formal definitions: prose says one thing but the equation says another +4. Parameter / numerical inconsistencies: stated values contradict what can be derived from definitions or tables elsewhere +5. Insufficient justification: a key derivation step is skipped where the result is non-trivial +6. Questionable claims: statements that overstate what has actually been shown +7. Ambiguity that could mislead: flag only if a careful reader could reasonably reach an incorrect conclusion +8. Underspecified methods: an algorithm, procedure, or modification is described too vaguely for a reader to reproduce — key choices, boundary conditions, or parameter settings are left implicit {EXPLANATION_STYLE} @@ -133,26 +194,34 @@ {DO_NOT_FLAG_CHUNKED} -Return a JSON object with this structure: +Return a JSON object: {{{{ "overall_feedback": "brief assessment of this section", "comments": [ {{{{ - "title": "concise title of the issue", - "quote": "exact verbatim text from the paper (preserving LaTeX)", - "explanation": "precise explanation of what is wrong and why", + "title": "short descriptive title of the issue", + "quote": "the exact verbatim text from the paper containing the issue (copy it exactly, preserving LaTeX)", + "explanation": "deep reasoning — what you initially thought, whether context resolves it, and what specifically remains problematic", "type": "technical" or "logical" }}}} ] }}}} -Return ONLY the JSON object. No other text.""" +Return ONLY the JSON object (comments can be [] if no issues found). No other text. + +{{ocr_caveat}} +--- + +SECTION {{chunk_num}} of {{total_chunks}}: + +{{chunk_text}} +""" # ── Progressive-only prompts ──────────────────────────────────────────────── SUMMARY_UPDATE_PROMPT = """\ -You are maintaining a concise running summary of an academic paper. \ +You are maintaining a concise running summary of an academic paper's key technical content. \ This summary will be used as context when reviewing later sections of the paper. CURRENT SUMMARY: @@ -160,7 +229,7 @@ --- -NEW PASSAGE: +NEW PASSAGE (section {passage_idx} of {total_passages}): {passage_text} --- @@ -168,36 +237,42 @@ Update the summary to incorporate any NEW information from this passage. \ Keep the summary structured and concise. Include: -1. **Notation and Definitions**: Any new symbols, variables, or terms defined +1. **Notation & Definitions**: Any new symbols, variables, or terms defined 2. **Key Equations**: Important equations or formulas introduced (write them out, preserving LaTeX) -3. **Theorems and Propositions**: Statements of theorems, lemmas, corollaries (brief statement, not proof) +3. **Theorems & Propositions**: Statements of theorems, lemmas, corollaries (brief statement, not proof) 4. **Assumptions**: Any stated assumptions or conditions 5. **Key Claims**: Important results or conclusions established Rules: -- PRESERVE ALL existing summary content unless it is superseded by new information +- PRESERVE all existing summary content unless it is superseded by new information +- ADD new items from the passage - Do NOT include commentary, proof details, or experimental results - Do NOT include information not in the passage or existing summary -- Keep entries brief (one line per item where possible) +- Keep entries brief — one line per item where possible - If the passage contains no new definitions, equations, or key claims, return the summary unchanged -Return the updated summary directly.""" +Return the updated summary directly (no JSON, no code fences).""" -SUBSTANTIAL_FILTER_PROMPT = """\ -Does this passage from an academic paper contain substantial content worth checking for errors? \ -Substantial content includes: equations, derivations, definitions, theorems, proofs, algorithms, \ -logical reasoning/claims. +TECHNICAL_FILTER_PROMPT = """\ +Does this passage from an academic paper contain technical content worth checking for errors? \ +Technical content includes: equations, proofs, derivations, theorems, algorithms, \ +specific quantitative claims, or formal definitions. -Non-substantial content includes: introductions, citations, acknowledgments, author bios. +Non-technical content includes: introductions, related work surveys, acknowledgments, \ +reference lists, author bios, general motivation, or high-level overviews without formal claims. -PASSAGE TO CHECK: +PASSAGE: {passage} Answer with ONLY "yes" or "no".""" CONSOLIDATION_PROMPT = """\ You are reviewing the complete list of issues found in an academic paper. \ -Your job is to consolidate this list by removing duplicates. If multiple issues flag the SAME underlying problem, keep the most detailed and well-explained one and remove the others. +Your job is to consolidate this list: remove duplicates and merge closely related issues. + +Remove issues that: +- Flag the same underlying problem as another issue (keep the better-explained one) +- Flag standard conventions, notational shorthands, or well-known results ISSUES FOUND: {issues_json} @@ -209,20 +284,9 @@ # ── Overall feedback (shared by local and progressive) ────────────────────── OVERALL_FEEDBACK_PROMPT = """\ -You are an expert academic reviewer. Based on the beginning of the paper below, write one paragraph of high-level feedback that: -- Identifies the paper's strongest contributions -- Raises 3-5 major thematic critiques (reference specific passages) -- Identifies conceptual tensions or unresolved contradictions in the paper's core argument -- If empirical, evaluates whether the experimental design, baselines, and metrics are appropriate and whether the results actually support the conclusions - -Be direct and specific. Read like a domain expert, not a general-purpose critic. Do NOT summarize the paper. Do NOT list individual errors. - -Things to consider: -- Scope of claims: are the results narrow but presented as broadly applicable? -- Mising baselines or comparisons: what obvious alternative approaches are not discussed? -- Assumptions: are the key assumptions stated clearly, and are the realistic? -- Internal consistency: do the different parts of the paper (theory, experiments, conclusions) tell a coherent story? -- Unresolved ideas: what are the most important questions the paper doesn't address? +You are an expert academic reviewer. Based on the beginning of the paper below, \ +write one paragraph of high-level feedback on the paper's quality, clarity, \ +and most significant issues. PAPER (first 8000 characters): {paper_start}