diff --git a/.gitignore b/.gitignore index 3630098..f514b74 100644 --- a/.gitignore +++ b/.gitignore @@ -1,43 +1,2 @@ -# Python -.venv/ -__pycache__/ -*.pyc -*.pyo -.python-version -build/ -dist/ -*.egg-info/ - - -# Environment -.env - -# Results -/results/ -benchmarks/results/ -data/raw_html/ -benchmarks/data/raw_html/ - -# Review output -review_results/ - -# Perturbation output -perturbation_results/ - -# Jupyter -.ipynb_checkpoints/ - -# IDE -.idea/ -.vscode/ - -# macOS -.DS_Store -.claude/ - -*.log -*.zip - -configs/ -shell_scripts/ -benchmarks/perturbation/data/ \ No newline at end of file +# Created by venv; see https://docs.python.org/3/library/venv.html +* diff --git a/benchmarks/experimental_perturbations/experimental.py b/benchmarks/experimental_perturbations/experimental.py new file mode 100644 index 0000000..184199b --- /dev/null +++ b/benchmarks/experimental_perturbations/experimental.py @@ -0,0 +1,157 @@ +import reviewer.prompts as p +import json +from pathlib import Path + +from reviewer.parsers import parse_document +from reviewer.method_progressive import review_progressive, review_zero_shot + +# ── Zero-shot prompts (EDITED) ─────────────────────────────────────────────────────── + +p.ZERO_SHOT_PROMPT = f"""{p.REVIEWER_PREAMBLE} + +{{ocr_caveat}} + +--- + +PAPER: + +{{paper_text}} + +--- + +Check specifically for EXPERIMENTAL errors. This includes poor experimental design, instances of p-hacking, and incorrect interpretation of data/results. + +{p.EXPLANATION_STYLE} + +{p.LENIENCY_RULES} + +{p.DO_NOT_FLAG_BASE} + +Return a JSON object with this structure: +{{{{ + "overall_feedback": "one paragraph high-level assessment of the paper's quality and main issues", + "comments": [ + {{{{ + "title": "concise title of the issue", + "quote": "exact verbatim text from the paper (preserving LaTeX)", + "explanation": "precise explanation of what is wrong and why", + "type": "technical" or "logical" + }}}} + ] +}}}} + +Return ONLY the JSON object. No other text.""" + +p.ZERO_SHOT_CHUNK_PROMPT = f"""{p.REVIEWER_PREAMBLE} + +{{ocr_caveat}} + +--- + +PASSAGE TO CHECK: + +{{chunk_text}} + +--- + +Check specifically for EXPERIMENTAL errors. This includes poor experimental design, instances of p-hacking, and incorrect interpretation of data/results. + +{p.EXPLANATION_STYLE} + +{p.LENIENCY_RULES} + +{p.DO_NOT_FLAG_CHUNKED} + +Return a JSON object with this structure: +{{{{ + "overall_feedback": "brief assessment of this section", + "comments": [ + {{{{ + "title": "concise title of the issue", + "quote": "exact verbatim text from the paper (preserving LaTeX)", + "explanation": "precise explanation of what is wrong and why", + "type": "technical" or "logical" + }}}} + ] +}}}} + +Return ONLY the JSON object. No other text.""" + +# ── Progressive prompt (EDITED) ─────────────────────────────────────────────────────── + +p.DEEP_CHECK_PROMPT = f"""{p.REVIEWER_PREAMBLE} + +{{ocr_caveat}} + +CONTEXT: +{{context}} + +--- + +PASSAGE TO CHECK: +{{passage}} + +--- + +Check specifically for EXPERIMENTAL errors. This includes poor experimental design, instances of p-hacking, and incorrect interpretation of data/results. + +{p.EXPLANATION_STYLE} + +{p.LENIENCY_RULES} + +{p.DO_NOT_FLAG_CHUNKED} + +{p.JSON_ARRAY_OUTPUT}""" + + +def review_experimental(perturbations_dir, output_dir, method): + output_dir.mkdir(parents=True, exist_ok=True) + + for category_dir in perturbations_dir.iterdir(): + if not category_dir.is_dir() or category_dir.name.startswith("."): + continue + + for paper_dir in (category_dir / "all").iterdir(): + if not paper_dir.is_dir(): + continue + + md_files = list((paper_dir / "experimental").glob("*recorrupted.md")) + if not md_files: + continue + md_file = md_files[0] + + slug = paper_dir.name + output_path = output_dir / method / f"{slug}.json" + if output_path.exists(): + print(f" Skipping {slug} (already done)") + continue + + print(f"Reviewing {slug}...") + text = md_file.read_text() + + if method == "progressive": + consolidated, _ = review_progressive( + paper_slug=slug, + document_content=text, + model="anthropic/claude-opus-4-6", + reasoning_effort=None, + skip_nonsubstantial=False, + window_size=3, + ocr=False, + ) + with open(output_path, "w") as f: + json.dump(consolidated.to_dict(), f, indent=2) + elif method == "zero_shot": + result = review_zero_shot( + paper_slug=slug, + document_content=text, + model="anthropic/claude-opus-4-6", + reasoning_effort=None, + ocr= False + ) + +if __name__ == "__main__": + perturbations_dir = Path("./perturbation_results") + output_dir = Path("./experimental_comments") + method = "zero_shot" + review_experimental(perturbations_dir, output_dir, method) \ No newline at end of file diff --git a/benchmarks/results_iclr_coarse/analysis.py b/benchmarks/results_iclr_coarse/analysis.py new file mode 100644 index 0000000..559f84e --- /dev/null +++ b/benchmarks/results_iclr_coarse/analysis.py @@ -0,0 +1,550 @@ +import json +from pathlib import Path +from collections import defaultdict +from rapidfuzz import fuzz +import numpy as np +import matplotlib.pyplot as plt +from matplotlib_venn import venn2, venn2_circles, venn3, venn3_circles + +from sentence_transformers import SentenceTransformer +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.cluster import KMeans +from sklearn.manifold import TSNE +from collections import Counter + +model_dict = { + 'deepseek-v4-flash': 'DeepSeek-V4-Flash', + 'gemini-3.1-flash-lite-preview': 'Gemini-3.1-Flash-Lite', + 'glm-4.7-flash': 'GLM-4.7-Flash', + 'qwen3.6-35b-a3b': 'Qwen3.6-35B-A3B' +} + +def load(path): + return json.loads(Path(path).read_text()) + +def method_key(folder_name, model): + prefix = {"coarse": "coarse", "progressive": "progressive", "zero_shot": "zero_shot"} + return f"{prefix[folder_name]}__{model}" + +''' +folders = {method: folder path} +models = [models] +total_papers = # of papers + +Ex: +FOLDERS = { + "coarse": "./coarse_v2/", + "progressive": "./scaleup_v2_progressive/", + "zero_shot": "./scaleup_v2_zero_shot/" +} + +MODELS = ["deepseek-v4-flash", "gemini-3.1-flash-lite-preview", "glm-4.7-flash", "qwen3.6-35b-a3b"] + +TOTAL_PAPERS = len(list(Path(FOLDERS["coarse"]).glob("*.json"))) +''' + + + +# VOLUME + +def volume_dicts(folders, models, total_papers): + volume = {} # { slug -> { "coarse/deepseek": 5, "progressive/deepseek": 3, ... } } + + for folder_name, folder_path in folders.items(): + for p in Path(folder_path).glob("*.json"): + slug = p.stem + d = load(p) + if slug not in volume: + volume[slug] = defaultdict(dict) + for model in models: + key = method_key(folder_name, model) + comments = d.get("methods", {}).get(key, {}).get("comments", []) + volume[slug][model][folder_name] = len(comments) + + highest_volume = defaultdict(dict) + average_volume = defaultdict(dict) + + for _, models_volume in volume.items(): + for model, counts in models_volume.items(): + highest = max(counts, key=counts.get) + if highest not in highest_volume[model]: + highest_volume[model][highest] = 0 + highest_volume[model][highest] += 1 + + for method, number in counts.items(): + if method not in average_volume[model]: + average_volume[model][method] = 0 + average_volume[model][method] += number / total_papers + + print(f"Average number of comments per paper:\n") + print(f"{'Model':<40} {'Coarse':>10} {'Progressive':>12} {'Zero Shot':>11} {'Winner':>12}") + print("-" * 90) + for model, counts in average_volume.items(): + coarse = counts.get('coarse', 0) + prog = counts.get('progressive', 0) + zero_shot = counts.get('zero_shot', 0) + winner = max(counts, key=counts.get) + print(f"{model:<40} {coarse:>10.2f} {prog:>12.2f} {zero_shot:>11.2f} {winner:>12}") + + return volume, highest_volume, average_volume + + + +# Comment Overlap (Coarse, Progressive) + +def get_papers(folders): + first_folder = next(iter(folders.values())) + if not first_folder: + return None + + papers = [] + for p in list(Path(first_folder).glob("*.json")): + papers.append(p.stem) + + return papers + +def overlap_cp(folders, models, total_papers): + overlap_ind = defaultdict(lambda: defaultdict(dict)) + overlap_total = defaultdict(lambda: {"both_total": 0, "only_c_total": 0, "only_p_total": 0}) + overlap_avg = defaultdict(lambda: {"both_avg": 0, "only_c_avg": 0, "only_p_avg": 0, "jaccard_sim_avg": 0}) + + papers = get_papers(folders) + if not papers: + return None + + temp_jaccard_sim = defaultdict(int) + temp_count = defaultdict(int) + for stem in papers: + coarse_data = load(Path(folders["coarse"]) / (stem + ".json")) + prog_data = load(Path(folders["progressive"]) / (stem + ".json")) + + def para_set(d, method_key): + comments = d.get("methods", {}).get(method_key, {}).get("comments", []) + return {c["paragraph_index"] for c in comments if "paragraph_index" in c} + + for model in models: + coarse_paras = para_set(coarse_data, method_key("coarse", model)) + prog_paras = para_set(prog_data, method_key("progressive", model)) + + both_idx = coarse_paras & prog_paras + only_c_idx = coarse_paras - prog_paras + only_p_idx = prog_paras - coarse_paras + + both_num = len(coarse_paras & prog_paras) + only_c_num = len(coarse_paras - prog_paras) + only_p_num = len(prog_paras - coarse_paras) + total_num = both_num + only_c_num + only_p_num + + overlap_ind[model][stem]["both_idx"] = both_idx + overlap_ind[model][stem]["only_c_idx"] = only_c_idx + overlap_ind[model][stem]["only_p_idx"] = only_p_idx + overlap_ind[model][stem]["both_num"] = both_num + overlap_ind[model][stem]["only_c_num"] = only_c_num + overlap_ind[model][stem]["only_p_num"] = only_p_num + overlap_ind[model][stem]["both_pct"] = both_num / total_num if total_num != 0 else None + overlap_ind[model][stem]["only_c_pct"] = only_c_num / total_num if total_num != 0 else None + overlap_ind[model][stem]["only_p_pct"] = only_p_num / total_num if total_num != 0 else None + overlap_ind[model][stem]["jaccard_sim"] = both_num / total_num if total_num != 0 else None + + overlap_total[model]["both_total"] += both_num + overlap_total[model]["only_c_total"] += only_c_num + overlap_total[model]["only_p_total"] += only_p_num + + temp_jaccard_sim[model] += both_num / total_num if total_num != 0 else 0 + temp_count[model] += 1 if total_num != 0 else 0 + + for model in models: + overlap_avg[model]["both_avg"] = overlap_total[model]["both_total"] / total_papers + overlap_avg[model]["only_c_avg"] = overlap_total[model]["only_c_total"] / total_papers + overlap_avg[model]["only_p_avg"] = overlap_total[model]["only_p_total"] / total_papers + overlap_avg[model]["jaccard_sim_avg"] = temp_jaccard_sim[model] / temp_count[model] + + print(f"Average overlap per paper:\n") + print(f"{'Model':<35} {'Both':>8} {'Only C':>8} {'Only P':>8} {'Jaccard':>8}") + print("-" * 71) + for model, counts in overlap_avg.items(): + print(f"{model:<35} {counts['both_avg']:>8.2f} {counts['only_c_avg']:>8.2f} {counts['only_p_avg']:>8.2f} {counts['jaccard_sim_avg']:>8.3f}") + + plot_overlap_cp(overlap_avg) + + return overlap_ind, overlap_total, overlap_avg + +def plot_overlap_cp(overlap_avg): + COLORS = ["#2196F3", "#E53935"] # blue, red + + fig, axes = plt.subplots(2, 2, figsize=(14, 11), dpi=400) + axes = axes.flatten() + + for i, (model, counts) in enumerate(overlap_avg.items()): + only_c = round(counts["only_c_avg"], 2) + only_p = round(counts["only_p_avg"], 2) + both = round(counts["both_avg"], 2) + + v = venn2( + subsets=(only_c, only_p, both), + set_labels=("Coarse", "Progressive"), + ax=axes[i], + set_colors=COLORS, + alpha=0.15, + ) + + c = venn2_circles( + subsets=(only_c, only_p, both), + ax=axes[i], + linewidth=2.0, + ) + + for circle, color in zip(c, COLORS): + circle.set_edgecolor(color) + circle.set_linewidth(2.0) + + for label_id in ["10", "01", "11"]: + lbl = v.get_label_by_id(label_id) + if lbl: + lbl.set_fontsize(15) + lbl.set_color("black") + lbl.set_fontweight("normal") + lbl.set_ha("center") + + for set_label in v.set_labels: + if set_label: + set_label.set_fontsize(15) + set_label.set_color("black") + + axes[i].set_title(f"{model_dict.get(model, model)}\nJaccard Similarity: {counts['jaccard_sim_avg']:.3f}", + fontsize=15, fontweight="bold", pad=10) + + plt.tight_layout() + plt.subplots_adjust(hspace=0.2, wspace=0.1) + plt.savefig("./venn_cp.png", dpi=400, bbox_inches="tight") + + + +# Comment Overlap (Coarse, Progressive, Zero Shot) + +def overlap_all(folders, models, total_papers): + overlap_ind = defaultdict(lambda: defaultdict(dict)) + overlap_total = defaultdict(lambda: { + "all_total": 0, "only_c_total": 0, "only_p_total": 0, "only_z_total": 0, + "only_c_p_total": 0, "only_c_z_total": 0, "only_p_z_total": 0, + }) + overlap_avg = defaultdict(lambda: { + "all_avg": 0, "only_c_avg": 0, "only_p_avg": 0, "only_z_avg": 0, + "only_c_p_avg": 0, "only_c_z_avg": 0, "only_p_z_avg": 0, "jaccard_sim_avg": 0, + }) + + papers = get_papers(folders) + if not papers: + return None + + def para_set(d, mk): + comments = d.get("methods", {}).get(mk, {}).get("comments", []) + return {c["paragraph_index"] for c in comments if "paragraph_index" in c} + + temp_jaccard_sim = defaultdict(int) + temp_count = defaultdict(int) + + for stem in papers: + coarse_data = load(Path(folders["coarse"]) / (stem + ".json")) + prog_data = load(Path(folders["progressive"]) / (stem + ".json")) + zero_data = load(Path(folders["zero_shot"]) / (stem + ".json")) + + for model in models: + coarse_paras = para_set(coarse_data, method_key("coarse", model)) + prog_paras = para_set(prog_data, method_key("progressive", model)) + zero_paras = para_set(zero_data, method_key("zero_shot", model)) + + all_idx = coarse_paras & prog_paras & zero_paras + only_c_idx = coarse_paras - prog_paras - zero_paras + only_p_idx = prog_paras - coarse_paras - zero_paras + only_z_idx = zero_paras - coarse_paras - prog_paras + only_c_p_idx = (coarse_paras & prog_paras) - zero_paras + only_c_z_idx = (coarse_paras & zero_paras) - prog_paras + only_p_z_idx = (prog_paras & zero_paras) - coarse_paras + + all_num = len(all_idx) + only_c_num = len(only_c_idx) + only_p_num = len(only_p_idx) + only_z_num = len(only_z_idx) + only_c_p_num = len(only_c_p_idx) + only_c_z_num = len(only_c_z_idx) + only_p_z_num = len(only_p_z_idx) + total_num = all_num + only_c_num + only_p_num + only_z_num + only_c_p_num + only_c_z_num + only_p_z_num + + overlap_ind[model][stem]["all_idx"] = all_idx + overlap_ind[model][stem]["only_c_idx"] = only_c_idx + overlap_ind[model][stem]["only_p_idx"] = only_p_idx + overlap_ind[model][stem]["only_z_idx"] = only_z_idx + overlap_ind[model][stem]["only_c_p_idx"] = only_c_p_idx + overlap_ind[model][stem]["only_c_z_idx"] = only_c_z_idx + overlap_ind[model][stem]["only_p_z_idx"] = only_p_z_idx + overlap_ind[model][stem]["all_num"] = all_num + overlap_ind[model][stem]["only_c_num"] = only_c_num + overlap_ind[model][stem]["only_p_num"] = only_p_num + overlap_ind[model][stem]["only_z_num"] = only_z_num + overlap_ind[model][stem]["only_c_p_num"] = only_c_p_num + overlap_ind[model][stem]["only_c_z_num"] = only_c_z_num + overlap_ind[model][stem]["only_p_z_num"] = only_p_z_num + overlap_ind[model][stem]["jaccard_sim"] = all_num / total_num if total_num != 0 else None + + overlap_total[model]["all_total"] += all_num + overlap_total[model]["only_c_total"] += only_c_num + overlap_total[model]["only_p_total"] += only_p_num + overlap_total[model]["only_z_total"] += only_z_num + overlap_total[model]["only_c_p_total"] += only_c_p_num + overlap_total[model]["only_c_z_total"] += only_c_z_num + overlap_total[model]["only_p_z_total"] += only_p_z_num + + temp_jaccard_sim[model] += all_num / total_num if total_num != 0 else 0 + temp_count[model] += 1 if total_num != 0 else 0 + + for model in models: + overlap_avg[model]["all_avg"] = overlap_total[model]["all_total"] / total_papers + overlap_avg[model]["only_c_avg"] = overlap_total[model]["only_c_total"] / total_papers + overlap_avg[model]["only_p_avg"] = overlap_total[model]["only_p_total"] / total_papers + overlap_avg[model]["only_z_avg"] = overlap_total[model]["only_z_total"] / total_papers + overlap_avg[model]["only_c_p_avg"] = overlap_total[model]["only_c_p_total"] / total_papers + overlap_avg[model]["only_c_z_avg"] = overlap_total[model]["only_c_z_total"] / total_papers + overlap_avg[model]["only_p_z_avg"] = overlap_total[model]["only_p_z_total"] / total_papers + overlap_avg[model]["jaccard_sim_avg"] = temp_jaccard_sim[model] / temp_count[model] if temp_count[model] else 0 + + print(f"Average 3-way overlap per paper:\n") + print(f"{'Model':<35} {'All':>6} {'Only C':>8} {'Only P':>8} {'Only Z':>8} {'C∩P':>6} {'C∩Z':>6} {'P∩Z':>6} {'Jaccard':>8}") + print("-" * 100) + for model, counts in overlap_avg.items(): + print(f"{model:<35} " + f"{counts['all_avg']:>6.2f} " + f"{counts['only_c_avg']:>8.2f} " + f"{counts['only_p_avg']:>8.2f} " + f"{counts['only_z_avg']:>8.2f} " + f"{counts['only_c_p_avg']:>6.2f} " + f"{counts['only_c_z_avg']:>6.2f} " + f"{counts['only_p_z_avg']:>6.2f} " + f"{counts['jaccard_sim_avg']:>8.3f}") + + plot_overlap_all(overlap_avg) + + return overlap_ind, overlap_total, overlap_avg + + +def plot_overlap_all(overlap_avg): + COLORS = ["#2196F3", "#E53935", "#43A047"] # blue, red, green + + fig, axes = plt.subplots(2, 2, figsize=(14, 11), dpi=400) + axes = axes.flatten() + + for i, (model, counts) in enumerate(overlap_avg.items()): + only_c = round(counts["only_c_avg"], 2) + only_p = round(counts["only_p_avg"], 2) + only_z = round(counts["only_z_avg"], 2) + only_cp = round(counts["only_c_p_avg"], 2) + only_cz = round(counts["only_c_z_avg"], 2) + only_pz = round(counts["only_p_z_avg"], 2) + all_three = round(counts["all_avg"], 2) + + v = venn3( + subsets=(only_c, only_p, only_cp, only_z, only_cz, only_pz, all_three), + set_labels=("Coarse", "Progressive", "Zero Shot"), + ax=axes[i], + set_colors=COLORS, + alpha=0.15, + ) + + c = venn3_circles( + subsets=(only_c, only_p, only_cp, only_z, only_cz, only_pz, all_three), + ax=axes[i], + linewidth=2.0, + ) + + for circle, color in zip(c, COLORS): + circle.set_edgecolor(color) + circle.set_linewidth(2.0) + + for label_id in ["100", "010", "110", "001", "101", "011", "111"]: + lbl = v.get_label_by_id(label_id) + if lbl: + lbl.set_fontsize(13) + lbl.set_color("black") + lbl.set_fontweight("normal") + lbl.set_ha("center") + + for set_label in v.set_labels: + if set_label: + set_label.set_fontsize(13) + set_label.set_color("black") + + axes[i].set_title(f"{model_dict.get(model, model)}\nJaccard Similarity: {counts['jaccard_sim_avg']:.3f}", + fontsize=15, fontweight="bold", pad=10) + + plt.tight_layout() + plt.subplots_adjust(hspace=0.2, wspace=0.1) + plt.savefig("./venn_all.png", dpi=400, bbox_inches="tight") + + + +# Cluster Analysis + +def cluster_cp(folders, models): + # 1. Collect comments + texts = [] + labels = [] # "coarse" or "progressive" + models_tag = [] + + papers = get_papers(folders) + if not papers: + return None + + for model in models: + for stem in papers: + coarse_data = load(Path(folders["coarse"]) / (stem + ".json")) + prog_data = load(Path(folders["progressive"]) / (stem + ".json")) + + for c in coarse_data.get("methods", {}).get(method_key("coarse", model), {}).get("comments", []): + texts.append(c.get("title", "") + " " + c.get("explanation", "")) + labels.append("coarse") + models_tag.append(model) + + for p in prog_data.get("methods", {}).get(method_key("progressive", model), {}).get("comments", []): + texts.append(p.get("title", "") + " " + p.get("explanation", "")) + labels.append("progressive") + models_tag.append(model) + + print(f"Total comments: {len(texts)} (coarse: {labels.count('coarse')}, progressive: {labels.count('progressive')})") + + # 2. Embed and cluster + model_emb = SentenceTransformer("all-MiniLM-L6-v2") + X = model_emb.encode(texts, show_progress_bar=True) + + N_CLUSTERS = 10 + km = KMeans(n_clusters=N_CLUSTERS, random_state=42) + cluster_ids = km.fit_predict(X) + + # 3. Results + + # fit TF-IDF on clustered comments for keywords + tfidf = TfidfVectorizer(max_features=10000, stop_words="english") + X_tfidf = tfidf.fit_transform(texts) + terms = tfidf.get_feature_names_out() + + for cluster_id in range(N_CLUSTERS): + indices = np.where(cluster_ids == cluster_id)[0] + method_counts = Counter(labels[i] for i in indices) + total = len(indices) + + # find 5 comments closest to the cluster centroid + centroid = km.cluster_centers_[cluster_id] + distances = np.linalg.norm(X[indices] - centroid, axis=1) + closest = indices[np.argsort(distances)[:5]] + + # average TF-IDF score across all docs in this cluster + cluster_tfidf = X_tfidf[indices].mean(axis=0) + cluster_tfidf = np.asarray(cluster_tfidf).flatten() + top_keywords = [terms[j] for j in cluster_tfidf.argsort()[-15:][::-1]] + + print(f"\nCluster {cluster_id} ({total} comments)") + print(f" coarse: {method_counts['coarse']} ({method_counts['coarse']/total*100:.0f}%) " + f"progressive: {method_counts['progressive']} ({method_counts['progressive']/total*100:.0f}%)") + print(f" Keywords: {', '.join(top_keywords)}") + print(f" Most representative comments:") + for i in closest: + print(f" [{labels[i]:12s}] {texts[i][:100]}") + + +def cluster_all(folders, models): + texts = [] + labels = [] + models_tag = [] + + papers = get_papers(folders) + if not papers: + return None + + for model in models: + for stem in papers: + coarse_data = load(Path(folders["coarse"]) / (stem + ".json")) + prog_data = load(Path(folders["progressive"]) / (stem + ".json")) + zero_data = load(Path(folders["zero_shot"]) / (stem + ".json")) + + for c in coarse_data.get("methods", {}).get(method_key("coarse", model), {}).get("comments", []): + texts.append(c.get("title", "") + " " + c.get("explanation", "")) + labels.append("coarse") + models_tag.append(model) + + for p in prog_data.get("methods", {}).get(method_key("progressive", model), {}).get("comments", []): + texts.append(p.get("title", "") + " " + p.get("explanation", "")) + labels.append("progressive") + models_tag.append(model) + + for z in zero_data.get("methods", {}).get(method_key("zero_shot", model), {}).get("comments", []): + texts.append(z.get("title", "") + " " + z.get("explanation", "")) + labels.append("zero_shot") + models_tag.append(model) + + print(f"Total comments: {len(texts)} (coarse: {labels.count('coarse')}, progressive: {labels.count('progressive')}, zero_shot: {labels.count('zero_shot')})") + + model_emb = SentenceTransformer("all-MiniLM-L6-v2") + X = model_emb.encode(texts, show_progress_bar=True) + + N_CLUSTERS = 10 + km = KMeans(n_clusters=N_CLUSTERS, random_state=42) + cluster_ids = km.fit_predict(X) + + tfidf = TfidfVectorizer(max_features=10000, stop_words="english") + X_tfidf = tfidf.fit_transform(texts) + terms = tfidf.get_feature_names_out() + + for cluster_id in range(N_CLUSTERS): + indices = np.where(cluster_ids == cluster_id)[0] + method_counts = Counter(labels[i] for i in indices) + total = len(indices) + + centroid = km.cluster_centers_[cluster_id] + distances = np.linalg.norm(X[indices] - centroid, axis=1) + closest = indices[np.argsort(distances)[:5]] + + cluster_tfidf = np.asarray(X_tfidf[indices].mean(axis=0)).flatten() + top_keywords = [terms[j] for j in cluster_tfidf.argsort()[-15:][::-1]] + + print(f"\nCluster {cluster_id} ({total} comments)") + print(f" coarse: {method_counts['coarse']} ({method_counts['coarse']/total*100:.0f}%)") + print(f" progressive: {method_counts['progressive']} ({method_counts['progressive']/total*100:.0f}%)") + print(f" zero_shot: {method_counts['zero_shot']} ({method_counts['zero_shot']/total*100:.0f}%)") + print(f" Keywords: {', '.join(top_keywords)}") + print(f" Most representative comments:") + for i in closest: + print(f" [{labels[i]:12s}] {texts[i][:100]}") + + +if __name__ == "__main__": + FOLDERS = { + "coarse": "./coarse_v2/", + "progressive": "./scaleup_v2_progressive/", + "zero_shot": "./scaleup_v2_zero_shot/", + } + MODELS = ["deepseek-v4-flash", "gemini-3.1-flash-lite-preview", "glm-4.7-flash", "qwen3.6-35b-a3b"] + TOTAL_PAPERS = len(list(Path(FOLDERS["coarse"]).glob("*.json"))) + + print("=" * 90) + print("VOLUME") + print("=" * 90) + volume_dicts(FOLDERS, MODELS, TOTAL_PAPERS) + + print("\n" + "=" * 90) + print("2-WAY OVERLAP (Coarse vs. Progressive)") + print("=" * 90) + overlap_cp(FOLDERS, MODELS, TOTAL_PAPERS) + + print("\n" + "=" * 90) + print("3-WAY OVERLAP (Coarse, Progressive, Zero Shot)") + print("=" * 90) + overlap_all(FOLDERS, MODELS, TOTAL_PAPERS) + + print("\n" + "=" * 90) + print("CLUSTERING (Coarse + Progressive)") + print("=" * 90) + cluster_cp(FOLDERS, MODELS) + + print("\n" + "=" * 90) + print("CLUSTERING (Coarse + Progressive + Zero Shot)") + print("=" * 90) + cluster_all(FOLDERS, MODELS) diff --git a/benchmarks/results_iclr_coarse/venn_all.png b/benchmarks/results_iclr_coarse/venn_all.png new file mode 100644 index 0000000..f0b8165 Binary files /dev/null and b/benchmarks/results_iclr_coarse/venn_all.png differ diff --git a/benchmarks/results_iclr_coarse/venn_cp.png b/benchmarks/results_iclr_coarse/venn_cp.png new file mode 100644 index 0000000..4809e2b Binary files /dev/null and b/benchmarks/results_iclr_coarse/venn_cp.png differ diff --git a/src/reviewer/prompts.py b/src/reviewer/prompts.py index 7c8018d..6ca28f3 100644 --- a/src/reviewer/prompts.py +++ b/src/reviewer/prompts.py @@ -227,3 +227,4 @@ PAPER (first 8000 characters): {paper_start} """ +