Skip to content

Commit aef834c

Browse files
committed
Polish language, add GitHub star counts, remove AI-style dashes
- Rewrite EN/ZH dict: remove excess em dashes, more natural phrasing - Polish paper abstracts and blog excerpts in both languages - Add star counts to each paper card (37/25/113/71/19) with star icon - Star badge links to GitHub repo for one-click starring - Related repos also show star counts inline Made-with: Cursor
1 parent 7460a6c commit aef834c

3 files changed

Lines changed: 68 additions & 55 deletions

File tree

src/components/PaperCard.tsx

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,20 @@ export function PaperCard({ paper, index = 0 }: { paper: Paper; index?: number }
4444
))}
4545
</div>
4646

47-
<div className="mt-5 flex flex-wrap gap-3">
47+
<div className="mt-5 flex flex-wrap items-center gap-3">
48+
{paper.github && paper.stars != null && (
49+
<a
50+
href={paper.github}
51+
target="_blank"
52+
rel="noopener noreferrer"
53+
className="inline-flex items-center gap-1.5 rounded-full border border-border bg-muted/50 px-3 py-1 text-xs font-medium text-foreground transition-colors hover:border-accent/40 hover:bg-accent/10"
54+
>
55+
<svg className="h-3.5 w-3.5" fill="currentColor" viewBox="0 0 24 24">
56+
<path d="M12 .587l3.668 7.568 8.332 1.151-6.064 5.828 1.48 8.279L12 19.771l-7.416 3.642 1.48-8.279L0 9.306l8.332-1.151z" />
57+
</svg>
58+
{paper.stars}
59+
</a>
60+
)}
4861
{paper.arxiv && (
4962
<a
5063
href={paper.arxiv}
@@ -148,8 +161,18 @@ export function PaperCard({ paper, index = 0 }: { paper: Paper; index?: number }
148161
<svg className="mt-0.5 h-4 w-4 shrink-0 text-muted-foreground" fill="currentColor" viewBox="0 0 24 24">
149162
<path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z" />
150163
</svg>
151-
<div>
152-
<div className="text-sm font-medium text-foreground">{repo.name}</div>
164+
<div className="flex-1">
165+
<div className="flex items-center gap-2">
166+
<span className="text-sm font-medium text-foreground">{repo.name}</span>
167+
{repo.stars != null && (
168+
<span className="inline-flex items-center gap-0.5 text-xs text-muted-foreground">
169+
<svg className="h-3 w-3" fill="currentColor" viewBox="0 0 24 24">
170+
<path d="M12 .587l3.668 7.568 8.332 1.151-6.064 5.828 1.48 8.279L12 19.771l-7.416 3.642 1.48-8.279L0 9.306l8.332-1.151z" />
171+
</svg>
172+
{repo.stars}
173+
</span>
174+
)}
175+
</div>
153176
<div className="text-xs text-muted-foreground">{repo.description}</div>
154177
</div>
155178
</a>

src/data/papers.ts

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@ export interface Paper {
1414
project?: string;
1515
huggingface?: string;
1616
doi?: string;
17+
stars?: number;
1718
tags: string[];
1819
featured?: boolean;
19-
relatedRepos?: { name: string; url: string; description: string }[];
20+
relatedRepos?: { name: string; url: string; description: string; stars?: number }[];
2021
}
2122

2223
export const papers: Paper[] = [
@@ -57,6 +58,7 @@ export const papers: Paper[] = [
5758
"LLMEval-Fair addresses robustness and fairness concerns in LLM evaluation through a 30-month longitudinal study. Built on a proprietary bank of 220,000 graduate-level questions across 13 academic disciplines, it dynamically samples unseen test sets for each evaluation run. Its automated pipeline ensures integrity via contamination-resistant data curation, a novel anti-cheating architecture, and a calibrated LLM-as-a-judge process achieving 90% agreement with human experts. A study of nearly 60 leading models reveals performance ceilings and exposes data contamination vulnerabilities undetectable by static benchmarks.",
5859
arxiv: "https://arxiv.org/abs/2508.05452",
5960
github: "https://github.com/llmeval/LLMEval-Fair",
61+
stars: 37,
6062
project: "http://llmeval.com/",
6163
tags: ["evaluation", "fairness", "robustness", "generative QA", "longitudinal study"],
6264
featured: true,
@@ -87,11 +89,12 @@ export const papers: Paper[] = [
8789
authorNotes: "* Equal Contribution, † Corresponding Author",
8890
venue: "EMNLP 2025 Findings",
8991
year: 2025,
90-
abstractZh: "LLMEval-Med 提出了一个全面的、经医生验证的基准,用于评估大语言模型在真实临床任务上的表现。涵盖五个核心医学领域——医学知识、医学语言理解、医学推理、医学伦理与安全、医学文本生成——包含2,996道来自真实电子病历和专家设计临床场景的题目。引入结合专家清单的LLM-as-Judge自动评测流水线,通过人机一致性分析验证。评测了13个大模型(专用医学模型、开源模型和闭源模型)。",
92+
abstractZh: "LLMEval-Med 提出一个全面的、经医生验证的基准,用于评估大语言模型在真实临床任务上的表现。涵盖五个核心医学领域医学知识、医学语言理解、医学推理、医学伦理与安全、医学文本生成),包含2,996道来自真实电子病历和专家设计临床场景的题目。引入结合专家清单的LLM-as-Judge自动评测流水线,通过人机一致性分析验证。评测了13个大模型。",
9193
abstract:
92-
"LLMEval-Med presents a comprehensive, physician-validated benchmark for evaluating LLMs on real-world clinical tasks. It covers five core medical areas Medical Knowledge, Medical Language Understanding, Medical Reasoning, Medical Ethics and Safety, and Medical Text Generationwith 2,996 questions created from real-world electronic health records and expert-designed clinical scenarios. The work introduces an automated evaluation pipeline incorporating expert-developed checklists into an LLM-as-Judge framework, validated through human-machine agreement analysis. 13 LLMs across three categories (specialized medical, open-source, and closed-source) are evaluated.",
94+
"LLMEval-Med is a physician-validated benchmark for evaluating LLMs on real-world clinical tasks. It covers five core medical areas (Medical Knowledge, Language Understanding, Reasoning, Ethics & Safety, Text Generation) with 2,996 questions from real electronic health records and expert-designed clinical scenarios. An automated evaluation pipeline with expert-developed checklists is validated through human-machine agreement analysis. 13 LLMs across specialized, open-source, and closed-source categories are evaluated.",
9395
arxiv: "https://arxiv.org/abs/2506.04078",
9496
github: "https://github.com/llmeval/LLMEval-Med",
97+
stars: 25,
9598
huggingface: "https://huggingface.co/datasets/HuayuSha/LLMeval-Med",
9699
tags: ["medical", "clinical", "physician validation", "LLM-as-Judge"],
97100
featured: true,
@@ -113,9 +116,9 @@ export const papers: Paper[] = [
113116
authorNotes: "* Equal Contribution, † Corresponding Author",
114117
venue: "AAAI 2024",
115118
year: 2024,
116-
abstractZh: "本文聚焦大语言模型评测的第三个关键问题——\"如何评测\"——通过对比人工评测和自动评测中的多种评价标准、评估者类型、评分方法和排序系统。利用现场专家、众包标注员、公众志愿者和GPT-4,评测了20个大模型。共2,186人参与,生成243,337条人工标注和57,511条自动评测结果。论文提出LLMEval数据集(包含LLMEval-1和LLMEval-2两期评测数据)并得出10条结论。",
119+
abstractZh: "本文聚焦大语言模型评测的第三个关键问题如何评测」,通过对比人工和自动评测中的多种评价标准、评估者类型、评分方法和排序系统。利用现场专家、众包标注员、公众志愿者和GPT-4评测了20个大模型。共2,186人参与,生成243,337条人工标注和57,511条自动评测结果。论文提出LLMEval数据集(包含两期评测数据)并得出10条结论。",
117120
abstract:
118-
"This paper addresses the third crucial question in LLM evaluation — \"how to evaluate\" — by analyzing evaluation methods through comparing various criteria with both manual and automatic evaluation. Utilizing onsite staff, crowd-sourcing workers, public annotators, and GPT-4 with different scoring methods and ranking systems, we evaluate 20 LLMs. A total of 2,186 individuals participated, generating 243,337 manual annotations and 57,511 automatic evaluation results. The paper proposes the LLMEval dataset (comprising data from both LLMEval-1 and LLMEval-2 evaluation rounds) and draws 10 conclusions providing insights for future LLM evaluation practices.",
121+
"This paper tackles the third crucial question in LLM evaluation: how to evaluate. We compare various criteria with both manual and automatic evaluation, utilizing onsite staff, crowd-sourcing workers, public annotators, and GPT-4 across different scoring and ranking systems. 20 LLMs are evaluated with 2,186 participants generating 243,337 manual annotations and 57,511 automated results. The paper proposes the LLMEval dataset (from LLMEval-1 and LLMEval-2 rounds) and draws 10 conclusions for future evaluation practices.",
119122
arxiv: "https://arxiv.org/abs/2312.07398",
120123
doi: "https://doi.org/10.1609/aaai.v38i17.29934",
121124
tags: ["evaluation methodology", "crowdsourcing", "annotation", "scoring", "ranking"],
@@ -124,12 +127,14 @@ export const papers: Paper[] = [
124127
{
125128
name: "LLMEval-1",
126129
url: "https://github.com/llmeval/LLMEval-1",
127-
description: "Phase I dataset — 17 categories, 453 questions, 2,186 annotators for Chinese LLM evaluation",
130+
stars: 113,
131+
description: "Phase I dataset: 17 categories, 453 questions, 2,186 annotators for Chinese LLM evaluation",
128132
},
129133
{
130134
name: "LLMEval-2",
131135
url: "https://github.com/llmeval/LLMEval-2",
132-
description: "Phase II dataset — professional domain evaluation across 12 academic disciplines, 480 questions",
136+
stars: 71,
137+
description: "Phase II dataset: professional domain evaluation across 12 academic disciplines, 480 questions",
133138
},
134139
],
135140
},
@@ -144,6 +149,7 @@ export const papers: Paper[] = [
144149
abstract:
145150
"This evaluation utilizes the 2024 Chinese National College Entrance Examination (Gaokao) mathematics papers as a benchmark for large language models. Fresh exam questions with high originality and confidentiality make them an excellent test set. The evaluation covers both New Paper I and New Paper II, testing models with both LaTeX and escape-character formatted prompts to reveal sensitivity to prompt formatting in mathematical contexts.",
146151
github: "https://github.com/llmeval/Llmeval-Gaokao2024-Math",
152+
stars: 19,
147153
tags: ["mathematics", "Gaokao", "prompt format"],
148154
featured: false,
149155
},

0 commit comments

Comments
 (0)