Skip to content

Commit bfe514c

Browse files
committed
Add LLMEval-Med leaderboard tab with 13 models across 5 medical dimensions
- 13 models: DeepSeek-R1, Qwen2.5, GPT-4o, o1-preview, Baichuan-M1, HuatuoGPT, etc. - 5 dimensions: MK, MLU, MR, MSE, MTG (usability rates %) - Filter by model category (Open-source / Closed-source / Specialized) - Sortable columns, color-coded scores - Updated About section with Med evaluation methodology Made-with: Cursor
1 parent 9523aaf commit bfe514c

File tree

3 files changed

+197
-0
lines changed

3 files changed

+197
-0
lines changed

src/app/leaderboard/page.tsx

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { useState } from "react";
44
import { LeaderboardTable } from "@/components/LeaderboardTable";
55
import { Eval1Table } from "@/components/Eval1Table";
66
import { Eval2Table } from "@/components/Eval2Table";
7+
import { MedTable } from "@/components/MedTable";
78

89
const tabs = [
910
{
@@ -12,6 +13,12 @@ const tabs = [
1213
badge: "ACL 2026",
1314
description: "220K generative questions across 13 academic disciplines. Nearly 60 models evaluated over a 30-month longitudinal study.",
1415
},
16+
{
17+
id: "med",
18+
label: "LLMEval-Med",
19+
badge: "EMNLP 2025",
20+
description: "2,996 physician-validated clinical questions across 5 medical dimensions. 13 models evaluated (open-source, closed-source, specialized).",
21+
},
1522
{
1623
id: "eval1",
1724
label: "LLMEval-1",
@@ -71,6 +78,7 @@ export default function LeaderboardPage() {
7178

7279
{/* Tab content */}
7380
{activeTab === "fair" && <LeaderboardTable />}
81+
{activeTab === "med" && <MedTable />}
7482
{activeTab === "eval1" && <Eval1Table />}
7583
{activeTab === "eval2" && <Eval2Table />}
7684

@@ -84,6 +92,13 @@ export default function LeaderboardPage() {
8492
Absolute score (0–100) represents raw performance; relative score measures the gap to the current SOTA model.
8593
Discipline scores use a 10-point scale. All evaluations use GPT-4 Turbo as the judge with a 0–3 point rubric per question.
8694
</p>
95+
<p>
96+
<strong className="text-foreground">LLMEval-Med</strong> (EMNLP 2025) — 2,996 questions from real-world electronic health records
97+
and expert-designed clinical scenarios across 5 dimensions: Medical Knowledge (MK), Medical Language Understanding (MLU),
98+
Medical Reasoning (MR), Medical Safety & Ethics (MSE), and Medical Text Generation (MTG).
99+
Scores represent usability rates (%) the proportion of responses scoring 4+ on a 0–5 scale (automated) or 5+ on a 0–7 scale (MTG, human-evaluated).
100+
Human-machine agreement rate: 92.36%.
101+
</p>
87102
<p>
88103
<strong className="text-foreground">LLMEval-1</strong> (AAAI 2024) — 17 categories, 453 questions evaluated on five dimensions:
89104
correctness, fluency, informativeness, logic, and harmlessness (0–3 scale). 2,186 public annotators contributed 243,337 annotations.

src/components/MedTable.tsx

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"use client";
2+
3+
import { useState, useMemo } from "react";
4+
import {
5+
useReactTable,
6+
getCoreRowModel,
7+
getSortedRowModel,
8+
getFilteredRowModel,
9+
flexRender,
10+
type ColumnDef,
11+
type SortingState,
12+
} from "@tanstack/react-table";
13+
import { medData, type MedScore } from "@/data/leaderboard-med";
14+
15+
function ScoreBadge({ score }: { score: number }) {
16+
const color =
17+
score >= 60
18+
? "text-green-600 dark:text-green-400"
19+
: score >= 45
20+
? "text-blue-600 dark:text-blue-400"
21+
: score >= 25
22+
? "text-yellow-600 dark:text-yellow-400"
23+
: "text-muted-foreground";
24+
return <span className={`font-mono text-sm font-medium ${color}`}>{score.toFixed(2)}</span>;
25+
}
26+
27+
const columns: ColumnDef<MedScore>[] = [
28+
{
29+
id: "rank",
30+
header: "#",
31+
cell: ({ row }) => {
32+
const rank = row.index + 1;
33+
const badge =
34+
rank === 1
35+
? "bg-yellow-100 text-yellow-800 dark:bg-yellow-900/30 dark:text-yellow-400"
36+
: rank === 2
37+
? "bg-gray-100 text-gray-700 dark:bg-gray-800 dark:text-gray-300"
38+
: rank === 3
39+
? "bg-orange-100 text-orange-700 dark:bg-orange-900/30 dark:text-orange-400"
40+
: "bg-muted text-muted-foreground";
41+
return (
42+
<span className={`inline-flex h-7 w-7 items-center justify-center rounded-full text-xs font-bold ${badge}`}>
43+
{rank}
44+
</span>
45+
);
46+
},
47+
size: 50,
48+
},
49+
{
50+
accessorKey: "model",
51+
header: "Model",
52+
cell: ({ row }) => (
53+
<div>
54+
<span className="font-semibold text-foreground">{row.original.model}</span>
55+
<span
56+
className={`ml-2 inline-flex rounded-full px-1.5 py-0.5 text-[10px] font-medium ${
57+
row.original.category === "Open-source"
58+
? "bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400"
59+
: row.original.category === "Closed-source"
60+
? "bg-purple-100 text-purple-700 dark:bg-purple-900/30 dark:text-purple-400"
61+
: "bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-400"
62+
}`}
63+
>
64+
{row.original.category}
65+
</span>
66+
</div>
67+
),
68+
size: 220,
69+
},
70+
{ accessorKey: "overall", header: "Overall", cell: ({ getValue }) => <ScoreBadge score={getValue<number>()} />, size: 80 },
71+
{ accessorKey: "mk", header: "MK", cell: ({ getValue }) => <ScoreBadge score={getValue<number>()} />, size: 70 },
72+
{ accessorKey: "mlu", header: "MLU", cell: ({ getValue }) => <ScoreBadge score={getValue<number>()} />, size: 70 },
73+
{ accessorKey: "mr", header: "MR", cell: ({ getValue }) => <ScoreBadge score={getValue<number>()} />, size: 70 },
74+
{ accessorKey: "mse", header: "MSE", cell: ({ getValue }) => <ScoreBadge score={getValue<number>()} />, size: 70 },
75+
{ accessorKey: "mtg", header: "MTG", cell: ({ getValue }) => <ScoreBadge score={getValue<number>()} />, size: 70 },
76+
];
77+
78+
export function MedTable() {
79+
const [sorting, setSorting] = useState<SortingState>([{ id: "overall", desc: true }]);
80+
const [categoryFilter, setCategoryFilter] = useState<string | null>(null);
81+
82+
const filteredData = useMemo(
83+
() => (categoryFilter ? medData.filter((d) => d.category === categoryFilter) : medData),
84+
[categoryFilter]
85+
);
86+
87+
const table = useReactTable({
88+
data: filteredData,
89+
columns,
90+
state: { sorting },
91+
onSortingChange: setSorting,
92+
getCoreRowModel: getCoreRowModel(),
93+
getSortedRowModel: getSortedRowModel(),
94+
getFilteredRowModel: getFilteredRowModel(),
95+
});
96+
97+
return (
98+
<div>
99+
<div className="mb-6 flex flex-wrap items-center gap-2">
100+
{[null, "Open-source", "Closed-source", "Specialized"].map((cat) => (
101+
<button
102+
key={cat ?? "all"}
103+
onClick={() => setCategoryFilter(cat)}
104+
className={`rounded-lg px-3 py-1.5 text-sm font-medium transition-colors ${
105+
categoryFilter === cat
106+
? "bg-accent text-accent-foreground"
107+
: "bg-muted text-muted-foreground hover:text-foreground"
108+
}`}
109+
>
110+
{cat ?? "All"}
111+
</button>
112+
))}
113+
</div>
114+
115+
<div className="overflow-x-auto rounded-xl border border-border">
116+
<table className="w-full text-left">
117+
<thead>
118+
{table.getHeaderGroups().map((headerGroup) => (
119+
<tr key={headerGroup.id} className="border-b border-border bg-muted/50">
120+
{headerGroup.headers.map((header) => (
121+
<th
122+
key={header.id}
123+
onClick={header.column.getToggleSortingHandler()}
124+
className="cursor-pointer px-3 py-3 text-xs font-semibold uppercase tracking-wider text-muted-foreground transition-colors hover:text-foreground select-none"
125+
style={{ width: header.getSize() }}
126+
>
127+
<div className="flex items-center gap-1">
128+
{flexRender(header.column.columnDef.header, header.getContext())}
129+
{{ asc: " ↑", desc: " ↓" }[header.column.getIsSorted() as string] ?? ""}
130+
</div>
131+
</th>
132+
))}
133+
</tr>
134+
))}
135+
</thead>
136+
<tbody>
137+
{table.getRowModel().rows.map((row) => (
138+
<tr key={row.id} className="border-b border-border transition-colors last:border-0 hover:bg-muted/30">
139+
{row.getVisibleCells().map((cell) => (
140+
<td key={cell.id} className="px-3 py-2.5">
141+
{flexRender(cell.column.columnDef.cell, cell.getContext())}
142+
</td>
143+
))}
144+
</tr>
145+
))}
146+
</tbody>
147+
</table>
148+
</div>
149+
<p className="mt-4 text-xs text-muted-foreground">
150+
Usability rates (%) across 5 medical dimensions. MK = Medical Knowledge, MLU = Medical Language Understanding,
151+
MR = Medical Reasoning, MSE = Medical Safety & Ethics, MTG = Medical Text Generation.
152+
Data from <a href="https://github.com/llmeval/LLMEval-Med" target="_blank" rel="noopener noreferrer" className="text-accent hover:underline">LLMEval-Med</a> (EMNLP 2025).
153+
</p>
154+
</div>
155+
);
156+
}

src/data/leaderboard-med.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
export interface MedScore {
2+
model: string;
3+
category: "Open-source" | "Closed-source" | "Specialized";
4+
overall: number;
5+
mk: number;
6+
mlu: number;
7+
mr: number;
8+
mse: number;
9+
mtg: number;
10+
}
11+
12+
export const medData: MedScore[] = [
13+
{ model: "DeepSeek-R1", category: "Open-source", overall: 64.23, mk: 84.16, mlu: 69.64, mr: 63.40, mse: 59.63, mtg: 44.33 },
14+
{ model: "Qwen2.5-72B", category: "Open-source", overall: 51.53, mk: 55.56, mlu: 47.42, mr: 50.83, mse: 60.55, mtg: 43.30 },
15+
{ model: "Qwen2.5-32B", category: "Open-source", overall: 48.87, mk: 52.25, mlu: 46.48, mr: 42.24, mse: 61.11, mtg: 42.27 },
16+
{ model: "DeepSeek-V3", category: "Open-source", overall: 48.03, mk: 51.06, mlu: 53.68, mr: 38.24, mse: 47.71, mtg: 49.48 },
17+
{ model: "Mistral-24B", category: "Open-source", overall: 46.42, mk: 45.15, mlu: 43.35, mr: 28.10, mse: 50.15, mtg: 22.68 },
18+
{ model: "Llama-3.1-8B", category: "Open-source", overall: 26.65, mk: 16.78, mlu: 20.50, mr: 18.63, mse: 25.38, mtg: 29.90 },
19+
{ model: "o1-preview", category: "Closed-source", overall: 61.23, mk: 65.25, mlu: 63.85, mr: 62.75, mse: 64.81, mtg: 49.48 },
20+
{ model: "GPT-4o", category: "Closed-source", overall: 58.73, mk: 61.23, mlu: 56.34, mr: 55.23, mse: 56.27, mtg: 64.58 },
21+
{ model: "o1-mini", category: "Closed-source", overall: 57.86, mk: 56.03, mlu: 60.09, mr: 60.40, mse: 63.30, mtg: 49.48 },
22+
{ model: "Baichuan-M1", category: "Specialized", overall: 60.34, mk: 70.69, mlu: 63.22, mr: 62.09, mse: 50.76, mtg: 54.95 },
23+
{ model: "Baichuan-M1-14B", category: "Specialized", overall: 55.43, mk: 62.88, mlu: 40.53, mr: 55.23, mse: 70.03, mtg: 48.45 },
24+
{ model: "HuatuoGPT-o1-72B", category: "Specialized", overall: 52.27, mk: 53.43, mlu: 49.45, mr: 56.86, mse: 56.27, mtg: 45.36 },
25+
{ model: "DISC-MedLLM", category: "Specialized", overall: 11.34, mk: 7.09, mlu: 10.02, mr: 7.52, mse: 23.24, mtg: 2.06 },
26+
];

0 commit comments

Comments
 (0)