forked from DinoFazlic/gnn-ddi
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathupdate_leaderboard.py
More file actions
369 lines (297 loc) · 14 KB
/
update_leaderboard.py
File metadata and controls
369 lines (297 loc) · 14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
"""
Leaderboard Update Script for GNN Molecular Graph Classification Challenge
==========================================================================
This script updates the leaderboard.md file with new submission scores
and optional efficiency metrics.
Usage:
python update_leaderboard.py <submission_file> [--efficiency <score>] [--params <count>] [--time <ms>]
Environment Variables:
ACTOR: GitHub username of the participant (set by GitHub Actions)
The script:
1. Computes the score for the submission
2. Updates the participant's entry in leaderboard.md (keeping best score)
3. Optionally records efficiency metrics (inference time, parameters)
4. Sorts the leaderboard by score (descending)
Leaderboard columns:
- Rank: Position based on Macro F1 Score
- Participant: GitHub username or team name
- Macro-F1: Primary metric for ranking
- Efficiency: F1² / (log₁₀(time_ms) × log₁₀(params))
- Params: Model parameter count
- Time (ms): Average inference time per batch
- Last Updated: Submission date
"""
import os
import sys
import re
import math
import argparse
from typing import Dict, List, Any, Optional
import pandas as pd
from datetime import datetime
from sklearn.metrics import f1_score
def load_leaderboard(leaderboard_path: str) -> List[Dict[str, Any]]:
"""
Load the current leaderboard from markdown file.
Supports both old format (4 columns) and new format (7 columns with efficiency).
Returns:
list: List of entry dictionaries
"""
entries = []
if not os.path.exists(leaderboard_path):
return entries
with open(leaderboard_path, 'r') as f:
lines = f.readlines()
# Skip header lines (title, description, table header)
in_table = False
num_columns = 0
for line in lines:
line = line.strip()
# Detect table start and column count
if line.startswith('| Rank'):
in_table = True
num_columns = len([p for p in line.split('|') if p.strip()])
continue
# Skip separator line
if line.startswith('|---') or line.startswith('| ---'):
continue
# Parse table rows
if in_table and line.startswith('|'):
parts = [p.strip() for p in line.strip('|').split('|')]
if len(parts) >= 3:
try:
participant = parts[1].strip().strip('*') # Remove italic markers
score = float(parts[2].strip())
entry = {
'participant': participant,
'score': score,
'efficiency': None,
'params': None,
'time_ms': None,
'cliff_accuracy': None,
'date': ''
}
# Parse extended format (with efficiency metrics)
if num_columns >= 7 and len(parts) >= 6:
# Format: Rank | Participant | Macro-F1 | Efficiency | Params | Time | Cliff Acc | Date
eff_str = parts[3].strip()
if eff_str and eff_str != '-':
entry['efficiency'] = float(eff_str)
params_str = parts[4].strip().replace(',', '').replace('K', '000').replace('M', '000000')
if params_str and params_str != '-':
entry['params'] = int(float(params_str))
time_str = parts[5].strip()
if time_str and time_str != '-':
entry['time_ms'] = float(time_str)
# Cliff accuracy column (index 6) if present
if len(parts) > 6:
cliff_str = parts[6].strip()
if cliff_str and cliff_str != '-':
try:
entry['cliff_accuracy'] = float(cliff_str)
except ValueError:
pass
entry['date'] = parts[7].strip() if len(parts) > 7 else (parts[6].strip() if len(parts) > 6 and not parts[6].strip().replace('.', '').isdigit() else '')
else:
# Old format: Rank | Participant | Score | Date
entry['date'] = parts[3].strip() if len(parts) > 3 else ''
entries.append(entry)
except (ValueError, IndexError):
continue
return entries
def format_params(params: Optional[int]) -> str:
"""Format parameter count with K/M suffix."""
if params is None:
return '-'
if params >= 1_000_000:
return f"{params / 1_000_000:.1f}M"
elif params >= 1_000:
return f"{params / 1_000:.1f}K"
return str(params)
def save_leaderboard(leaderboard_path: str, entries: List[Dict[str, Any]]) -> None:
"""
Save the leaderboard to markdown file with extended format.
Args:
leaderboard_path: Path to leaderboard.md
entries: List of entry dictionaries
"""
# Sort by score (descending)
entries.sort(key=lambda x: x['score'], reverse=True)
with open(leaderboard_path, 'w') as f:
# Header
f.write("# 🏆 Leaderboard\n\n")
f.write("Competition: **GNN Molecular Graph Classification Challenge**\n\n")
f.write("Primary Metric: **Macro F1 Score** (higher is better)\n\n")
f.write("Efficiency Metric: $\\text{Efficiency} = \\frac{F_1^2}{\\log_{10}(\\text{time}_{ms}) \\times \\log_{10}(\\text{params})}$\n\n")
f.write("---\n\n")
# Table header (extended format)
f.write("| Rank | Participant | Macro-F1 | Efficiency | Params | Time (ms) | Cliff Acc | Last Updated |\n")
f.write("|------|-------------|----------|------------|--------|-----------|-----------|-----------------|\n")
# Table rows
for i, entry in enumerate(entries, 1):
# Add medal emojis for top 3
if i == 1:
rank_str = "🥇 1"
elif i == 2:
rank_str = "🥈 2"
elif i == 3:
rank_str = "🥉 3"
else:
rank_str = str(i)
# Mark baseline entries
participant = entry['participant']
if 'baseline' in participant.lower():
participant = f"*{participant}*"
# Format efficiency metrics
eff_str = f"{entry['efficiency']:.4f}" if entry.get('efficiency') else '-'
params_str = format_params(entry.get('params'))
time_str = f"{entry['time_ms']:.1f}" if entry.get('time_ms') else '-'
cliff_str = f"{entry['cliff_accuracy']:.4f}" if entry.get('cliff_accuracy') is not None else '-'
f.write(f"| {rank_str} | {participant} | {entry['score']:.4f} | {eff_str} | {params_str} | {time_str} | {cliff_str} | {entry['date']} |\n")
# Footer
f.write("\n---\n\n")
f.write("### Legend\n\n")
f.write("- **Macro-F1**: Primary ranking metric (harmonic mean of class-wise F1 scores)\n")
f.write("- **Efficiency**: Higher is better - rewards both accuracy and computational efficiency\n")
f.write("- **Params**: Total number of trainable parameters\n")
f.write("- **Time (ms)**: Average inference time per batch\n")
f.write("- **Cliff Acc**: MMP-OOD Pairwise Cliff Accuracy — fraction of activity-cliff pairs correctly ranked\n\n")
f.write("*Italic entries are baseline models provided by organizers.*\n\n")
f.write(f"*Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M UTC')}*\n")
def compute_submission_score(submission_file: str, truth_file: str) -> float:
"""
Compute the macro F1 score for a submission.
Args:
submission_file: Path to submission CSV
truth_file: Path to ground truth CSV
Returns:
float: Macro F1 score
"""
submission_df = pd.read_csv(submission_file)
truth_df = pd.read_csv(truth_file)
# Merge on ID
merged = truth_df.merge(submission_df, on='id', suffixes=('_true', '_pred'))
y_true = merged['target_true'].values
y_pred = merged['target_pred'].values
return f1_score(y_true, y_pred, average='macro')
def compute_efficiency_score(f1: float, time_ms: float, params: int) -> float:
"""
Compute efficiency score.
Efficiency = F1² / (log₁₀(time_ms) × log₁₀(params))
Args:
f1: Macro F1 score
time_ms: Inference time in milliseconds
params: Number of model parameters
Returns:
Efficiency score (higher is better)
"""
if f1 <= 0 or time_ms <= 0 or params <= 0:
return 0.0
time_ms = max(time_ms, 0.1)
params = max(params, 100)
log_time = math.log10(time_ms)
log_params = math.log10(params)
denominator = log_time * log_params
if denominator <= 0:
denominator = max(log_params, 1.0)
return (f1 ** 2) / denominator
def main():
parser = argparse.ArgumentParser(
description='Update leaderboard with submission score and efficiency metrics'
)
parser.add_argument('submission_file', type=str, help='Path to submission CSV')
parser.add_argument('--efficiency', type=float, default=None, help='Pre-computed efficiency score')
parser.add_argument('--params', type=int, default=None, help='Model parameter count')
parser.add_argument('--time', type=float, default=None, help='Inference time in ms')
parser.add_argument('--cliff-acc', type=float, default=None, help='MMP-OOD cliff accuracy')
parser.add_argument('--participant', type=str, default=None, help='Override participant name')
args = parser.parse_args()
submission_file = args.submission_file
# Get participant name from args, environment, or filename
participant = args.participant or os.environ.get('ACTOR')
if not participant:
# Extract from filename (e.g., "alice.csv" -> "alice")
participant = os.path.basename(submission_file).replace('.csv', '')
# File paths
script_dir = os.path.dirname(os.path.abspath(__file__))
leaderboard_path = os.path.join(script_dir, 'leaderboard.md')
truth_file = os.path.join(script_dir, 'data', 'test_labels.csv')
print(f"Updating leaderboard for participant: {participant}")
print(f"Submission file: {submission_file}")
# Check if ground truth exists
if not os.path.exists(truth_file):
print(f"Error: Ground truth file not found: {truth_file}")
sys.exit(1)
# Compute score
try:
score = compute_submission_score(submission_file, truth_file)
print(f"Computed score: {score:.4f}")
except Exception as e:
print(f"Error computing score: {e}")
sys.exit(1)
# Compute efficiency if params and time provided
efficiency = args.efficiency
params = args.params
time_ms = args.time
cliff_acc = args.cliff_acc
if efficiency is None and params and time_ms:
efficiency = compute_efficiency_score(score, time_ms, params)
print(f"Computed efficiency: {efficiency:.4f}")
# Load current leaderboard
entries = load_leaderboard(leaderboard_path)
print(f"Current leaderboard has {len(entries)} entries")
# Check if participant already exists
existing_idx = None
for i, entry in enumerate(entries):
if entry['participant'].lower().strip('*') == participant.lower():
existing_idx = i
break
current_date = datetime.now().strftime('%Y-%m-%d')
if existing_idx is not None:
# Update existing entry if new score is better
old_score = entries[existing_idx]['score']
if score > old_score:
print(f"Updating score: {old_score:.4f} -> {score:.4f}")
entries[existing_idx]['score'] = score
entries[existing_idx]['date'] = current_date
# Update efficiency metrics
if efficiency is not None:
entries[existing_idx]['efficiency'] = efficiency
if params is not None:
entries[existing_idx]['params'] = params
if time_ms is not None:
entries[existing_idx]['time_ms'] = time_ms
if cliff_acc is not None:
entries[existing_idx]['cliff_accuracy'] = cliff_acc
else:
print(f"Keeping existing score: {old_score:.4f} (new score: {score:.4f})")
# Still update efficiency if not set and now provided
if entries[existing_idx].get('efficiency') is None and efficiency is not None:
entries[existing_idx]['efficiency'] = efficiency
entries[existing_idx]['params'] = params
entries[existing_idx]['time_ms'] = time_ms
if entries[existing_idx].get('cliff_accuracy') is None and cliff_acc is not None:
entries[existing_idx]['cliff_accuracy'] = cliff_acc
else:
# Add new entry
print(f"Adding new entry for {participant}")
entries.append({
'participant': participant,
'score': score,
'efficiency': efficiency,
'params': params,
'time_ms': time_ms,
'cliff_accuracy': cliff_acc,
'date': current_date
})
# Save updated leaderboard
save_leaderboard(leaderboard_path, entries)
print(f"Leaderboard updated successfully!")
# Print current top 5
entries.sort(key=lambda x: x['score'], reverse=True)
print("\nTop 5 on leaderboard:")
for i, entry in enumerate(entries[:5], 1):
eff_str = f", eff={entry['efficiency']:.3f}" if entry.get('efficiency') else ""
print(f" {i}. {entry['participant']}: {entry['score']:.4f}{eff_str}")
if __name__ == "__main__":
main()