diff --git a/data/leaderboard.csv b/data/leaderboard.csv index fe8ad8d..e4b6140 100644 --- a/data/leaderboard.csv +++ b/data/leaderboard.csv @@ -1,9 +1,9 @@ Model,Count,End-to-End % Solved,Flag Success Count,Subtask Challenge Count,Subtask-Guided # Solved,Subtask-Guided % Solved,Sum of Subtask Run Scores,Subtasks % Solved,FST Standard,FST Subtask -GPT-4o,40,12.5,5,40,7,17.5,11.8,14,0:11,0:52 -OpenAI o1-preview,40,10,4,40,4,10,18.2,45.6,0:11,0:11 -Claude 3 Opus,40,10,4,40,5,12.5,14.4,17,0:11,0:11 -Claude 3.5 Sonnet,40,17.5,7,40,6,15,17.8,20.6,0:11,0:11 -Mixtral 8x22b Instruct,40,7.5,3,40,2,5,6.1,5.3,0:09,0:07 -Gemini 1.5 Pro,40,7.5,3,40,2,5,4.5,3,0:09,0:06 +GPT-4o,40,12.5,5,40,7,17.5,11.8,14.0,0:11,0:52 +OpenAI o1-preview,40,10,4,40,5,12.5,18.2,45.6,0:11,1:18 +Claude 3 Opus,40,10,4,40,5,12.5,14.4,17.0,0:11,0:11 +Claude 3.5 Sonnet,40,17.5,7,40,6,15.0,17.8,20.6,0:11,0:11 +Mixtral 8x22b Instruct,40,7.5,3,40,2,5.0,6.1,5.3,0:09,0:07 +Gemini 1.5 Pro,40,7.5,3,40,2,5.0,4.5,3.0,0:09,0:06 Llama 3 70b Chat,40,5,2,40,3,7.5,3.2,4.3,0:09,0:11 -Llama 3.1 405B Instruct,40,7.5,3,40,6,15,8.2,6.7,0:09,0:11 +Llama 3.1 405B Instruct,40,7.5,3,40,6,15.0,8.2,6.7,0:09,0:11 \ No newline at end of file