Spaces:
Running
Running
Yuxuan-Zhang-Dexter
commited on
Commit
·
9caee78
1
Parent(s):
dafeb92
update leaderboard
Browse files- assets/news.json +1 -1
- leaderboard_utils.py +2 -0
- rank_data_03_25_2025.json +0 -29
assets/news.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"news": [
|
3 |
{
|
4 |
-
"date": "2025-04-
|
5 |
"video_link": "https://www.youtube.com/watch?v=NB1-5aKV9v4",
|
6 |
"twitter_text": "Zero-Shot AI Gaming Showdown: O3 Multi-Modal Might Sweeps Sokoban & 2048, Lands Top-2 in Phoenix Wright & Candy Crush",
|
7 |
"twitter_link": "https://x.com/haoailab"
|
|
|
1 |
{
|
2 |
"news": [
|
3 |
{
|
4 |
+
"date": "2025-04-24",
|
5 |
"video_link": "https://www.youtube.com/watch?v=NB1-5aKV9v4",
|
6 |
"twitter_text": "Zero-Shot AI Gaming Showdown: O3 Multi-Modal Might Sweeps Sokoban & 2048, Lands Top-2 in Phoenix Wright & Candy Crush",
|
7 |
"twitter_link": "https://x.com/haoailab"
|
leaderboard_utils.py
CHANGED
@@ -25,6 +25,8 @@ def get_organization(model_name):
|
|
25 |
return "deepseek"
|
26 |
elif "llama" in m:
|
27 |
return "meta"
|
|
|
|
|
28 |
else:
|
29 |
return "unknown"
|
30 |
|
|
|
25 |
return "deepseek"
|
26 |
elif "llama" in m:
|
27 |
return "meta"
|
28 |
+
elif "grok" in m:
|
29 |
+
return "xai"
|
30 |
else:
|
31 |
return "unknown"
|
32 |
|
rank_data_03_25_2025.json
CHANGED
@@ -126,13 +126,6 @@
|
|
126 |
"time": "",
|
127 |
"rank": 11
|
128 |
},
|
129 |
-
{
|
130 |
-
"model": "grok3-beta",
|
131 |
-
"score": 128,
|
132 |
-
"steps": "",
|
133 |
-
"time": "",
|
134 |
-
"rank": 12
|
135 |
-
},
|
136 |
{
|
137 |
"model": "claude-3-5-sonnet-20241022",
|
138 |
"score": 64,
|
@@ -280,13 +273,6 @@
|
|
280 |
"steps": 25,
|
281 |
"rank": 9
|
282 |
},
|
283 |
-
{
|
284 |
-
"model": "grok-3-beta",
|
285 |
-
"score_runs": "11",
|
286 |
-
"average_score": 11,
|
287 |
-
"steps": 25,
|
288 |
-
"rank": 10
|
289 |
-
},
|
290 |
{
|
291 |
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
292 |
"score_runs": "6;0;0",
|
@@ -407,12 +393,6 @@
|
|
407 |
"steps": "[9]; [47]; [64]",
|
408 |
"rank": 14
|
409 |
},
|
410 |
-
{
|
411 |
-
"model": "grok-3-beta",
|
412 |
-
"levels_cracked": "0",
|
413 |
-
"steps": "",
|
414 |
-
"rank": 15
|
415 |
-
},
|
416 |
{
|
417 |
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
418 |
"levels_cracked": "0;0;0",
|
@@ -514,15 +494,6 @@
|
|
514 |
"score": 1,
|
515 |
"note": "failed to present evidence"
|
516 |
},
|
517 |
-
{
|
518 |
-
"model": "grok-3-beta",
|
519 |
-
"levels_cracked": "0",
|
520 |
-
"lives_left": "0",
|
521 |
-
"cracked_details": "1:1/5",
|
522 |
-
"rank": 11,
|
523 |
-
"score": 1,
|
524 |
-
"note": "failed to present evidence"
|
525 |
-
},
|
526 |
{
|
527 |
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
528 |
"levels_cracked": "0",
|
|
|
126 |
"time": "",
|
127 |
"rank": 11
|
128 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
{
|
130 |
"model": "claude-3-5-sonnet-20241022",
|
131 |
"score": 64,
|
|
|
273 |
"steps": 25,
|
274 |
"rank": 9
|
275 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
{
|
277 |
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
278 |
"score_runs": "6;0;0",
|
|
|
393 |
"steps": "[9]; [47]; [64]",
|
394 |
"rank": 14
|
395 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
{
|
397 |
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
398 |
"levels_cracked": "0;0;0",
|
|
|
494 |
"score": 1,
|
495 |
"note": "failed to present evidence"
|
496 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
{
|
498 |
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
499 |
"levels_cracked": "0",
|