Yuxuan-Zhang-Dexter commited on
Commit
9caee78
·
1 Parent(s): dafeb92

update leaderboard

Browse files
assets/news.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "news": [
3
  {
4
- "date": "2025-04-23",
5
  "video_link": "https://www.youtube.com/watch?v=NB1-5aKV9v4",
6
  "twitter_text": "Zero-Shot AI Gaming Showdown: O3 Multi-Modal Might Sweeps Sokoban & 2048, Lands Top-2 in Phoenix Wright & Candy Crush",
7
  "twitter_link": "https://x.com/haoailab"
 
1
  {
2
  "news": [
3
  {
4
+ "date": "2025-04-24",
5
  "video_link": "https://www.youtube.com/watch?v=NB1-5aKV9v4",
6
  "twitter_text": "Zero-Shot AI Gaming Showdown: O3 Multi-Modal Might Sweeps Sokoban & 2048, Lands Top-2 in Phoenix Wright & Candy Crush",
7
  "twitter_link": "https://x.com/haoailab"
leaderboard_utils.py CHANGED
@@ -25,6 +25,8 @@ def get_organization(model_name):
25
  return "deepseek"
26
  elif "llama" in m:
27
  return "meta"
 
 
28
  else:
29
  return "unknown"
30
 
 
25
  return "deepseek"
26
  elif "llama" in m:
27
  return "meta"
28
+ elif "grok" in m:
29
+ return "xai"
30
  else:
31
  return "unknown"
32
 
rank_data_03_25_2025.json CHANGED
@@ -126,13 +126,6 @@
126
  "time": "",
127
  "rank": 11
128
  },
129
- {
130
- "model": "grok3-beta",
131
- "score": 128,
132
- "steps": "",
133
- "time": "",
134
- "rank": 12
135
- },
136
  {
137
  "model": "claude-3-5-sonnet-20241022",
138
  "score": 64,
@@ -280,13 +273,6 @@
280
  "steps": 25,
281
  "rank": 9
282
  },
283
- {
284
- "model": "grok-3-beta",
285
- "score_runs": "11",
286
- "average_score": 11,
287
- "steps": 25,
288
- "rank": 10
289
- },
290
  {
291
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
292
  "score_runs": "6;0;0",
@@ -407,12 +393,6 @@
407
  "steps": "[9]; [47]; [64]",
408
  "rank": 14
409
  },
410
- {
411
- "model": "grok-3-beta",
412
- "levels_cracked": "0",
413
- "steps": "",
414
- "rank": 15
415
- },
416
  {
417
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
418
  "levels_cracked": "0;0;0",
@@ -514,15 +494,6 @@
514
  "score": 1,
515
  "note": "failed to present evidence"
516
  },
517
- {
518
- "model": "grok-3-beta",
519
- "levels_cracked": "0",
520
- "lives_left": "0",
521
- "cracked_details": "1:1/5",
522
- "rank": 11,
523
- "score": 1,
524
- "note": "failed to present evidence"
525
- },
526
  {
527
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
528
  "levels_cracked": "0",
 
126
  "time": "",
127
  "rank": 11
128
  },
 
 
 
 
 
 
 
129
  {
130
  "model": "claude-3-5-sonnet-20241022",
131
  "score": 64,
 
273
  "steps": 25,
274
  "rank": 9
275
  },
 
 
 
 
 
 
 
276
  {
277
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
278
  "score_runs": "6;0;0",
 
393
  "steps": "[9]; [47]; [64]",
394
  "rank": 14
395
  },
 
 
 
 
 
 
396
  {
397
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
398
  "levels_cracked": "0;0;0",
 
494
  "score": 1,
495
  "note": "failed to present evidence"
496
  },
 
 
 
 
 
 
 
 
 
497
  {
498
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
499
  "levels_cracked": "0",