Yuxuan-Zhang-Dexter commited on
Commit
5d62091
·
1 Parent(s): d44f890

update grok-3-mini data

Browse files
assets/model_color.json CHANGED
@@ -16,6 +16,7 @@
16
  "o3-2025-04-16": "#26C6DA",
17
  "o4-mini-2025-04-16": "#00ACC1",
18
  "grok-3-beta": "#FF7043",
 
19
  "deepseek-v3": "#FFC107",
20
  "deepseek-r1": "#FFA000",
21
  "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA"
 
16
  "o3-2025-04-16": "#26C6DA",
17
  "o4-mini-2025-04-16": "#00ACC1",
18
  "grok-3-beta": "#FF7043",
19
+ "grok-3-mini-beta": "#FF8A65",
20
  "deepseek-v3": "#FFC107",
21
  "deepseek-r1": "#FFA000",
22
  "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA"
assets/news.json CHANGED
@@ -1,5 +1,11 @@
1
  {
2
  "news": [
 
 
 
 
 
 
3
  {
4
  "date": "2025-04-24",
5
  "video_link": "https://www.youtube.com/watch?v=NB1-5aKV9v4",
 
1
  {
2
  "news": [
3
+ {
4
+ "date": "2025-04-28",
5
+ "video_link": "https://www.youtube.com/watch?v=OEQRhBKYxIE",
6
+ "twitter_text": "Grok-3-mini-beta Joins the Battle: Outperforms Gemini 2.5 Flash, Challenges O3-mini Across Games — Full Grok-3-beta Power Yet to Come. 🚀",
7
+ "twitter_link": "https://x.com/haoailab"
8
+ },
9
  {
10
  "date": "2025-04-24",
11
  "video_link": "https://www.youtube.com/watch?v=NB1-5aKV9v4",
data_visualization.py CHANGED
@@ -364,7 +364,7 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
364
 
365
  is_highlighted = highlight_models and player in highlight_models
366
  color = 'red' if is_highlighted else MODEL_COLORS.get(player, '#808080')
367
- fillcolor = 'rgba(255, 0, 0, 0.3)' if is_highlighted else hex_to_rgba(color, 0.2)
368
 
369
  r = [row[f"norm_{col}"] for col in game_cols]
370
 
@@ -377,8 +377,8 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
377
  mode='lines+markers',
378
  fill='toself',
379
  name=display_name, # Use lowercase name in legend
380
- line=dict(color=color, width=4 if is_highlighted else 2),
381
- marker=dict(color=color),
382
  fillcolor=fillcolor,
383
  opacity=1.0 if is_highlighted else 0.7,
384
  hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
@@ -604,4 +604,4 @@ def create_player_radar_chart(rank_data, player_name):
604
 
605
 
606
  def save_visualization(fig, filename):
607
- fig.write_image(filename)
 
364
 
365
  is_highlighted = highlight_models and player in highlight_models
366
  color = 'red' if is_highlighted else MODEL_COLORS.get(player, '#808080')
367
+ fillcolor = 'rgba(255, 0, 0, 0.4)' if is_highlighted else hex_to_rgba(color, 0.2)
368
 
369
  r = [row[f"norm_{col}"] for col in game_cols]
370
 
 
377
  mode='lines+markers',
378
  fill='toself',
379
  name=display_name, # Use lowercase name in legend
380
+ line=dict(color=color, width=6 if is_highlighted else 2),
381
+ marker=dict(color=color, size=10 if is_highlighted else 6),
382
  fillcolor=fillcolor,
383
  opacity=1.0 if is_highlighted else 0.7,
384
  hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
 
604
 
605
 
606
  def save_visualization(fig, filename):
607
+ fig.write_image(filename)
rank_data_03_25_2025.json CHANGED
@@ -50,25 +50,32 @@
50
  "runs": 1,
51
  "results": [
52
  {
53
- "model": "o3-2025-04-16",
54
  "score": 256,
55
- "steps": 108,
56
- "time": "58:09",
57
  "rank": 1
58
  },
59
  {
60
- "model": "claude-3-7-sonnet-20250219(thinking)",
61
  "score": 256,
62
- "steps": 114,
63
- "time": ">200",
64
- "rank": 2
65
  },
66
  {
67
  "model": "o1-2024-12-17",
68
  "score": 256,
69
  "steps": 116,
70
  "time": ">200",
71
- "rank": 3
 
 
 
 
 
 
 
72
  },
73
  {
74
  "model": "claude-3-7-sonnet-20250219",
@@ -231,47 +238,54 @@
231
  "steps": 25,
232
  "rank": 3
233
  },
 
 
 
 
 
 
 
234
  {
235
  "model": "o1-2024-12-17",
236
  "score_runs": "96;114;83",
237
  "average_score": 97.67,
238
  "steps": 25,
239
- "rank": 4
240
  },
241
  {
242
  "model": "deepseek-r1",
243
  "score_runs": "62;108;105",
244
  "average_score": 91.67,
245
  "steps": 25,
246
- "rank": 5
247
  },
248
  {
249
  "model": "gemini-2.5-flash-preview-04-17",
250
  "score_runs": "59",
251
  "average_score": 59,
252
  "steps": 25,
253
- "rank": 6
254
  },
255
  {
256
  "model": "gemini-2.5-pro-exp-03-25",
257
  "score_runs": "50;36;68",
258
  "average_score": 51.33,
259
  "steps": 25,
260
- "rank": 7
261
  },
262
  {
263
  "model": "claude-3-7-sonnet-20250219(thinking)",
264
  "score_runs": "36;46;24",
265
  "average_score": 35.33,
266
  "steps": 25,
267
- "rank": 8
268
  },
269
  {
270
  "model": "gemini-2.0-flash-thinking-exp-1219",
271
  "score_runs": "0;15;39",
272
  "average_score": 18,
273
  "steps": 25,
274
- "rank": 9
275
  },
276
  {
277
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
@@ -312,92 +326,98 @@
312
  "steps": "[16, 40, 59, 110]",
313
  "rank": 1
314
  },
 
 
 
 
 
 
315
  {
316
  "model": "o3-mini-2025-01-31(medium)",
317
  "levels_cracked": "2; 3; 2",
318
  "steps": "[17,52,68];[24,58,78,91];[19,44,64]",
319
- "rank": 2
320
  },
321
  {
322
  "model": "gemini-2.5-pro-exp-03-25",
323
  "levels_cracked": "2;2;3",
324
  "steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
325
- "rank": 3
326
  },
327
  {
328
  "model": "gemini-2.5-flash-preview-04-17",
329
  "levels_cracked": "2",
330
  "steps": "[24, 50, 60]",
331
- "rank": 4
332
  },
333
  {
334
  "model": "o4-mini-2025-04-16",
335
  "levels_cracked": "2",
336
  "steps": "",
337
- "rank": 5
338
  },
339
  {
340
  "model": "claude-3-7-sonnet-20250219(thinking)",
341
  "levels_cracked": "1; 2; 0",
342
  "steps": "[17,35];[15,40,43];[4]",
343
- "rank": 6
344
  },
345
  {
346
  "model": "o1-2024-12-17",
347
  "levels_cracked": "1; 1; 1",
348
  "steps": null,
349
- "rank": 7
350
  },
351
  {
352
  "model": "deepseek-r1",
353
  "levels_cracked": "1; 0; 1",
354
  "steps": "[19,42];[13];[19,36]",
355
  "note": "stuck",
356
- "rank": 8
357
  },
358
  {
359
  "model": "o1-mini-2024-09-12",
360
  "levels_cracked": "0;1;0",
361
  "steps": null,
362
- "rank": 9
363
  },
364
  {
365
  "model": "gemini-2.0-flash-thinking-exp-1219",
366
  "levels_cracked": "0; 0; 0",
367
  "steps": "[23]; [14]; [14]",
368
- "rank": 10
369
  },
370
  {
371
  "model": "gpt-4o-2024-11-20",
372
  "levels_cracked": "0; 0; 0",
373
  "steps": "[68];[105];[168]",
374
  "note": "stuck in a loop",
375
- "rank": 11
376
  },
377
  {
378
  "model": "claude-3-5-sonnet-20241022",
379
  "levels_cracked": "0; 0; 0",
380
  "steps": "[21]; [30]; [51]",
381
  "note": "stuck in a loop",
382
- "rank": 12
383
  },
384
  {
385
  "model": "deepseek-v3",
386
  "levels_cracked": "0; 0; 0",
387
  "steps": "[9]; [47]; [64]",
388
- "rank": 13
389
  },
390
  {
391
  "model": "gpt-4.1-2025-04-14",
392
  "levels_cracked": "0; 0; 0",
393
  "steps": "[9]; [47]; [64]",
394
- "rank": 14
395
  },
396
  {
397
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
398
  "levels_cracked": "0;0;0",
399
  "steps": "[5]",
400
- "rank": 16
401
  }
402
  ]
403
  },
@@ -440,12 +460,21 @@
440
  "score": 8,
441
  "note": "failed to present evidence"
442
  },
 
 
 
 
 
 
 
 
 
443
  {
444
  "model": "claude-3-5-sonnet-20241022",
445
  "levels_cracked": "1",
446
  "lives_left": "5, 5",
447
  "cracked_details": "1:1/8",
448
- "rank": 5,
449
  "score": 6,
450
  "note": "stuck in loop"
451
  },
@@ -454,7 +483,7 @@
454
  "levels_cracked": "1",
455
  "lives_left": "[4,5]",
456
  "cracked_details": "1: 1/8",
457
- "rank": 6,
458
  "score": 6,
459
  "note": "stuck in loop"
460
  },
@@ -463,7 +492,7 @@
463
  "levels_cracked": "0",
464
  "lives_left": "0",
465
  "cracked_details": "1: 4/5",
466
- "rank": 7,
467
  "score": 4,
468
  "note": "stuck in the last option section"
469
  },
@@ -472,7 +501,7 @@
472
  "levels_cracked": "0",
473
  "lives_left": "0",
474
  "cracked_details": "1: 4/5",
475
- "rank": 8,
476
  "score": 4,
477
  "note": "stuck in the last option section"
478
  },
@@ -481,7 +510,7 @@
481
  "levels_cracked": "0",
482
  "lives_left": "0",
483
  "cracked_details": "1: 4/5",
484
- "rank": 9,
485
  "score": 4,
486
  "note": "stuck in the 3rd evidence present"
487
  },
@@ -490,7 +519,7 @@
490
  "levels_cracked": "0",
491
  "lives_left": "0",
492
  "cracked_details": "1:1/5",
493
- "rank": 10,
494
  "score": 1,
495
  "note": "failed to present evidence"
496
  },
@@ -499,7 +528,7 @@
499
  "levels_cracked": "0",
500
  "lives_left": "0",
501
  "cracked_details": "0:0/5",
502
- "rank": 12,
503
  "score": 0,
504
  "note": "failed to present evidence"
505
  }
 
50
  "runs": 1,
51
  "results": [
52
  {
53
+ "model": "claude-3-7-sonnet-20250219(thinking)",
54
  "score": 256,
55
+ "steps": 114,
56
+ "time": ">200",
57
  "rank": 1
58
  },
59
  {
60
+ "model": "grok-3-mini-beta",
61
  "score": 256,
62
+ "steps": 108,
63
+ "time": "58:09",
64
+ "rank": 1
65
  },
66
  {
67
  "model": "o1-2024-12-17",
68
  "score": 256,
69
  "steps": 116,
70
  "time": ">200",
71
+ "rank": 1
72
+ },
73
+ {
74
+ "model": "o3-2025-04-16",
75
+ "score": 256,
76
+ "steps": 108,
77
+ "time": "58:09",
78
+ "rank": 1
79
  },
80
  {
81
  "model": "claude-3-7-sonnet-20250219",
 
238
  "steps": 25,
239
  "rank": 3
240
  },
241
+ {
242
+ "model": "grok-3-mini-beta",
243
+ "score_runs": "106",
244
+ "average_score": 106,
245
+ "steps": 25,
246
+ "rank": 4
247
+ },
248
  {
249
  "model": "o1-2024-12-17",
250
  "score_runs": "96;114;83",
251
  "average_score": 97.67,
252
  "steps": 25,
253
+ "rank": 5
254
  },
255
  {
256
  "model": "deepseek-r1",
257
  "score_runs": "62;108;105",
258
  "average_score": 91.67,
259
  "steps": 25,
260
+ "rank": 6
261
  },
262
  {
263
  "model": "gemini-2.5-flash-preview-04-17",
264
  "score_runs": "59",
265
  "average_score": 59,
266
  "steps": 25,
267
+ "rank": 7
268
  },
269
  {
270
  "model": "gemini-2.5-pro-exp-03-25",
271
  "score_runs": "50;36;68",
272
  "average_score": 51.33,
273
  "steps": 25,
274
+ "rank": 8
275
  },
276
  {
277
  "model": "claude-3-7-sonnet-20250219(thinking)",
278
  "score_runs": "36;46;24",
279
  "average_score": 35.33,
280
  "steps": 25,
281
+ "rank": 9
282
  },
283
  {
284
  "model": "gemini-2.0-flash-thinking-exp-1219",
285
  "score_runs": "0;15;39",
286
  "average_score": 18,
287
  "steps": 25,
288
+ "rank": 10
289
  },
290
  {
291
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
 
326
  "steps": "[16, 40, 59, 110]",
327
  "rank": 1
328
  },
329
+ {
330
+ "model": "grok-3-mini-beta",
331
+ "levels_cracked": "3",
332
+ "steps": "[14, 36, 55, 78]",
333
+ "rank": 2
334
+ },
335
  {
336
  "model": "o3-mini-2025-01-31(medium)",
337
  "levels_cracked": "2; 3; 2",
338
  "steps": "[17,52,68];[24,58,78,91];[19,44,64]",
339
+ "rank": 3
340
  },
341
  {
342
  "model": "gemini-2.5-pro-exp-03-25",
343
  "levels_cracked": "2;2;3",
344
  "steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
345
+ "rank": 4
346
  },
347
  {
348
  "model": "gemini-2.5-flash-preview-04-17",
349
  "levels_cracked": "2",
350
  "steps": "[24, 50, 60]",
351
+ "rank": 5
352
  },
353
  {
354
  "model": "o4-mini-2025-04-16",
355
  "levels_cracked": "2",
356
  "steps": "",
357
+ "rank": 6
358
  },
359
  {
360
  "model": "claude-3-7-sonnet-20250219(thinking)",
361
  "levels_cracked": "1; 2; 0",
362
  "steps": "[17,35];[15,40,43];[4]",
363
+ "rank": 7
364
  },
365
  {
366
  "model": "o1-2024-12-17",
367
  "levels_cracked": "1; 1; 1",
368
  "steps": null,
369
+ "rank": 8
370
  },
371
  {
372
  "model": "deepseek-r1",
373
  "levels_cracked": "1; 0; 1",
374
  "steps": "[19,42];[13];[19,36]",
375
  "note": "stuck",
376
+ "rank": 9
377
  },
378
  {
379
  "model": "o1-mini-2024-09-12",
380
  "levels_cracked": "0;1;0",
381
  "steps": null,
382
+ "rank": 10
383
  },
384
  {
385
  "model": "gemini-2.0-flash-thinking-exp-1219",
386
  "levels_cracked": "0; 0; 0",
387
  "steps": "[23]; [14]; [14]",
388
+ "rank": 11
389
  },
390
  {
391
  "model": "gpt-4o-2024-11-20",
392
  "levels_cracked": "0; 0; 0",
393
  "steps": "[68];[105];[168]",
394
  "note": "stuck in a loop",
395
+ "rank": 12
396
  },
397
  {
398
  "model": "claude-3-5-sonnet-20241022",
399
  "levels_cracked": "0; 0; 0",
400
  "steps": "[21]; [30]; [51]",
401
  "note": "stuck in a loop",
402
+ "rank": 13
403
  },
404
  {
405
  "model": "deepseek-v3",
406
  "levels_cracked": "0; 0; 0",
407
  "steps": "[9]; [47]; [64]",
408
+ "rank": 14
409
  },
410
  {
411
  "model": "gpt-4.1-2025-04-14",
412
  "levels_cracked": "0; 0; 0",
413
  "steps": "[9]; [47]; [64]",
414
+ "rank": 15
415
  },
416
  {
417
  "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
418
  "levels_cracked": "0;0;0",
419
  "steps": "[5]",
420
+ "rank": 17
421
  }
422
  ]
423
  },
 
460
  "score": 8,
461
  "note": "failed to present evidence"
462
  },
463
+ {
464
+ "model": "grok-3-mini-beta",
465
+ "levels_cracked": "1",
466
+ "lives_left": "[3, 0]",
467
+ "cracked_details": "2: 2/9",
468
+ "rank": 5,
469
+ "score": 7,
470
+ "note": "failed to present evidence"
471
+ },
472
  {
473
  "model": "claude-3-5-sonnet-20241022",
474
  "levels_cracked": "1",
475
  "lives_left": "5, 5",
476
  "cracked_details": "1:1/8",
477
+ "rank": 6,
478
  "score": 6,
479
  "note": "stuck in loop"
480
  },
 
483
  "levels_cracked": "1",
484
  "lives_left": "[4,5]",
485
  "cracked_details": "1: 1/8",
486
+ "rank": 7,
487
  "score": 6,
488
  "note": "stuck in loop"
489
  },
 
492
  "levels_cracked": "0",
493
  "lives_left": "0",
494
  "cracked_details": "1: 4/5",
495
+ "rank": 8,
496
  "score": 4,
497
  "note": "stuck in the last option section"
498
  },
 
501
  "levels_cracked": "0",
502
  "lives_left": "0",
503
  "cracked_details": "1: 4/5",
504
+ "rank": 9,
505
  "score": 4,
506
  "note": "stuck in the last option section"
507
  },
 
510
  "levels_cracked": "0",
511
  "lives_left": "0",
512
  "cracked_details": "1: 4/5",
513
+ "rank": 10,
514
  "score": 4,
515
  "note": "stuck in the 3rd evidence present"
516
  },
 
519
  "levels_cracked": "0",
520
  "lives_left": "0",
521
  "cracked_details": "1:1/5",
522
+ "rank": 11,
523
  "score": 1,
524
  "note": "failed to present evidence"
525
  },
 
528
  "levels_cracked": "0",
529
  "lives_left": "0",
530
  "cracked_details": "0:0/5",
531
+ "rank": 13,
532
  "score": 0,
533
  "note": "failed to present evidence"
534
  }