Spaces:
Running
Running
Yuxuan-Zhang-Dexter
commited on
Commit
·
5d62091
1
Parent(s):
d44f890
update grok-3-mini data
Browse files- assets/model_color.json +1 -0
- assets/news.json +6 -0
- data_visualization.py +4 -4
- rank_data_03_25_2025.json +64 -35
assets/model_color.json
CHANGED
@@ -16,6 +16,7 @@
|
|
16 |
"o3-2025-04-16": "#26C6DA",
|
17 |
"o4-mini-2025-04-16": "#00ACC1",
|
18 |
"grok-3-beta": "#FF7043",
|
|
|
19 |
"deepseek-v3": "#FFC107",
|
20 |
"deepseek-r1": "#FFA000",
|
21 |
"llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA"
|
|
|
16 |
"o3-2025-04-16": "#26C6DA",
|
17 |
"o4-mini-2025-04-16": "#00ACC1",
|
18 |
"grok-3-beta": "#FF7043",
|
19 |
+
"grok-3-mini-beta": "#FF8A65",
|
20 |
"deepseek-v3": "#FFC107",
|
21 |
"deepseek-r1": "#FFA000",
|
22 |
"llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA"
|
assets/news.json
CHANGED
@@ -1,5 +1,11 @@
|
|
1 |
{
|
2 |
"news": [
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
{
|
4 |
"date": "2025-04-24",
|
5 |
"video_link": "https://www.youtube.com/watch?v=NB1-5aKV9v4",
|
|
|
1 |
{
|
2 |
"news": [
|
3 |
+
{
|
4 |
+
"date": "2025-04-28",
|
5 |
+
"video_link": "https://www.youtube.com/watch?v=OEQRhBKYxIE",
|
6 |
+
"twitter_text": "Grok-3-mini-beta Joins the Battle: Outperforms Gemini 2.5 Flash, Challenges O3-mini Across Games — Full Grok-3-beta Power Yet to Come. 🚀",
|
7 |
+
"twitter_link": "https://x.com/haoailab"
|
8 |
+
},
|
9 |
{
|
10 |
"date": "2025-04-24",
|
11 |
"video_link": "https://www.youtube.com/watch?v=NB1-5aKV9v4",
|
data_visualization.py
CHANGED
@@ -364,7 +364,7 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
364 |
|
365 |
is_highlighted = highlight_models and player in highlight_models
|
366 |
color = 'red' if is_highlighted else MODEL_COLORS.get(player, '#808080')
|
367 |
-
fillcolor = 'rgba(255, 0, 0, 0.
|
368 |
|
369 |
r = [row[f"norm_{col}"] for col in game_cols]
|
370 |
|
@@ -377,8 +377,8 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
377 |
mode='lines+markers',
|
378 |
fill='toself',
|
379 |
name=display_name, # Use lowercase name in legend
|
380 |
-
line=dict(color=color, width=
|
381 |
-
marker=dict(color=color),
|
382 |
fillcolor=fillcolor,
|
383 |
opacity=1.0 if is_highlighted else 0.7,
|
384 |
hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
|
@@ -604,4 +604,4 @@ def create_player_radar_chart(rank_data, player_name):
|
|
604 |
|
605 |
|
606 |
def save_visualization(fig, filename):
|
607 |
-
fig.write_image(filename)
|
|
|
364 |
|
365 |
is_highlighted = highlight_models and player in highlight_models
|
366 |
color = 'red' if is_highlighted else MODEL_COLORS.get(player, '#808080')
|
367 |
+
fillcolor = 'rgba(255, 0, 0, 0.4)' if is_highlighted else hex_to_rgba(color, 0.2)
|
368 |
|
369 |
r = [row[f"norm_{col}"] for col in game_cols]
|
370 |
|
|
|
377 |
mode='lines+markers',
|
378 |
fill='toself',
|
379 |
name=display_name, # Use lowercase name in legend
|
380 |
+
line=dict(color=color, width=6 if is_highlighted else 2),
|
381 |
+
marker=dict(color=color, size=10 if is_highlighted else 6),
|
382 |
fillcolor=fillcolor,
|
383 |
opacity=1.0 if is_highlighted else 0.7,
|
384 |
hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
|
|
|
604 |
|
605 |
|
606 |
def save_visualization(fig, filename):
|
607 |
+
fig.write_image(filename)
|
rank_data_03_25_2025.json
CHANGED
@@ -50,25 +50,32 @@
|
|
50 |
"runs": 1,
|
51 |
"results": [
|
52 |
{
|
53 |
-
"model": "
|
54 |
"score": 256,
|
55 |
-
"steps":
|
56 |
-
"time": "
|
57 |
"rank": 1
|
58 |
},
|
59 |
{
|
60 |
-
"model": "
|
61 |
"score": 256,
|
62 |
-
"steps":
|
63 |
-
"time": "
|
64 |
-
"rank":
|
65 |
},
|
66 |
{
|
67 |
"model": "o1-2024-12-17",
|
68 |
"score": 256,
|
69 |
"steps": 116,
|
70 |
"time": ">200",
|
71 |
-
"rank":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
},
|
73 |
{
|
74 |
"model": "claude-3-7-sonnet-20250219",
|
@@ -231,47 +238,54 @@
|
|
231 |
"steps": 25,
|
232 |
"rank": 3
|
233 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
{
|
235 |
"model": "o1-2024-12-17",
|
236 |
"score_runs": "96;114;83",
|
237 |
"average_score": 97.67,
|
238 |
"steps": 25,
|
239 |
-
"rank":
|
240 |
},
|
241 |
{
|
242 |
"model": "deepseek-r1",
|
243 |
"score_runs": "62;108;105",
|
244 |
"average_score": 91.67,
|
245 |
"steps": 25,
|
246 |
-
"rank":
|
247 |
},
|
248 |
{
|
249 |
"model": "gemini-2.5-flash-preview-04-17",
|
250 |
"score_runs": "59",
|
251 |
"average_score": 59,
|
252 |
"steps": 25,
|
253 |
-
"rank":
|
254 |
},
|
255 |
{
|
256 |
"model": "gemini-2.5-pro-exp-03-25",
|
257 |
"score_runs": "50;36;68",
|
258 |
"average_score": 51.33,
|
259 |
"steps": 25,
|
260 |
-
"rank":
|
261 |
},
|
262 |
{
|
263 |
"model": "claude-3-7-sonnet-20250219(thinking)",
|
264 |
"score_runs": "36;46;24",
|
265 |
"average_score": 35.33,
|
266 |
"steps": 25,
|
267 |
-
"rank":
|
268 |
},
|
269 |
{
|
270 |
"model": "gemini-2.0-flash-thinking-exp-1219",
|
271 |
"score_runs": "0;15;39",
|
272 |
"average_score": 18,
|
273 |
"steps": 25,
|
274 |
-
"rank":
|
275 |
},
|
276 |
{
|
277 |
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
@@ -312,92 +326,98 @@
|
|
312 |
"steps": "[16, 40, 59, 110]",
|
313 |
"rank": 1
|
314 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
{
|
316 |
"model": "o3-mini-2025-01-31(medium)",
|
317 |
"levels_cracked": "2; 3; 2",
|
318 |
"steps": "[17,52,68];[24,58,78,91];[19,44,64]",
|
319 |
-
"rank":
|
320 |
},
|
321 |
{
|
322 |
"model": "gemini-2.5-pro-exp-03-25",
|
323 |
"levels_cracked": "2;2;3",
|
324 |
"steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
|
325 |
-
"rank":
|
326 |
},
|
327 |
{
|
328 |
"model": "gemini-2.5-flash-preview-04-17",
|
329 |
"levels_cracked": "2",
|
330 |
"steps": "[24, 50, 60]",
|
331 |
-
"rank":
|
332 |
},
|
333 |
{
|
334 |
"model": "o4-mini-2025-04-16",
|
335 |
"levels_cracked": "2",
|
336 |
"steps": "",
|
337 |
-
"rank":
|
338 |
},
|
339 |
{
|
340 |
"model": "claude-3-7-sonnet-20250219(thinking)",
|
341 |
"levels_cracked": "1; 2; 0",
|
342 |
"steps": "[17,35];[15,40,43];[4]",
|
343 |
-
"rank":
|
344 |
},
|
345 |
{
|
346 |
"model": "o1-2024-12-17",
|
347 |
"levels_cracked": "1; 1; 1",
|
348 |
"steps": null,
|
349 |
-
"rank":
|
350 |
},
|
351 |
{
|
352 |
"model": "deepseek-r1",
|
353 |
"levels_cracked": "1; 0; 1",
|
354 |
"steps": "[19,42];[13];[19,36]",
|
355 |
"note": "stuck",
|
356 |
-
"rank":
|
357 |
},
|
358 |
{
|
359 |
"model": "o1-mini-2024-09-12",
|
360 |
"levels_cracked": "0;1;0",
|
361 |
"steps": null,
|
362 |
-
"rank":
|
363 |
},
|
364 |
{
|
365 |
"model": "gemini-2.0-flash-thinking-exp-1219",
|
366 |
"levels_cracked": "0; 0; 0",
|
367 |
"steps": "[23]; [14]; [14]",
|
368 |
-
"rank":
|
369 |
},
|
370 |
{
|
371 |
"model": "gpt-4o-2024-11-20",
|
372 |
"levels_cracked": "0; 0; 0",
|
373 |
"steps": "[68];[105];[168]",
|
374 |
"note": "stuck in a loop",
|
375 |
-
"rank":
|
376 |
},
|
377 |
{
|
378 |
"model": "claude-3-5-sonnet-20241022",
|
379 |
"levels_cracked": "0; 0; 0",
|
380 |
"steps": "[21]; [30]; [51]",
|
381 |
"note": "stuck in a loop",
|
382 |
-
"rank":
|
383 |
},
|
384 |
{
|
385 |
"model": "deepseek-v3",
|
386 |
"levels_cracked": "0; 0; 0",
|
387 |
"steps": "[9]; [47]; [64]",
|
388 |
-
"rank":
|
389 |
},
|
390 |
{
|
391 |
"model": "gpt-4.1-2025-04-14",
|
392 |
"levels_cracked": "0; 0; 0",
|
393 |
"steps": "[9]; [47]; [64]",
|
394 |
-
"rank":
|
395 |
},
|
396 |
{
|
397 |
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
398 |
"levels_cracked": "0;0;0",
|
399 |
"steps": "[5]",
|
400 |
-
"rank":
|
401 |
}
|
402 |
]
|
403 |
},
|
@@ -440,12 +460,21 @@
|
|
440 |
"score": 8,
|
441 |
"note": "failed to present evidence"
|
442 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
{
|
444 |
"model": "claude-3-5-sonnet-20241022",
|
445 |
"levels_cracked": "1",
|
446 |
"lives_left": "5, 5",
|
447 |
"cracked_details": "1:1/8",
|
448 |
-
"rank":
|
449 |
"score": 6,
|
450 |
"note": "stuck in loop"
|
451 |
},
|
@@ -454,7 +483,7 @@
|
|
454 |
"levels_cracked": "1",
|
455 |
"lives_left": "[4,5]",
|
456 |
"cracked_details": "1: 1/8",
|
457 |
-
"rank":
|
458 |
"score": 6,
|
459 |
"note": "stuck in loop"
|
460 |
},
|
@@ -463,7 +492,7 @@
|
|
463 |
"levels_cracked": "0",
|
464 |
"lives_left": "0",
|
465 |
"cracked_details": "1: 4/5",
|
466 |
-
"rank":
|
467 |
"score": 4,
|
468 |
"note": "stuck in the last option section"
|
469 |
},
|
@@ -472,7 +501,7 @@
|
|
472 |
"levels_cracked": "0",
|
473 |
"lives_left": "0",
|
474 |
"cracked_details": "1: 4/5",
|
475 |
-
"rank":
|
476 |
"score": 4,
|
477 |
"note": "stuck in the last option section"
|
478 |
},
|
@@ -481,7 +510,7 @@
|
|
481 |
"levels_cracked": "0",
|
482 |
"lives_left": "0",
|
483 |
"cracked_details": "1: 4/5",
|
484 |
-
"rank":
|
485 |
"score": 4,
|
486 |
"note": "stuck in the 3rd evidence present"
|
487 |
},
|
@@ -490,7 +519,7 @@
|
|
490 |
"levels_cracked": "0",
|
491 |
"lives_left": "0",
|
492 |
"cracked_details": "1:1/5",
|
493 |
-
"rank":
|
494 |
"score": 1,
|
495 |
"note": "failed to present evidence"
|
496 |
},
|
@@ -499,7 +528,7 @@
|
|
499 |
"levels_cracked": "0",
|
500 |
"lives_left": "0",
|
501 |
"cracked_details": "0:0/5",
|
502 |
-
"rank":
|
503 |
"score": 0,
|
504 |
"note": "failed to present evidence"
|
505 |
}
|
|
|
50 |
"runs": 1,
|
51 |
"results": [
|
52 |
{
|
53 |
+
"model": "claude-3-7-sonnet-20250219(thinking)",
|
54 |
"score": 256,
|
55 |
+
"steps": 114,
|
56 |
+
"time": ">200",
|
57 |
"rank": 1
|
58 |
},
|
59 |
{
|
60 |
+
"model": "grok-3-mini-beta",
|
61 |
"score": 256,
|
62 |
+
"steps": 108,
|
63 |
+
"time": "58:09",
|
64 |
+
"rank": 1
|
65 |
},
|
66 |
{
|
67 |
"model": "o1-2024-12-17",
|
68 |
"score": 256,
|
69 |
"steps": 116,
|
70 |
"time": ">200",
|
71 |
+
"rank": 1
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"model": "o3-2025-04-16",
|
75 |
+
"score": 256,
|
76 |
+
"steps": 108,
|
77 |
+
"time": "58:09",
|
78 |
+
"rank": 1
|
79 |
},
|
80 |
{
|
81 |
"model": "claude-3-7-sonnet-20250219",
|
|
|
238 |
"steps": 25,
|
239 |
"rank": 3
|
240 |
},
|
241 |
+
{
|
242 |
+
"model": "grok-3-mini-beta",
|
243 |
+
"score_runs": "106",
|
244 |
+
"average_score": 106,
|
245 |
+
"steps": 25,
|
246 |
+
"rank": 4
|
247 |
+
},
|
248 |
{
|
249 |
"model": "o1-2024-12-17",
|
250 |
"score_runs": "96;114;83",
|
251 |
"average_score": 97.67,
|
252 |
"steps": 25,
|
253 |
+
"rank": 5
|
254 |
},
|
255 |
{
|
256 |
"model": "deepseek-r1",
|
257 |
"score_runs": "62;108;105",
|
258 |
"average_score": 91.67,
|
259 |
"steps": 25,
|
260 |
+
"rank": 6
|
261 |
},
|
262 |
{
|
263 |
"model": "gemini-2.5-flash-preview-04-17",
|
264 |
"score_runs": "59",
|
265 |
"average_score": 59,
|
266 |
"steps": 25,
|
267 |
+
"rank": 7
|
268 |
},
|
269 |
{
|
270 |
"model": "gemini-2.5-pro-exp-03-25",
|
271 |
"score_runs": "50;36;68",
|
272 |
"average_score": 51.33,
|
273 |
"steps": 25,
|
274 |
+
"rank": 8
|
275 |
},
|
276 |
{
|
277 |
"model": "claude-3-7-sonnet-20250219(thinking)",
|
278 |
"score_runs": "36;46;24",
|
279 |
"average_score": 35.33,
|
280 |
"steps": 25,
|
281 |
+
"rank": 9
|
282 |
},
|
283 |
{
|
284 |
"model": "gemini-2.0-flash-thinking-exp-1219",
|
285 |
"score_runs": "0;15;39",
|
286 |
"average_score": 18,
|
287 |
"steps": 25,
|
288 |
+
"rank": 10
|
289 |
},
|
290 |
{
|
291 |
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
|
326 |
"steps": "[16, 40, 59, 110]",
|
327 |
"rank": 1
|
328 |
},
|
329 |
+
{
|
330 |
+
"model": "grok-3-mini-beta",
|
331 |
+
"levels_cracked": "3",
|
332 |
+
"steps": "[14, 36, 55, 78]",
|
333 |
+
"rank": 2
|
334 |
+
},
|
335 |
{
|
336 |
"model": "o3-mini-2025-01-31(medium)",
|
337 |
"levels_cracked": "2; 3; 2",
|
338 |
"steps": "[17,52,68];[24,58,78,91];[19,44,64]",
|
339 |
+
"rank": 3
|
340 |
},
|
341 |
{
|
342 |
"model": "gemini-2.5-pro-exp-03-25",
|
343 |
"levels_cracked": "2;2;3",
|
344 |
"steps": "[23, 46, 79]; [20,50,77]; [26,95,125,175]",
|
345 |
+
"rank": 4
|
346 |
},
|
347 |
{
|
348 |
"model": "gemini-2.5-flash-preview-04-17",
|
349 |
"levels_cracked": "2",
|
350 |
"steps": "[24, 50, 60]",
|
351 |
+
"rank": 5
|
352 |
},
|
353 |
{
|
354 |
"model": "o4-mini-2025-04-16",
|
355 |
"levels_cracked": "2",
|
356 |
"steps": "",
|
357 |
+
"rank": 6
|
358 |
},
|
359 |
{
|
360 |
"model": "claude-3-7-sonnet-20250219(thinking)",
|
361 |
"levels_cracked": "1; 2; 0",
|
362 |
"steps": "[17,35];[15,40,43];[4]",
|
363 |
+
"rank": 7
|
364 |
},
|
365 |
{
|
366 |
"model": "o1-2024-12-17",
|
367 |
"levels_cracked": "1; 1; 1",
|
368 |
"steps": null,
|
369 |
+
"rank": 8
|
370 |
},
|
371 |
{
|
372 |
"model": "deepseek-r1",
|
373 |
"levels_cracked": "1; 0; 1",
|
374 |
"steps": "[19,42];[13];[19,36]",
|
375 |
"note": "stuck",
|
376 |
+
"rank": 9
|
377 |
},
|
378 |
{
|
379 |
"model": "o1-mini-2024-09-12",
|
380 |
"levels_cracked": "0;1;0",
|
381 |
"steps": null,
|
382 |
+
"rank": 10
|
383 |
},
|
384 |
{
|
385 |
"model": "gemini-2.0-flash-thinking-exp-1219",
|
386 |
"levels_cracked": "0; 0; 0",
|
387 |
"steps": "[23]; [14]; [14]",
|
388 |
+
"rank": 11
|
389 |
},
|
390 |
{
|
391 |
"model": "gpt-4o-2024-11-20",
|
392 |
"levels_cracked": "0; 0; 0",
|
393 |
"steps": "[68];[105];[168]",
|
394 |
"note": "stuck in a loop",
|
395 |
+
"rank": 12
|
396 |
},
|
397 |
{
|
398 |
"model": "claude-3-5-sonnet-20241022",
|
399 |
"levels_cracked": "0; 0; 0",
|
400 |
"steps": "[21]; [30]; [51]",
|
401 |
"note": "stuck in a loop",
|
402 |
+
"rank": 13
|
403 |
},
|
404 |
{
|
405 |
"model": "deepseek-v3",
|
406 |
"levels_cracked": "0; 0; 0",
|
407 |
"steps": "[9]; [47]; [64]",
|
408 |
+
"rank": 14
|
409 |
},
|
410 |
{
|
411 |
"model": "gpt-4.1-2025-04-14",
|
412 |
"levels_cracked": "0; 0; 0",
|
413 |
"steps": "[9]; [47]; [64]",
|
414 |
+
"rank": 15
|
415 |
},
|
416 |
{
|
417 |
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
|
418 |
"levels_cracked": "0;0;0",
|
419 |
"steps": "[5]",
|
420 |
+
"rank": 17
|
421 |
}
|
422 |
]
|
423 |
},
|
|
|
460 |
"score": 8,
|
461 |
"note": "failed to present evidence"
|
462 |
},
|
463 |
+
{
|
464 |
+
"model": "grok-3-mini-beta",
|
465 |
+
"levels_cracked": "1",
|
466 |
+
"lives_left": "[3, 0]",
|
467 |
+
"cracked_details": "2: 2/9",
|
468 |
+
"rank": 5,
|
469 |
+
"score": 7,
|
470 |
+
"note": "failed to present evidence"
|
471 |
+
},
|
472 |
{
|
473 |
"model": "claude-3-5-sonnet-20241022",
|
474 |
"levels_cracked": "1",
|
475 |
"lives_left": "5, 5",
|
476 |
"cracked_details": "1:1/8",
|
477 |
+
"rank": 6,
|
478 |
"score": 6,
|
479 |
"note": "stuck in loop"
|
480 |
},
|
|
|
483 |
"levels_cracked": "1",
|
484 |
"lives_left": "[4,5]",
|
485 |
"cracked_details": "1: 1/8",
|
486 |
+
"rank": 7,
|
487 |
"score": 6,
|
488 |
"note": "stuck in loop"
|
489 |
},
|
|
|
492 |
"levels_cracked": "0",
|
493 |
"lives_left": "0",
|
494 |
"cracked_details": "1: 4/5",
|
495 |
+
"rank": 8,
|
496 |
"score": 4,
|
497 |
"note": "stuck in the last option section"
|
498 |
},
|
|
|
501 |
"levels_cracked": "0",
|
502 |
"lives_left": "0",
|
503 |
"cracked_details": "1: 4/5",
|
504 |
+
"rank": 9,
|
505 |
"score": 4,
|
506 |
"note": "stuck in the last option section"
|
507 |
},
|
|
|
510 |
"levels_cracked": "0",
|
511 |
"lives_left": "0",
|
512 |
"cracked_details": "1: 4/5",
|
513 |
+
"rank": 10,
|
514 |
"score": 4,
|
515 |
"note": "stuck in the 3rd evidence present"
|
516 |
},
|
|
|
519 |
"levels_cracked": "0",
|
520 |
"lives_left": "0",
|
521 |
"cracked_details": "1:1/5",
|
522 |
+
"rank": 11,
|
523 |
"score": 1,
|
524 |
"note": "failed to present evidence"
|
525 |
},
|
|
|
528 |
"levels_cracked": "0",
|
529 |
"lives_left": "0",
|
530 |
"cracked_details": "0:0/5",
|
531 |
+
"rank": 13,
|
532 |
"score": 0,
|
533 |
"note": "failed to present evidence"
|
534 |
}
|