Spaces:

lmgame
/

game_arena_bench

Running

App Files Files Community

Yuxuan-Zhang-Dexter commited on 20 days ago

Commit

f589e51

1 Parent(s): 93c11f0

update ace attorney game in the gradio app

Browse files

Files changed (8) hide show

app.py +46 -24
assets/game_video_link.json +3 -2
assets/model_color.json +8 -7
assets/news.json +6 -0
data_visualization.py +6 -2
leaderboard_utils.py +25 -3
rank_data_03_25_2025.json +78 -1
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ from leaderboard_utils import (
     get_candy_leaderboard,
     get_tetris_leaderboard,
     get_tetris_planning_leaderboard,
     get_combined_leaderboard,
     GAME_ORDER
 )
@@ -54,7 +55,8 @@ leaderboard_state = {
         "2048": True,
         "Candy Crash": True,
         "Tetris (complete)": True,
-        "Tetris (planning only)": True
     },
     "previous_details": {
         "Super Mario Bros": False,
@@ -62,7 +64,8 @@ leaderboard_state = {
         "2048": False,
         "Candy Crash": False,
         "Tetris (complete)": False,
-        "Tetris (planning only)": False
     }
 }
@@ -160,7 +163,8 @@ def update_leaderboard(mario_overall, mario_details,
                        _2048_overall, _2048_details,
                        candy_overall, candy_details,
                        tetris_overall, tetris_details,
-                       tetris_plan_overall, tetris_plan_details):
     global leaderboard_state
     # Convert current checkbox states to dictionary for easier comparison
@@ -170,7 +174,8 @@ def update_leaderboard(mario_overall, mario_details,
         "2048": _2048_overall,
         "Candy Crash": candy_overall,
         "Tetris (complete)": tetris_overall,
-        "Tetris (planning only)": tetris_plan_overall
     }
     current_details = {
@@ -179,7 +184,8 @@ def update_leaderboard(mario_overall, mario_details,
         "2048": _2048_details,
         "Candy Crash": candy_details,
         "Tetris (complete)": tetris_details,
-        "Tetris (planning only)": tetris_plan_details
     }
     # Find which game's state changed
@@ -235,12 +241,11 @@ def update_leaderboard(mario_overall, mario_details,
             leaderboard_state["previous_details"][changed_game] = False
             if leaderboard_state["current_game"] == changed_game:
                 leaderboard_state["current_game"] = None
-                # When exiting details view, reset to show all games
-                for game in current_overall.keys():
-                    current_overall[game] = True
-                    current_details[game] = False
-                    leaderboard_state["previous_overall"][game] = True
-                    leaderboard_state["previous_details"][game] = False
     # Special case: If all games are selected and we're trying to view details
     all_games_selected = all(current_overall.values()) and not any(current_details.values())
@@ -266,7 +271,8 @@ def update_leaderboard(mario_overall, mario_details,
         "2048": current_overall["2048"],
         "Candy Crash": current_overall["Candy Crash"],
         "Tetris (complete)": current_overall["Tetris (complete)"],
-        "Tetris (planning only)": current_overall["Tetris (planning only)"]
     }
     # Get the appropriate DataFrame and charts based on current state
@@ -282,8 +288,10 @@ def update_leaderboard(mario_overall, mario_details,
             df = get_candy_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Tetris (complete)":
             df = get_tetris_leaderboard(rank_data)
-        else:  # Tetris (planning only)
             df = get_tetris_planning_leaderboard(rank_data)
         # Format the DataFrame for display
         display_df = prepare_dataframe_for_display(df, leaderboard_state["current_game"])
@@ -303,21 +311,23 @@ def update_leaderboard(mario_overall, mario_details,
         chart = radar_chart
         group_bar_chart = radar_chart  # Use radar chart instead of bar chart
-    # Return exactly 16 values to match the expected outputs
     return (update_df_with_height(display_df), chart, radar_chart, radar_chart,
             current_overall["Super Mario Bros"], current_details["Super Mario Bros"],
             current_overall["Sokoban"], current_details["Sokoban"],
             current_overall["2048"], current_details["2048"],
             current_overall["Candy Crash"], current_details["Candy Crash"],
             current_overall["Tetris (complete)"], current_details["Tetris (complete)"],
-            current_overall["Tetris (planning only)"], current_details["Tetris (planning only)"])
 def update_leaderboard_with_time(time_point, mario_overall, mario_details,
                                sokoban_overall, sokoban_details,
                                _2048_overall, _2048_details,
                                candy_overall, candy_details,
                                tetris_overall, tetris_details,
-                               tetris_plan_overall, tetris_plan_details):
     # Load rank data for the selected time point
     global rank_data
     new_rank_data = load_rank_data(time_point)
@@ -330,7 +340,8 @@ def update_leaderboard_with_time(time_point, mario_overall, mario_details,
                             _2048_overall, _2048_details,
                             candy_overall, candy_details,
                             tetris_overall, tetris_details,
-                            tetris_plan_overall, tetris_plan_details)
 def get_initial_state():
     """Get the initial state for the leaderboard"""
@@ -342,7 +353,8 @@ def get_initial_state():
             "2048": True,
             "Candy Crash": True,
             "Tetris (complete)": True,
-            "Tetris (planning only)": True
         },
         "previous_details": {
             "Super Mario Bros": False,
@@ -350,7 +362,8 @@ def get_initial_state():
             "2048": False,
             "Candy Crash": False,
             "Tetris (complete)": False,
-            "Tetris (planning only)": False
         }
     }
@@ -364,7 +377,8 @@ def clear_filters():
         "2048": True,
         "Candy Crash": True,
         "Tetris (complete)": True,
-        "Tetris (planning only)": True
     }
     # Get the combined leaderboard and group bar chart
@@ -386,7 +400,8 @@ def clear_filters():
             True, False,  # 2048
             True, False,  # candy
             True, False,  # tetris
-            True, False)  # tetris plan
 def create_timeline_slider():
     """Create a custom timeline slider component"""
@@ -874,6 +889,10 @@ def build_app():
                         gr.Markdown("**📋 Tetris (planning)**")
                         tetris_plan_overall = gr.Checkbox(label="Tetris (planning) Score", value=True)
                         tetris_plan_details = gr.Checkbox(label="Tetris (planning) Details", value=False)
                 # Controls
                 with gr.Row():
@@ -899,7 +918,8 @@ def build_app():
                     "2048": True,
                     "Candy Crash": True,
                     "Tetris (complete)": True,
-                    "Tetris (planning only)": True
                 })
                 # Format the DataFrame for display
@@ -940,7 +960,8 @@ def build_app():
                     _2048_overall, _2048_details,
                     candy_overall, candy_details,
                     tetris_overall, tetris_details,
-                    tetris_plan_overall, tetris_plan_details
                 ]
                 # Update visualizations when checkboxes change
@@ -948,7 +969,8 @@ def build_app():
                     # Check if any details checkbox is selected
                     is_details_view = any([
                         checkbox_states[1], checkbox_states[3], checkbox_states[5],
-                        checkbox_states[7], checkbox_states[9], checkbox_states[11]
                     ])
                     # Update visibility of visualization blocks

     get_candy_leaderboard,
     get_tetris_leaderboard,
     get_tetris_planning_leaderboard,
+    get_ace_attorney_leaderboard,
     get_combined_leaderboard,
     GAME_ORDER
 )
         "2048": True,
         "Candy Crash": True,
         "Tetris (complete)": True,
+        "Tetris (planning only)": True,
+        "Ace Attorney": True
     },
     "previous_details": {
         "Super Mario Bros": False,
         "2048": False,
         "Candy Crash": False,
         "Tetris (complete)": False,
+        "Tetris (planning only)": False,
+        "Ace Attorney": False
     }
 }
                        _2048_overall, _2048_details,
                        candy_overall, candy_details,
                        tetris_overall, tetris_details,
+                       tetris_plan_overall, tetris_plan_details,
+                       ace_attorney_overall, ace_attorney_details):
     global leaderboard_state
     # Convert current checkbox states to dictionary for easier comparison
         "2048": _2048_overall,
         "Candy Crash": candy_overall,
         "Tetris (complete)": tetris_overall,
+        "Tetris (planning only)": tetris_plan_overall,
+        "Ace Attorney": ace_attorney_overall
     }
     current_details = {
         "2048": _2048_details,
         "Candy Crash": candy_details,
         "Tetris (complete)": tetris_details,
+        "Tetris (planning only)": tetris_plan_details,
+        "Ace Attorney": ace_attorney_details
     }
     # Find which game's state changed
             leaderboard_state["previous_details"][changed_game] = False
             if leaderboard_state["current_game"] == changed_game:
                 leaderboard_state["current_game"] = None
+                # When exiting details view, only reset the current game's state
+                current_overall[changed_game] = True
+                current_details[changed_game] = False
+                leaderboard_state["previous_overall"][changed_game] = True
+                leaderboard_state["previous_details"][changed_game] = False
     # Special case: If all games are selected and we're trying to view details
     all_games_selected = all(current_overall.values()) and not any(current_details.values())
         "2048": current_overall["2048"],
         "Candy Crash": current_overall["Candy Crash"],
         "Tetris (complete)": current_overall["Tetris (complete)"],
+        "Tetris (planning only)": current_overall["Tetris (planning only)"],
+        "Ace Attorney": current_overall["Ace Attorney"]
     }
     # Get the appropriate DataFrame and charts based on current state
             df = get_candy_leaderboard(rank_data)
         elif leaderboard_state["current_game"] == "Tetris (complete)":
             df = get_tetris_leaderboard(rank_data)
+        elif leaderboard_state["current_game"] == "Tetris (planning only)":
             df = get_tetris_planning_leaderboard(rank_data)
+        elif leaderboard_state["current_game"] == "Ace Attorney":
+            df = get_ace_attorney_leaderboard(rank_data)
         # Format the DataFrame for display
         display_df = prepare_dataframe_for_display(df, leaderboard_state["current_game"])
         chart = radar_chart
         group_bar_chart = radar_chart  # Use radar chart instead of bar chart
+    # Return exactly 18 values to match the expected outputs
     return (update_df_with_height(display_df), chart, radar_chart, radar_chart,
             current_overall["Super Mario Bros"], current_details["Super Mario Bros"],
             current_overall["Sokoban"], current_details["Sokoban"],
             current_overall["2048"], current_details["2048"],
             current_overall["Candy Crash"], current_details["Candy Crash"],
             current_overall["Tetris (complete)"], current_details["Tetris (complete)"],
+            current_overall["Tetris (planning only)"], current_details["Tetris (planning only)"],
+            current_overall["Ace Attorney"], current_details["Ace Attorney"])
 def update_leaderboard_with_time(time_point, mario_overall, mario_details,
                                sokoban_overall, sokoban_details,
                                _2048_overall, _2048_details,
                                candy_overall, candy_details,
                                tetris_overall, tetris_details,
+                               tetris_plan_overall, tetris_plan_details,
+                               ace_attorney_overall, ace_attorney_details):
     # Load rank data for the selected time point
     global rank_data
     new_rank_data = load_rank_data(time_point)
                             _2048_overall, _2048_details,
                             candy_overall, candy_details,
                             tetris_overall, tetris_details,
+                            tetris_plan_overall, tetris_plan_details,
+                            ace_attorney_overall, ace_attorney_details)
 def get_initial_state():
     """Get the initial state for the leaderboard"""
             "2048": True,
             "Candy Crash": True,
             "Tetris (complete)": True,
+            "Tetris (planning only)": True,
+            "Ace Attorney": True
         },
         "previous_details": {
             "Super Mario Bros": False,
             "2048": False,
             "Candy Crash": False,
             "Tetris (complete)": False,
+            "Tetris (planning only)": False,
+            "Ace Attorney": False
         }
     }
         "2048": True,
         "Candy Crash": True,
         "Tetris (complete)": True,
+        "Tetris (planning only)": True,
+        "Ace Attorney": True
     }
     # Get the combined leaderboard and group bar chart
             True, False,  # 2048
             True, False,  # candy
             True, False,  # tetris
+            True, False,  # tetris plan
+            True, False)  # ace attorney
 def create_timeline_slider():
     """Create a custom timeline slider component"""
                         gr.Markdown("**📋 Tetris (planning)**")
                         tetris_plan_overall = gr.Checkbox(label="Tetris (planning) Score", value=True)
                         tetris_plan_details = gr.Checkbox(label="Tetris (planning) Details", value=False)
+                    with gr.Column():
+                        gr.Markdown("**⚖️ Ace Attorney**")
+                        ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
+                        ace_attorney_details = gr.Checkbox(label="Ace Attorney Details", value=False)
                 # Controls
                 with gr.Row():
                     "2048": True,
                     "Candy Crash": True,
                     "Tetris (complete)": True,
+                    "Tetris (planning only)": True,
+                    "Ace Attorney": True
                 })
                 # Format the DataFrame for display
                     _2048_overall, _2048_details,
                     candy_overall, candy_details,
                     tetris_overall, tetris_details,
+                    tetris_plan_overall, tetris_plan_details,
+                    ace_attorney_overall, ace_attorney_details
                 ]
                 # Update visualizations when checkboxes change
                     # Check if any details checkbox is selected
                     is_details_view = any([
                         checkbox_states[1], checkbox_states[3], checkbox_states[5],
+                        checkbox_states[7], checkbox_states[9], checkbox_states[11],
+                        checkbox_states[13]  # Ace Attorney details checkbox
                     ])
                     # Update visibility of visualization blocks

assets/game_video_link.json CHANGED Viewed

@@ -1,6 +1,7 @@
-{
     "sokoban": "https://www.youtube.com/watch?v=59enV32MBUE",
     "super_mario": "https://www.youtube.com/watch?v=nixMIJZYAgg",
     "2048": "https://www.youtube.com/watch?v=3aYDCSa3AWI",
-    "candy": "https://www.youtube.com/watch?v=b-Uyz3W4yIg"
 }

+{
     "sokoban": "https://www.youtube.com/watch?v=59enV32MBUE",
     "super_mario": "https://www.youtube.com/watch?v=nixMIJZYAgg",
     "2048": "https://www.youtube.com/watch?v=3aYDCSa3AWI",
+    "candy": "https://www.youtube.com/watch?v=b-Uyz3W4yIg",
+    "ace_attorney": "https://www.youtube.com/watch?v=q8PMW870yp8"
 }

assets/model_color.json CHANGED Viewed

@@ -1,17 +1,18 @@
 {
-    "claude-3-7-sonnet-20250219": "#4A90E2",
     "claude-3-7-sonnet-20250219(thinking)": "#2E5C8A",
     "claude-3-5-haiku-20241022": "#7FB5E6",
-    "claude-3-5-sonnet-20241022": "#1A4C7C",
     "gemini-2.0-flash": "#FF4081",
     "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
-    "gemini-2.5-pro-exp-03-25": "#FF80AB",
-    "gpt-4o-2024-11-20": "#00BFA5",
-    "gpt-4.5-preview-2025-02-27": "#00796B",
     "o1-2024-12-17": "#4DB6AC",
-    "o1-mini-2024-09-12": "#26A69A",
     "o3-mini-2025-01-31(medium)": "#80CBC4",
     "deepseek-v3": "#FFC107",
-    "deepseek-r1": "#FFA000",
     "Llama-4-Maverick-17B-128E-Instruct-FP8": "#8E24AA"
 }

 {
+    "claude-3-7-sonnet-20250219": "#4A90E2",
     "claude-3-7-sonnet-20250219(thinking)": "#2E5C8A",
     "claude-3-5-haiku-20241022": "#7FB5E6",
+    "claude-3-5-sonnet-20241022": "#1A4C7C",
     "gemini-2.0-flash": "#FF4081",
     "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
+    "gemini-2.5-pro-exp-03-25": "#FF80AB",
+    "gpt-4o-2024-11-20": "#00BFA5",
+    "gpt-4.5-preview-2025-02-27": "#00796B",
+    "gpt-4.1-2025-04-14": "#00897B",
     "o1-2024-12-17": "#4DB6AC",
+    "o1-mini-2024-09-12": "#26A69A",
     "o3-mini-2025-01-31(medium)": "#80CBC4",
     "deepseek-v3": "#FFC107",
+    "deepseek-r1": "#FFA000",
     "Llama-4-Maverick-17B-128E-Instruct-FP8": "#8E24AA"
 }

assets/news.json CHANGED Viewed

@@ -1,5 +1,11 @@
 {
     "news": [
         {
             "date": "2025-04-08",
             "video_link": "https://www.youtube.com/watch?v=yoEo2Bk7PGA",

 {
     "news": [
+        {
+            "date": "2025-04-15",
+            "video_link": "https://www.youtube.com/watch?v=q8PMW870yp8",
+            "twitter_text": "Ace Attorney AI Revolution: O1 & Gemini 2.5 Pro lead in courtroom reasoning, while GPT-4.1 matches older models. Cost analysis reveals Gemini 2.5 Pro's 6-15x efficiency over O1.",
+            "twitter_link": "https://x.com/haoailab"
+        },
         {
             "date": "2025-04-08",
             "video_link": "https://www.youtube.com/watch?v=yoEo2Bk7PGA",

data_visualization.py CHANGED Viewed

@@ -24,7 +24,8 @@ GAME_SCORE_COLUMNS = {
     "2048": "Score",
     "Candy Crash": "Average Score",
     "Tetris (complete)": "Score",
-    "Tetris (planning only)": "Score"
 }
 def get_model_prefix(name):
     return name.split('-')[0]
@@ -81,6 +82,9 @@ def create_horizontal_bar_chart(df, game_name):
     elif game_name in ["Tetris (complete)", "Tetris (planning only)"]:
         score_col = "Score"
         df_sorted = df.sort_values(by=score_col, ascending=True)
     else:
         return None
@@ -315,7 +319,7 @@ def hex_to_rgba(hex_color, alpha=0.2):
 def create_single_radar_chart(df, selected_games=None, highlight_models=None):
     if selected_games is None:
-        selected_games = ['Super Mario Bros', '2048', 'Candy Crash', 'Sokoban']
     # Format game names
     formatted_games = []

     "2048": "Score",
     "Candy Crash": "Average Score",
     "Tetris (complete)": "Score",
+    "Tetris (planning only)": "Score",
+    "Ace Attorney": "Score"
 }
 def get_model_prefix(name):
     return name.split('-')[0]
     elif game_name in ["Tetris (complete)", "Tetris (planning only)"]:
         score_col = "Score"
         df_sorted = df.sort_values(by=score_col, ascending=True)
+    elif game_name == "Ace Attorney":
+        score_col = "Score"
+        df_sorted = df.sort_values(by=score_col, ascending=True)
     else:
         return None
 def create_single_radar_chart(df, selected_games=None, highlight_models=None):
     if selected_games is None:
+        selected_games = ['Super Mario Bros', '2048', 'Candy Crash', 'Sokoban', 'Ace Attorney']
     # Format game names
     formatted_games = []

leaderboard_utils.py CHANGED Viewed

@@ -9,7 +9,8 @@ GAME_ORDER = [
     "2048",
     "Candy Crash",
     "Tetris (complete)",
-    "Tetris (planning only)"
 ]
 def get_organization(model_name):
@@ -102,6 +103,21 @@ def get_tetris_planning_leaderboard(rank_data):
     df = df[["Player", "Organization", "Score", "Steps"]]
     return df
 def calculate_rank_and_completeness(rank_data, selected_games):
     # Dictionary to store DataFrames for each game
     game_dfs = {}
@@ -119,6 +135,8 @@ def calculate_rank_and_completeness(rank_data, selected_games):
         game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
     if selected_games.get("Tetris (planning only)"):
         game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
     # Get all unique players
     all_players = set()
@@ -165,10 +183,10 @@ def calculate_rank_and_completeness(rank_data, selected_games):
                     elif game == "Candy Crash":
                         player_score = df[df["Player"] == player]["Average Score"].iloc[0]
                         rank = len(df[df["Average Score"] > player_score]) + 1
-                    elif game == "Tetris (complete)":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
-                    elif game == "Tetris (planning only)":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
@@ -227,6 +245,8 @@ def get_combined_leaderboard(rank_data, selected_games):
         game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
     if selected_games.get("Tetris (planning only)"):
         game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
     # Get all unique players
     all_players = set()
@@ -263,6 +283,8 @@ def get_combined_leaderboard(rank_data, selected_games):
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Average Score"].iloc[0]
                     elif game in ["Tetris (complete)", "Tetris (planning only)"]:
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                 else:
                     player_data[f"{game} Score"] = 'n/a'

     "2048",
     "Candy Crash",
     "Tetris (complete)",
+    "Tetris (planning only)",
+    "Ace Attorney"
 ]
 def get_organization(model_name):
     df = df[["Player", "Organization", "Score", "Steps"]]
     return df
+def get_ace_attorney_leaderboard(rank_data):
+    data = rank_data.get("Ace Attorney", {}).get("results", [])
+    df = pd.DataFrame(data)
+    df = df.rename(columns={
+        "model": "Player",
+        "levels_cracked": "Levels Cracked",
+        "lives_left": "Lives Left",
+        "cracked_details": "Progress",
+        "score": "Score",
+        "note": "Notes"
+    })
+    df["Organization"] = df["Player"].apply(get_organization)
+    df = df[["Player", "Organization", "Levels Cracked", "Lives Left", "Progress", "Score", "Notes"]]
+    return df
 def calculate_rank_and_completeness(rank_data, selected_games):
     # Dictionary to store DataFrames for each game
     game_dfs = {}
         game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
     if selected_games.get("Tetris (planning only)"):
         game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
+    if selected_games.get("Ace Attorney"):
+        game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)
     # Get all unique players
     all_players = set()
                     elif game == "Candy Crash":
                         player_score = df[df["Player"] == player]["Average Score"].iloc[0]
                         rank = len(df[df["Average Score"] > player_score]) + 1
+                    elif game in ["Tetris (complete)", "Tetris (planning only)"]:
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
+                    elif game == "Ace Attorney":
                         player_score = df[df["Player"] == player]["Score"].iloc[0]
                         rank = len(df[df["Score"] > player_score]) + 1
         game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
     if selected_games.get("Tetris (planning only)"):
         game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
+    if selected_games.get("Ace Attorney"):
+        game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)
     # Get all unique players
     all_players = set()
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Average Score"].iloc[0]
                     elif game in ["Tetris (complete)", "Tetris (planning only)"]:
                         player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
+                    elif game == "Ace Attorney":
+                        player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
                 else:
                     player_data[f"{game} Score"] = 'n/a'

rank_data_03_25_2025.json CHANGED Viewed

@@ -236,7 +236,7 @@
                 "score_runs": "0;0;0",
                 "average_score": 0,
                 "steps": 25,
-                "rank":9
             },
             {
                 "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
@@ -320,5 +320,82 @@
                 "rank": 11
             }
         ]
     }
 }

                 "score_runs": "0;0;0",
                 "average_score": 0,
                 "steps": 25,
+                "rank": 9
             },
             {
                 "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
                 "rank": 11
             }
         ]
+    },
+    "Ace Attorney": {
+        "runs": 2,
+        "results": [
+            {
+                "model": "o1-2024-12-17",
+                "levels_cracked": "3; 3",
+                "lives_left": "[5, 3, 3, 0],[4, 5, 3, 0]",
+                "cracked_details": "4: 7/8",
+                "rank": 1,
+                "score": 26,
+                "note": "stuck at the end not present evidence"
+            },
+            {
+                "model": "gemini-2.5-pro-exp-03-25",
+                "levels_cracked": "2; 3",
+                "lives_left": "[5,5,0]; [5, 5, 4, 0]",
+                "cracked_details": "4: 0/8",
+                "rank": 2,
+                "score": 20,
+                "note": "failed to present evidence"
+            },
+            {
+                "model": "claude-3-7-sonnet-20250219(thinking)",
+                "levels_cracked": "1; 1",
+                "lives_left": "[3,0]; [5,0]",
+                "cracked_details": "2: 3/9",
+                "rank": 3,
+                "score": 8,
+                "note": "failed to present evidence"
+            },
+            {
+                "model": "claude-3-5-sonnet-20241022",
+                "levels_cracked": "1",
+                "lives_left": "5, 5",
+                "cracked_details": "1:1/8",
+                "rank": 4,
+                "score": 6,
+                "note": "stuck in loop"
+            },
+            {
+                "model": "gpt-4.1-2025-04-14",
+                "levels_cracked": "1",
+                "lives_left": "[4,5]",
+                "cracked_details": "1: 1/8",
+                "rank": 5,
+                "score": 6,
+                "note": "stuck in loop"
+            },
+            {
+                "model": "gemini-2.0-flash-thinking-exp-1219",
+                "levels_cracked": "0",
+                "lives_left": "0",
+                "cracked_details": "1: 4/5",
+                "rank": 6,
+                "score": 4,
+                "note": "stuck in the last option section"
+            },
+            {
+                "model": "deepseek-r1",
+                "levels_cracked": "0",
+                "lives_left": "0",
+                "cracked_details": "1: 4/5",
+                "rank": 7,
+                "score": 4,
+                "note": "stuck in the 3rd evidence present"
+            },
+            {
+                "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
+                "levels_cracked": "0",
+                "lives_left": "0",
+                "cracked_details": "0:0/5",
+                "rank": 8,
+                "score": 0,
+                "note": "failed to present evidence"
+            }
+        ]
     }
 }

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio
 pandas>=2.0.0
 matplotlib>=3.7.0
 seaborn>=0.12.0

+gradio==5.23.3
 pandas>=2.0.0
 matplotlib>=3.7.0
 seaborn>=0.12.0