core_leaderboard

Running

Zachary Siegel commited on Sep 28, 2024

Commit

64319c0

1 Parent(s): b335ab8

added first agent to leaderboard

Files changed (2) hide show

app.py CHANGED Viewed

@@ -383,20 +383,20 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
             with gr.Row():
                 with gr.Column(scale=2):
                     Leaderboard(
-                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]),
                         select_columns=SelectColumns(
-                            default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
                             cant_deselect=["Agent Name"],
                             label="Select Columns to Display:",
                         ),
-                        hide_columns=config.USACO_HIDE_COLUMNS,
-                        search_columns=config.USACO_SEARCH_COLUMNS,
                     )
                     # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
                 gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
             with gr.Row():
-                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
     # Will trigger autoscaling of plots when tabs are switched
     tabs.select(fn=None, inputs=None, outputs=None, js="""

             with gr.Row():
                 with gr.Column(scale=2):
                     Leaderboard(
+                        value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
                         select_columns=SelectColumns(
+                            default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
                             cant_deselect=["Agent Name"],
                             label="Select Columns to Display:",
                         ),
+                        hide_columns=config.COREBENCH_HIDE_COLUMNS,
+                        search_columns=config.COREBENCH_SEARCH_COLUMNS,
                     )
                     # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
             with gr.Row():
                 gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
             with gr.Row():
+                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
     # Will trigger autoscaling of plots when tabs are switched
     tabs.select(fn=None, inputs=None, outputs=None, js="""

config.py CHANGED Viewed

@@ -6,15 +6,6 @@ TYPES = [
     "number"
 ]
-SWEBENCH_ON_LOAD_COLUMNS = [
-    "Agent Name",
-    "Accuracy",
-    "Total Cost",
-    "Runs",
-   ]
-SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
-SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
 USACO_ON_LOAD_COLUMNS = [
     "Agent Name",
     "Accuracy",
@@ -24,14 +15,14 @@ USACO_ON_LOAD_COLUMNS = [
 USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
 USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
-MLAGENTBENCH_ON_LOAD_COLUMNS = [
     "Agent Name",
-    "Overall Score",
     "Total Cost",
    ]
-MLAGENTBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
-MLAGENTBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Accuracy']
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

     "number"
 ]
 USACO_ON_LOAD_COLUMNS = [
     "Agent Name",
     "Accuracy",
 USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
 USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
+COREBENCH_ON_LOAD_COLUMNS = [
     "Agent Name",
+    "Accuracy",
     "Total Cost",
+    "Runs",
    ]
+COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
+COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),