Spaces:
Running
Running
Zachary Siegel
commited on
Commit
·
64319c0
1
Parent(s):
b335ab8
added first agent to leaderboard
Browse files
app.py
CHANGED
@@ -383,20 +383,20 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
383 |
with gr.Row():
|
384 |
with gr.Column(scale=2):
|
385 |
Leaderboard(
|
386 |
-
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), '
|
387 |
select_columns=SelectColumns(
|
388 |
-
default_selection=config.
|
389 |
cant_deselect=["Agent Name"],
|
390 |
label="Select Columns to Display:",
|
391 |
),
|
392 |
-
hide_columns=config.
|
393 |
-
search_columns=config.
|
394 |
)
|
395 |
# gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
396 |
with gr.Row():
|
397 |
gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
|
398 |
with gr.Row():
|
399 |
-
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), '
|
400 |
|
401 |
# Will trigger autoscaling of plots when tabs are switched
|
402 |
tabs.select(fn=None, inputs=None, outputs=None, js="""
|
|
|
383 |
with gr.Row():
|
384 |
with gr.Column(scale=2):
|
385 |
Leaderboard(
|
386 |
+
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
|
387 |
select_columns=SelectColumns(
|
388 |
+
default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
389 |
cant_deselect=["Agent Name"],
|
390 |
label="Select Columns to Display:",
|
391 |
),
|
392 |
+
hide_columns=config.COREBENCH_HIDE_COLUMNS,
|
393 |
+
search_columns=config.COREBENCH_SEARCH_COLUMNS,
|
394 |
)
|
395 |
# gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
|
396 |
with gr.Row():
|
397 |
gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
|
398 |
with gr.Row():
|
399 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
400 |
|
401 |
# Will trigger autoscaling of plots when tabs are switched
|
402 |
tabs.select(fn=None, inputs=None, outputs=None, js="""
|
config.py
CHANGED
@@ -6,15 +6,6 @@ TYPES = [
|
|
6 |
"number"
|
7 |
]
|
8 |
|
9 |
-
SWEBENCH_ON_LOAD_COLUMNS = [
|
10 |
-
"Agent Name",
|
11 |
-
"Accuracy",
|
12 |
-
"Total Cost",
|
13 |
-
"Runs",
|
14 |
-
]
|
15 |
-
SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
|
16 |
-
SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
|
17 |
-
|
18 |
USACO_ON_LOAD_COLUMNS = [
|
19 |
"Agent Name",
|
20 |
"Accuracy",
|
@@ -24,14 +15,14 @@ USACO_ON_LOAD_COLUMNS = [
|
|
24 |
USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
|
25 |
USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
|
26 |
|
27 |
-
|
28 |
"Agent Name",
|
29 |
-
"
|
30 |
"Total Cost",
|
|
|
31 |
]
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
|
36 |
NUMERIC_INTERVALS = {
|
37 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
6 |
"number"
|
7 |
]
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
USACO_ON_LOAD_COLUMNS = [
|
10 |
"Agent Name",
|
11 |
"Accuracy",
|
|
|
15 |
USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
|
16 |
USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
|
17 |
|
18 |
+
COREBENCH_ON_LOAD_COLUMNS = [
|
19 |
"Agent Name",
|
20 |
+
"Accuracy",
|
21 |
"Total Cost",
|
22 |
+
"Runs",
|
23 |
]
|
24 |
+
COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
|
25 |
+
COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
|
|
|
26 |
|
27 |
NUMERIC_INTERVALS = {
|
28 |
"?": pd.Interval(-1, 0, closed="right"),
|