Zachary Siegel commited on
Commit
64319c0
·
1 Parent(s): b335ab8

added first agent to leaderboard

Browse files
Files changed (2) hide show
  1. app.py +5 -5
  2. config.py +5 -14
app.py CHANGED
@@ -383,20 +383,20 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
383
  with gr.Row():
384
  with gr.Column(scale=2):
385
  Leaderboard(
386
- value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]),
387
  select_columns=SelectColumns(
388
- default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
389
  cant_deselect=["Agent Name"],
390
  label="Select Columns to Display:",
391
  ),
392
- hide_columns=config.USACO_HIDE_COLUMNS,
393
- search_columns=config.USACO_SEARCH_COLUMNS,
394
  )
395
  # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
396
  with gr.Row():
397
  gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
398
  with gr.Row():
399
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
400
 
401
  # Will trigger autoscaling of plots when tabs are switched
402
  tabs.select(fn=None, inputs=None, outputs=None, js="""
 
383
  with gr.Row():
384
  with gr.Column(scale=2):
385
  Leaderboard(
386
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
387
  select_columns=SelectColumns(
388
+ default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
389
  cant_deselect=["Agent Name"],
390
  label="Select Columns to Display:",
391
  ),
392
+ hide_columns=config.COREBENCH_HIDE_COLUMNS,
393
+ search_columns=config.COREBENCH_SEARCH_COLUMNS,
394
  )
395
  # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
396
  with gr.Row():
397
  gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
398
  with gr.Row():
399
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
400
 
401
  # Will trigger autoscaling of plots when tabs are switched
402
  tabs.select(fn=None, inputs=None, outputs=None, js="""
config.py CHANGED
@@ -6,15 +6,6 @@ TYPES = [
6
  "number"
7
  ]
8
 
9
- SWEBENCH_ON_LOAD_COLUMNS = [
10
- "Agent Name",
11
- "Accuracy",
12
- "Total Cost",
13
- "Runs",
14
- ]
15
- SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
16
- SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
17
-
18
  USACO_ON_LOAD_COLUMNS = [
19
  "Agent Name",
20
  "Accuracy",
@@ -24,14 +15,14 @@ USACO_ON_LOAD_COLUMNS = [
24
  USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
25
  USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
26
 
27
- MLAGENTBENCH_ON_LOAD_COLUMNS = [
28
  "Agent Name",
29
- "Overall Score",
30
  "Total Cost",
 
31
  ]
32
- MLAGENTBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
33
- MLAGENTBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Accuracy']
34
-
35
 
36
  NUMERIC_INTERVALS = {
37
  "?": pd.Interval(-1, 0, closed="right"),
 
6
  "number"
7
  ]
8
 
 
 
 
 
 
 
 
 
 
9
  USACO_ON_LOAD_COLUMNS = [
10
  "Agent Name",
11
  "Accuracy",
 
15
  USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
16
  USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
17
 
18
+ COREBENCH_ON_LOAD_COLUMNS = [
19
  "Agent Name",
20
+ "Accuracy",
21
  "Total Cost",
22
+ "Runs",
23
  ]
24
+ COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
25
+ COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']
 
26
 
27
  NUMERIC_INTERVALS = {
28
  "?": pd.Interval(-1, 0, closed="right"),