Zachary Siegel commited on
Commit
b335ab8
·
1 Parent(s): 2faf3bd

scaffold for core bench

Browse files
Files changed (1) hide show
  1. app.py +38 -67
app.py CHANGED
@@ -328,71 +328,58 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
328
  line-height: 1.6;
329
  color: #333;
330
  }
331
- .button {
332
- margin: 15px 0;
333
- padding: 10px 20px;
 
 
 
 
 
334
  font-size: 1em;
335
  font-weight: bold;
336
- color: #fff;
337
- background-color: #3498db;
338
  border: none;
339
  border-radius: 5px;
340
- text-decoration: none;
341
- display: inline-flex;
342
- align-items: center;
343
  transition: background-color 0.3s ease;
 
 
344
  }
345
- .button:hover {
346
- background-color: #2980b9;
347
  }
348
- .button img {
349
- margin-right: 8px;
350
- height: 20px;
351
  }
352
  </style>
353
 
354
- <div class="feature-row">
355
- <div class="feature-column">
356
- <div class="feature-keyword">Paper</div>
357
- <div class="feature-content">
358
- <a href="https://arxiv.org/abs/2409.11363" class="button">
359
- <img src="https://example.com/favicon-paper.png" alt="Paper Icon"> View Paper
360
- </a>
361
- </div>
362
- </div>
363
- <div class="feature-column">
364
- <div class="feature-keyword">Github</div>
365
- <div class="feature-content">
366
- <a href="https://github.com/siegelz/core-bench" class="button">
367
- <img src="https://example.com/favicon-github.png" alt="Github Icon"> View Github
368
- </a>
369
- </div>
370
- </div>
371
- <div class="feature-column">
372
- <div class="feature-keyword">Dataset</div>
373
- <div class="feature-content">
374
- <a href="https://huggingface.co/datasets/siegelz/core-bench" class="button">
375
- <img src="https://example.com/favicon-dataset.png" alt="Dataset Icon"> View Dataset
376
- </a>
377
- </div>
378
- </div>
379
  </div>
 
380
  </br>
381
  <h2 class="section-heading" id="leaderboards">Leaderboards</h2>
382
- <p>Select a benchmark to see the agent leaderboard. Verified results have been run by the HAL team:</p>
 
 
 
 
 
 
 
 
 
 
 
383
  """)
384
 
385
  with gr.Tabs() as tabs:
386
- with gr.Tab("CORE-Bench"):
387
- gr.Markdown("""
388
- CORE-Bench evaluates the ability of agents to computationally reproduce the results of published scientific papers. Agents are given the codebase of a paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. The benchmark has tasks at three difficulty levels:
389
-
390
- <b>CORE-Bench-Easy</b>: The agent is given the output of the code and must answer questions about the output without running any code. To answer questions, agents must navigate through the terminal output as well as files and figures generated by the code.
391
-
392
- <b>CORE-Bench-Medium</b>: The agent is given a Dockerfile and instructions on how to use the Dockerfile to fully reproduce the paper. This level mainly evaluates agents ability to use and interact with the terminal. The agent must then answer questions about the output of the code, as in the above level.
393
-
394
- <b>CORE-Bench-Hard</b>: The agent is given the codebase of the paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. This level is most akin to fully reproducing a paper and is the most realistic and challenging level.
395
- """)
396
  with gr.Row():
397
  with gr.Column(scale=2):
398
  Leaderboard(
@@ -405,25 +392,11 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
405
  hide_columns=config.USACO_HIDE_COLUMNS,
406
  search_columns=config.USACO_SEARCH_COLUMNS,
407
  )
408
- gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
409
  with gr.Row():
410
- gr.Markdown("### Accuracy vs. Cost for USACO agents")
411
  with gr.Row():
412
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
413
-
414
- gr.HTML('<div style="height: 30px;"></div>')
415
- gr.Markdown("## Task success heatmap")
416
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
417
- with gr.Row():
418
- task_success_heatmap = gr.Plot()
419
- demo.load(
420
- lambda: create_task_success_heatmap(
421
- preprocessor.get_task_success_data('usaco'),
422
- 'USACO'
423
- ),
424
- outputs=[task_success_heatmap]
425
- )
426
-
427
 
428
  # Will trigger autoscaling of plots when tabs are switched
429
  tabs.select(fn=None, inputs=None, outputs=None, js="""
@@ -435,8 +408,6 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
435
  """)
436
  gr.HTML("""<h2 class="section-heading" id="agent-submission">How to add an agent?</h2>""")
437
  gr.Markdown((Path(__file__).parent / "agent_submission.md").read_text())
438
- gr.HTML("""<h2 class="section-heading" id="benchmark-submission">How to add a benchmark?</h2>""")
439
- gr.Markdown((Path(__file__).parent / "benchmark_submission.md").read_text())
440
  gr.HTML("""<h2 class="section-heading" id="reproduction-guide">How can I run evaluations?</h2>""")
441
  gr.Markdown("""Coming soon...""")
442
 
 
328
  line-height: 1.6;
329
  color: #333;
330
  }
331
+ .button-container {
332
+ display: flex;
333
+ justify-content: center;
334
+ margin-top: 2px;
335
+ }
336
+ .button-container .button {
337
+ margin: 0 10px;
338
+ padding: 15px 25px;
339
  font-size: 1em;
340
  font-weight: bold;
341
+ color: #fff !important; /* Force white text color */
342
+ background-color: #3498db !important; /* Force background color */
343
  border: none;
344
  border-radius: 5px;
345
+ text-decoration: none !important; /* Force no underline */
346
+ text-align: center;
 
347
  transition: background-color 0.3s ease;
348
+ cursor: pointer;
349
+ height: 50px;
350
  }
351
+ .button-container .button:hover {
352
+ background-color: #2980b9 !important; /* Force hover color */
353
  }
354
+ .button:visited {
355
+ color: #fff; /* Keep text color white when link is visited */
 
356
  }
357
  </style>
358
 
359
+ <div class="button-container">
360
+ <a href="https://arxiv.org/abs/2409.11363" class="button">Paper</a>
361
+ <a href="https://github.com/siegelz/core-bench" class="button">Github</a>
362
+ <a href="https://huggingface.co/datasets/siegelz/core-bench" class="button">Dataset</a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  </div>
364
+
365
  </br>
366
  <h2 class="section-heading" id="leaderboards">Leaderboards</h2>
367
+ <p>
368
+ CORE-Bench evaluates the ability of agents to computationally reproduce the results of published scientific papers. Agents are given the codebase of a paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. The benchmark has tasks at three difficulty levels:
369
+ </p>
370
+ <p>
371
+ <i><b>CORE-Bench-Hard:</b></i> The agent is given the codebase of the paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. This level is most akin to fully reproducing a paper and is the most realistic and challenging level.
372
+ </p>
373
+ <p>
374
+ <i><b>CORE-Bench-Medium:</b></i> The agent is given a Dockerfile and instructions on how to use the Dockerfile to fully reproduce the paper. This level mainly evaluates agents ability to use and interact with the terminal. The agent must then answer questions about the output of the code, as in the above level.
375
+ </p>
376
+ <p>
377
+ <i><b>CORE-Bench-Easy:</b></i> The agent is given the output of the code and must answer questions about the output without running any code. To answer questions, agents must navigate through the terminal output as well as files and figures generated by the code.
378
+ </p>
379
  """)
380
 
381
  with gr.Tabs() as tabs:
382
+ with gr.Tab("CORE-Bench-Hard"):
 
 
 
 
 
 
 
 
 
383
  with gr.Row():
384
  with gr.Column(scale=2):
385
  Leaderboard(
 
392
  hide_columns=config.USACO_HIDE_COLUMNS,
393
  search_columns=config.USACO_SEARCH_COLUMNS,
394
  )
395
+ # gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
396
  with gr.Row():
397
+ gr.Markdown("### Accuracy vs. Cost on CORE-Bench-Hard")
398
  with gr.Row():
399
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
  # Will trigger autoscaling of plots when tabs are switched
402
  tabs.select(fn=None, inputs=None, outputs=None, js="""
 
408
  """)
409
  gr.HTML("""<h2 class="section-heading" id="agent-submission">How to add an agent?</h2>""")
410
  gr.Markdown((Path(__file__).parent / "agent_submission.md").read_text())
 
 
411
  gr.HTML("""<h2 class="section-heading" id="reproduction-guide">How can I run evaluations?</h2>""")
412
  gr.Markdown("""Coming soon...""")
413