benediktstroebl commited on
Commit
e24146f
·
1 Parent(s): bb2348f

vis update

Browse files
Files changed (5) hide show
  1. app.py +221 -31
  2. css.css +48 -155
  3. header.md +3 -0
  4. scratch.py +38 -0
  5. utils/viz.py +15 -8
app.py CHANGED
@@ -18,6 +18,7 @@ import asyncio
18
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
19
  import weave
20
  from utils.db import TracePreprocessor
 
21
 
22
  preprocessor = TracePreprocessor()
23
 
@@ -86,7 +87,7 @@ def update_task_details(benchmark_name, agent_name, task_id):
86
 
87
  summary = analysis.get('task_analysis', {})
88
 
89
- overview = f"## Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
90
  # overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
91
  # overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
92
  # overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
@@ -157,9 +158,9 @@ def format_call_info(step, step_index):
157
  }}
158
  </style>
159
 
160
- <h2>Step {step_index + 1}: {analysis.get('headline', '')}</h2>
161
 
162
- <h3>Call Metadata</h3>
163
  <ul>
164
  <li><strong>Weave Task ID:</strong> {call_data['weave_task_id']}</li>
165
  <li><strong>Trace ID:</strong> {call_data['trace_id']}</li>
@@ -168,16 +169,16 @@ def format_call_info(step, step_index):
168
  <li><strong>Model:</strong> {call_data['inputs']['model']}</li>
169
  </ul>
170
 
171
- <h3>Inputs</h3>
172
  {format_json(call_data['inputs'])}
173
 
174
- <h3>Outputs</h3>
175
  {format_json(call_data['outputs'])}
176
 
177
- <h3>Usage</h3>
178
  {format_json(call_data['summary'])}
179
 
180
- <h3>Analysis</h3>
181
  <ul>
182
  <li><strong>Description:</strong> {analysis['description']}</li>
183
  <li><strong>Assessment:</strong> {analysis['assessment']}</li>
@@ -194,9 +195,9 @@ def update_failure_report(agent_name, benchmark_name):
194
  return "No failure report available for this agent.", None
195
 
196
  # Create overview of failure categories
197
- categories_overview = "## Failure Categories Overview\n\n"
198
  for category in failure_report['failure_categories']:
199
- categories_overview += f"### {category['category_name']}\n"
200
  categories_overview += f"{category['description']}\n\n"
201
 
202
  # Count tasks affected by each category
@@ -214,14 +215,171 @@ def update_failure_report(agent_name, benchmark_name):
214
 
215
  return categories_overview, chart
216
 
217
-
218
- with gr.Blocks() as demo:
219
- gr.Markdown("""
220
- # 🥇 Agent Leaderboard
221
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  with gr.Tabs():
224
  with gr.Tab("USACO"):
 
225
  with gr.Row():
226
  with gr.Column(scale=2):
227
  Leaderboard(
@@ -234,10 +392,15 @@ with gr.Blocks() as demo:
234
  hide_columns=config.USACO_HIDE_COLUMNS,
235
  search_columns=config.USACO_SEARCH_COLUMNS,
236
  )
 
 
237
  with gr.Row():
238
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
239
 
240
- gr.Markdown("# Task Success Heatmap")
 
 
 
241
  with gr.Row():
242
  task_success_heatmap = gr.Plot()
243
  demo.load(
@@ -247,8 +410,10 @@ with gr.Blocks() as demo:
247
  ),
248
  outputs=[task_success_heatmap]
249
  )
250
-
251
- gr.Markdown("# Failure Report")
 
 
252
  with gr.Row():
253
  with gr.Column(scale=1):
254
  failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
@@ -269,7 +434,9 @@ with gr.Blocks() as demo:
269
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
270
  outputs=[failure_categories_overview, failure_categories_chart])
271
 
272
- gr.Markdown("# Agent Monitor")
 
 
273
  with gr.Row():
274
  with gr.Column(scale=1):
275
  agent_dropdown = gr.Dropdown(label="Select Agent")
@@ -291,7 +458,9 @@ with gr.Blocks() as demo:
291
  inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
292
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
293
 
294
- gr.Markdown("# Raw Predictions")
 
 
295
  with gr.Row():
296
  with gr.Column(scale=1):
297
  raw_agent_dropdown = gr.Dropdown(label="Select Agent")
@@ -366,7 +535,9 @@ with gr.Blocks() as demo:
366
  with gr.Row():
367
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
368
 
369
- gr.Markdown("# Task Success Heatmap")
 
 
370
  with gr.Row():
371
  task_success_heatmap = gr.Plot()
372
  demo.load(
@@ -377,7 +548,9 @@ with gr.Blocks() as demo:
377
  outputs=[task_success_heatmap]
378
  )
379
 
380
- gr.Markdown("# Failure Report")
 
 
381
  with gr.Row():
382
  with gr.Column(scale=1):
383
  failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
@@ -398,7 +571,9 @@ with gr.Blocks() as demo:
398
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
399
  outputs=[failure_categories_overview, failure_categories_chart])
400
 
401
- gr.Markdown("# Agent Monitor")
 
 
402
  with gr.Row():
403
  with gr.Column(scale=1):
404
  agent_dropdown = gr.Dropdown(label="Select Agent")
@@ -419,8 +594,9 @@ with gr.Blocks() as demo:
419
  task_dropdown.change(update_task_details,
420
  inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
421
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
422
-
423
- gr.Markdown("# Raw Predictions")
 
424
  with gr.Row():
425
  with gr.Column(scale=1):
426
  raw_agent_dropdown = gr.Dropdown(label="Select Agent")
@@ -494,7 +670,9 @@ with gr.Blocks() as demo:
494
  with gr.Row():
495
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
496
 
497
- gr.Markdown("# Task Success Heatmap")
 
 
498
  with gr.Row():
499
  task_success_heatmap = gr.Plot()
500
  demo.load(
@@ -505,7 +683,9 @@ with gr.Blocks() as demo:
505
  outputs=[task_success_heatmap]
506
  )
507
 
508
- gr.Markdown("# Failure Report")
 
 
509
  with gr.Row():
510
  with gr.Column(scale=1):
511
  failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
@@ -526,7 +706,9 @@ with gr.Blocks() as demo:
526
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
527
  outputs=[failure_categories_overview, failure_categories_chart])
528
 
529
- gr.Markdown("# Agent Monitor")
 
 
530
  with gr.Row():
531
  with gr.Column(scale=1):
532
  agent_dropdown = gr.Dropdown(label="Select Agent")
@@ -548,7 +730,9 @@ with gr.Blocks() as demo:
548
  inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
549
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
550
 
551
- gr.Markdown("# Raw Predictions")
 
 
552
  with gr.Row():
553
  with gr.Column(scale=1):
554
  raw_agent_dropdown = gr.Dropdown(label="Select Agent")
@@ -622,7 +806,9 @@ with gr.Blocks() as demo:
622
  with gr.Row():
623
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
624
 
625
- gr.Markdown("# Failure Report")
 
 
626
  with gr.Row():
627
  with gr.Column(scale=1):
628
  failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
@@ -643,7 +829,9 @@ with gr.Blocks() as demo:
643
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
644
  outputs=[failure_categories_overview, failure_categories_chart])
645
 
646
- gr.Markdown("# Agent Monitor")
 
 
647
  with gr.Row():
648
  with gr.Column(scale=1):
649
  agent_dropdown = gr.Dropdown(label="Select Agent")
@@ -665,7 +853,9 @@ with gr.Blocks() as demo:
665
  inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
666
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
667
 
668
- gr.Markdown("# Raw Predictions")
 
 
669
  with gr.Row():
670
  with gr.Column(scale=1):
671
  raw_agent_dropdown = gr.Dropdown(label="Select Agent")
 
18
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
19
  import weave
20
  from utils.db import TracePreprocessor
21
+ from gradio.themes.soft import Soft
22
 
23
  preprocessor = TracePreprocessor()
24
 
 
87
 
88
  summary = analysis.get('task_analysis', {})
89
 
90
+ overview = f"### Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
91
  # overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
92
  # overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
93
  # overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
 
158
  }}
159
  </style>
160
 
161
+ <h3>Step {step_index + 1}: {analysis.get('headline', '')}</h3>
162
 
163
+ <h4>Call Metadata</h4>
164
  <ul>
165
  <li><strong>Weave Task ID:</strong> {call_data['weave_task_id']}</li>
166
  <li><strong>Trace ID:</strong> {call_data['trace_id']}</li>
 
169
  <li><strong>Model:</strong> {call_data['inputs']['model']}</li>
170
  </ul>
171
 
172
+ <h4>Inputs</h4>
173
  {format_json(call_data['inputs'])}
174
 
175
+ <h4>Outputs</h4>
176
  {format_json(call_data['outputs'])}
177
 
178
+ <h4>Usage</h4>
179
  {format_json(call_data['summary'])}
180
 
181
+ <h4>Analysis</h4>
182
  <ul>
183
  <li><strong>Description:</strong> {analysis['description']}</li>
184
  <li><strong>Assessment:</strong> {analysis['assessment']}</li>
 
195
  return "No failure report available for this agent.", None
196
 
197
  # Create overview of failure categories
198
+ categories_overview = "### Failure Categories:\n\n"
199
  for category in failure_report['failure_categories']:
200
+ categories_overview += f"#### {category['category_name']}\n"
201
  categories_overview += f"{category['description']}\n\n"
202
 
203
  # Count tasks affected by each category
 
215
 
216
  return categories_overview, chart
217
 
218
+ from gradio.themes.utils import colors, fonts, sizes
219
+ from typing import Iterable
220
+ class MyTheme(Soft):
221
+ def __init__(
222
+ self,
223
+ *,
224
+ primary_hue: colors.Color | str = colors.blue,
225
+ text_size: sizes.Size | str = sizes.text_lg,
226
+ font: fonts.Font
227
+ | str
228
+ | Iterable[fonts.Font | str] = (
229
+ fonts.GoogleFont("Lato"),
230
+ "ui-sans-serif",
231
+ "sans-serif",
232
+ ),
233
+ font_mono: fonts.Font
234
+ | str
235
+ | Iterable[fonts.Font | str] = (
236
+ fonts.GoogleFont("IBM Plex Mono"),
237
+ "ui-monospace",
238
+ "monospace",
239
+ ),
240
+ ):
241
+ super().__init__(
242
+ primary_hue=primary_hue,
243
+ text_size=text_size,
244
+ font=font,
245
+ font_mono=font_mono,
246
+ )
247
+
248
+ my_theme = MyTheme()
249
+
250
+ with gr.Blocks(theme=my_theme, css='css.css') as demo:
251
+ gr.Markdown((Path(__file__).parent / "header.md").read_text(), elem_classes=["text-large"])
252
+ gr.HTML("""
253
+ <style>
254
+ .feature-row {
255
+ display: flex;
256
+ justify-content: space-between;
257
+ margin-top: 20px;
258
+ margin-bottom: 20px;
259
+ }
260
+ .feature-column {
261
+ flex: 1;
262
+ padding: 25px;
263
+ background-color: #ffffff;
264
+ border-radius: 10px;
265
+ margin: 0 15px;
266
+ text-align: left;
267
+ box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
268
+ display: flex;
269
+ flex-direction: column;
270
+ align-items: flex-start;
271
+ border-top: 5px solid #3498db;
272
+ transition: transform 0.3s ease, box-shadow 0.3s ease;
273
+ }
274
+ .feature-column:hover {
275
+ transform: translateY(-5px);
276
+ box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
277
+ }
278
+ .feature-keyword {
279
+ font-size: 1.2em;
280
+ font-weight: bold;
281
+ color: #1b9e77;
282
+ margin-bottom: 10px;
283
+ text-transform: uppercase;
284
+ letter-spacing: 1px;
285
+ }
286
+ .feature-content {
287
+ flex-grow: 1;
288
+ }
289
+ .feature-description {
290
+ font-size: 0.95em;
291
+ line-height: 1.6;
292
+ color: #333;
293
+ }
294
+ </style>
295
+
296
+ <div class="feature-row">
297
+ <div class="feature-column">
298
+ <div class="feature-keyword">Centralized</div>
299
+ <div class="feature-content">
300
+ <p class="feature-description">Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.</p>
301
+ </div>
302
+ </div>
303
+ <div class="feature-column">
304
+ <div class="feature-keyword">Third-party</div>
305
+ <div class="feature-content">
306
+ <p class="feature-description">Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.</p>
307
+ </div>
308
+ </div>
309
+ <div class="feature-column">
310
+ <div class="feature-keyword">Cost-controlled</div>
311
+ <div class="feature-content">
312
+ <p class="feature-description">For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.</p>
313
+ </div>
314
+ </div>
315
+ </div>
316
+ <style>
317
+ .section-heading {
318
+ font-size: 1.8em;
319
+ font-weight: bold;
320
+ color: #2c3e50;
321
+ margin-top: 40px;
322
+ margin-bottom: 20px;
323
+ text-align: left;
324
+ }
325
+ .user-types-container {
326
+ display: grid;
327
+ grid-template-columns: repeat(2, 1fr);
328
+ gap: 20px;
329
+ margin-top: 20px;
330
+ }
331
+ .user-type {
332
+ background-color: #ffffff;
333
+ border-radius: 10px;
334
+ padding: 25px;
335
+ box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
336
+ transition: transform 0.3s ease, box-shadow 0.3s ease;
337
+ border-left: 5px solid #3498db;
338
+ }
339
+ .user-type:hover {
340
+ transform: translateY(-5px);
341
+ box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
342
+ }
343
+ .user-type-title {
344
+ font-size: 1.2em;
345
+ font-weight: bold;
346
+ color: #3498db;
347
+ margin-bottom: 10px;
348
+ }
349
+ .user-type-description {
350
+ font-size: 0.95em;
351
+ line-height: 1.6;
352
+ color: #333;
353
+ }
354
+ </style>
355
+ <br/>
356
+ <h2 class="section-heading">Who is it for?</h2>
357
+ <p>We see HAL being useful for four types of users:</p>
358
+
359
+ <div class="user-types-container">
360
+ <div class="user-type">
361
+ <h3 class="user-type-title">Downstream Users & Procurers</h3>
362
+ <p class="user-type-description">Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.</p>
363
+ </div>
364
+ <div class="user-type">
365
+ <h3 class="user-type-title">Agent Benchmark Developers</h3>
366
+ <p class="user-type-description">Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.</p>
367
+ </div>
368
+ <div class="user-type">
369
+ <h3 class="user-type-title">Agent Developers</h3>
370
+ <p class="user-type-description">HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.</p>
371
+ </div>
372
+ <div class="user-type">
373
+ <h3 class="user-type-title">Safety Researchers</h3>
374
+ <p class="user-type-description">Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.</p>
375
+ </div>
376
+ </div>
377
+ <br/>
378
+ """)
379
 
380
  with gr.Tabs():
381
  with gr.Tab("USACO"):
382
+ gr.Markdown("""The USA Computing Olympiad (USACO) is a computer programming competition for pre-college students. This benchmark evaluates the performance of AI agents on a set of 307 USACO tasks. The agents are evaluated based on the number of tasks correctly solved.""")
383
  with gr.Row():
384
  with gr.Column(scale=2):
385
  Leaderboard(
 
392
  hide_columns=config.USACO_HIDE_COLUMNS,
393
  search_columns=config.USACO_SEARCH_COLUMNS,
394
  )
395
+ with gr.Row():
396
+ gr.Markdown("### Accuracy vs. Cost for USACO agents")
397
  with gr.Row():
398
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
399
 
400
+ gr.Markdown("")
401
+ gr.Markdown("")
402
+ gr.Markdown("## Task success heatmap")
403
+ gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least")
404
  with gr.Row():
405
  task_success_heatmap = gr.Plot()
406
  demo.load(
 
410
  ),
411
  outputs=[task_success_heatmap]
412
  )
413
+ gr.Markdown("")
414
+ gr.Markdown("")
415
+ gr.Markdown("## Failure report for each agent")
416
+ gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
417
  with gr.Row():
418
  with gr.Column(scale=1):
419
  failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
 
434
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
435
  outputs=[failure_categories_overview, failure_categories_chart])
436
 
437
+ gr.Markdown("")
438
+ gr.Markdown("")
439
+ gr.Markdown("## Agent monitor")
440
  with gr.Row():
441
  with gr.Column(scale=1):
442
  agent_dropdown = gr.Dropdown(label="Select Agent")
 
458
  inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
459
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
460
 
461
+ gr.Markdown("")
462
+ gr.Markdown("")
463
+ gr.Markdown("## Raw predictions")
464
  with gr.Row():
465
  with gr.Column(scale=1):
466
  raw_agent_dropdown = gr.Dropdown(label="Select Agent")
 
535
  with gr.Row():
536
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
537
 
538
+ gr.Markdown("")
539
+ gr.Markdown("")
540
+ gr.Markdown("## Task success heatmap")
541
  with gr.Row():
542
  task_success_heatmap = gr.Plot()
543
  demo.load(
 
548
  outputs=[task_success_heatmap]
549
  )
550
 
551
+ gr.Markdown("")
552
+ gr.Markdown("")
553
+ gr.Markdown("## Failure report for each agent")
554
  with gr.Row():
555
  with gr.Column(scale=1):
556
  failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
 
571
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
572
  outputs=[failure_categories_overview, failure_categories_chart])
573
 
574
+ gr.Markdown("")
575
+ gr.Markdown("")
576
+ gr.Markdown("## Agent monitor")
577
  with gr.Row():
578
  with gr.Column(scale=1):
579
  agent_dropdown = gr.Dropdown(label="Select Agent")
 
594
  task_dropdown.change(update_task_details,
595
  inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
596
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
597
+ gr.Markdown("")
598
+ gr.Markdown("")
599
+ gr.Markdown("## Raw predictions")
600
  with gr.Row():
601
  with gr.Column(scale=1):
602
  raw_agent_dropdown = gr.Dropdown(label="Select Agent")
 
670
  with gr.Row():
671
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
672
 
673
+ gr.Markdown("")
674
+ gr.Markdown("")
675
+ gr.Markdown("## Task success heatmap")
676
  with gr.Row():
677
  task_success_heatmap = gr.Plot()
678
  demo.load(
 
683
  outputs=[task_success_heatmap]
684
  )
685
 
686
+ gr.Markdown("")
687
+ gr.Markdown("")
688
+ gr.Markdown("## Failure report for each agent")
689
  with gr.Row():
690
  with gr.Column(scale=1):
691
  failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
 
706
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
707
  outputs=[failure_categories_overview, failure_categories_chart])
708
 
709
+ gr.Markdown("")
710
+ gr.Markdown("")
711
+ gr.Markdown("## Agent monitor")
712
  with gr.Row():
713
  with gr.Column(scale=1):
714
  agent_dropdown = gr.Dropdown(label="Select Agent")
 
730
  inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
731
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
732
 
733
+ gr.Markdown("")
734
+ gr.Markdown("")
735
+ gr.Markdown("## Raw predictions")
736
  with gr.Row():
737
  with gr.Column(scale=1):
738
  raw_agent_dropdown = gr.Dropdown(label="Select Agent")
 
806
  with gr.Row():
807
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
808
 
809
+ gr.Markdown("")
810
+ gr.Markdown("")
811
+ gr.Markdown("## Failure report for each agent")
812
  with gr.Row():
813
  with gr.Column(scale=1):
814
  failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
 
829
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
830
  outputs=[failure_categories_overview, failure_categories_chart])
831
 
832
+ gr.Markdown("")
833
+ gr.Markdown("")
834
+ gr.Markdown("## Agent monitor")
835
  with gr.Row():
836
  with gr.Column(scale=1):
837
  agent_dropdown = gr.Dropdown(label="Select Agent")
 
853
  inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
854
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
855
 
856
+ gr.Markdown("")
857
+ gr.Markdown("")
858
+ gr.Markdown("## Raw predictions")
859
  with gr.Row():
860
  with gr.Column(scale=1):
861
  raw_agent_dropdown = gr.Dropdown(label="Select Agent")
css.css CHANGED
@@ -1,157 +1,50 @@
1
- html {
2
- font-family: Inter;
3
- font-size: 16px;
4
- font-weight: 400;
5
- line-height: 1.5;
6
- -webkit-text-size-adjust: 100%;
7
- background: #fff;
8
- color: #323232;
9
- -webkit-font-smoothing: antialiased;
10
- -moz-osx-font-smoothing: grayscale;
11
- text-rendering: optimizeLegibility;
12
- }
13
-
14
  :root {
15
- --space: 1;
16
- --vspace: calc(var(--space) * 1rem);
17
- --vspace-0: calc(3 * var(--space) * 1rem);
18
- --vspace-1: calc(2 * var(--space) * 1rem);
19
- --vspace-2: calc(1.5 * var(--space) * 1rem);
20
- --vspace-3: calc(0.5 * var(--space) * 1rem);
21
- }
22
-
23
- .app {
24
- max-width: 748px !important;
25
- }
26
-
27
- .prose p {
28
- margin: var(--vspace) 0;
29
- line-height: var(--vspace * 2);
30
- font-size: 1rem;
31
- }
32
-
33
- code {
34
- font-family: "Inconsolata", sans-serif;
35
- font-size: 16px;
36
- }
37
-
38
- h1,
39
- h1 code {
40
- font-weight: 400;
41
- line-height: calc(2.5 / var(--space) * var(--vspace));
42
- }
43
-
44
- h1 code {
45
- background: none;
46
  border: none;
47
- letter-spacing: 0.05em;
48
- padding-bottom: 5px;
49
- position: relative;
50
- padding: 0;
51
- }
52
-
53
- h2 {
54
- margin: var(--vspace-1) 0 var(--vspace-2) 0;
55
- line-height: 1em;
56
- }
57
-
58
- h3,
59
- h3 code {
60
- margin: var(--vspace-1) 0 var(--vspace-2) 0;
61
- line-height: 1em;
62
- }
63
-
64
- h4,
65
- h5,
66
- h6 {
67
- margin: var(--vspace-3) 0 var(--vspace-3) 0;
68
- line-height: var(--vspace);
69
- }
70
-
71
- .bigtitle,
72
- h1,
73
- h1 code {
74
- font-size: calc(8px * 4.5);
75
- word-break: break-word;
76
- }
77
-
78
- .title,
79
- h2,
80
- h2 code {
81
- font-size: calc(8px * 3.375);
82
- font-weight: lighter;
83
- word-break: break-word;
84
- border: none;
85
- background: none;
86
- }
87
-
88
- .subheading1,
89
- h3,
90
- h3 code {
91
- font-size: calc(8px * 1.8);
92
- font-weight: 600;
93
- border: none;
94
- background: none;
95
- letter-spacing: 0.1em;
96
- text-transform: uppercase;
97
- }
98
-
99
- h2 code {
100
- padding: 0;
101
- position: relative;
102
- letter-spacing: 0.05em;
103
- }
104
-
105
- blockquote {
106
- font-size: calc(8px * 1.1667);
107
- font-style: italic;
108
- line-height: calc(1.1667 * var(--vspace));
109
- margin: var(--vspace-2) var(--vspace-2);
110
- }
111
-
112
- .subheading2,
113
- h4 {
114
- font-size: calc(8px * 1.4292);
115
- text-transform: uppercase;
116
- font-weight: 600;
117
- }
118
-
119
- .subheading3,
120
- h5 {
121
- font-size: calc(8px * 1.2917);
122
- line-height: calc(1.2917 * var(--vspace));
123
-
124
- font-weight: lighter;
125
- text-transform: uppercase;
126
- letter-spacing: 0.15em;
127
- }
128
-
129
- h6 {
130
- font-size: calc(8px * 1.1667);
131
- font-size: 1.1667em;
132
- font-weight: normal;
133
- font-style: italic;
134
- font-family: "le-monde-livre-classic-byol", serif !important;
135
- letter-spacing: 0px !important;
136
- }
137
-
138
- #start .md > *:first-child {
139
- margin-top: 0;
140
- }
141
-
142
- h2 + h3 {
143
- margin-top: 0;
144
- }
145
-
146
- .md hr {
147
- border: none;
148
- border-top: 1px solid var(--block-border-color);
149
- margin: var(--vspace-2) 0 var(--vspace-2) 0;
150
- }
151
- .prose ul {
152
- margin: var(--vspace-2) 0 var(--vspace-1) 0;
153
- }
154
-
155
- .gap {
156
- gap: 0;
157
- }
 
1
+ /* Base styles and variables */
 
 
 
 
 
 
 
 
 
 
 
 
2
  :root {
3
+ --primary-color: #3498db;
4
+ --secondary-color: #2c3e50;
5
+ --background-color: #f8f9fa;
6
+ --text-color: #333;
7
+ --accent-color: #e74c3c;
8
+ --space: 1rem;
9
+ }
10
+
11
+ /* Tabs */
12
+ .tab-nav {
13
+ display: flex;
14
+ background-color: var(--secondary-color);
15
+ border-radius: 8px 8px 0 0;
16
+ overflow: hidden;
17
+ }
18
+
19
+ .tab-nav button {
20
+ padding: 1rem 1.5rem;
21
+ background-color: transparent;
22
+ color: #fff;
 
 
 
 
 
 
 
 
 
 
 
23
  border: none;
24
+ cursor: pointer;
25
+ transition: background-color 0.3s;
26
+ }
27
+
28
+ .tab-nav button:hover,
29
+ .tab-nav button.selected {
30
+ background-color: var(--primary-color);
31
+ }
32
+
33
+
34
+ .svelte-iibkxk .stretch {
35
+ display: none;
36
+ }
37
+
38
+ /* Utility classes */
39
+ .text-center { text-align: center; }
40
+ .text-right { text-align: right; }
41
+ .font-bold { font-weight: 700; }
42
+ .text-small { font-size: 0.875rem; }
43
+ .text-large { font-size: 1.25rem; }
44
+ .mt-1 { margin-top: 1rem; }
45
+ .mb-1 { margin-bottom: 1rem; }
46
+ .ml-1 { margin-left: 1rem; }
47
+ .mr-1 { margin-right: 1rem; }
48
+
49
+
50
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
header.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Holistic Agent Leaderboard (HAL)
2
+
3
+ **A centralized, standardized, cost-aware leaderboard for evaluating agents.**
scratch.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ def process_json_files(directory, suffix="_updated"):
6
+ # Iterate through all JSON files in the directory
7
+ for filename in os.listdir(directory):
8
+ if filename.endswith(".json") and "USACO" in filename:
9
+ file_path = os.path.join(directory, filename)
10
+
11
+ # Read the JSON file
12
+ with open(file_path, 'r') as f:
13
+ data = json.load(f)
14
+
15
+ # Extract sdict from raw_eval_results
16
+ sdict = data['raw_eval_results']['sdict']
17
+
18
+ # Calculate successful_tasks and failed_tasks
19
+ successful_tasks = [key for key in sdict if float(sdict[key][0]['result']['fraction_passed']) == 1]
20
+ failed_tasks = [key for key in sdict if float(sdict[key][0]['result']['fraction_passed']) < 1]
21
+
22
+ # Add new key-value pairs to the results
23
+ data['results']['successful_tasks'] = successful_tasks
24
+ data['results']['failed_tasks'] = failed_tasks
25
+
26
+ # Create new filename with suffix
27
+ new_filename = f"{Path(filename).stem}{suffix}{Path(filename).suffix}"
28
+ new_file_path = os.path.join(directory, new_filename)
29
+
30
+ # Write updated data to new file
31
+ with open(new_file_path, 'w') as f:
32
+ json.dump(data, f, indent=4)
33
+
34
+ print(f"Processed {filename} and saved as {new_filename}")
35
+
36
+ # Usage
37
+ directory_path = "/Users/benediktstroebl/Documents/GitHub/leaderboard/evals_live"
38
+ process_json_files(directory_path)
utils/viz.py CHANGED
@@ -26,7 +26,7 @@ def create_task_success_heatmap(df, benchmark_name):
26
  z=pivot_df.values,
27
  y=pivot_df.index,
28
  x=pivot_df.columns,
29
- colorscale=[[0, 'white'], [1, '#1b9e77']], # White for failed, green for success
30
  showscale=False,
31
  hovertemplate='<b>Agent:</b> %{y}<br>' +
32
  '<b>Task:</b> %{x}<br>' +
@@ -37,7 +37,7 @@ def create_task_success_heatmap(df, benchmark_name):
37
  fig.update_layout(
38
  xaxis_title='Task ID',
39
  height=total_height,
40
- width=1300,
41
  yaxis=dict(
42
  autorange='reversed',
43
  showticklabels=True, # Show y-axis tick labels (agent names)
@@ -81,16 +81,23 @@ def create_bar_chart(categories, values, x_label, y_label, title):
81
  sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)
82
  categories, values = zip(*sorted_data)
83
 
 
 
 
 
 
 
84
  fig = go.Figure(data=[go.Bar(
85
  y=categories,
86
  x=values,
87
  orientation='h',
88
- marker_color='#1b9e77', # Same color as the scatter plot
89
- text=values,
90
  textposition='auto',
 
91
  textfont=dict(color='black', size=14, family='Arial', weight=2),
92
  hovertemplate='<b>%{y}</b><br>' +
93
- 'Affected Tasks: %{x}<br>'
94
  )])
95
 
96
  fig.update_layout(
@@ -144,7 +151,7 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
144
  ])
145
  )
146
 
147
- fig.update_traces(marker=dict(size=10, color='#1b9e77'),
148
  hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),)
149
 
150
 
@@ -164,7 +171,7 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
164
  fig.update_xaxes(rangemode="tozero")
165
 
166
  fig.update_layout(
167
- width = 1300,
168
  height = 600,
169
  xaxis_title = x_label,
170
  yaxis_title = y_label,
@@ -287,7 +294,7 @@ def create_flow_chart(steps):
287
  hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
288
  marker=dict(
289
  # color=node_colors,
290
- color='#1b9e77',
291
  size=30,
292
  line_width=2,
293
  # symbol=node_shapes
 
26
  z=pivot_df.values,
27
  y=pivot_df.index,
28
  x=pivot_df.columns,
29
+ colorscale=[[0, 'white'], [1, '#3498db']], # White for failed, green for success
30
  showscale=False,
31
  hovertemplate='<b>Agent:</b> %{y}<br>' +
32
  '<b>Task:</b> %{x}<br>' +
 
37
  fig.update_layout(
38
  xaxis_title='Task ID',
39
  height=total_height,
40
+ width=1150,
41
  yaxis=dict(
42
  autorange='reversed',
43
  showticklabels=True, # Show y-axis tick labels (agent names)
 
81
  sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)
82
  categories, values = zip(*sorted_data)
83
 
84
+ # get total number of tasks
85
+ total_tasks = sum(values)
86
+
87
+ text_labels = [f"({value/total_tasks:.1%} of failures)" for value in values]
88
+
89
+
90
  fig = go.Figure(data=[go.Bar(
91
  y=categories,
92
  x=values,
93
  orientation='h',
94
+ marker_color='#3498db', # Same color as the scatter plot
95
+ text=text_labels,
96
  textposition='auto',
97
+ customdata=[f'{value} tasks ({value/total_tasks:.1%} of failures)' for value in values],
98
  textfont=dict(color='black', size=14, family='Arial', weight=2),
99
  hovertemplate='<b>%{y}</b><br>' +
100
+ 'Affected Tasks: %{customdata}<extra></extra>'
101
  )])
102
 
103
  fig.update_layout(
 
151
  ])
152
  )
153
 
154
+ fig.update_traces(marker=dict(size=10, color='#3498db'),
155
  hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),)
156
 
157
 
 
171
  fig.update_xaxes(rangemode="tozero")
172
 
173
  fig.update_layout(
174
+ width = 1150,
175
  height = 600,
176
  xaxis_title = x_label,
177
  yaxis_title = y_label,
 
294
  hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
295
  marker=dict(
296
  # color=node_colors,
297
+ color='#3498db',
298
  size=30,
299
  line_width=2,
300
  # symbol=node_shapes