Spaces:
Running
Running
Commit
·
e24146f
1
Parent(s):
bb2348f
vis update
Browse files- app.py +221 -31
- css.css +48 -155
- header.md +3 -0
- scratch.py +38 -0
- utils/viz.py +15 -8
app.py
CHANGED
@@ -18,6 +18,7 @@ import asyncio
|
|
18 |
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
19 |
import weave
|
20 |
from utils.db import TracePreprocessor
|
|
|
21 |
|
22 |
preprocessor = TracePreprocessor()
|
23 |
|
@@ -86,7 +87,7 @@ def update_task_details(benchmark_name, agent_name, task_id):
|
|
86 |
|
87 |
summary = analysis.get('task_analysis', {})
|
88 |
|
89 |
-
overview = f"
|
90 |
# overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
|
91 |
# overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
|
92 |
# overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
|
@@ -157,9 +158,9 @@ def format_call_info(step, step_index):
|
|
157 |
}}
|
158 |
</style>
|
159 |
|
160 |
-
<
|
161 |
|
162 |
-
<
|
163 |
<ul>
|
164 |
<li><strong>Weave Task ID:</strong> {call_data['weave_task_id']}</li>
|
165 |
<li><strong>Trace ID:</strong> {call_data['trace_id']}</li>
|
@@ -168,16 +169,16 @@ def format_call_info(step, step_index):
|
|
168 |
<li><strong>Model:</strong> {call_data['inputs']['model']}</li>
|
169 |
</ul>
|
170 |
|
171 |
-
<
|
172 |
{format_json(call_data['inputs'])}
|
173 |
|
174 |
-
<
|
175 |
{format_json(call_data['outputs'])}
|
176 |
|
177 |
-
<
|
178 |
{format_json(call_data['summary'])}
|
179 |
|
180 |
-
<
|
181 |
<ul>
|
182 |
<li><strong>Description:</strong> {analysis['description']}</li>
|
183 |
<li><strong>Assessment:</strong> {analysis['assessment']}</li>
|
@@ -194,9 +195,9 @@ def update_failure_report(agent_name, benchmark_name):
|
|
194 |
return "No failure report available for this agent.", None
|
195 |
|
196 |
# Create overview of failure categories
|
197 |
-
categories_overview = "
|
198 |
for category in failure_report['failure_categories']:
|
199 |
-
categories_overview += f"
|
200 |
categories_overview += f"{category['description']}\n\n"
|
201 |
|
202 |
# Count tasks affected by each category
|
@@ -214,14 +215,171 @@ def update_failure_report(agent_name, benchmark_name):
|
|
214 |
|
215 |
return categories_overview, chart
|
216 |
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
with gr.Tabs():
|
224 |
with gr.Tab("USACO"):
|
|
|
225 |
with gr.Row():
|
226 |
with gr.Column(scale=2):
|
227 |
Leaderboard(
|
@@ -234,10 +392,15 @@ with gr.Blocks() as demo:
|
|
234 |
hide_columns=config.USACO_HIDE_COLUMNS,
|
235 |
search_columns=config.USACO_SEARCH_COLUMNS,
|
236 |
)
|
|
|
|
|
237 |
with gr.Row():
|
238 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
239 |
|
240 |
-
gr.Markdown("
|
|
|
|
|
|
|
241 |
with gr.Row():
|
242 |
task_success_heatmap = gr.Plot()
|
243 |
demo.load(
|
@@ -247,8 +410,10 @@ with gr.Blocks() as demo:
|
|
247 |
),
|
248 |
outputs=[task_success_heatmap]
|
249 |
)
|
250 |
-
|
251 |
-
gr.Markdown("
|
|
|
|
|
252 |
with gr.Row():
|
253 |
with gr.Column(scale=1):
|
254 |
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
@@ -269,7 +434,9 @@ with gr.Blocks() as demo:
|
|
269 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
|
270 |
outputs=[failure_categories_overview, failure_categories_chart])
|
271 |
|
272 |
-
gr.Markdown("
|
|
|
|
|
273 |
with gr.Row():
|
274 |
with gr.Column(scale=1):
|
275 |
agent_dropdown = gr.Dropdown(label="Select Agent")
|
@@ -291,7 +458,9 @@ with gr.Blocks() as demo:
|
|
291 |
inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
|
292 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
293 |
|
294 |
-
gr.Markdown("
|
|
|
|
|
295 |
with gr.Row():
|
296 |
with gr.Column(scale=1):
|
297 |
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
@@ -366,7 +535,9 @@ with gr.Blocks() as demo:
|
|
366 |
with gr.Row():
|
367 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
368 |
|
369 |
-
gr.Markdown("
|
|
|
|
|
370 |
with gr.Row():
|
371 |
task_success_heatmap = gr.Plot()
|
372 |
demo.load(
|
@@ -377,7 +548,9 @@ with gr.Blocks() as demo:
|
|
377 |
outputs=[task_success_heatmap]
|
378 |
)
|
379 |
|
380 |
-
gr.Markdown("
|
|
|
|
|
381 |
with gr.Row():
|
382 |
with gr.Column(scale=1):
|
383 |
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
@@ -398,7 +571,9 @@ with gr.Blocks() as demo:
|
|
398 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
|
399 |
outputs=[failure_categories_overview, failure_categories_chart])
|
400 |
|
401 |
-
gr.Markdown("
|
|
|
|
|
402 |
with gr.Row():
|
403 |
with gr.Column(scale=1):
|
404 |
agent_dropdown = gr.Dropdown(label="Select Agent")
|
@@ -419,8 +594,9 @@ with gr.Blocks() as demo:
|
|
419 |
task_dropdown.change(update_task_details,
|
420 |
inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
|
421 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
422 |
-
|
423 |
-
gr.Markdown("
|
|
|
424 |
with gr.Row():
|
425 |
with gr.Column(scale=1):
|
426 |
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
@@ -494,7 +670,9 @@ with gr.Blocks() as demo:
|
|
494 |
with gr.Row():
|
495 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
496 |
|
497 |
-
gr.Markdown("
|
|
|
|
|
498 |
with gr.Row():
|
499 |
task_success_heatmap = gr.Plot()
|
500 |
demo.load(
|
@@ -505,7 +683,9 @@ with gr.Blocks() as demo:
|
|
505 |
outputs=[task_success_heatmap]
|
506 |
)
|
507 |
|
508 |
-
gr.Markdown("
|
|
|
|
|
509 |
with gr.Row():
|
510 |
with gr.Column(scale=1):
|
511 |
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
@@ -526,7 +706,9 @@ with gr.Blocks() as demo:
|
|
526 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
|
527 |
outputs=[failure_categories_overview, failure_categories_chart])
|
528 |
|
529 |
-
gr.Markdown("
|
|
|
|
|
530 |
with gr.Row():
|
531 |
with gr.Column(scale=1):
|
532 |
agent_dropdown = gr.Dropdown(label="Select Agent")
|
@@ -548,7 +730,9 @@ with gr.Blocks() as demo:
|
|
548 |
inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
|
549 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
550 |
|
551 |
-
gr.Markdown("
|
|
|
|
|
552 |
with gr.Row():
|
553 |
with gr.Column(scale=1):
|
554 |
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
@@ -622,7 +806,9 @@ with gr.Blocks() as demo:
|
|
622 |
with gr.Row():
|
623 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
|
624 |
|
625 |
-
gr.Markdown("
|
|
|
|
|
626 |
with gr.Row():
|
627 |
with gr.Column(scale=1):
|
628 |
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
@@ -643,7 +829,9 @@ with gr.Blocks() as demo:
|
|
643 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
|
644 |
outputs=[failure_categories_overview, failure_categories_chart])
|
645 |
|
646 |
-
gr.Markdown("
|
|
|
|
|
647 |
with gr.Row():
|
648 |
with gr.Column(scale=1):
|
649 |
agent_dropdown = gr.Dropdown(label="Select Agent")
|
@@ -665,7 +853,9 @@ with gr.Blocks() as demo:
|
|
665 |
inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
|
666 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
667 |
|
668 |
-
gr.Markdown("
|
|
|
|
|
669 |
with gr.Row():
|
670 |
with gr.Column(scale=1):
|
671 |
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
|
|
18 |
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
19 |
import weave
|
20 |
from utils.db import TracePreprocessor
|
21 |
+
from gradio.themes.soft import Soft
|
22 |
|
23 |
preprocessor = TracePreprocessor()
|
24 |
|
|
|
87 |
|
88 |
summary = analysis.get('task_analysis', {})
|
89 |
|
90 |
+
overview = f"### Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
|
91 |
# overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
|
92 |
# overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
|
93 |
# overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
|
|
|
158 |
}}
|
159 |
</style>
|
160 |
|
161 |
+
<h3>Step {step_index + 1}: {analysis.get('headline', '')}</h3>
|
162 |
|
163 |
+
<h4>Call Metadata</h4>
|
164 |
<ul>
|
165 |
<li><strong>Weave Task ID:</strong> {call_data['weave_task_id']}</li>
|
166 |
<li><strong>Trace ID:</strong> {call_data['trace_id']}</li>
|
|
|
169 |
<li><strong>Model:</strong> {call_data['inputs']['model']}</li>
|
170 |
</ul>
|
171 |
|
172 |
+
<h4>Inputs</h4>
|
173 |
{format_json(call_data['inputs'])}
|
174 |
|
175 |
+
<h4>Outputs</h4>
|
176 |
{format_json(call_data['outputs'])}
|
177 |
|
178 |
+
<h4>Usage</h4>
|
179 |
{format_json(call_data['summary'])}
|
180 |
|
181 |
+
<h4>Analysis</h4>
|
182 |
<ul>
|
183 |
<li><strong>Description:</strong> {analysis['description']}</li>
|
184 |
<li><strong>Assessment:</strong> {analysis['assessment']}</li>
|
|
|
195 |
return "No failure report available for this agent.", None
|
196 |
|
197 |
# Create overview of failure categories
|
198 |
+
categories_overview = "### Failure Categories:\n\n"
|
199 |
for category in failure_report['failure_categories']:
|
200 |
+
categories_overview += f"#### {category['category_name']}\n"
|
201 |
categories_overview += f"{category['description']}\n\n"
|
202 |
|
203 |
# Count tasks affected by each category
|
|
|
215 |
|
216 |
return categories_overview, chart
|
217 |
|
218 |
+
from gradio.themes.utils import colors, fonts, sizes
|
219 |
+
from typing import Iterable
|
220 |
+
class MyTheme(Soft):
|
221 |
+
def __init__(
|
222 |
+
self,
|
223 |
+
*,
|
224 |
+
primary_hue: colors.Color | str = colors.blue,
|
225 |
+
text_size: sizes.Size | str = sizes.text_lg,
|
226 |
+
font: fonts.Font
|
227 |
+
| str
|
228 |
+
| Iterable[fonts.Font | str] = (
|
229 |
+
fonts.GoogleFont("Lato"),
|
230 |
+
"ui-sans-serif",
|
231 |
+
"sans-serif",
|
232 |
+
),
|
233 |
+
font_mono: fonts.Font
|
234 |
+
| str
|
235 |
+
| Iterable[fonts.Font | str] = (
|
236 |
+
fonts.GoogleFont("IBM Plex Mono"),
|
237 |
+
"ui-monospace",
|
238 |
+
"monospace",
|
239 |
+
),
|
240 |
+
):
|
241 |
+
super().__init__(
|
242 |
+
primary_hue=primary_hue,
|
243 |
+
text_size=text_size,
|
244 |
+
font=font,
|
245 |
+
font_mono=font_mono,
|
246 |
+
)
|
247 |
+
|
248 |
+
my_theme = MyTheme()
|
249 |
+
|
250 |
+
with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
251 |
+
gr.Markdown((Path(__file__).parent / "header.md").read_text(), elem_classes=["text-large"])
|
252 |
+
gr.HTML("""
|
253 |
+
<style>
|
254 |
+
.feature-row {
|
255 |
+
display: flex;
|
256 |
+
justify-content: space-between;
|
257 |
+
margin-top: 20px;
|
258 |
+
margin-bottom: 20px;
|
259 |
+
}
|
260 |
+
.feature-column {
|
261 |
+
flex: 1;
|
262 |
+
padding: 25px;
|
263 |
+
background-color: #ffffff;
|
264 |
+
border-radius: 10px;
|
265 |
+
margin: 0 15px;
|
266 |
+
text-align: left;
|
267 |
+
box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
|
268 |
+
display: flex;
|
269 |
+
flex-direction: column;
|
270 |
+
align-items: flex-start;
|
271 |
+
border-top: 5px solid #3498db;
|
272 |
+
transition: transform 0.3s ease, box-shadow 0.3s ease;
|
273 |
+
}
|
274 |
+
.feature-column:hover {
|
275 |
+
transform: translateY(-5px);
|
276 |
+
box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
|
277 |
+
}
|
278 |
+
.feature-keyword {
|
279 |
+
font-size: 1.2em;
|
280 |
+
font-weight: bold;
|
281 |
+
color: #1b9e77;
|
282 |
+
margin-bottom: 10px;
|
283 |
+
text-transform: uppercase;
|
284 |
+
letter-spacing: 1px;
|
285 |
+
}
|
286 |
+
.feature-content {
|
287 |
+
flex-grow: 1;
|
288 |
+
}
|
289 |
+
.feature-description {
|
290 |
+
font-size: 0.95em;
|
291 |
+
line-height: 1.6;
|
292 |
+
color: #333;
|
293 |
+
}
|
294 |
+
</style>
|
295 |
+
|
296 |
+
<div class="feature-row">
|
297 |
+
<div class="feature-column">
|
298 |
+
<div class="feature-keyword">Centralized</div>
|
299 |
+
<div class="feature-content">
|
300 |
+
<p class="feature-description">Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.</p>
|
301 |
+
</div>
|
302 |
+
</div>
|
303 |
+
<div class="feature-column">
|
304 |
+
<div class="feature-keyword">Third-party</div>
|
305 |
+
<div class="feature-content">
|
306 |
+
<p class="feature-description">Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.</p>
|
307 |
+
</div>
|
308 |
+
</div>
|
309 |
+
<div class="feature-column">
|
310 |
+
<div class="feature-keyword">Cost-controlled</div>
|
311 |
+
<div class="feature-content">
|
312 |
+
<p class="feature-description">For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.</p>
|
313 |
+
</div>
|
314 |
+
</div>
|
315 |
+
</div>
|
316 |
+
<style>
|
317 |
+
.section-heading {
|
318 |
+
font-size: 1.8em;
|
319 |
+
font-weight: bold;
|
320 |
+
color: #2c3e50;
|
321 |
+
margin-top: 40px;
|
322 |
+
margin-bottom: 20px;
|
323 |
+
text-align: left;
|
324 |
+
}
|
325 |
+
.user-types-container {
|
326 |
+
display: grid;
|
327 |
+
grid-template-columns: repeat(2, 1fr);
|
328 |
+
gap: 20px;
|
329 |
+
margin-top: 20px;
|
330 |
+
}
|
331 |
+
.user-type {
|
332 |
+
background-color: #ffffff;
|
333 |
+
border-radius: 10px;
|
334 |
+
padding: 25px;
|
335 |
+
box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
|
336 |
+
transition: transform 0.3s ease, box-shadow 0.3s ease;
|
337 |
+
border-left: 5px solid #3498db;
|
338 |
+
}
|
339 |
+
.user-type:hover {
|
340 |
+
transform: translateY(-5px);
|
341 |
+
box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
|
342 |
+
}
|
343 |
+
.user-type-title {
|
344 |
+
font-size: 1.2em;
|
345 |
+
font-weight: bold;
|
346 |
+
color: #3498db;
|
347 |
+
margin-bottom: 10px;
|
348 |
+
}
|
349 |
+
.user-type-description {
|
350 |
+
font-size: 0.95em;
|
351 |
+
line-height: 1.6;
|
352 |
+
color: #333;
|
353 |
+
}
|
354 |
+
</style>
|
355 |
+
<br/>
|
356 |
+
<h2 class="section-heading">Who is it for?</h2>
|
357 |
+
<p>We see HAL being useful for four types of users:</p>
|
358 |
+
|
359 |
+
<div class="user-types-container">
|
360 |
+
<div class="user-type">
|
361 |
+
<h3 class="user-type-title">Downstream Users & Procurers</h3>
|
362 |
+
<p class="user-type-description">Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.</p>
|
363 |
+
</div>
|
364 |
+
<div class="user-type">
|
365 |
+
<h3 class="user-type-title">Agent Benchmark Developers</h3>
|
366 |
+
<p class="user-type-description">Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.</p>
|
367 |
+
</div>
|
368 |
+
<div class="user-type">
|
369 |
+
<h3 class="user-type-title">Agent Developers</h3>
|
370 |
+
<p class="user-type-description">HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.</p>
|
371 |
+
</div>
|
372 |
+
<div class="user-type">
|
373 |
+
<h3 class="user-type-title">Safety Researchers</h3>
|
374 |
+
<p class="user-type-description">Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.</p>
|
375 |
+
</div>
|
376 |
+
</div>
|
377 |
+
<br/>
|
378 |
+
""")
|
379 |
|
380 |
with gr.Tabs():
|
381 |
with gr.Tab("USACO"):
|
382 |
+
gr.Markdown("""The USA Computing Olympiad (USACO) is a computer programming competition for pre-college students. This benchmark evaluates the performance of AI agents on a set of 307 USACO tasks. The agents are evaluated based on the number of tasks correctly solved.""")
|
383 |
with gr.Row():
|
384 |
with gr.Column(scale=2):
|
385 |
Leaderboard(
|
|
|
392 |
hide_columns=config.USACO_HIDE_COLUMNS,
|
393 |
search_columns=config.USACO_SEARCH_COLUMNS,
|
394 |
)
|
395 |
+
with gr.Row():
|
396 |
+
gr.Markdown("### Accuracy vs. Cost for USACO agents")
|
397 |
with gr.Row():
|
398 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
399 |
|
400 |
+
gr.Markdown("")
|
401 |
+
gr.Markdown("")
|
402 |
+
gr.Markdown("## Task success heatmap")
|
403 |
+
gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least")
|
404 |
with gr.Row():
|
405 |
task_success_heatmap = gr.Plot()
|
406 |
demo.load(
|
|
|
410 |
),
|
411 |
outputs=[task_success_heatmap]
|
412 |
)
|
413 |
+
gr.Markdown("")
|
414 |
+
gr.Markdown("")
|
415 |
+
gr.Markdown("## Failure report for each agent")
|
416 |
+
gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
|
417 |
with gr.Row():
|
418 |
with gr.Column(scale=1):
|
419 |
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
|
|
434 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
|
435 |
outputs=[failure_categories_overview, failure_categories_chart])
|
436 |
|
437 |
+
gr.Markdown("")
|
438 |
+
gr.Markdown("")
|
439 |
+
gr.Markdown("## Agent monitor")
|
440 |
with gr.Row():
|
441 |
with gr.Column(scale=1):
|
442 |
agent_dropdown = gr.Dropdown(label="Select Agent")
|
|
|
458 |
inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
|
459 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
460 |
|
461 |
+
gr.Markdown("")
|
462 |
+
gr.Markdown("")
|
463 |
+
gr.Markdown("## Raw predictions")
|
464 |
with gr.Row():
|
465 |
with gr.Column(scale=1):
|
466 |
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
|
|
535 |
with gr.Row():
|
536 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
537 |
|
538 |
+
gr.Markdown("")
|
539 |
+
gr.Markdown("")
|
540 |
+
gr.Markdown("## Task success heatmap")
|
541 |
with gr.Row():
|
542 |
task_success_heatmap = gr.Plot()
|
543 |
demo.load(
|
|
|
548 |
outputs=[task_success_heatmap]
|
549 |
)
|
550 |
|
551 |
+
gr.Markdown("")
|
552 |
+
gr.Markdown("")
|
553 |
+
gr.Markdown("## Failure report for each agent")
|
554 |
with gr.Row():
|
555 |
with gr.Column(scale=1):
|
556 |
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
|
|
571 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
|
572 |
outputs=[failure_categories_overview, failure_categories_chart])
|
573 |
|
574 |
+
gr.Markdown("")
|
575 |
+
gr.Markdown("")
|
576 |
+
gr.Markdown("## Agent monitor")
|
577 |
with gr.Row():
|
578 |
with gr.Column(scale=1):
|
579 |
agent_dropdown = gr.Dropdown(label="Select Agent")
|
|
|
594 |
task_dropdown.change(update_task_details,
|
595 |
inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
|
596 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
597 |
+
gr.Markdown("")
|
598 |
+
gr.Markdown("")
|
599 |
+
gr.Markdown("## Raw predictions")
|
600 |
with gr.Row():
|
601 |
with gr.Column(scale=1):
|
602 |
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
|
|
670 |
with gr.Row():
|
671 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
672 |
|
673 |
+
gr.Markdown("")
|
674 |
+
gr.Markdown("")
|
675 |
+
gr.Markdown("## Task success heatmap")
|
676 |
with gr.Row():
|
677 |
task_success_heatmap = gr.Plot()
|
678 |
demo.load(
|
|
|
683 |
outputs=[task_success_heatmap]
|
684 |
)
|
685 |
|
686 |
+
gr.Markdown("")
|
687 |
+
gr.Markdown("")
|
688 |
+
gr.Markdown("## Failure report for each agent")
|
689 |
with gr.Row():
|
690 |
with gr.Column(scale=1):
|
691 |
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
|
|
706 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
|
707 |
outputs=[failure_categories_overview, failure_categories_chart])
|
708 |
|
709 |
+
gr.Markdown("")
|
710 |
+
gr.Markdown("")
|
711 |
+
gr.Markdown("## Agent monitor")
|
712 |
with gr.Row():
|
713 |
with gr.Column(scale=1):
|
714 |
agent_dropdown = gr.Dropdown(label="Select Agent")
|
|
|
730 |
inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
|
731 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
732 |
|
733 |
+
gr.Markdown("")
|
734 |
+
gr.Markdown("")
|
735 |
+
gr.Markdown("## Raw predictions")
|
736 |
with gr.Row():
|
737 |
with gr.Column(scale=1):
|
738 |
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
|
|
806 |
with gr.Row():
|
807 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
|
808 |
|
809 |
+
gr.Markdown("")
|
810 |
+
gr.Markdown("")
|
811 |
+
gr.Markdown("## Failure report for each agent")
|
812 |
with gr.Row():
|
813 |
with gr.Column(scale=1):
|
814 |
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
|
|
829 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
|
830 |
outputs=[failure_categories_overview, failure_categories_chart])
|
831 |
|
832 |
+
gr.Markdown("")
|
833 |
+
gr.Markdown("")
|
834 |
+
gr.Markdown("## Agent monitor")
|
835 |
with gr.Row():
|
836 |
with gr.Column(scale=1):
|
837 |
agent_dropdown = gr.Dropdown(label="Select Agent")
|
|
|
853 |
inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
|
854 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
855 |
|
856 |
+
gr.Markdown("")
|
857 |
+
gr.Markdown("")
|
858 |
+
gr.Markdown("## Raw predictions")
|
859 |
with gr.Row():
|
860 |
with gr.Column(scale=1):
|
861 |
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
css.css
CHANGED
@@ -1,157 +1,50 @@
|
|
1 |
-
|
2 |
-
font-family: Inter;
|
3 |
-
font-size: 16px;
|
4 |
-
font-weight: 400;
|
5 |
-
line-height: 1.5;
|
6 |
-
-webkit-text-size-adjust: 100%;
|
7 |
-
background: #fff;
|
8 |
-
color: #323232;
|
9 |
-
-webkit-font-smoothing: antialiased;
|
10 |
-
-moz-osx-font-smoothing: grayscale;
|
11 |
-
text-rendering: optimizeLegibility;
|
12 |
-
}
|
13 |
-
|
14 |
:root {
|
15 |
-
--
|
16 |
-
--
|
17 |
-
--
|
18 |
-
--
|
19 |
-
--
|
20 |
-
--
|
21 |
-
}
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
font-size: 16px;
|
36 |
-
}
|
37 |
-
|
38 |
-
h1,
|
39 |
-
h1 code {
|
40 |
-
font-weight: 400;
|
41 |
-
line-height: calc(2.5 / var(--space) * var(--vspace));
|
42 |
-
}
|
43 |
-
|
44 |
-
h1 code {
|
45 |
-
background: none;
|
46 |
border: none;
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
}
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
}
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
font-size: calc(8px * 4.5);
|
75 |
-
word-break: break-word;
|
76 |
-
}
|
77 |
-
|
78 |
-
.title,
|
79 |
-
h2,
|
80 |
-
h2 code {
|
81 |
-
font-size: calc(8px * 3.375);
|
82 |
-
font-weight: lighter;
|
83 |
-
word-break: break-word;
|
84 |
-
border: none;
|
85 |
-
background: none;
|
86 |
-
}
|
87 |
-
|
88 |
-
.subheading1,
|
89 |
-
h3,
|
90 |
-
h3 code {
|
91 |
-
font-size: calc(8px * 1.8);
|
92 |
-
font-weight: 600;
|
93 |
-
border: none;
|
94 |
-
background: none;
|
95 |
-
letter-spacing: 0.1em;
|
96 |
-
text-transform: uppercase;
|
97 |
-
}
|
98 |
-
|
99 |
-
h2 code {
|
100 |
-
padding: 0;
|
101 |
-
position: relative;
|
102 |
-
letter-spacing: 0.05em;
|
103 |
-
}
|
104 |
-
|
105 |
-
blockquote {
|
106 |
-
font-size: calc(8px * 1.1667);
|
107 |
-
font-style: italic;
|
108 |
-
line-height: calc(1.1667 * var(--vspace));
|
109 |
-
margin: var(--vspace-2) var(--vspace-2);
|
110 |
-
}
|
111 |
-
|
112 |
-
.subheading2,
|
113 |
-
h4 {
|
114 |
-
font-size: calc(8px * 1.4292);
|
115 |
-
text-transform: uppercase;
|
116 |
-
font-weight: 600;
|
117 |
-
}
|
118 |
-
|
119 |
-
.subheading3,
|
120 |
-
h5 {
|
121 |
-
font-size: calc(8px * 1.2917);
|
122 |
-
line-height: calc(1.2917 * var(--vspace));
|
123 |
-
|
124 |
-
font-weight: lighter;
|
125 |
-
text-transform: uppercase;
|
126 |
-
letter-spacing: 0.15em;
|
127 |
-
}
|
128 |
-
|
129 |
-
h6 {
|
130 |
-
font-size: calc(8px * 1.1667);
|
131 |
-
font-size: 1.1667em;
|
132 |
-
font-weight: normal;
|
133 |
-
font-style: italic;
|
134 |
-
font-family: "le-monde-livre-classic-byol", serif !important;
|
135 |
-
letter-spacing: 0px !important;
|
136 |
-
}
|
137 |
-
|
138 |
-
#start .md > *:first-child {
|
139 |
-
margin-top: 0;
|
140 |
-
}
|
141 |
-
|
142 |
-
h2 + h3 {
|
143 |
-
margin-top: 0;
|
144 |
-
}
|
145 |
-
|
146 |
-
.md hr {
|
147 |
-
border: none;
|
148 |
-
border-top: 1px solid var(--block-border-color);
|
149 |
-
margin: var(--vspace-2) 0 var(--vspace-2) 0;
|
150 |
-
}
|
151 |
-
.prose ul {
|
152 |
-
margin: var(--vspace-2) 0 var(--vspace-1) 0;
|
153 |
-
}
|
154 |
-
|
155 |
-
.gap {
|
156 |
-
gap: 0;
|
157 |
-
}
|
|
|
1 |
+
/* Base styles and variables */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
:root {
|
3 |
+
--primary-color: #3498db;
|
4 |
+
--secondary-color: #2c3e50;
|
5 |
+
--background-color: #f8f9fa;
|
6 |
+
--text-color: #333;
|
7 |
+
--accent-color: #e74c3c;
|
8 |
+
--space: 1rem;
|
9 |
+
}
|
10 |
+
|
11 |
+
/* Tabs */
|
12 |
+
.tab-nav {
|
13 |
+
display: flex;
|
14 |
+
background-color: var(--secondary-color);
|
15 |
+
border-radius: 8px 8px 0 0;
|
16 |
+
overflow: hidden;
|
17 |
+
}
|
18 |
+
|
19 |
+
.tab-nav button {
|
20 |
+
padding: 1rem 1.5rem;
|
21 |
+
background-color: transparent;
|
22 |
+
color: #fff;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
border: none;
|
24 |
+
cursor: pointer;
|
25 |
+
transition: background-color 0.3s;
|
26 |
+
}
|
27 |
+
|
28 |
+
.tab-nav button:hover,
|
29 |
+
.tab-nav button.selected {
|
30 |
+
background-color: var(--primary-color);
|
31 |
+
}
|
32 |
+
|
33 |
+
|
34 |
+
.svelte-iibkxk .stretch {
|
35 |
+
display: none;
|
36 |
+
}
|
37 |
+
|
38 |
+
/* Utility classes */
|
39 |
+
.text-center { text-align: center; }
|
40 |
+
.text-right { text-align: right; }
|
41 |
+
.font-bold { font-weight: 700; }
|
42 |
+
.text-small { font-size: 0.875rem; }
|
43 |
+
.text-large { font-size: 1.25rem; }
|
44 |
+
.mt-1 { margin-top: 1rem; }
|
45 |
+
.mb-1 { margin-bottom: 1rem; }
|
46 |
+
.ml-1 { margin-left: 1rem; }
|
47 |
+
.mr-1 { margin-right: 1rem; }
|
48 |
+
|
49 |
+
|
50 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
header.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Holistic Agent Leaderboard (HAL)
|
2 |
+
|
3 |
+
**A centralized, standardized, cost-aware leaderboard for evaluating agents.**
|
scratch.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
def process_json_files(directory, suffix="_updated"):
|
6 |
+
# Iterate through all JSON files in the directory
|
7 |
+
for filename in os.listdir(directory):
|
8 |
+
if filename.endswith(".json") and "USACO" in filename:
|
9 |
+
file_path = os.path.join(directory, filename)
|
10 |
+
|
11 |
+
# Read the JSON file
|
12 |
+
with open(file_path, 'r') as f:
|
13 |
+
data = json.load(f)
|
14 |
+
|
15 |
+
# Extract sdict from raw_eval_results
|
16 |
+
sdict = data['raw_eval_results']['sdict']
|
17 |
+
|
18 |
+
# Calculate successful_tasks and failed_tasks
|
19 |
+
successful_tasks = [key for key in sdict if float(sdict[key][0]['result']['fraction_passed']) == 1]
|
20 |
+
failed_tasks = [key for key in sdict if float(sdict[key][0]['result']['fraction_passed']) < 1]
|
21 |
+
|
22 |
+
# Add new key-value pairs to the results
|
23 |
+
data['results']['successful_tasks'] = successful_tasks
|
24 |
+
data['results']['failed_tasks'] = failed_tasks
|
25 |
+
|
26 |
+
# Create new filename with suffix
|
27 |
+
new_filename = f"{Path(filename).stem}{suffix}{Path(filename).suffix}"
|
28 |
+
new_file_path = os.path.join(directory, new_filename)
|
29 |
+
|
30 |
+
# Write updated data to new file
|
31 |
+
with open(new_file_path, 'w') as f:
|
32 |
+
json.dump(data, f, indent=4)
|
33 |
+
|
34 |
+
print(f"Processed {filename} and saved as {new_filename}")
|
35 |
+
|
36 |
+
# Usage
|
37 |
+
directory_path = "/Users/benediktstroebl/Documents/GitHub/leaderboard/evals_live"
|
38 |
+
process_json_files(directory_path)
|
utils/viz.py
CHANGED
@@ -26,7 +26,7 @@ def create_task_success_heatmap(df, benchmark_name):
|
|
26 |
z=pivot_df.values,
|
27 |
y=pivot_df.index,
|
28 |
x=pivot_df.columns,
|
29 |
-
colorscale=[[0, 'white'], [1, '#
|
30 |
showscale=False,
|
31 |
hovertemplate='<b>Agent:</b> %{y}<br>' +
|
32 |
'<b>Task:</b> %{x}<br>' +
|
@@ -37,7 +37,7 @@ def create_task_success_heatmap(df, benchmark_name):
|
|
37 |
fig.update_layout(
|
38 |
xaxis_title='Task ID',
|
39 |
height=total_height,
|
40 |
-
width=
|
41 |
yaxis=dict(
|
42 |
autorange='reversed',
|
43 |
showticklabels=True, # Show y-axis tick labels (agent names)
|
@@ -81,16 +81,23 @@ def create_bar_chart(categories, values, x_label, y_label, title):
|
|
81 |
sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)
|
82 |
categories, values = zip(*sorted_data)
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
fig = go.Figure(data=[go.Bar(
|
85 |
y=categories,
|
86 |
x=values,
|
87 |
orientation='h',
|
88 |
-
marker_color='#
|
89 |
-
text=
|
90 |
textposition='auto',
|
|
|
91 |
textfont=dict(color='black', size=14, family='Arial', weight=2),
|
92 |
hovertemplate='<b>%{y}</b><br>' +
|
93 |
-
'Affected Tasks: %{
|
94 |
)])
|
95 |
|
96 |
fig.update_layout(
|
@@ -144,7 +151,7 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
|
|
144 |
])
|
145 |
)
|
146 |
|
147 |
-
fig.update_traces(marker=dict(size=10, color='#
|
148 |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),)
|
149 |
|
150 |
|
@@ -164,7 +171,7 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
|
|
164 |
fig.update_xaxes(rangemode="tozero")
|
165 |
|
166 |
fig.update_layout(
|
167 |
-
width =
|
168 |
height = 600,
|
169 |
xaxis_title = x_label,
|
170 |
yaxis_title = y_label,
|
@@ -287,7 +294,7 @@ def create_flow_chart(steps):
|
|
287 |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
|
288 |
marker=dict(
|
289 |
# color=node_colors,
|
290 |
-
color='#
|
291 |
size=30,
|
292 |
line_width=2,
|
293 |
# symbol=node_shapes
|
|
|
26 |
z=pivot_df.values,
|
27 |
y=pivot_df.index,
|
28 |
x=pivot_df.columns,
|
29 |
+
colorscale=[[0, 'white'], [1, '#3498db']], # White for failed, green for success
|
30 |
showscale=False,
|
31 |
hovertemplate='<b>Agent:</b> %{y}<br>' +
|
32 |
'<b>Task:</b> %{x}<br>' +
|
|
|
37 |
fig.update_layout(
|
38 |
xaxis_title='Task ID',
|
39 |
height=total_height,
|
40 |
+
width=1150,
|
41 |
yaxis=dict(
|
42 |
autorange='reversed',
|
43 |
showticklabels=True, # Show y-axis tick labels (agent names)
|
|
|
81 |
sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)
|
82 |
categories, values = zip(*sorted_data)
|
83 |
|
84 |
+
# get total number of tasks
|
85 |
+
total_tasks = sum(values)
|
86 |
+
|
87 |
+
text_labels = [f"({value/total_tasks:.1%} of failures)" for value in values]
|
88 |
+
|
89 |
+
|
90 |
fig = go.Figure(data=[go.Bar(
|
91 |
y=categories,
|
92 |
x=values,
|
93 |
orientation='h',
|
94 |
+
marker_color='#3498db', # Same color as the scatter plot
|
95 |
+
text=text_labels,
|
96 |
textposition='auto',
|
97 |
+
customdata=[f'{value} tasks ({value/total_tasks:.1%} of failures)' for value in values],
|
98 |
textfont=dict(color='black', size=14, family='Arial', weight=2),
|
99 |
hovertemplate='<b>%{y}</b><br>' +
|
100 |
+
'Affected Tasks: %{customdata}<extra></extra>'
|
101 |
)])
|
102 |
|
103 |
fig.update_layout(
|
|
|
151 |
])
|
152 |
)
|
153 |
|
154 |
+
fig.update_traces(marker=dict(size=10, color='#3498db'),
|
155 |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),)
|
156 |
|
157 |
|
|
|
171 |
fig.update_xaxes(rangemode="tozero")
|
172 |
|
173 |
fig.update_layout(
|
174 |
+
width = 1150,
|
175 |
height = 600,
|
176 |
xaxis_title = x_label,
|
177 |
yaxis_title = y_label,
|
|
|
294 |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
|
295 |
marker=dict(
|
296 |
# color=node_colors,
|
297 |
+
color='#3498db',
|
298 |
size=30,
|
299 |
line_width=2,
|
300 |
# symbol=node_shapes
|