import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import os
import base64
# Define the benchmark categories and their component metrics
CATEGORIES = {
"Document Understanding": {
"metrics": [
"Invoice ID Detection",
"Date Field Recognition",
"Address Block Parsing",
"Table Structure Recognition"
],
"weight": 0.25
},
"Data Extraction": {
"metrics": [
"Line Item Extraction",
"Numerical Value Accuracy",
"Text Field Accuracy",
"Field Completeness"
],
"weight": 0.25
},
"Bookkeeping Intelligence": {
"metrics": [
"VAT Calculation",
"Total Reconciliation",
"Tax Code Assignment",
"Account Classification"
],
"weight": 0.25
},
"Error Handling": {
"metrics": [
"Validation Rules",
"Inconsistency Detection",
"Missing Data Handling",
"Format Validation"
],
"weight": 0.25
}
}
# Updated benchmark data with real metrics
MODELS = {
"Ark II": {
"version": "ark-ii-v1",
"type": "Text + Vision",
"provider": "Jenesys AI",
"inference_time": "17.94s",
"scores": {
"Document Understanding": {
"Invoice ID": 0.733,
"Date of Invoice": 0.887,
"Line Items Total": 0.803,
"Overall": 0.808
},
"Data Extraction": {
"Supplier": 0.735,
"Line Items Quantity": 0.882,
"Line Items Description": 0.555,
"VAT Number": 0.768,
"Line Items Total": 0.803,
"Overall": 0.749
},
"Bookkeeping Intelligence": {
"Discount Total": 0.800,
"Line Items VAT": 0.590,
"VAT Exclusive": 0.694,
"VAT Number": 0.768,
"Discount Verification": 0.800,
"Overall": 0.730
},
"Error Handling": {
"Mean Accuracy": 0.718,
"Overall": 0.718
}
}
},
"Claude-3-5-Sonnet": {
"version": "claude-3-5-sonnet-20241022",
"type": "Text + Vision",
"provider": "Anthropic",
"inference_time": "26.51s",
"scores": {
"Document Understanding": {
"Invoice ID": 0.773,
"Date of Invoice": 0.806,
"Line Items Total": 0.533,
"Overall": 0.704
},
"Data Extraction": {
"Supplier": 0.706,
"Line Items Quantity": 0.597,
"Line Items Description": 0.504,
"VAT Number": 0.708,
"Line Items Total": 0.533,
"Overall": 0.609
},
"Bookkeeping Intelligence": {
"Discount Total": 0.600,
"Line Items VAT": 0.524,
"VAT Exclusive": 0.706,
"VAT Number": 0.708,
"Discount Verification": 0.600,
"Overall": 0.628
},
"Error Handling": {
"Mean Accuracy": 0.675,
"Overall": 0.675
}
}
},
"GPT-4o": {
"version": "gpt-4o",
"type": "Text + Vision",
"provider": "OpenAI",
"inference_time": "19.88s",
"scores": {
"Document Understanding": {
"Invoice ID": 0.600,
"Date of Invoice": 0.917,
"Line Items Total": 0.571,
"Overall": 0.696
},
"Data Extraction": {
"Supplier": 0.818,
"Line Items Quantity": 0.722,
"Line Items Description": 0.619,
"VAT Number": 0.714,
"Line Items Total": 0.571,
"Overall": 0.689
},
"Bookkeeping Intelligence": {
"Discount Total": 0.000,
"Line Items VAT": 0.313,
"VAT Exclusive": 0.250,
"VAT Number": 0.714,
"Discount Verification": 0.000,
"Overall": 0.255
},
"Error Handling": {
"Mean Accuracy": 0.683,
"Overall": 0.683
}
}
},
"Ark I": {
"version": "ark-i-v1",
"type": "Text + Vision",
"provider": "Jenesys AI",
"inference_time": "7.955s",
"scores": {
"Document Understanding": {
"Invoice ID": 0.747,
"Date of Invoice": 0.905,
"Line Items Total": 0.703,
"Overall": 0.785
},
"Data Extraction": {
"Supplier": 0.792,
"Line Items Quantity": 0.811,
"Line Items Description": 0.521,
"VAT Number": 0.719,
"Line Items Total": 0.703,
"Overall": 0.709
},
"Bookkeeping Intelligence": {
"Discount Total": 0.600,
"Line Items VAT": 0.434,
"VAT Exclusive": 0.491,
"VAT Number": 0.719,
"Discount Verification": 0.600,
"Overall": 0.569
},
"Error Handling": {
"Mean Accuracy": 0.641,
"Overall": 0.641
}
}
}
}
def calculate_category_score(scores):
"""Calculate average score for a category's metrics."""
# Skip 'Overall' when calculating average
metrics = {k: v for k, v in scores.items() if k != 'Overall'}
return sum(metrics.values()) / len(metrics)
def calculate_overall_score(model_data):
"""Calculate the weighted average score across all categories."""
category_scores = {}
for category, metrics in model_data["scores"].items():
# Skip 'Overall' when calculating
category_metrics = {k: v for k, v in metrics.items() if k != 'Overall'}
category_scores[category] = sum(category_metrics.values()) / len(category_metrics) * CATEGORIES[category]["weight"]
return sum(category_scores.values())
def create_leaderboard_df():
"""Create a DataFrame for the leaderboard with detailed metrics."""
data = []
for model_name, model_info in MODELS.items():
# Calculate category scores
category_scores = {
category: calculate_category_score(metrics)
for category, metrics in model_info["scores"].items()
}
# Use Error Handling score as Average Score
error_handling_score = calculate_category_score(model_info["scores"]["Error Handling"])
row = {
"Model": model_name,
"Version": model_info["version"],
"Type": model_info["type"],
"Provider": model_info["provider"],
"Average Score": error_handling_score, # Using Error Handling score
**category_scores
}
data.append(row)
df = pd.DataFrame(data)
return df.sort_values("Average Score", ascending=False)
def create_category_comparison():
"""Create a bar chart comparing all models across categories."""
df = create_leaderboard_df()
df_melted = df.melt(
id_vars=["Model"],
value_vars=list(CATEGORIES.keys()),
var_name="Category",
value_name="Score"
)
fig = px.bar(
df_melted,
x="Category",
y="Score",
color="Model",
barmode="group",
title="Model Performance by Category",
range_y=[0, 1.0]
)
fig.update_layout(
xaxis_title="Category",
yaxis_title="Score",
legend_title="Model",
font=dict(size=14),
title=dict(
text="Model Performance by Category",
x=0.5,
y=0.95,
xanchor='center',
yanchor='top',
font=dict(size=20)
),
yaxis=dict(
tickmode='array',
ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
gridcolor='rgba(0, 0, 0, 0.1)',
zeroline=True,
zerolinecolor='rgba(0, 0, 0, 0.2)',
zerolinewidth=1
),
xaxis=dict(
tickangle=-45,
gridcolor='rgba(0, 0, 0, 0.1)'
),
bargap=0.2,
bargroupgap=0.1,
paper_bgcolor='rgba(255, 255, 255, 0.9)',
plot_bgcolor='rgba(255, 255, 255, 0.9)',
margin=dict(t=100, b=100, l=100, r=20),
showlegend=True,
legend=dict(
yanchor="top",
y=1,
xanchor="left",
x=1.02,
bgcolor='rgba(255, 255, 255, 0.9)',
bordercolor='rgba(0, 0, 0, 0.1)',
borderwidth=1
)
)
return fig
def create_combined_radar_chart():
"""Create a radar chart showing all models together."""
try:
import plotly.graph_objects as go
categories = list(CATEGORIES.keys())
# Define colors for each model
colors = {
"Ark II": "rgb(99, 110, 250)", # Blue
"Claude-3-5-Sonnet": "rgb(239, 85, 59)", # Red
"GPT-4o": "rgb(0, 204, 150)", # Green
"Ark I": "rgb(171, 99, 250)" # Purple
}
fig = go.Figure()
# Add trace for each model
for model_name, color in colors.items():
model_data = MODELS[model_name]
values = []
for category in categories:
metrics = {k: v for k, v in model_data["scores"][category].items() if k != 'Overall'}
if category == "Error Handling":
values.append(metrics.get("Mean Accuracy", 0.0))
else:
values.append(sum(metrics.values()) / len(metrics) if metrics else 0.0)
fig.add_trace(go.Scatterpolar(
r=values + [values[0]],
theta=categories + [categories[0]],
fill='none',
line=dict(color=color, width=2),
name=model_name
))
# Update layout
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 1.0],
tickmode='array',
ticktext=['0%', '20%', '40%', '60%', '80%', '100%'],
tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
gridcolor='rgba(0, 0, 0, 0.1)',
linecolor='rgba(0, 0, 0, 0.1)'
),
angularaxis=dict(
gridcolor='rgba(0, 0, 0, 0.1)',
linecolor='rgba(0, 0, 0, 0.1)'
),
bgcolor='rgba(255, 255, 255, 0.9)'
),
showlegend=True,
paper_bgcolor='rgba(255, 255, 255, 0.9)',
plot_bgcolor='rgba(255, 255, 255, 0.9)',
title=dict(
text="Model Performance Comparison",
x=0.5,
y=0.95,
xanchor='center',
yanchor='top',
font=dict(size=20)
),
legend=dict(
yanchor="top",
y=1,
xanchor="left",
x=1.02
),
margin=dict(t=100, b=100, l=100, r=100)
)
return fig
except Exception as e:
print(f"Error creating radar chart: {str(e)}")
return go.Figure()
def create_comparison_metrics_df(model_name):
"""Create a DataFrame showing detailed metrics with comparisons."""
base_model = "Ark II"
data = []
base_data = MODELS[base_model]["scores"]
compare_data = MODELS[model_name]["scores"]
for category in CATEGORIES.keys():
base_metrics = {k: v for k, v in base_data[category].items() if k != 'Overall'}
compare_metrics = {k: v for k, v in compare_data[category].items() if k != 'Overall'}
for metric in base_metrics.keys():
if metric in compare_metrics:
base_value = base_metrics[metric]
compare_value = compare_metrics[metric]
diff = compare_value - base_value
data.append({
"Category": category,
"Metric": metric,
f"{model_name} Score": compare_value,
f"{base_model} Score": base_value,
"Difference": diff,
"Better/Worse": "↑" if diff > 0 else "↓" if diff < 0 else "="
})
df = pd.DataFrame(data)
return df
def update_model_details(model_name):
"""Update the detailed metrics view for a selected model."""
try:
df = create_comparison_metrics_df(model_name)
return [df, create_combined_radar_chart()]
except Exception as e:
print(f"Error in update_model_details: {str(e)}")
return [pd.DataFrame(), go.Figure()]
# Load logo as base64
def get_logo_html():
logo_path = os.path.join(os.path.dirname(__file__), "jenesys.jpg")
with open(logo_path, "rb") as f:
encoded_logo = base64.b64encode(f.read()).decode()
return f''
# Create the Gradio interface
with gr.Blocks(title="AI Bookkeeper Leaderboard") as demo:
gr.Markdown(f"""