import gradio as gr import pandas as pd import plotly.graph_objects as go import plotly.express as px from datetime import datetime import os import base64 # Define the benchmark categories and their component metrics CATEGORIES = { "Document Understanding": { "metrics": [ "Invoice ID Detection", "Date Field Recognition", "Address Block Parsing", "Table Structure Recognition" ], "weight": 0.25 }, "Data Extraction": { "metrics": [ "Line Item Extraction", "Numerical Value Accuracy", "Text Field Accuracy", "Field Completeness" ], "weight": 0.25 }, "Bookkeeping Intelligence": { "metrics": [ "VAT Calculation", "Total Reconciliation", "Tax Code Assignment", "Account Classification" ], "weight": 0.25 }, "Error Handling": { "metrics": [ "Validation Rules", "Inconsistency Detection", "Missing Data Handling", "Format Validation" ], "weight": 0.25 } } # Updated benchmark data with real metrics MODELS = { "Ark II": { "version": "ark-ii-v1", "type": "Text + Vision", "provider": "Jenesys AI", "inference_time": "17.94s", "scores": { "Document Understanding": { "Invoice ID": 0.733, "Date of Invoice": 0.887, "Line Items Total": 0.803, "Overall": 0.808 }, "Data Extraction": { "Supplier": 0.735, "Line Items Quantity": 0.882, "Line Items Description": 0.555, "VAT Number": 0.768, "Line Items Total": 0.803, "Overall": 0.749 }, "Bookkeeping Intelligence": { "Discount Total": 0.800, "Line Items VAT": 0.590, "VAT Exclusive": 0.694, "VAT Number": 0.768, "Discount Verification": 0.800, "Overall": 0.730 }, "Error Handling": { "Mean Accuracy": 0.718, "Overall": 0.718 } } }, "Claude-3-5-Sonnet": { "version": "claude-3-5-sonnet-20241022", "type": "Text + Vision", "provider": "Anthropic", "inference_time": "26.51s", "scores": { "Document Understanding": { "Invoice ID": 0.773, "Date of Invoice": 0.806, "Line Items Total": 0.533, "Overall": 0.704 }, "Data Extraction": { "Supplier": 0.706, "Line Items Quantity": 0.597, "Line Items Description": 0.504, "VAT Number": 0.708, "Line Items Total": 0.533, "Overall": 0.609 }, "Bookkeeping Intelligence": { "Discount Total": 0.600, "Line Items VAT": 0.524, "VAT Exclusive": 0.706, "VAT Number": 0.708, "Discount Verification": 0.600, "Overall": 0.628 }, "Error Handling": { "Mean Accuracy": 0.675, "Overall": 0.675 } } }, "GPT-4o": { "version": "gpt-4o", "type": "Text + Vision", "provider": "OpenAI", "inference_time": "19.88s", "scores": { "Document Understanding": { "Invoice ID": 0.600, "Date of Invoice": 0.917, "Line Items Total": 0.571, "Overall": 0.696 }, "Data Extraction": { "Supplier": 0.818, "Line Items Quantity": 0.722, "Line Items Description": 0.619, "VAT Number": 0.714, "Line Items Total": 0.571, "Overall": 0.689 }, "Bookkeeping Intelligence": { "Discount Total": 0.000, "Line Items VAT": 0.313, "VAT Exclusive": 0.250, "VAT Number": 0.714, "Discount Verification": 0.000, "Overall": 0.255 }, "Error Handling": { "Mean Accuracy": 0.683, "Overall": 0.683 } } }, "Ark I": { "version": "ark-i-v1", "type": "Text + Vision", "provider": "Jenesys AI", "inference_time": "7.955s", "scores": { "Document Understanding": { "Invoice ID": 0.747, "Date of Invoice": 0.905, "Line Items Total": 0.703, "Overall": 0.785 }, "Data Extraction": { "Supplier": 0.792, "Line Items Quantity": 0.811, "Line Items Description": 0.521, "VAT Number": 0.719, "Line Items Total": 0.703, "Overall": 0.709 }, "Bookkeeping Intelligence": { "Discount Total": 0.600, "Line Items VAT": 0.434, "VAT Exclusive": 0.491, "VAT Number": 0.719, "Discount Verification": 0.600, "Overall": 0.569 }, "Error Handling": { "Mean Accuracy": 0.641, "Overall": 0.641 } } } } def calculate_category_score(scores): """Calculate average score for a category's metrics.""" # Skip 'Overall' when calculating average metrics = {k: v for k, v in scores.items() if k != 'Overall'} return sum(metrics.values()) / len(metrics) def calculate_overall_score(model_data): """Calculate the weighted average score across all categories.""" category_scores = {} for category, metrics in model_data["scores"].items(): # Skip 'Overall' when calculating category_metrics = {k: v for k, v in metrics.items() if k != 'Overall'} category_scores[category] = sum(category_metrics.values()) / len(category_metrics) * CATEGORIES[category]["weight"] return sum(category_scores.values()) def create_leaderboard_df(): """Create a DataFrame for the leaderboard with detailed metrics.""" data = [] for model_name, model_info in MODELS.items(): # Calculate category scores category_scores = { category: calculate_category_score(metrics) for category, metrics in model_info["scores"].items() } # Use Error Handling score as Average Score error_handling_score = calculate_category_score(model_info["scores"]["Error Handling"]) row = { "Model": model_name, "Version": model_info["version"], "Type": model_info["type"], "Provider": model_info["provider"], "Average Score": error_handling_score, # Using Error Handling score **category_scores } data.append(row) df = pd.DataFrame(data) return df.sort_values("Average Score", ascending=False) def create_category_comparison(): """Create a bar chart comparing all models across categories.""" df = create_leaderboard_df() df_melted = df.melt( id_vars=["Model"], value_vars=list(CATEGORIES.keys()), var_name="Category", value_name="Score" ) fig = px.bar( df_melted, x="Category", y="Score", color="Model", barmode="group", title="Model Performance by Category", range_y=[0, 1.0] ) fig.update_layout( xaxis_title="Category", yaxis_title="Score", legend_title="Model", font=dict(size=14), title=dict( text="Model Performance by Category", x=0.5, y=0.95, xanchor='center', yanchor='top', font=dict(size=20) ), yaxis=dict( tickmode='array', ticktext=['0%', '20%', '40%', '60%', '80%', '100%'], tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1.0], gridcolor='rgba(0, 0, 0, 0.1)', zeroline=True, zerolinecolor='rgba(0, 0, 0, 0.2)', zerolinewidth=1 ), xaxis=dict( tickangle=-45, gridcolor='rgba(0, 0, 0, 0.1)' ), bargap=0.2, bargroupgap=0.1, paper_bgcolor='rgba(255, 255, 255, 0.9)', plot_bgcolor='rgba(255, 255, 255, 0.9)', margin=dict(t=100, b=100, l=100, r=20), showlegend=True, legend=dict( yanchor="top", y=1, xanchor="left", x=1.02, bgcolor='rgba(255, 255, 255, 0.9)', bordercolor='rgba(0, 0, 0, 0.1)', borderwidth=1 ) ) return fig def create_combined_radar_chart(): """Create a radar chart showing all models together.""" try: import plotly.graph_objects as go categories = list(CATEGORIES.keys()) # Define colors for each model colors = { "Ark II": "rgb(99, 110, 250)", # Blue "Claude-3-5-Sonnet": "rgb(239, 85, 59)", # Red "GPT-4o": "rgb(0, 204, 150)", # Green "Ark I": "rgb(171, 99, 250)" # Purple } fig = go.Figure() # Add trace for each model for model_name, color in colors.items(): model_data = MODELS[model_name] values = [] for category in categories: metrics = {k: v for k, v in model_data["scores"][category].items() if k != 'Overall'} if category == "Error Handling": values.append(metrics.get("Mean Accuracy", 0.0)) else: values.append(sum(metrics.values()) / len(metrics) if metrics else 0.0) fig.add_trace(go.Scatterpolar( r=values + [values[0]], theta=categories + [categories[0]], fill='none', line=dict(color=color, width=2), name=model_name )) # Update layout fig.update_layout( polar=dict( radialaxis=dict( visible=True, range=[0, 1.0], tickmode='array', ticktext=['0%', '20%', '40%', '60%', '80%', '100%'], tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1.0], gridcolor='rgba(0, 0, 0, 0.1)', linecolor='rgba(0, 0, 0, 0.1)' ), angularaxis=dict( gridcolor='rgba(0, 0, 0, 0.1)', linecolor='rgba(0, 0, 0, 0.1)' ), bgcolor='rgba(255, 255, 255, 0.9)' ), showlegend=True, paper_bgcolor='rgba(255, 255, 255, 0.9)', plot_bgcolor='rgba(255, 255, 255, 0.9)', title=dict( text="Model Performance Comparison", x=0.5, y=0.95, xanchor='center', yanchor='top', font=dict(size=20) ), legend=dict( yanchor="top", y=1, xanchor="left", x=1.02 ), margin=dict(t=100, b=100, l=100, r=100) ) return fig except Exception as e: print(f"Error creating radar chart: {str(e)}") return go.Figure() def create_comparison_metrics_df(model_name): """Create a DataFrame showing detailed metrics with comparisons.""" base_model = "Ark II" data = [] base_data = MODELS[base_model]["scores"] compare_data = MODELS[model_name]["scores"] for category in CATEGORIES.keys(): base_metrics = {k: v for k, v in base_data[category].items() if k != 'Overall'} compare_metrics = {k: v for k, v in compare_data[category].items() if k != 'Overall'} for metric in base_metrics.keys(): if metric in compare_metrics: base_value = base_metrics[metric] compare_value = compare_metrics[metric] diff = compare_value - base_value data.append({ "Category": category, "Metric": metric, f"{model_name} Score": compare_value, f"{base_model} Score": base_value, "Difference": diff, "Better/Worse": "↑" if diff > 0 else "↓" if diff < 0 else "=" }) df = pd.DataFrame(data) return df def update_model_details(model_name): """Update the detailed metrics view for a selected model.""" try: df = create_comparison_metrics_df(model_name) return [df, create_combined_radar_chart()] except Exception as e: print(f"Error in update_model_details: {str(e)}") return [pd.DataFrame(), go.Figure()] # Load logo as base64 def get_logo_html(): logo_path = os.path.join(os.path.dirname(__file__), "jenesys.jpg") with open(logo_path, "rb") as f: encoded_logo = base64.b64encode(f.read()).decode() return f'' # Create the Gradio interface with gr.Blocks(title="AI Bookkeeper Leaderboard") as demo: gr.Markdown(f"""
{get_logo_html()}

AI Bookkeeper Leaderboard

""") gr.Markdown(f"Last updated: {datetime.now().strftime('%Y-%m-%d')}") gr.Markdown(""" ## About the Benchmark 📊 This benchmark evaluates Large Vision Language Models on their ability to process and understand bookkeeping documents across four main categories: 1. **Document Understanding (25%)**: Ability to parse and understand document structure 2. **Data Extraction (25%)**: Accuracy in extracting specific data points 3. **Bookkeeping Intelligence (25%)**: Understanding of bookkeeping concepts, calculations and general ledger accounting 4. **Error Handling (25%)**: Ability to detect and handle inconsistencies Each metric is scored from 0 to 1, where: - 0.90-1.00 = Excellent - 0.80-0.89 = Good - 0.70-0.79 = Acceptable - < 0.70 = Needs improvement """) with gr.Row(): leaderboard = gr.DataFrame( create_leaderboard_df(), label="Overall Leaderboard", height=200 ) with gr.Row(): with gr.Column(scale=1, min_width=1200): category_plot = gr.Plot( value=create_category_comparison() ) with gr.Row(): with gr.Column(scale=1): model_selector = gr.Dropdown( choices=[m for m in list(MODELS.keys()) if m != "Ark II"], label="Select Model to Compare with Ark II", value="Claude-3-5-Sonnet", interactive=True ) with gr.Row(): with gr.Column(scale=2): metrics_table = gr.DataFrame( create_comparison_metrics_df("Claude-3-5-Sonnet"), label="Comparison Metrics (vs Ark II)", height=400 ) with gr.Row(): with gr.Column(scale=1, min_width=1200): radar_chart = gr.Plot(value=create_combined_radar_chart()) # Update callback model_selector.change( fn=update_model_details, inputs=[model_selector], outputs=[metrics_table, radar_chart] ) if __name__ == "__main__": demo.launch(share=True)