Spaces:
Running
Running
Martin Jurkovic
commited on
Commit
·
370d5a0
1
Parent(s):
385e405
Add single column leaderboard
Browse files- app.py +29 -3
- src/about.py +18 -0
- src/display/utils.py +16 -1
- src/populate.py +35 -3
app.py
CHANGED
@@ -20,6 +20,7 @@ from src.display.utils import (
|
|
20 |
EVAL_TYPES,
|
21 |
AutoEvalColumn,
|
22 |
singletable_AutoEvalColumn,
|
|
|
23 |
ModelType,
|
24 |
fields,
|
25 |
# WeightType,
|
@@ -50,7 +51,7 @@ except Exception:
|
|
50 |
restart_space()
|
51 |
|
52 |
|
53 |
-
SINGLETABLE_LEADERBOARD_DF, MULTITABLE_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
54 |
|
55 |
(
|
56 |
finished_eval_queue_df,
|
@@ -111,6 +112,28 @@ def init_singletable_leaderboard(dataframe):
|
|
111 |
interactive=False,
|
112 |
)
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
demo = gr.Blocks(css=custom_css)
|
116 |
with demo:
|
@@ -124,12 +147,15 @@ with demo:
|
|
124 |
with gr.TabItem("🏅 SingleTable", elem_id="syntherela-benchmark-tab-table", id=1):
|
125 |
singletable_leaderboard = init_singletable_leaderboard(SINGLETABLE_LEADERBOARD_DF)
|
126 |
|
|
|
|
|
|
|
127 |
|
128 |
|
129 |
-
with gr.TabItem("📝 About", elem_id="syntherela-benchmark-tab-table", id=
|
130 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
131 |
|
132 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="syntherela-benchmark-tab-table", id=
|
133 |
with gr.Column():
|
134 |
with gr.Row():
|
135 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
20 |
EVAL_TYPES,
|
21 |
AutoEvalColumn,
|
22 |
singletable_AutoEvalColumn,
|
23 |
+
singlecolumn_AutoEvalColumn,
|
24 |
ModelType,
|
25 |
fields,
|
26 |
# WeightType,
|
|
|
51 |
restart_space()
|
52 |
|
53 |
|
54 |
+
SINGLECOLUMN_LEADERBOARD_DF, SINGLETABLE_LEADERBOARD_DF, MULTITABLE_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
55 |
|
56 |
(
|
57 |
finished_eval_queue_df,
|
|
|
112 |
interactive=False,
|
113 |
)
|
114 |
|
115 |
+
def init_singlecolumn_leaderboard(dataframe):
|
116 |
+
if dataframe is None or dataframe.empty:
|
117 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
118 |
+
return Leaderboard(
|
119 |
+
value=dataframe,
|
120 |
+
datatype=[c.type for c in fields(singlecolumn_AutoEvalColumn)],
|
121 |
+
select_columns=SelectColumns(
|
122 |
+
default_selection=[c.name for c in fields(singlecolumn_AutoEvalColumn) if c.displayed_by_default],
|
123 |
+
cant_deselect=[c.name for c in fields(singlecolumn_AutoEvalColumn) if c.never_hidden],
|
124 |
+
label="Select Columns to Display:",
|
125 |
+
),
|
126 |
+
search_columns=[singlecolumn_AutoEvalColumn.model.name], # AutoEvalColumn.license.name],
|
127 |
+
hide_columns=[c.name for c in fields(singlecolumn_AutoEvalColumn) if c.hidden],
|
128 |
+
filter_columns=[
|
129 |
+
ColumnFilter(singlecolumn_AutoEvalColumn.dataset.name, type="checkboxgroup", label="Datasets"),
|
130 |
+
ColumnFilter(singlecolumn_AutoEvalColumn.table.name, type="checkboxgroup", label="Tables"),
|
131 |
+
ColumnFilter(singlecolumn_AutoEvalColumn.model.name, type="checkboxgroup", label="Models"),
|
132 |
+
],
|
133 |
+
bool_checkboxgroup_label="Hide models",
|
134 |
+
interactive=False,
|
135 |
+
)
|
136 |
+
|
137 |
|
138 |
demo = gr.Blocks(css=custom_css)
|
139 |
with demo:
|
|
|
147 |
with gr.TabItem("🏅 SingleTable", elem_id="syntherela-benchmark-tab-table", id=1):
|
148 |
singletable_leaderboard = init_singletable_leaderboard(SINGLETABLE_LEADERBOARD_DF)
|
149 |
|
150 |
+
with gr.TabItem("🏅 SingleColumn", elem_id="syntherela-benchmark-tab-table", id=2):
|
151 |
+
singlecolumn_leaderboard = init_singlecolumn_leaderboard(SINGLECOLUMN_LEADERBOARD_DF)
|
152 |
+
|
153 |
|
154 |
|
155 |
+
with gr.TabItem("📝 About", elem_id="syntherela-benchmark-tab-table", id=3):
|
156 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
157 |
|
158 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="syntherela-benchmark-tab-table", id=4):
|
159 |
with gr.Column():
|
160 |
with gr.Row():
|
161 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
src/about.py
CHANGED
@@ -27,6 +27,24 @@ class SingleTableTasks(Enum):
|
|
27 |
# SingleTableDetection-XGBClassifier
|
28 |
task_3 = Task("single-table", "SingleTableDetection-XGBClassifier", "SingleTableDetection-XGBClassifier ⬇️")
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
NUM_FEWSHOT = 0 # Change with your few shot
|
31 |
# ---------------------------------------------------
|
32 |
|
|
|
27 |
# SingleTableDetection-XGBClassifier
|
28 |
task_3 = Task("single-table", "SingleTableDetection-XGBClassifier", "SingleTableDetection-XGBClassifier ⬇️")
|
29 |
|
30 |
+
class SingleColumnTasks(Enum):
|
31 |
+
# ChiSquareTest
|
32 |
+
task_0 = Task("single-column", "ChiSquareTest", "ChiSquareTest ⬇️")
|
33 |
+
# HellingerDistance
|
34 |
+
task_1 = Task("single-column", "HellingerDistance", "HellingerDistance ⬇️")
|
35 |
+
# JensenShannonDistance
|
36 |
+
task_2 = Task("single-column", "JensenShannonDistance", "JensenShannonDistance ⬇️")
|
37 |
+
# KolmogorovSmirnovTest
|
38 |
+
task_3 = Task("single-column", "KolmogorovSmirnovTest", "KolmogorovSmirnovTest ⬇️")
|
39 |
+
# SingleColumnDetection-LogisticRegression
|
40 |
+
task_4 = Task("single-column", "SingleColumnDetection-LogisticRegression", "SingleColumnDetection-LogisticRegression ⬇️")
|
41 |
+
# SingleColumnDetection-XGBClassifier
|
42 |
+
task_5 = Task("single-column", "SingleColumnDetection-XGBClassifier", "SingleColumnDetection-XGBClassifier ⬇️")
|
43 |
+
# TotalVariationDistance
|
44 |
+
task_6 = Task("single-column", "TotalVariationDistance", "TotalVariationDistance ⬇️")
|
45 |
+
# WassersteinDistance
|
46 |
+
task_7 = Task("single-column", "WassersteinDistance", "WassersteinDistance ⬇️")
|
47 |
+
|
48 |
NUM_FEWSHOT = 0 # Change with your few shot
|
49 |
# ---------------------------------------------------
|
50 |
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks, SingleTableTasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -55,6 +55,21 @@ for task in SingleTableTasks:
|
|
55 |
|
56 |
singletable_AutoEvalColumn = make_dataclass("AutoEvalColumn", singletable_auto_eval_column_dict, frozen=True)
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
## For the queue columns in the submission tab
|
59 |
@dataclass(frozen=True)
|
60 |
class EvalQueueColumn: # Queue column
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks, SingleTableTasks, SingleColumnTasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
55 |
|
56 |
singletable_AutoEvalColumn = make_dataclass("AutoEvalColumn", singletable_auto_eval_column_dict, frozen=True)
|
57 |
|
58 |
+
# SINGLE COLUMN
|
59 |
+
|
60 |
+
singlecolumn_auto_eval_column_dict = []
|
61 |
+
# Init
|
62 |
+
singlecolumn_auto_eval_column_dict.append(["dataset", ColumnContent, ColumnContent("Dataset", "str", True, never_hidden=True)])
|
63 |
+
singlecolumn_auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
64 |
+
singlecolumn_auto_eval_column_dict.append(["table", ColumnContent, ColumnContent("Table", "str", True, never_hidden=True)])
|
65 |
+
#Scores
|
66 |
+
for task in SingleColumnTasks:
|
67 |
+
singlecolumn_auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
68 |
+
|
69 |
+
singlecolumn_AutoEvalColumn = make_dataclass("AutoEvalColumn", singlecolumn_auto_eval_column_dict, frozen=True)
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
## For the queue columns in the submission tab
|
74 |
@dataclass(frozen=True)
|
75 |
class EvalQueueColumn: # Queue column
|
src/populate.py
CHANGED
@@ -7,7 +7,7 @@ import numpy as np
|
|
7 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
8 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
-
from src.about import Tasks, SingleTableTasks
|
11 |
|
12 |
|
13 |
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
@@ -43,12 +43,13 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
|
|
43 |
all_data_json.append(data)
|
44 |
|
45 |
multi_table_metrics = [task.value.col_name for task in Tasks]
|
46 |
-
|
47 |
single_table_metrics = [task.value.col_name for task in SingleTableTasks]
|
|
|
48 |
|
49 |
# create empty dataframe with the columns multi_table_metrics
|
50 |
multitable_df = pd.DataFrame(columns=["Dataset", "Model"] + multi_table_metrics)
|
51 |
singletable_df = pd.DataFrame(columns=["Dataset", "Model"] + single_table_metrics)
|
|
|
52 |
|
53 |
# iterate through all json files and add the data to the dataframe
|
54 |
for data in all_data_json:
|
@@ -86,7 +87,38 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
|
|
86 |
singletable_row[metric] = np.nan
|
87 |
singletable_df = pd.concat([singletable_df, pd.DataFrame([singletable_row])], ignore_index=True)
|
88 |
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
|
92 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
7 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
8 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
+
from src.about import Tasks, SingleTableTasks, SingleColumnTasks
|
11 |
|
12 |
|
13 |
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
43 |
all_data_json.append(data)
|
44 |
|
45 |
multi_table_metrics = [task.value.col_name for task in Tasks]
|
|
|
46 |
single_table_metrics = [task.value.col_name for task in SingleTableTasks]
|
47 |
+
single_column_metrics = [task.value.col_name for task in SingleColumnTasks]
|
48 |
|
49 |
# create empty dataframe with the columns multi_table_metrics
|
50 |
multitable_df = pd.DataFrame(columns=["Dataset", "Model"] + multi_table_metrics)
|
51 |
singletable_df = pd.DataFrame(columns=["Dataset", "Model"] + single_table_metrics)
|
52 |
+
singlecolumn_df = pd.DataFrame(columns=["Dataset", "Table", "Model"] + single_column_metrics)
|
53 |
|
54 |
# iterate through all json files and add the data to the dataframe
|
55 |
for data in all_data_json:
|
|
|
87 |
singletable_row[metric] = np.nan
|
88 |
singletable_df = pd.concat([singletable_df, pd.DataFrame([singletable_row])], ignore_index=True)
|
89 |
|
90 |
+
singlecolumn_row = {"Dataset": dataset, "Model": model, "Table": ""}
|
91 |
+
# insert row
|
92 |
+
for metric in single_column_metrics:
|
93 |
+
stripped_metric = strip_emoji(metric)
|
94 |
+
if stripped_metric in data["single_column_metrics"]:
|
95 |
+
for table in data["single_column_metrics"][stripped_metric].keys():
|
96 |
+
# check if row where dataset = dataset, model = model, table = table exists
|
97 |
+
if singlecolumn_df[
|
98 |
+
(singlecolumn_df["Dataset"] == dataset) &
|
99 |
+
(singlecolumn_df["Model"] == model) &
|
100 |
+
(singlecolumn_df["Table"] == table)
|
101 |
+
].empty:
|
102 |
+
singlecolumn_row = {"Dataset": dataset, "Model": model, "Table": table}
|
103 |
+
singlecolumn_df = pd.concat([singlecolumn_df, pd.DataFrame([singlecolumn_row])], ignore_index=True)
|
104 |
+
|
105 |
+
metric_values = []
|
106 |
+
for column in data["single_column_metrics"][stripped_metric][table].keys():
|
107 |
+
if "accuracy" in data["single_column_metrics"][stripped_metric][table][column]:
|
108 |
+
metric_values.append(data["single_column_metrics"][stripped_metric][table][column]["accuracy"])
|
109 |
+
if "value" in data["single_column_metrics"][stripped_metric][table][column]:
|
110 |
+
metric_values.append(data["single_column_metrics"][stripped_metric][table][column]["value"])
|
111 |
+
if "statistic" in data["single_column_metrics"][stripped_metric][table][column]:
|
112 |
+
metric_values.append(data["single_column_metrics"][stripped_metric][table][column]["statistic"])
|
113 |
+
|
114 |
+
# save np.mean(metric_values).round(decimals=2) to singlecolumn_df where dataset = dataset, model = model, table = table
|
115 |
+
singlecolumn_df.loc[
|
116 |
+
(singlecolumn_df["Dataset"] == dataset) &
|
117 |
+
(singlecolumn_df["Model"] == model) &
|
118 |
+
(singlecolumn_df["Table"] == table), metric] = np.mean(metric_values).round(decimals=2)
|
119 |
+
|
120 |
+
|
121 |
+
return singlecolumn_df, singletable_df, multitable_df
|
122 |
|
123 |
|
124 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|