Martin Jurkovic commited on
Commit
370d5a0
·
1 Parent(s): 385e405

Add single column leaderboard

Browse files
Files changed (4) hide show
  1. app.py +29 -3
  2. src/about.py +18 -0
  3. src/display/utils.py +16 -1
  4. src/populate.py +35 -3
app.py CHANGED
@@ -20,6 +20,7 @@ from src.display.utils import (
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
  singletable_AutoEvalColumn,
 
23
  ModelType,
24
  fields,
25
  # WeightType,
@@ -50,7 +51,7 @@ except Exception:
50
  restart_space()
51
 
52
 
53
- SINGLETABLE_LEADERBOARD_DF, MULTITABLE_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
54
 
55
  (
56
  finished_eval_queue_df,
@@ -111,6 +112,28 @@ def init_singletable_leaderboard(dataframe):
111
  interactive=False,
112
  )
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  demo = gr.Blocks(css=custom_css)
116
  with demo:
@@ -124,12 +147,15 @@ with demo:
124
  with gr.TabItem("🏅 SingleTable", elem_id="syntherela-benchmark-tab-table", id=1):
125
  singletable_leaderboard = init_singletable_leaderboard(SINGLETABLE_LEADERBOARD_DF)
126
 
 
 
 
127
 
128
 
129
- with gr.TabItem("📝 About", elem_id="syntherela-benchmark-tab-table", id=2):
130
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
131
 
132
- with gr.TabItem("🚀 Submit here! ", elem_id="syntherela-benchmark-tab-table", id=3):
133
  with gr.Column():
134
  with gr.Row():
135
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
  singletable_AutoEvalColumn,
23
+ singlecolumn_AutoEvalColumn,
24
  ModelType,
25
  fields,
26
  # WeightType,
 
51
  restart_space()
52
 
53
 
54
+ SINGLECOLUMN_LEADERBOARD_DF, SINGLETABLE_LEADERBOARD_DF, MULTITABLE_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
55
 
56
  (
57
  finished_eval_queue_df,
 
112
  interactive=False,
113
  )
114
 
115
+ def init_singlecolumn_leaderboard(dataframe):
116
+ if dataframe is None or dataframe.empty:
117
+ raise ValueError("Leaderboard DataFrame is empty or None.")
118
+ return Leaderboard(
119
+ value=dataframe,
120
+ datatype=[c.type for c in fields(singlecolumn_AutoEvalColumn)],
121
+ select_columns=SelectColumns(
122
+ default_selection=[c.name for c in fields(singlecolumn_AutoEvalColumn) if c.displayed_by_default],
123
+ cant_deselect=[c.name for c in fields(singlecolumn_AutoEvalColumn) if c.never_hidden],
124
+ label="Select Columns to Display:",
125
+ ),
126
+ search_columns=[singlecolumn_AutoEvalColumn.model.name], # AutoEvalColumn.license.name],
127
+ hide_columns=[c.name for c in fields(singlecolumn_AutoEvalColumn) if c.hidden],
128
+ filter_columns=[
129
+ ColumnFilter(singlecolumn_AutoEvalColumn.dataset.name, type="checkboxgroup", label="Datasets"),
130
+ ColumnFilter(singlecolumn_AutoEvalColumn.table.name, type="checkboxgroup", label="Tables"),
131
+ ColumnFilter(singlecolumn_AutoEvalColumn.model.name, type="checkboxgroup", label="Models"),
132
+ ],
133
+ bool_checkboxgroup_label="Hide models",
134
+ interactive=False,
135
+ )
136
+
137
 
138
  demo = gr.Blocks(css=custom_css)
139
  with demo:
 
147
  with gr.TabItem("🏅 SingleTable", elem_id="syntherela-benchmark-tab-table", id=1):
148
  singletable_leaderboard = init_singletable_leaderboard(SINGLETABLE_LEADERBOARD_DF)
149
 
150
+ with gr.TabItem("🏅 SingleColumn", elem_id="syntherela-benchmark-tab-table", id=2):
151
+ singlecolumn_leaderboard = init_singlecolumn_leaderboard(SINGLECOLUMN_LEADERBOARD_DF)
152
+
153
 
154
 
155
+ with gr.TabItem("📝 About", elem_id="syntherela-benchmark-tab-table", id=3):
156
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
157
 
158
+ with gr.TabItem("🚀 Submit here! ", elem_id="syntherela-benchmark-tab-table", id=4):
159
  with gr.Column():
160
  with gr.Row():
161
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
src/about.py CHANGED
@@ -27,6 +27,24 @@ class SingleTableTasks(Enum):
27
  # SingleTableDetection-XGBClassifier
28
  task_3 = Task("single-table", "SingleTableDetection-XGBClassifier", "SingleTableDetection-XGBClassifier ⬇️")
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  NUM_FEWSHOT = 0 # Change with your few shot
31
  # ---------------------------------------------------
32
 
 
27
  # SingleTableDetection-XGBClassifier
28
  task_3 = Task("single-table", "SingleTableDetection-XGBClassifier", "SingleTableDetection-XGBClassifier ⬇️")
29
 
30
+ class SingleColumnTasks(Enum):
31
+ # ChiSquareTest
32
+ task_0 = Task("single-column", "ChiSquareTest", "ChiSquareTest ⬇️")
33
+ # HellingerDistance
34
+ task_1 = Task("single-column", "HellingerDistance", "HellingerDistance ⬇️")
35
+ # JensenShannonDistance
36
+ task_2 = Task("single-column", "JensenShannonDistance", "JensenShannonDistance ⬇️")
37
+ # KolmogorovSmirnovTest
38
+ task_3 = Task("single-column", "KolmogorovSmirnovTest", "KolmogorovSmirnovTest ⬇️")
39
+ # SingleColumnDetection-LogisticRegression
40
+ task_4 = Task("single-column", "SingleColumnDetection-LogisticRegression", "SingleColumnDetection-LogisticRegression ⬇️")
41
+ # SingleColumnDetection-XGBClassifier
42
+ task_5 = Task("single-column", "SingleColumnDetection-XGBClassifier", "SingleColumnDetection-XGBClassifier ⬇️")
43
+ # TotalVariationDistance
44
+ task_6 = Task("single-column", "TotalVariationDistance", "TotalVariationDistance ⬇️")
45
+ # WassersteinDistance
46
+ task_7 = Task("single-column", "WassersteinDistance", "WassersteinDistance ⬇️")
47
+
48
  NUM_FEWSHOT = 0 # Change with your few shot
49
  # ---------------------------------------------------
50
 
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks, SingleTableTasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -55,6 +55,21 @@ for task in SingleTableTasks:
55
 
56
  singletable_AutoEvalColumn = make_dataclass("AutoEvalColumn", singletable_auto_eval_column_dict, frozen=True)
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  ## For the queue columns in the submission tab
59
  @dataclass(frozen=True)
60
  class EvalQueueColumn: # Queue column
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks, SingleTableTasks, SingleColumnTasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
55
 
56
  singletable_AutoEvalColumn = make_dataclass("AutoEvalColumn", singletable_auto_eval_column_dict, frozen=True)
57
 
58
+ # SINGLE COLUMN
59
+
60
+ singlecolumn_auto_eval_column_dict = []
61
+ # Init
62
+ singlecolumn_auto_eval_column_dict.append(["dataset", ColumnContent, ColumnContent("Dataset", "str", True, never_hidden=True)])
63
+ singlecolumn_auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
64
+ singlecolumn_auto_eval_column_dict.append(["table", ColumnContent, ColumnContent("Table", "str", True, never_hidden=True)])
65
+ #Scores
66
+ for task in SingleColumnTasks:
67
+ singlecolumn_auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
68
+
69
+ singlecolumn_AutoEvalColumn = make_dataclass("AutoEvalColumn", singlecolumn_auto_eval_column_dict, frozen=True)
70
+
71
+
72
+
73
  ## For the queue columns in the submission tab
74
  @dataclass(frozen=True)
75
  class EvalQueueColumn: # Queue column
src/populate.py CHANGED
@@ -7,7 +7,7 @@ import numpy as np
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
- from src.about import Tasks, SingleTableTasks
11
 
12
 
13
  # def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
@@ -43,12 +43,13 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
43
  all_data_json.append(data)
44
 
45
  multi_table_metrics = [task.value.col_name for task in Tasks]
46
-
47
  single_table_metrics = [task.value.col_name for task in SingleTableTasks]
 
48
 
49
  # create empty dataframe with the columns multi_table_metrics
50
  multitable_df = pd.DataFrame(columns=["Dataset", "Model"] + multi_table_metrics)
51
  singletable_df = pd.DataFrame(columns=["Dataset", "Model"] + single_table_metrics)
 
52
 
53
  # iterate through all json files and add the data to the dataframe
54
  for data in all_data_json:
@@ -86,7 +87,38 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
86
  singletable_row[metric] = np.nan
87
  singletable_df = pd.concat([singletable_df, pd.DataFrame([singletable_row])], ignore_index=True)
88
 
89
- return singletable_df, multitable_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
 
92
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
+ from src.about import Tasks, SingleTableTasks, SingleColumnTasks
11
 
12
 
13
  # def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
43
  all_data_json.append(data)
44
 
45
  multi_table_metrics = [task.value.col_name for task in Tasks]
 
46
  single_table_metrics = [task.value.col_name for task in SingleTableTasks]
47
+ single_column_metrics = [task.value.col_name for task in SingleColumnTasks]
48
 
49
  # create empty dataframe with the columns multi_table_metrics
50
  multitable_df = pd.DataFrame(columns=["Dataset", "Model"] + multi_table_metrics)
51
  singletable_df = pd.DataFrame(columns=["Dataset", "Model"] + single_table_metrics)
52
+ singlecolumn_df = pd.DataFrame(columns=["Dataset", "Table", "Model"] + single_column_metrics)
53
 
54
  # iterate through all json files and add the data to the dataframe
55
  for data in all_data_json:
 
87
  singletable_row[metric] = np.nan
88
  singletable_df = pd.concat([singletable_df, pd.DataFrame([singletable_row])], ignore_index=True)
89
 
90
+ singlecolumn_row = {"Dataset": dataset, "Model": model, "Table": ""}
91
+ # insert row
92
+ for metric in single_column_metrics:
93
+ stripped_metric = strip_emoji(metric)
94
+ if stripped_metric in data["single_column_metrics"]:
95
+ for table in data["single_column_metrics"][stripped_metric].keys():
96
+ # check if row where dataset = dataset, model = model, table = table exists
97
+ if singlecolumn_df[
98
+ (singlecolumn_df["Dataset"] == dataset) &
99
+ (singlecolumn_df["Model"] == model) &
100
+ (singlecolumn_df["Table"] == table)
101
+ ].empty:
102
+ singlecolumn_row = {"Dataset": dataset, "Model": model, "Table": table}
103
+ singlecolumn_df = pd.concat([singlecolumn_df, pd.DataFrame([singlecolumn_row])], ignore_index=True)
104
+
105
+ metric_values = []
106
+ for column in data["single_column_metrics"][stripped_metric][table].keys():
107
+ if "accuracy" in data["single_column_metrics"][stripped_metric][table][column]:
108
+ metric_values.append(data["single_column_metrics"][stripped_metric][table][column]["accuracy"])
109
+ if "value" in data["single_column_metrics"][stripped_metric][table][column]:
110
+ metric_values.append(data["single_column_metrics"][stripped_metric][table][column]["value"])
111
+ if "statistic" in data["single_column_metrics"][stripped_metric][table][column]:
112
+ metric_values.append(data["single_column_metrics"][stripped_metric][table][column]["statistic"])
113
+
114
+ # save np.mean(metric_values).round(decimals=2) to singlecolumn_df where dataset = dataset, model = model, table = table
115
+ singlecolumn_df.loc[
116
+ (singlecolumn_df["Dataset"] == dataset) &
117
+ (singlecolumn_df["Model"] == model) &
118
+ (singlecolumn_df["Table"] == table), metric] = np.mean(metric_values).round(decimals=2)
119
+
120
+
121
+ return singlecolumn_df, singletable_df, multitable_df
122
 
123
 
124
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: