Jacqueline Garrahan
commited on
Check in updates
Browse files- app.py +6 -5
- src/display/utils.py +25 -20
- src/populate.py +5 -1
app.py
CHANGED
@@ -60,6 +60,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
60 |
|
61 |
|
62 |
def init_leaderboard(dataframe):
|
|
|
63 |
if dataframe is None or dataframe.empty:
|
64 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
65 |
|
@@ -68,15 +69,15 @@ def init_leaderboard(dataframe):
|
|
68 |
value=dataframe,
|
69 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
70 |
select_columns=SelectColumns(
|
71 |
-
default_selection=[c.name for c in fields(
|
72 |
-
cant_deselect=[c.name for c in fields(
|
73 |
label="Select Columns to Display:",
|
74 |
),
|
75 |
-
search_columns=[
|
76 |
-
hide_columns=[c.name for c in fields(
|
77 |
filter_columns=[
|
78 |
ColumnFilter(
|
79 |
-
|
80 |
),
|
81 |
],
|
82 |
bool_checkboxgroup_label="Hide models",
|
|
|
60 |
|
61 |
|
62 |
def init_leaderboard(dataframe):
|
63 |
+
eval_col_instance = AutoEvalColumn()
|
64 |
if dataframe is None or dataframe.empty:
|
65 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
66 |
|
|
|
69 |
value=dataframe,
|
70 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
71 |
select_columns=SelectColumns(
|
72 |
+
default_selection=[c.name for c in fields(eval_col_instance) if c.displayed_by_default],
|
73 |
+
cant_deselect=[c.name for c in fields(eval_col_instance) if c.never_hidden],
|
74 |
label="Select Columns to Display:",
|
75 |
),
|
76 |
+
search_columns=[eval_col_instance.model.name, eval_col_instance.license.name],
|
77 |
+
hide_columns=[c.name for c in fields(eval_col_instance) if c.hidden],
|
78 |
filter_columns=[
|
79 |
ColumnFilter(
|
80 |
+
eval_col_instance.still_on_hub.name, type="boolean", label="External Providers", default=False
|
81 |
),
|
82 |
],
|
83 |
bool_checkboxgroup_label="Hide models",
|
src/display/utils.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
@@ -8,10 +8,7 @@ from src.about import Tasks
|
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
11 |
-
|
12 |
-
# These classes are for user facing column names,
|
13 |
-
# to avoid having to change them all around the code
|
14 |
-
# when a modif is needed
|
15 |
@dataclass
|
16 |
class ColumnContent:
|
17 |
name: str
|
@@ -20,23 +17,30 @@ class ColumnContent:
|
|
20 |
hidden: bool = False
|
21 |
never_hidden: bool = False
|
22 |
|
23 |
-
|
24 |
-
auto_eval_column_dict = [
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Aiera Score ⬆️", "number", True)])
|
30 |
-
for task in Tasks:
|
31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
auto_eval_column_dict.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
|
38 |
-
# We use make dataclass to dynamically fill the scores from Tasks
|
39 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
40 |
|
41 |
## For the queue columns in the submission tab
|
42 |
@dataclass(frozen=True)
|
@@ -60,7 +64,8 @@ class ModelDetails:
|
|
60 |
symbol: str = "" # emoji
|
61 |
|
62 |
# Column selection
|
63 |
-
|
|
|
64 |
|
65 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
66 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass, field
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
|
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
11 |
+
# Define ColumnContent class
|
|
|
|
|
|
|
12 |
@dataclass
|
13 |
class ColumnContent:
|
14 |
name: str
|
|
|
17 |
hidden: bool = False
|
18 |
never_hidden: bool = False
|
19 |
|
20 |
+
# Define auto_eval_column_dict with correct structure
|
21 |
+
auto_eval_column_dict = [
|
22 |
+
("model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))),
|
23 |
+
("org", ColumnContent, field(default_factory=lambda: ColumnContent("Organization", "str", True))),
|
24 |
+
("average", ColumnContent, field(default_factory=lambda: ColumnContent("Aiera Score ⬆️", "number", True))),
|
25 |
+
]
|
|
|
|
|
|
|
26 |
|
27 |
+
# Add task-specific columns
|
28 |
+
for task in Tasks:
|
29 |
+
auto_eval_column_dict.append(
|
30 |
+
(task.value.benchmark, ColumnContent, field(default_factory=lambda task=task: ColumnContent(task.value.col_name, "number", True)))
|
31 |
+
)
|
32 |
+
|
33 |
+
# Add remaining columns
|
34 |
+
auto_eval_column_dict.extend([
|
35 |
+
("params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))),
|
36 |
+
("still_on_hub", ColumnContent, field(default_factory=lambda: ColumnContent("Available on the hub", "bool", False))),
|
37 |
+
("license", ColumnContent, field(default_factory=lambda: ColumnContent("License", "str", False))),
|
38 |
+
])
|
39 |
+
|
40 |
+
# Dynamically create the AutoEvalColumn dataclass
|
41 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
42 |
|
43 |
|
|
|
|
|
44 |
|
45 |
## For the queue columns in the submission tab
|
46 |
@dataclass(frozen=True)
|
|
|
64 |
symbol: str = "" # emoji
|
65 |
|
66 |
# Column selection
|
67 |
+
eval_col_instance = AutoEvalColumn()
|
68 |
+
COLS = [c.name for c in fields(eval_col_instance) if not c.hidden]
|
69 |
|
70 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
71 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
src/populate.py
CHANGED
@@ -10,13 +10,17 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
|
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
|
|
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
|
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
|
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
|
14 |
+
auto_eval_instance = AutoEvalColumn()
|
15 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
16 |
all_data_json = [v.to_dict() for v in raw_data]
|
17 |
|
18 |
df = pd.DataFrame.from_records(all_data_json)
|
19 |
+
|
20 |
+
df = df.sort_values(by=[auto_eval_instance.average.name], ascending=False)
|
21 |
df = df[cols].round(decimals=2)
|
22 |
|
23 |
+
|
24 |
# filter out if any of the benchmarks have not been produced
|
25 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
26 |
return df
|