Check in all
Browse files- app.py +3 -2
- src/about.py +13 -5
- src/envs.py +4 -5
- src/leaderboard/read_evals.py +4 -5
app.py
CHANGED
@@ -57,6 +57,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
57 |
pending_eval_queue_df,
|
58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
|
|
60 |
def init_leaderboard(dataframe):
|
61 |
if dataframe is None or dataframe.empty:
|
62 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
@@ -76,12 +77,12 @@ def init_leaderboard(dataframe):
|
|
76 |
ColumnFilter(
|
77 |
AutoEvalColumn.params.name,
|
78 |
type="slider",
|
79 |
-
min=0.
|
80 |
max=150,
|
81 |
label="Select the number of parameters (B)",
|
82 |
),
|
83 |
ColumnFilter(
|
84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=
|
85 |
),
|
86 |
],
|
87 |
bool_checkboxgroup_label="Hide models",
|
|
|
57 |
pending_eval_queue_df,
|
58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
+
|
61 |
def init_leaderboard(dataframe):
|
62 |
if dataframe is None or dataframe.empty:
|
63 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
77 |
ColumnFilter(
|
78 |
AutoEvalColumn.params.name,
|
79 |
type="slider",
|
80 |
+
min=0.00,
|
81 |
max=150,
|
82 |
label="Select the number of parameters (B)",
|
83 |
),
|
84 |
ColumnFilter(
|
85 |
+
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
|
86 |
),
|
87 |
],
|
88 |
bool_checkboxgroup_label="Hide models",
|
src/about.py
CHANGED
@@ -12,26 +12,34 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
20 |
|
21 |
|
22 |
-
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
LLM_BENCHMARKS_TEXT = f"""
|
33 |
## How it works
|
34 |
|
|
|
|
|
35 |
## Reproducibility
|
36 |
To reproduce our results, here is the commands you can run:
|
37 |
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("aiera_speaker_assign", "accuracy,none", "SPEAKER_ID")
|
16 |
+
task1 = Task("aiera_transcript_sentiment", "accuracy,none","SENT")
|
17 |
+
task2 = Task("bbh_zeroshot_causal_judgement", "exact_match,flexible-extract","BBH0")
|
18 |
+
task3 = Task("flare_ectsum", "recall,none","ECTSUM")
|
19 |
+
task4 = Task("flare_edtsum", "rougeLsum,none","EDTSUM")
|
20 |
+
task5 = Task("flare_finqa", "exact_match_manual,none","FINQA")
|
21 |
+
task6 = Task("flare_fiqasa", "accuracy,none","FIQASA")
|
22 |
+
task7 = Task("flare_ner", "accuracy,none","NER")
|
23 |
+
|
24 |
|
25 |
NUM_FEWSHOT = 0 # Change with your few shot
|
26 |
# ---------------------------------------------------
|
27 |
|
28 |
|
|
|
29 |
# Your leaderboard name
|
30 |
+
TITLE = """<h1 align="center" id="space-title">Aiera Leaderboard</h1>"""
|
31 |
|
32 |
# What does your leaderboard evaluate?
|
33 |
INTRODUCTION_TEXT = """
|
34 |
+
The Aiera Financial Leaderboard evaluates the performance of LLMs on a variety of tasks tailored to financial services.
|
35 |
"""
|
36 |
|
37 |
# Which evaluations are you running? how can people reproduce what you have?
|
38 |
LLM_BENCHMARKS_TEXT = f"""
|
39 |
## How it works
|
40 |
|
41 |
+
Proof something happenind
|
42 |
+
|
43 |
## Reproducibility
|
44 |
To reproduce our results, here is the commands you can run:
|
45 |
|
src/envs.py
CHANGED
@@ -5,13 +5,12 @@ from huggingface_hub import HfApi
|
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
-
|
9 |
-
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
-
QUEUE_REPO = f"{OWNER}/
|
14 |
-
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
+
OWNER = "Aiera" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
|
|
9 |
# ----------------------------------
|
10 |
|
11 |
+
REPO_ID = f"{OWNER}/aiera-finance-leaderboard"
|
12 |
+
QUEUE_REPO = f"{OWNER}/aiera-leaderboard-queue"
|
13 |
+
RESULTS_REPO = f"{OWNER}/aiera-leaderboard-results"
|
14 |
|
15 |
# If you setup a cache later, just change HF_HOME
|
16 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
src/leaderboard/read_evals.py
CHANGED
@@ -93,7 +93,7 @@ class EvalResult:
|
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
-
request_file = get_request_file_for_model(requests_path, self.full_model
|
97 |
|
98 |
try:
|
99 |
with open(request_file, "r") as f:
|
@@ -105,7 +105,7 @@ class EvalResult:
|
|
105 |
self.num_params = request.get("params", 0)
|
106 |
self.date = request.get("submitted_time", "")
|
107 |
except Exception:
|
108 |
-
print(f"Could not find request file for {self.org}/{self.model}
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
@@ -132,11 +132,11 @@ class EvalResult:
|
|
132 |
return data_dict
|
133 |
|
134 |
|
135 |
-
def get_request_file_for_model(requests_path, model_name
|
136 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
137 |
request_files = os.path.join(
|
138 |
requests_path,
|
139 |
-
f"{model_name}
|
140 |
)
|
141 |
request_files = glob.glob(request_files)
|
142 |
|
@@ -148,7 +148,6 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
148 |
req_content = json.load(f)
|
149 |
if (
|
150 |
req_content["status"] in ["FINISHED"]
|
151 |
-
and req_content["precision"] == precision.split(".")[-1]
|
152 |
):
|
153 |
request_file = tmp_request_file
|
154 |
return request_file
|
|
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
+
request_file = get_request_file_for_model(requests_path, self.full_model)
|
97 |
|
98 |
try:
|
99 |
with open(request_file, "r") as f:
|
|
|
105 |
self.num_params = request.get("params", 0)
|
106 |
self.date = request.get("submitted_time", "")
|
107 |
except Exception:
|
108 |
+
print(f"Could not find request file for {self.org}/{self.model}")
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
132 |
return data_dict
|
133 |
|
134 |
|
135 |
+
def get_request_file_for_model(requests_path, model_name):
|
136 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
137 |
request_files = os.path.join(
|
138 |
requests_path,
|
139 |
+
f"{model_name}*.json",
|
140 |
)
|
141 |
request_files = glob.glob(request_files)
|
142 |
|
|
|
148 |
req_content = json.load(f)
|
149 |
if (
|
150 |
req_content["status"] in ["FINISHED"]
|
|
|
151 |
):
|
152 |
request_file = tmp_request_file
|
153 |
return request_file
|