jrg-123 commited on
Commit
c02e9eb
·
1 Parent(s): efc4686

Check in all

Browse files
Files changed (4) hide show
  1. app.py +3 -2
  2. src/about.py +13 -5
  3. src/envs.py +4 -5
  4. src/leaderboard/read_evals.py +4 -5
app.py CHANGED
@@ -57,6 +57,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -76,12 +77,12 @@ def init_leaderboard(dataframe):
76
  ColumnFilter(
77
  AutoEvalColumn.params.name,
78
  type="slider",
79
- min=0.01,
80
  max=150,
81
  label="Select the number of parameters (B)",
82
  ),
83
  ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
  ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
 
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
+
61
  def init_leaderboard(dataframe):
62
  if dataframe is None or dataframe.empty:
63
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
77
  ColumnFilter(
78
  AutoEvalColumn.params.name,
79
  type="slider",
80
+ min=0.00,
81
  max=150,
82
  label="Select the number of parameters (B)",
83
  ),
84
  ColumnFilter(
85
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
86
  ),
87
  ],
88
  bool_checkboxgroup_label="Hide models",
src/about.py CHANGED
@@ -12,26 +12,34 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
20
 
21
 
22
-
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
  ## How it works
34
 
 
 
35
  ## Reproducibility
36
  To reproduce our results, here is the commands you can run:
37
 
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("aiera_speaker_assign", "accuracy,none", "SPEAKER_ID")
16
+ task1 = Task("aiera_transcript_sentiment", "accuracy,none","SENT")
17
+ task2 = Task("bbh_zeroshot_causal_judgement", "exact_match,flexible-extract","BBH0")
18
+ task3 = Task("flare_ectsum", "recall,none","ECTSUM")
19
+ task4 = Task("flare_edtsum", "rougeLsum,none","EDTSUM")
20
+ task5 = Task("flare_finqa", "exact_match_manual,none","FINQA")
21
+ task6 = Task("flare_fiqasa", "accuracy,none","FIQASA")
22
+ task7 = Task("flare_ner", "accuracy,none","NER")
23
+
24
 
25
  NUM_FEWSHOT = 0 # Change with your few shot
26
  # ---------------------------------------------------
27
 
28
 
 
29
  # Your leaderboard name
30
+ TITLE = """<h1 align="center" id="space-title">Aiera Leaderboard</h1>"""
31
 
32
  # What does your leaderboard evaluate?
33
  INTRODUCTION_TEXT = """
34
+ The Aiera Financial Leaderboard evaluates the performance of LLMs on a variety of tasks tailored to financial services.
35
  """
36
 
37
  # Which evaluations are you running? how can people reproduce what you have?
38
  LLM_BENCHMARKS_TEXT = f"""
39
  ## How it works
40
 
41
+ Proof something happenind
42
+
43
  ## Reproducibility
44
  To reproduce our results, here is the commands you can run:
45
 
src/envs.py CHANGED
@@ -5,13 +5,12 @@ from huggingface_hub import HfApi
5
  # Info to change for your repository
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
-
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
+ OWNER = "Aiera" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
9
  # ----------------------------------
10
 
11
+ REPO_ID = f"{OWNER}/aiera-finance-leaderboard"
12
+ QUEUE_REPO = f"{OWNER}/aiera-leaderboard-queue"
13
+ RESULTS_REPO = f"{OWNER}/aiera-leaderboard-results"
14
 
15
  # If you setup a cache later, just change HF_HOME
16
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -93,7 +93,7 @@ class EvalResult:
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
 
98
  try:
99
  with open(request_file, "r") as f:
@@ -105,7 +105,7 @@ class EvalResult:
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -132,11 +132,11 @@ class EvalResult:
132
  return data_dict
133
 
134
 
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
  request_files = os.path.join(
138
  requests_path,
139
- f"{model_name}_eval_request_*.json",
140
  )
141
  request_files = glob.glob(request_files)
142
 
@@ -148,7 +148,6 @@ def get_request_file_for_model(requests_path, model_name, precision):
148
  req_content = json.load(f)
149
  if (
150
  req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
  ):
153
  request_file = tmp_request_file
154
  return request_file
 
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
96
+ request_file = get_request_file_for_model(requests_path, self.full_model)
97
 
98
  try:
99
  with open(request_file, "r") as f:
 
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
  except Exception:
108
+ print(f"Could not find request file for {self.org}/{self.model}")
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
132
  return data_dict
133
 
134
 
135
+ def get_request_file_for_model(requests_path, model_name):
136
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
  request_files = os.path.join(
138
  requests_path,
139
+ f"{model_name}*.json",
140
  )
141
  request_files = glob.glob(request_files)
142
 
 
148
  req_content = json.load(f)
149
  if (
150
  req_content["status"] in ["FINISHED"]
 
151
  ):
152
  request_file = tmp_request_file
153
  return request_file