Spaces:

ai-conferences
/

ICLR2025

Running on Zero

hysts HF Staff commited on 18 days ago

Commit

52de44b

1 Parent(s): f69892b

Update

Files changed (2) hide show

app.py CHANGED Viewed

@@ -127,8 +127,9 @@ def update_df(
             except pl.exceptions.ComputeError as e:
                 raise gr.Error(str(e)) from e
         else:
-            paper_ids = semantic_search(search_query, candidate_pool_size, score_threshold)
-            df = df.filter(pl.col("paper_id").is_in(paper_ids))
     if presentation_type != "(ALL)":
         df = df.filter(pl.col("Type").str.contains(presentation_type))
@@ -156,12 +157,15 @@ with gr.Blocks(css_paths="style.css") as demo:
             choices=["Semantic Search", "Title Search"],
             value="Semantic Search",
             show_label=False,
         )
         search_query = gr.Textbox(label="Search", submit_btn=True, show_label=False, placeholder="Enter query here")
-        with gr.Accordion(label="Advanced Search Options", open=False, visible=True) as advanced_search_options:
             with gr.Row():
-                candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=200, step=1, value=100)
-                score_threshold = gr.Slider(label="Score Threshold", minimum=0, maximum=1, step=0.01, value=0.7)
     presentation_type = gr.Radio(
         label="Presentation Type",

             except pl.exceptions.ComputeError as e:
                 raise gr.Error(str(e)) from e
         else:
+            paper_ids, scores = semantic_search(search_query, candidate_pool_size, score_threshold)
+            df = pl.DataFrame({"paper_id": paper_ids, "score": scores}).join(df, on="paper_id", how="inner")
+            df = df.sort("score", descending=True).drop("score")
     if presentation_type != "(ALL)":
         df = df.filter(pl.col("Type").str.contains(presentation_type))
             choices=["Semantic Search", "Title Search"],
             value="Semantic Search",
             show_label=False,
+            info="Note: Semantic search consumes your ZeroGPU quota.",
         )
         search_query = gr.Textbox(label="Search", submit_btn=True, show_label=False, placeholder="Enter query here")
+        with gr.Accordion(label="Advanced Search Options", open=False) as advanced_search_options:
             with gr.Row():
+                candidate_pool_size = gr.Slider(
+                    label="Candidate Pool Size", minimum=1, maximum=1000, step=1, value=300
+                )
+                score_threshold = gr.Slider(label="Score Threshold", minimum=0, maximum=1, step=0.01, value=0.5)
     presentation_type = gr.Radio(
         label="Presentation Type",

semantic_search.py CHANGED Viewed

@@ -16,7 +16,9 @@ reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
 @spaces.GPU(duration=5)
-def semantic_search(query: str, candidate_pool_size: int = 100, score_threshold: float = 0.7) -> list[int]:
     query_vec = model.encode(query)
     _, retrieved_data = ds.get_nearest_examples("embedding", query_vec, k=candidate_pool_size)
@@ -27,8 +29,13 @@ def semantic_search(query: str, candidate_pool_size: int = 100, score_threshold:
     rerank_scores = reranker.predict(rerank_inputs)
     sorted_indices = np.argsort(rerank_scores)[::-1]
-    return [
-        retrieved_data["paper_id"][i]
-        for i in sorted_indices
-        if scipy.special.expit(rerank_scores[i]) >= score_threshold
-    ]

 @spaces.GPU(duration=5)
+def semantic_search(
+    query: str, candidate_pool_size: int = 300, score_threshold: float = 0.5
+) -> tuple[list[int], list[float]]:
     query_vec = model.encode(query)
     _, retrieved_data = ds.get_nearest_examples("embedding", query_vec, k=candidate_pool_size)
     rerank_scores = reranker.predict(rerank_inputs)
     sorted_indices = np.argsort(rerank_scores)[::-1]
+    paper_ids = []
+    scores = []
+    for i in sorted_indices:
+        score = float(scipy.special.expit(rerank_scores[i]))
+        if score < score_threshold:
+            break
+        paper_ids.append(retrieved_data["paper_id"][i])
+        scores.append(score)
+    return paper_ids, scores