Spaces:
Sleeping
Sleeping
import gradio as gr | |
import rapidfuzz | |
from datasets import load_dataset | |
from sentence_transformers import SentenceTransformer, util | |
dataset = load_dataset("furkanunluturk/turkce-atasozleri")["train"] | |
proverbs = dataset["text"] | |
definitions = [definition[0]["text"] for definition in dataset["all_definitions"]] | |
combined_definitions = [ | |
" ".join(definition["text"] for definition in item["all_definitions"]) | |
for item in dataset | |
] | |
model = SentenceTransformer("emrecan/bert-base-turkish-cased-mean-nli-stsb-tr") | |
definition_embeddings = model.encode(combined_definitions, convert_to_tensor=True) | |
proverb_embeddings = model.encode(proverbs, convert_to_tensor=True) | |
def embedding_search(input_text, embeddings, top_x): | |
input_embedding = model.encode(input_text, convert_to_tensor=True) | |
results = util.semantic_search(input_embedding, embeddings, top_k=top_x)[0] | |
return [ | |
( | |
proverbs[result["corpus_id"]], | |
definitions[result["corpus_id"]], | |
result["score"], | |
) | |
for result in results | |
] | |
def fuzzy_search(input_text, corpus, top_x): | |
matches = rapidfuzz.process.extract( | |
query=input_text, | |
choices=corpus, | |
limit=top_x, | |
processor=lambda text: rapidfuzz.utils.default_process(text.replace("I", "ı")), | |
) | |
return [(proverbs[match[2]], definitions[match[2]], match[1]) for match in matches] | |
# Combined function to return all types of recommendations | |
def recommend_proverbs(input_text, top_x): | |
return ( | |
embedding_search(input_text, definition_embeddings, top_x), | |
embedding_search(input_text, proverb_embeddings, top_x), | |
fuzzy_search(input_text, definitions, top_x), | |
fuzzy_search(input_text, proverbs, top_x), | |
) | |
# Format results for display | |
def format_results(results): | |
return [ | |
[proverb, definition, f"{score:.4f}" if isinstance(score, float) else score] | |
for proverb, definition, score in results | |
] | |
def search_proverbs(input_text, top_x): | |
( | |
embedding_def_results, | |
embedding_prov_results, | |
fuzzy_def_results, | |
fuzzy_prov_results, | |
) = recommend_proverbs(input_text, top_x) | |
return ( | |
format_results(embedding_def_results), | |
format_results(fuzzy_def_results), | |
format_results(embedding_prov_results), | |
format_results(fuzzy_prov_results), | |
) | |
# Define Gradio app inputs and outputs | |
inputs = [ | |
gr.Textbox(label="Input Text", placeholder="Enter a phrase or sentence..."), | |
gr.Slider(label="Top X Results", minimum=1, maximum=10, step=1, value=5), | |
] | |
outputs = [ | |
gr.Dataframe( | |
headers=["Proverb", "DEFINITION", "Score"], | |
label="Embedding-Based Search (Definition)", | |
wrap=True, | |
), | |
gr.Dataframe( | |
headers=["Proverb", "DEFINITION", "WRatio"], | |
label="Fuzzy Search (Definition)", | |
wrap=True, | |
), | |
gr.Dataframe( | |
headers=["PROVERB", "Definition", "Score"], | |
label="Embedding-Based Search (Proverb)", | |
wrap=True, | |
), | |
gr.Dataframe( | |
headers=["PROVERB", "Definition", "WRatio"], | |
label="Fuzzy Search (Proverb)", | |
wrap=True, | |
), | |
] | |
# Gradio app initialization | |
app = gr.Interface( | |
fn=search_proverbs, | |
inputs=inputs, | |
outputs=outputs, | |
title="Turkish Proverb Recommender", | |
description=( | |
"Compare recommendations using embedding-based similarity and fuzzy search. " | |
"Search proverbs and definitions based on semantic and literal similarities." | |
), | |
) | |
if __name__ == "__main__": | |
app.launch() | |