atasozu-onerici / app.py
furkanunluturk's picture
Update app.py
b0a17e3 verified
raw
history blame contribute delete
3.59 kB
import gradio as gr
import rapidfuzz
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
dataset = load_dataset("furkanunluturk/turkce-atasozleri")["train"]
proverbs = dataset["text"]
definitions = [definition[0]["text"] for definition in dataset["all_definitions"]]
combined_definitions = [
" ".join(definition["text"] for definition in item["all_definitions"])
for item in dataset
]
model = SentenceTransformer("emrecan/bert-base-turkish-cased-mean-nli-stsb-tr")
definition_embeddings = model.encode(combined_definitions, convert_to_tensor=True)
proverb_embeddings = model.encode(proverbs, convert_to_tensor=True)
def embedding_search(input_text, embeddings, top_x):
input_embedding = model.encode(input_text, convert_to_tensor=True)
results = util.semantic_search(input_embedding, embeddings, top_k=top_x)[0]
return [
(
proverbs[result["corpus_id"]],
definitions[result["corpus_id"]],
result["score"],
)
for result in results
]
def fuzzy_search(input_text, corpus, top_x):
matches = rapidfuzz.process.extract(
query=input_text,
choices=corpus,
limit=top_x,
processor=lambda text: rapidfuzz.utils.default_process(text.replace("I", "ı")),
)
return [(proverbs[match[2]], definitions[match[2]], match[1]) for match in matches]
# Combined function to return all types of recommendations
def recommend_proverbs(input_text, top_x):
return (
embedding_search(input_text, definition_embeddings, top_x),
embedding_search(input_text, proverb_embeddings, top_x),
fuzzy_search(input_text, definitions, top_x),
fuzzy_search(input_text, proverbs, top_x),
)
# Format results for display
def format_results(results):
return [
[proverb, definition, f"{score:.4f}" if isinstance(score, float) else score]
for proverb, definition, score in results
]
def search_proverbs(input_text, top_x):
(
embedding_def_results,
embedding_prov_results,
fuzzy_def_results,
fuzzy_prov_results,
) = recommend_proverbs(input_text, top_x)
return (
format_results(embedding_def_results),
format_results(fuzzy_def_results),
format_results(embedding_prov_results),
format_results(fuzzy_prov_results),
)
# Define Gradio app inputs and outputs
inputs = [
gr.Textbox(label="Input Text", placeholder="Enter a phrase or sentence..."),
gr.Slider(label="Top X Results", minimum=1, maximum=10, step=1, value=5),
]
outputs = [
gr.Dataframe(
headers=["Proverb", "DEFINITION", "Score"],
label="Embedding-Based Search (Definition)",
wrap=True,
),
gr.Dataframe(
headers=["Proverb", "DEFINITION", "WRatio"],
label="Fuzzy Search (Definition)",
wrap=True,
),
gr.Dataframe(
headers=["PROVERB", "Definition", "Score"],
label="Embedding-Based Search (Proverb)",
wrap=True,
),
gr.Dataframe(
headers=["PROVERB", "Definition", "WRatio"],
label="Fuzzy Search (Proverb)",
wrap=True,
),
]
# Gradio app initialization
app = gr.Interface(
fn=search_proverbs,
inputs=inputs,
outputs=outputs,
title="Turkish Proverb Recommender",
description=(
"Compare recommendations using embedding-based similarity and fuzzy search. "
"Search proverbs and definitions based on semantic and literal similarities."
),
)
if __name__ == "__main__":
app.launch()