|
import faiss
|
|
from sentence_transformers import SentenceTransformer
|
|
import PyPDF2
|
|
import gradio as gr
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path):
|
|
"""
|
|
Extract and clean text from a PDF file.
|
|
"""
|
|
knowledge = []
|
|
with open(pdf_path, 'rb') as file:
|
|
reader = PyPDF2.PdfReader(file)
|
|
for page in reader.pages:
|
|
text = page.extract_text()
|
|
if text:
|
|
|
|
text = text.replace("\n", " ")
|
|
|
|
knowledge.extend(text.split(". "))
|
|
|
|
return [sentence.strip() for sentence in knowledge if sentence.strip()]
|
|
|
|
|
|
embedder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
|
|
|
|
index = None
|
|
knowledge_base = []
|
|
|
|
def upload_and_query(pdf_file, query):
|
|
"""
|
|
Handle PDF upload and process queries.
|
|
"""
|
|
global index, knowledge_base
|
|
|
|
if not pdf_file:
|
|
return "Please upload a valid PDF file."
|
|
|
|
|
|
if not knowledge_base:
|
|
knowledge_base = extract_text_from_pdf(pdf_file)
|
|
if not knowledge_base:
|
|
return "The uploaded PDF does not contain any readable text."
|
|
|
|
document_embeddings = embedder.encode(knowledge_base)
|
|
dimension = document_embeddings.shape[1]
|
|
|
|
index = faiss.IndexFlatL2(dimension)
|
|
index.add(document_embeddings)
|
|
|
|
return "PDF uploaded successfully. Now ask your query."
|
|
|
|
|
|
if not query:
|
|
return "Please enter a query after uploading a PDF."
|
|
|
|
if index is None or not knowledge_base:
|
|
return "Please upload a PDF document before asking a query."
|
|
|
|
|
|
query_embedding = embedder.encode([query])
|
|
distances, indices = index.search(query_embedding, 5)
|
|
retrieved_docs = [knowledge_base[idx] for idx in indices[0]]
|
|
|
|
|
|
context = " ".join(retrieved_docs).replace("\n", " ").strip()
|
|
return context
|
|
|
|
|
|
interface = gr.Interface(
|
|
fn=upload_and_query,
|
|
inputs=[
|
|
gr.File(label="Upload a PDF document", type="filepath"),
|
|
gr.Textbox(label="Enter your query"),
|
|
],
|
|
outputs="text",
|
|
title="PDF Knowledge Base Query",
|
|
description="Upload a PDF document and ask questions based on its content."
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
interface.launch()
|
|
|