File size: 2,893 Bytes
898f281 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import faiss
from sentence_transformers import SentenceTransformer
import PyPDF2
import gradio as gr
# Step 1: Extract Knowledge Base from PDF
def extract_text_from_pdf(pdf_path):
"""
Extract and clean text from a PDF file.
"""
knowledge = []
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text = page.extract_text()
if text:
# Replace newlines with spaces for better readability
text = text.replace("\n", " ")
# Split the text into meaningful sentences
knowledge.extend(text.split(". "))
# Clean up each sentence and remove empty strings
return [sentence.strip() for sentence in knowledge if sentence.strip()]
# Step 2: Create the Retriever
embedder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') # Embedding model for document similarity
index = None # Global variable to store FAISS index
knowledge_base = [] # Global variable to store the knowledge base
def upload_and_query(pdf_file, query):
"""
Handle PDF upload and process queries.
"""
global index, knowledge_base
if not pdf_file:
return "Please upload a valid PDF file."
# Process the uploaded PDF
if not knowledge_base: # Only process if the knowledge base is empty
knowledge_base = extract_text_from_pdf(pdf_file)
if not knowledge_base:
return "The uploaded PDF does not contain any readable text."
document_embeddings = embedder.encode(knowledge_base)
dimension = document_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(document_embeddings)
return "PDF uploaded successfully. Now ask your query."
# Handle queries
if not query:
return "Please enter a query after uploading a PDF."
if index is None or not knowledge_base:
return "Please upload a PDF document before asking a query."
# Generate query embeddings and perform retrieval
query_embedding = embedder.encode([query])
distances, indices = index.search(query_embedding, 5) # Retrieve top 5 results
retrieved_docs = [knowledge_base[idx] for idx in indices[0]]
# Clean up and format the retrieved context
context = " ".join(retrieved_docs).replace("\n", " ").strip()
return context
# Step 3: Create Gradio Interface
interface = gr.Interface(
fn=upload_and_query,
inputs=[
gr.File(label="Upload a PDF document", type="filepath"),
gr.Textbox(label="Enter your query"),
],
outputs="text",
title="PDF Knowledge Base Query",
description="Upload a PDF document and ask questions based on its content."
)
# Launch the Interface
if __name__ == "__main__":
interface.launch()
|