RAG_PDF / app.py
Jagadesheeden's picture
Upload 2 files
898f281 verified
raw
history blame contribute delete
2.89 kB
import faiss
from sentence_transformers import SentenceTransformer
import PyPDF2
import gradio as gr
# Step 1: Extract Knowledge Base from PDF
def extract_text_from_pdf(pdf_path):
"""
Extract and clean text from a PDF file.
"""
knowledge = []
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text = page.extract_text()
if text:
# Replace newlines with spaces for better readability
text = text.replace("\n", " ")
# Split the text into meaningful sentences
knowledge.extend(text.split(". "))
# Clean up each sentence and remove empty strings
return [sentence.strip() for sentence in knowledge if sentence.strip()]
# Step 2: Create the Retriever
embedder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') # Embedding model for document similarity
index = None # Global variable to store FAISS index
knowledge_base = [] # Global variable to store the knowledge base
def upload_and_query(pdf_file, query):
"""
Handle PDF upload and process queries.
"""
global index, knowledge_base
if not pdf_file:
return "Please upload a valid PDF file."
# Process the uploaded PDF
if not knowledge_base: # Only process if the knowledge base is empty
knowledge_base = extract_text_from_pdf(pdf_file)
if not knowledge_base:
return "The uploaded PDF does not contain any readable text."
document_embeddings = embedder.encode(knowledge_base)
dimension = document_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(document_embeddings)
return "PDF uploaded successfully. Now ask your query."
# Handle queries
if not query:
return "Please enter a query after uploading a PDF."
if index is None or not knowledge_base:
return "Please upload a PDF document before asking a query."
# Generate query embeddings and perform retrieval
query_embedding = embedder.encode([query])
distances, indices = index.search(query_embedding, 5) # Retrieve top 5 results
retrieved_docs = [knowledge_base[idx] for idx in indices[0]]
# Clean up and format the retrieved context
context = " ".join(retrieved_docs).replace("\n", " ").strip()
return context
# Step 3: Create Gradio Interface
interface = gr.Interface(
fn=upload_and_query,
inputs=[
gr.File(label="Upload a PDF document", type="filepath"),
gr.Textbox(label="Enter your query"),
],
outputs="text",
title="PDF Knowledge Base Query",
description="Upload a PDF document and ask questions based on its content."
)
# Launch the Interface
if __name__ == "__main__":
interface.launch()