Jagadesheeden commited on
Commit
898f281
·
verified ·
1 Parent(s): f1c1404

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +83 -0
  2. requirements +4 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ from sentence_transformers import SentenceTransformer
3
+ import PyPDF2
4
+ import gradio as gr
5
+
6
+ # Step 1: Extract Knowledge Base from PDF
7
+ def extract_text_from_pdf(pdf_path):
8
+ """
9
+ Extract and clean text from a PDF file.
10
+ """
11
+ knowledge = []
12
+ with open(pdf_path, 'rb') as file:
13
+ reader = PyPDF2.PdfReader(file)
14
+ for page in reader.pages:
15
+ text = page.extract_text()
16
+ if text:
17
+ # Replace newlines with spaces for better readability
18
+ text = text.replace("\n", " ")
19
+ # Split the text into meaningful sentences
20
+ knowledge.extend(text.split(". "))
21
+ # Clean up each sentence and remove empty strings
22
+ return [sentence.strip() for sentence in knowledge if sentence.strip()]
23
+
24
+ # Step 2: Create the Retriever
25
+ embedder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') # Embedding model for document similarity
26
+
27
+ index = None # Global variable to store FAISS index
28
+ knowledge_base = [] # Global variable to store the knowledge base
29
+
30
+ def upload_and_query(pdf_file, query):
31
+ """
32
+ Handle PDF upload and process queries.
33
+ """
34
+ global index, knowledge_base
35
+
36
+ if not pdf_file:
37
+ return "Please upload a valid PDF file."
38
+
39
+ # Process the uploaded PDF
40
+ if not knowledge_base: # Only process if the knowledge base is empty
41
+ knowledge_base = extract_text_from_pdf(pdf_file)
42
+ if not knowledge_base:
43
+ return "The uploaded PDF does not contain any readable text."
44
+
45
+ document_embeddings = embedder.encode(knowledge_base)
46
+ dimension = document_embeddings.shape[1]
47
+
48
+ index = faiss.IndexFlatL2(dimension)
49
+ index.add(document_embeddings)
50
+
51
+ return "PDF uploaded successfully. Now ask your query."
52
+
53
+ # Handle queries
54
+ if not query:
55
+ return "Please enter a query after uploading a PDF."
56
+
57
+ if index is None or not knowledge_base:
58
+ return "Please upload a PDF document before asking a query."
59
+
60
+ # Generate query embeddings and perform retrieval
61
+ query_embedding = embedder.encode([query])
62
+ distances, indices = index.search(query_embedding, 5) # Retrieve top 5 results
63
+ retrieved_docs = [knowledge_base[idx] for idx in indices[0]]
64
+
65
+ # Clean up and format the retrieved context
66
+ context = " ".join(retrieved_docs).replace("\n", " ").strip()
67
+ return context
68
+
69
+ # Step 3: Create Gradio Interface
70
+ interface = gr.Interface(
71
+ fn=upload_and_query,
72
+ inputs=[
73
+ gr.File(label="Upload a PDF document", type="filepath"),
74
+ gr.Textbox(label="Enter your query"),
75
+ ],
76
+ outputs="text",
77
+ title="PDF Knowledge Base Query",
78
+ description="Upload a PDF document and ask questions based on its content."
79
+ )
80
+
81
+ # Launch the Interface
82
+ if __name__ == "__main__":
83
+ interface.launch()
requirements ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ faiss-cpu
2
+ sentence-transformers
3
+ PyPDF2
4
+ gradio