import os
import gradio as gr
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain_groq import ChatGroq
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
import base64
from io import BytesIO
# Set up Groq API Key and LLM
os.environ["GROQ_API_KEY"] = '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'
llm = ChatGroq(
model='llama3-70b-8192',
temperature=0.5,
max_tokens=None,
timeout=None,
max_retries=2
)
# OCR Functions
def ocr_image(image_path, language='eng+guj'):
img = Image.open(image_path)
text = pytesseract.image_to_string(img, lang=language)
return text
def ocr_pdf(pdf_path, language='eng+guj'):
images = convert_from_path(pdf_path)
all_text = ""
for img in images:
text = pytesseract.image_to_string(img, lang=language)
all_text += text + "\n"
return all_text
def ocr_file(file_path):
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == ".pdf":
text_re = ocr_pdf(file_path, language='guj+eng')
elif file_extension in [".jpg", ".jpeg", ".png", ".bmp"]:
text_re = ocr_image(file_path, language='guj+eng')
else:
raise ValueError("Unsupported file format. Supported formats are PDF, JPG, JPEG, PNG, BMP.")
return text_re
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_text(text)
return chunks
def get_vector_store(text_chunks):
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': True}
)
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
os.makedirs("faiss_index", exist_ok=True)
vector_store.save_local("faiss_index")
return vector_store
def process_ocr_and_pdf_files(file_paths):
raw_text = ""
for file_path in file_paths:
raw_text += ocr_file(file_path) + "\n"
text_chunks = get_text_chunks(raw_text)
return get_vector_store(text_chunks)
def get_conversational_chain():
template = """You are an intelligent educational assistant specialized in handling queries about documents. You have been provided with OCR-processed text from the uploaded files that contains important educational information.
Core Responsibilities:
1. Language Processing:
- Identify the language of the user's query (English or Gujarati)
- Respond in the same language as the query
- If the query is in Gujarati, ensure the response maintains proper Gujarati grammar and terminology
- For technical terms, provide both English and Gujarati versions when relevant
2. Document Understanding:
- Analyze the OCR-processed text from the uploaded files
- Account for potential OCR errors or misinterpretations
- Focus on extracting accurate information despite possible OCR imperfections
3. Response Guidelines:
- Provide direct, clear answers based solely on the document content
- If information is unclear due to OCR quality, mention this limitation
- For numerical data (dates, percentages, marks), double-check accuracy before responding
- If information is not found in the documents, clearly state: "This information is not present in the uploaded documents"
4. Educational Context:
- Maintain focus on educational queries related to the document content
- For admission-related queries, emphasize important deadlines and requirements
- For scholarship information, highlight eligibility criteria and application processes
- For course-related queries, provide detailed, accurate information from the documents
5. Response Format:
- Structure responses clearly with relevant subpoints when necessary
- For complex information, break down the answer into digestible parts
- Include relevant reference points from the documents when applicable
- Format numerical data and dates clearly
6. Quality Control:
- Verify that responses align with the document content
- Don't make assumptions beyond the provided information
- If multiple interpretations are possible due to OCR quality, mention all possibilities
- Maintain consistency in terminology throughout the conversation
Important Rules:
- Never make up information not present in the documents
- Don't combine information from previous conversations or external knowledge
- Always indicate if certain parts of the documents are unclear due to OCR quality
- Maintain professional tone while being accessible to students and parents
- If the query is out of scope of the uploaded documents, politely redirect to relevant official sources
Context from uploaded documents:
{context}
Chat History:
{history}
Current Question: {question}
Assistant: Let me provide a clear and accurate response based on the uploaded documents...
"""
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/paraphrase-MiniLM-L6-v2",
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': True}
)
new_vector_store = FAISS.load_local(
"faiss_index", embeddings, allow_dangerous_deserialization=True
)
QA_CHAIN_PROMPT = PromptTemplate(
input_variables=["history", "context", "question"],
template=template
)
qa_chain = RetrievalQA.from_chain_type(
llm,
retriever=new_vector_store.as_retriever(),
chain_type='stuff',
verbose=True,
chain_type_kwargs={
"verbose": True,
"prompt": QA_CHAIN_PROMPT,
"memory": ConversationBufferMemory(memory_key="history", input_key="question"),
}
)
return qa_chain
def process_files_and_query(files, query):
if len(files) > 5:
return "Error: You can upload a maximum of 5 files only."
# Ensure temp directory exists
os.makedirs("temp", exist_ok=True)
# Save uploaded files
file_paths = []
for file in files:
file_path = os.path.join("temp", os.path.basename(file))
with open(file_path, "wb") as f:
f.write(open(file, 'rb').read())
file_paths.append(file_path)
# Process files and create vector store
process_ocr_and_pdf_files(file_paths)
# Perform query
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': True}
)
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(query)
chain = get_conversational_chain()
response = chain({"input_documents": docs, "query": query}, return_only_outputs=True)
result = response.get("result", "No result found")
return result
def handle_uploaded_file(uploaded_files, show_in_sidebar=False):
sidebar_content = ""
if len(uploaded_files) > 5:
return "Error: You can upload a maximum of 5 files only."
# If the uploaded_files is a list, process each file
for uploaded_file in uploaded_files:
# Determine the file extension
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
file_path = os.path.join("temp", uploaded_file.name)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Check if the uploaded file is in 'NamedString' format (Gradio sometimes returns it this way)
if isinstance(uploaded_file, gr.File):
# In this case, read the file directly from the 'data' attribute
file_data = uploaded_file.read() # This is the file content in bytes
# Save the file content to a local file
with open(file_path, "wb") as f:
f.write(file_data)
if file_extension == ".pdf":
# Read and encode the PDF as base64 to embed in the sidebar
with open(file_path, "rb") as pdf_file:
pdf_data = pdf_file.read()
pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
sidebar_content += f''
elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
# Display image in the sidebar
img = Image.open(file_path)
img_byte_array = BytesIO()
img.save(img_byte_array, format="PNG")
img_byte_array.seek(0)
sidebar_content += f''
else:
# For text files, show the file content
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
sidebar_content += f"
{content}" return sidebar_content # Gradio interface setup def upload_and_display(files): if len(files) > 5: return "Error: You can upload a maximum of 5 files only." sidebar_content = handle_uploaded_file(files, show_in_sidebar=True) return sidebar_content def launch_gradio_app(): with gr.Blocks() as demo: gr.Markdown("# Document OCR and Q&A Assistant") with gr.Row(): with gr.Column(scale=1): # Main content area (adjusted scale to an integer) file_input = gr.File( file_count="multiple", type="filepath", # Changed from 'filepath' to 'file' file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp"], label="Upload Documents (PDF/Images)" ) query_input = gr.Textbox( label="Ask a Question about the Documents", lines=3 ) submit_btn = gr.Button("Process and Query") output = gr.Textbox(label="Answer", lines=5) submit_btn.click( fn=process_files_and_query, inputs=[file_input, query_input], outputs=[output] ) with gr.Column(scale=1): # Sidebar (adjusted scale to an integer) gr.Markdown("## Sidebar") file_preview = gr.HTML(label="File Preview") # Display the preview content here file_input.change(fn=upload_and_display, inputs=file_input, outputs=file_preview) return demo # Launch the Gradio app if __name__ == "__main__": app = launch_gradio_app() app.launch(share=True) # Set share=True to create a public link # # Launch the Gradio app # if __name__ == "__main__": # app = launch_gradio_app() # # app.launch() # app.launch(share=True) # demo.launch()