Spaces:

aquibmoin
/

EM-GEN

Sleeping

App Files Files Community

EM-GEN / app.py

aquibmoin

Update app.py

4350956 verified about 1 month ago

raw

history blame contribute delete

3.49 kB

	# Re-build 250325

	import gradio as gr
	from transformers import AutoTokenizer, AutoModel
	import torch
	import numpy as np
	from PyPDF2 import PdfReader
	from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, VectorType
	import os
	import hashlib
	import time

	# Load NASA-specific bi-encoder model
	tokenizer = AutoTokenizer.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2")
	model = AutoModel.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2")

	# Initialize Pinecone client
	pinecone_api_key = os.getenv('PINECONE_API_KEY')
	pc = Pinecone(api_key=pinecone_api_key)

	# Create Pinecone index if it doesn't exist
	index_name = "scdd-index"
	if index_name not in pc.list_indexes().names():
	pc.create_index(
	name=index_name,
	dimension=768,
	spec=ServerlessSpec(
	cloud=CloudProvider.AWS,
	region=AwsRegion.US_EAST_1
	),
	vector_type=VectorType.DENSE,
	metric="cosine"
	)

	# Connect to the Pinecone index
	index = pc.Index(index_name)

	# Function to encode text using bi-encoder in batches
	def encode_chunks_batch(chunks, batch_size=8):
	embeddings = []
	for i in range(0, len(chunks), batch_size):
	batch_chunks = chunks[i:i+batch_size]
	inputs = tokenizer(batch_chunks, return_tensors='pt', padding=True, truncation=True, max_length=128)
	with torch.no_grad():
	output = model(**inputs)
	batch_embeddings = output.last_hidden_state.mean(dim=1)
	batch_embeddings = batch_embeddings / batch_embeddings.norm(dim=1, keepdim=True)
	embeddings.extend(batch_embeddings.cpu().numpy())
	return embeddings

	# Function to generate a unique chunk ID based on file content
	def generate_chunk_id(pdf_file, chunk_text, chunk_idx):
	hasher = hashlib.md5()
	hasher.update(chunk_text.encode('utf-8'))
	file_hash = hasher.hexdigest()
	return f"{os.path.basename(pdf_file.name)}-{file_hash}-chunk-{chunk_idx}"

	# Function to process PDFs and upsert embeddings to Pinecone
	def process_pdfs(pdf_files):

	start_time = time.time()

	for pdf_file in pdf_files:

	reader = PdfReader(pdf_file.name)
	pdf_text = "".join(page.extract_text() for page in reader.pages if page.extract_text())

	# Split text into smaller chunks
	chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)]

	yield "Processing file, generating Embeddings and pushing to Pinecone...Please wait..."

	# Generate embeddings in batches
	embeddings = encode_chunks_batch(chunks, batch_size=8)

	# Prepare data for Pinecone with unique IDs
	vectors = [
	(generate_chunk_id(pdf_file, chunk, idx), embedding.tolist(), {"text": chunk})
	for idx, (embedding, chunk) in enumerate(zip(embeddings, chunks))
	]

	# Upsert embeddings into Pinecone
	index.upsert(vectors)

	# Fetch index stats
	stats = index.describe_index_stats()

	elapsed_time = time.time() - start_time

	yield f"Processed PDF and embeddings stored in Pinecone successfully in {elapsed_time:.2f} seconds. Current Index Stats: {stats}"

	# Gradio Interface
	demo = gr.Interface(
	fn=process_pdfs,
	inputs=gr.Files(label="Upload PDF", file_types=[".pdf"]),
	outputs="text",
	title="NASA Bi-encoder PDF Embedding & Pinecone Storage",
	description="Upload PDF files to generate embeddings with NASA Bi-encoder and store in Pinecone."
	)

	demo.launch()