|
|
|
|
|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
import numpy as np |
|
from PyPDF2 import PdfReader |
|
from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, VectorType |
|
import os |
|
import hashlib |
|
import time |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2") |
|
model = AutoModel.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2") |
|
|
|
|
|
pinecone_api_key = os.getenv('PINECONE_API_KEY') |
|
pc = Pinecone(api_key=pinecone_api_key) |
|
|
|
|
|
index_name = "scdd-index" |
|
if index_name not in pc.list_indexes().names(): |
|
pc.create_index( |
|
name=index_name, |
|
dimension=768, |
|
spec=ServerlessSpec( |
|
cloud=CloudProvider.AWS, |
|
region=AwsRegion.US_EAST_1 |
|
), |
|
vector_type=VectorType.DENSE, |
|
metric="cosine" |
|
) |
|
|
|
|
|
index = pc.Index(index_name) |
|
|
|
|
|
def encode_chunks_batch(chunks, batch_size=8): |
|
embeddings = [] |
|
for i in range(0, len(chunks), batch_size): |
|
batch_chunks = chunks[i:i+batch_size] |
|
inputs = tokenizer(batch_chunks, return_tensors='pt', padding=True, truncation=True, max_length=128) |
|
with torch.no_grad(): |
|
output = model(**inputs) |
|
batch_embeddings = output.last_hidden_state.mean(dim=1) |
|
batch_embeddings = batch_embeddings / batch_embeddings.norm(dim=1, keepdim=True) |
|
embeddings.extend(batch_embeddings.cpu().numpy()) |
|
return embeddings |
|
|
|
|
|
def generate_chunk_id(pdf_file, chunk_text, chunk_idx): |
|
hasher = hashlib.md5() |
|
hasher.update(chunk_text.encode('utf-8')) |
|
file_hash = hasher.hexdigest() |
|
return f"{os.path.basename(pdf_file.name)}-{file_hash}-chunk-{chunk_idx}" |
|
|
|
|
|
def process_pdfs(pdf_files): |
|
|
|
start_time = time.time() |
|
|
|
for pdf_file in pdf_files: |
|
|
|
reader = PdfReader(pdf_file.name) |
|
pdf_text = "".join(page.extract_text() for page in reader.pages if page.extract_text()) |
|
|
|
|
|
chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)] |
|
|
|
yield "Processing file, generating Embeddings and pushing to Pinecone...Please wait..." |
|
|
|
|
|
embeddings = encode_chunks_batch(chunks, batch_size=8) |
|
|
|
|
|
vectors = [ |
|
(generate_chunk_id(pdf_file, chunk, idx), embedding.tolist(), {"text": chunk}) |
|
for idx, (embedding, chunk) in enumerate(zip(embeddings, chunks)) |
|
] |
|
|
|
|
|
index.upsert(vectors) |
|
|
|
|
|
stats = index.describe_index_stats() |
|
|
|
elapsed_time = time.time() - start_time |
|
|
|
yield f"Processed PDF and embeddings stored in Pinecone successfully in {elapsed_time:.2f} seconds. Current Index Stats: {stats}" |
|
|
|
|
|
demo = gr.Interface( |
|
fn=process_pdfs, |
|
inputs=gr.Files(label="Upload PDF", file_types=[".pdf"]), |
|
outputs="text", |
|
title="NASA Bi-encoder PDF Embedding & Pinecone Storage", |
|
description="Upload PDF files to generate embeddings with NASA Bi-encoder and store in Pinecone." |
|
) |
|
|
|
demo.launch() |