File size: 3,493 Bytes
9f95d04
2f964fa
a612ef1
 
 
c9a2e13
 
 
 
a77728f
4350956
a612ef1
c9a2e13
 
 
 
 
 
 
 
 
7694d83
c9a2e13
 
 
 
 
 
 
 
 
 
 
a612ef1
c9a2e13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a77728f
 
 
 
 
 
 
c9a2e13
b691349
4350956
 
c18941c
a612ef1
c18941c
c9a2e13
 
 
 
 
a612ef1
4350956
c18941c
c9a2e13
b691349
a612ef1
a77728f
c9a2e13
a77728f
c9a2e13
4350956
c18941c
b691349
 
a77728f
b6a6419
 
a612ef1
4350956
 
 
a612ef1
b691349
 
 
c18941c
b691349
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Re-build 250325

import gradio as gr
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from PyPDF2 import PdfReader
from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, VectorType
import os
import hashlib
import time

# Load NASA-specific bi-encoder model
tokenizer = AutoTokenizer.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2")
model = AutoModel.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2")

# Initialize Pinecone client
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key)

# Create Pinecone index if it doesn't exist
index_name = "scdd-index"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        spec=ServerlessSpec(
            cloud=CloudProvider.AWS,
            region=AwsRegion.US_EAST_1
        ),
        vector_type=VectorType.DENSE,
        metric="cosine"
    )

# Connect to the Pinecone index
index = pc.Index(index_name)

# Function to encode text using bi-encoder in batches
def encode_chunks_batch(chunks, batch_size=8):
    embeddings = []
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i+batch_size]
        inputs = tokenizer(batch_chunks, return_tensors='pt', padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            output = model(**inputs)
            batch_embeddings = output.last_hidden_state.mean(dim=1)
            batch_embeddings = batch_embeddings / batch_embeddings.norm(dim=1, keepdim=True)
            embeddings.extend(batch_embeddings.cpu().numpy())
    return embeddings

# Function to generate a unique chunk ID based on file content
def generate_chunk_id(pdf_file, chunk_text, chunk_idx):
    hasher = hashlib.md5()
    hasher.update(chunk_text.encode('utf-8'))
    file_hash = hasher.hexdigest()
    return f"{os.path.basename(pdf_file.name)}-{file_hash}-chunk-{chunk_idx}"

# Function to process PDFs and upsert embeddings to Pinecone
def process_pdfs(pdf_files):

    start_time = time.time()
    
    for pdf_file in pdf_files:
        
        reader = PdfReader(pdf_file.name)
        pdf_text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
        
        # Split text into smaller chunks
        chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)]

        yield "Processing file, generating Embeddings and pushing to Pinecone...Please wait..."
        
        # Generate embeddings in batches
        embeddings = encode_chunks_batch(chunks, batch_size=8)

        # Prepare data for Pinecone with unique IDs
        vectors = [
            (generate_chunk_id(pdf_file, chunk, idx), embedding.tolist(), {"text": chunk})
            for idx, (embedding, chunk) in enumerate(zip(embeddings, chunks))
        ] 
        
        # Upsert embeddings into Pinecone
        index.upsert(vectors)

    # Fetch index stats
    stats = index.describe_index_stats()

    elapsed_time = time.time() - start_time

    yield f"Processed PDF and embeddings stored in Pinecone successfully in {elapsed_time:.2f} seconds. Current Index Stats: {stats}"

# Gradio Interface
demo = gr.Interface(
    fn=process_pdfs,
    inputs=gr.Files(label="Upload PDF", file_types=[".pdf"]),
    outputs="text",
    title="NASA Bi-encoder PDF Embedding & Pinecone Storage",
    description="Upload PDF files to generate embeddings with NASA Bi-encoder and store in Pinecone."
)

demo.launch()