aquibmoin commited on
Commit
4350956
·
verified ·
1 Parent(s): c18941c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -11
app.py CHANGED
@@ -8,6 +8,7 @@ from PyPDF2 import PdfReader
8
  from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, VectorType
9
  import os
10
  import hashlib
 
11
 
12
  # Load NASA-specific bi-encoder model
13
  tokenizer = AutoTokenizer.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2")
@@ -56,33 +57,27 @@ def generate_chunk_id(pdf_file, chunk_text, chunk_idx):
56
 
57
  # Function to process PDFs and upsert embeddings to Pinecone
58
  def process_pdfs(pdf_files):
 
 
59
 
60
  for pdf_file in pdf_files:
61
-
62
- yield "Reading PDF..."
63
 
64
  reader = PdfReader(pdf_file.name)
65
  pdf_text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
66
-
67
- yield "Processing PDF..."
68
 
69
  # Split text into smaller chunks
70
  chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)]
71
 
72
- yield "Generating Embeddings..."
73
 
74
  # Generate embeddings in batches
75
  embeddings = encode_chunks_batch(chunks, batch_size=8)
76
 
77
- yield "Embeddings generated successfully...Preparing..."
78
-
79
  # Prepare data for Pinecone with unique IDs
80
  vectors = [
81
  (generate_chunk_id(pdf_file, chunk, idx), embedding.tolist(), {"text": chunk})
82
  for idx, (embedding, chunk) in enumerate(zip(embeddings, chunks))
83
- ]
84
-
85
- yield "Pushing to Pinecone...Please wait"
86
 
87
  # Upsert embeddings into Pinecone
88
  index.upsert(vectors)
@@ -90,7 +85,9 @@ def process_pdfs(pdf_files):
90
  # Fetch index stats
91
  stats = index.describe_index_stats()
92
 
93
- yield f"Processed {len(pdf_files)} PDF(s) successfully and embeddings stored in Pinecone. Current Index Stats: {stats}"
 
 
94
 
95
  # Gradio Interface
96
  demo = gr.Interface(
 
8
  from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, VectorType
9
  import os
10
  import hashlib
11
+ import time
12
 
13
  # Load NASA-specific bi-encoder model
14
  tokenizer = AutoTokenizer.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2")
 
57
 
58
  # Function to process PDFs and upsert embeddings to Pinecone
59
  def process_pdfs(pdf_files):
60
+
61
+ start_time = time.time()
62
 
63
  for pdf_file in pdf_files:
 
 
64
 
65
  reader = PdfReader(pdf_file.name)
66
  pdf_text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
 
 
67
 
68
  # Split text into smaller chunks
69
  chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)]
70
 
71
+ yield "Processing file, generating Embeddings and pushing to Pinecone...Please wait..."
72
 
73
  # Generate embeddings in batches
74
  embeddings = encode_chunks_batch(chunks, batch_size=8)
75
 
 
 
76
  # Prepare data for Pinecone with unique IDs
77
  vectors = [
78
  (generate_chunk_id(pdf_file, chunk, idx), embedding.tolist(), {"text": chunk})
79
  for idx, (embedding, chunk) in enumerate(zip(embeddings, chunks))
80
+ ]
 
 
81
 
82
  # Upsert embeddings into Pinecone
83
  index.upsert(vectors)
 
85
  # Fetch index stats
86
  stats = index.describe_index_stats()
87
 
88
+ elapsed_time = time.time() - start_time
89
+
90
+ yield f"Processed PDF and embeddings stored in Pinecone successfully in {elapsed_time:.2f} seconds. Current Index Stats: {stats}"
91
 
92
  # Gradio Interface
93
  demo = gr.Interface(