Spaces:
Running
Running
third commit
Browse files- app/data_processor.py +174 -88
- app/evaluation.py +73 -7
- app/generate_ground_truth.py +56 -39
- app/main.py +117 -239
- app/transcript_extractor.py +71 -51
- data/sqlite.db +0 -0
- docker-compose.yaml +10 -2
- requirements.txt +3 -1
- run-docker-compose-windows.ps1 +20 -3
app/data_processor.py
CHANGED
@@ -1,15 +1,13 @@
|
|
1 |
-
import logging
|
2 |
from minsearch import Index
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
import numpy as np
|
5 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
-
import re
|
7 |
from elasticsearch import Elasticsearch
|
8 |
import os
|
9 |
import json
|
10 |
-
|
|
|
11 |
|
12 |
-
logging.basicConfig(level=logging.
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
15 |
def clean_text(text):
|
@@ -18,13 +16,17 @@ def clean_text(text):
|
|
18 |
return ""
|
19 |
cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
|
20 |
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
|
21 |
-
logger.
|
|
|
22 |
return cleaned
|
23 |
|
24 |
class DataProcessor:
|
25 |
def __init__(self, text_fields=["content", "title", "description"],
|
26 |
keyword_fields=["video_id", "author", "upload_date"],
|
27 |
-
embedding_model="
|
|
|
|
|
|
|
28 |
self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
|
29 |
self.embedding_model = SentenceTransformer(embedding_model)
|
30 |
self.documents = []
|
@@ -39,18 +41,29 @@ class DataProcessor:
|
|
39 |
logger.info(f"DataProcessor initialized with Elasticsearch at {elasticsearch_host}:{elasticsearch_port}")
|
40 |
|
41 |
def process_transcript(self, video_id, transcript_data):
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
return None
|
45 |
|
46 |
metadata = transcript_data['metadata']
|
47 |
transcript = transcript_data['transcript']
|
48 |
|
49 |
-
logger.info(f"Processing transcript for video {video_id}")
|
50 |
logger.info(f"Number of transcript segments: {len(transcript)}")
|
51 |
|
52 |
full_transcript = " ".join([segment.get('text', '') for segment in transcript])
|
|
|
|
|
|
|
53 |
cleaned_transcript = clean_text(full_transcript)
|
|
|
|
|
54 |
|
55 |
if not cleaned_transcript:
|
56 |
logger.warning(f"Empty cleaned transcript for video {video_id}")
|
@@ -59,17 +72,25 @@ class DataProcessor:
|
|
59 |
doc = {
|
60 |
"video_id": video_id,
|
61 |
"content": cleaned_transcript,
|
62 |
-
"segment_id": f"{video_id}_full",
|
63 |
"title": clean_text(metadata.get('title', '')),
|
|
|
64 |
"author": metadata.get('author', ''),
|
65 |
"upload_date": metadata.get('upload_date', ''),
|
|
|
66 |
"view_count": metadata.get('view_count', 0),
|
67 |
"like_count": metadata.get('like_count', 0),
|
68 |
"comment_count": metadata.get('comment_count', 0),
|
69 |
"video_duration": metadata.get('duration', '')
|
70 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
self.documents.append(doc)
|
72 |
-
self.
|
|
|
73 |
|
74 |
logger.info(f"Processed transcript for video {video_id}")
|
75 |
return f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
|
@@ -80,28 +101,52 @@ class DataProcessor:
|
|
80 |
return None
|
81 |
|
82 |
logger.info(f"Building index with {len(self.documents)} documents")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
try:
|
84 |
-
|
|
|
85 |
self.index_built = True
|
86 |
logger.info("Text index built successfully")
|
87 |
except Exception as e:
|
88 |
logger.error(f"Error building text index: {str(e)}")
|
89 |
raise
|
90 |
|
91 |
-
self.embeddings = np.array(self.embeddings)
|
92 |
-
|
93 |
try:
|
94 |
if not self.es.indices.exists(index=index_name):
|
95 |
self.es.indices.create(index=index_name, body={
|
96 |
"mappings": {
|
97 |
"properties": {
|
98 |
-
"embedding": {"type": "dense_vector", "dims": self.embeddings
|
99 |
"content": {"type": "text"},
|
100 |
-
"video_id": {"type": "keyword"},
|
101 |
-
"segment_id": {"type": "keyword"},
|
102 |
"title": {"type": "text"},
|
|
|
|
|
103 |
"author": {"type": "keyword"},
|
104 |
"upload_date": {"type": "date"},
|
|
|
105 |
"view_count": {"type": "integer"},
|
106 |
"like_count": {"type": "integer"},
|
107 |
"comment_count": {"type": "integer"},
|
@@ -122,19 +167,71 @@ class DataProcessor:
|
|
122 |
except Exception as e:
|
123 |
logger.error(f"Error building Elasticsearch index: {str(e)}")
|
124 |
raise
|
|
|
|
|
|
|
125 |
|
126 |
-
def
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
def search(self, query, filter_dict={}, boost_dict={}, num_results=10, method='hybrid', index_name=None):
|
140 |
if not index_name:
|
@@ -147,78 +244,67 @@ class DataProcessor:
|
|
147 |
|
148 |
logger.info(f"Performing {method} search for query: {query} in index: {index_name}")
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
158 |
|
159 |
def text_search(self, query, filter_dict={}, boost_dict={}, num_results=10, index_name=None):
|
160 |
if not index_name:
|
161 |
logger.error("No index name provided for text search.")
|
162 |
raise ValueError("No index name provided for text search.")
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
176 |
|
177 |
def embedding_search(self, query, num_results=10, index_name=None):
|
178 |
if not index_name:
|
179 |
logger.error("No index name provided for embedding search.")
|
180 |
raise ValueError("No index name provided for embedding search.")
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
"
|
186 |
-
|
187 |
-
"
|
188 |
-
|
|
|
|
|
189 |
}
|
190 |
}
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
for i in range(max(len(text_results), len(embedding_results))):
|
205 |
-
if i < len(text_results):
|
206 |
-
combined.append(text_results[i])
|
207 |
-
if i < len(embedding_results):
|
208 |
-
combined.append(embedding_results[i])
|
209 |
-
|
210 |
-
seen = set()
|
211 |
-
deduped = []
|
212 |
-
for doc in combined:
|
213 |
-
if doc['segment_id'] not in seen:
|
214 |
-
seen.add(doc['segment_id'])
|
215 |
-
deduped.append(doc)
|
216 |
-
|
217 |
-
return deduped[:num_results]
|
218 |
-
|
219 |
-
def process_query(self, query):
|
220 |
-
return clean_text(query)
|
221 |
-
|
222 |
def set_embedding_model(self, model_name):
|
223 |
self.embedding_model = SentenceTransformer(model_name)
|
224 |
logger.info(f"Embedding model set to: {model_name}")
|
|
|
|
|
1 |
from minsearch import Index
|
2 |
from sentence_transformers import SentenceTransformer
|
3 |
import numpy as np
|
|
|
|
|
4 |
from elasticsearch import Elasticsearch
|
5 |
import os
|
6 |
import json
|
7 |
+
import logging
|
8 |
+
import re
|
9 |
|
10 |
+
logging.basicConfig(level=logging.DEBUG)
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
13 |
def clean_text(text):
|
|
|
16 |
return ""
|
17 |
cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
|
18 |
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
|
19 |
+
logger.debug(f"Original text length: {len(text)}, Cleaned text length: {len(cleaned)}")
|
20 |
+
logger.debug(f"Cleaned text sample: '{cleaned[:100]}...'")
|
21 |
return cleaned
|
22 |
|
23 |
class DataProcessor:
|
24 |
def __init__(self, text_fields=["content", "title", "description"],
|
25 |
keyword_fields=["video_id", "author", "upload_date"],
|
26 |
+
embedding_model="multi-qa-MiniLM-L6-cos-v1"):
|
27 |
+
self.text_fields = text_fields
|
28 |
+
self.keyword_fields = keyword_fields
|
29 |
+
self.all_fields = text_fields + keyword_fields
|
30 |
self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
|
31 |
self.embedding_model = SentenceTransformer(embedding_model)
|
32 |
self.documents = []
|
|
|
41 |
logger.info(f"DataProcessor initialized with Elasticsearch at {elasticsearch_host}:{elasticsearch_port}")
|
42 |
|
43 |
def process_transcript(self, video_id, transcript_data):
|
44 |
+
logger.info(f"Processing transcript for video {video_id}")
|
45 |
+
|
46 |
+
if not transcript_data:
|
47 |
+
logger.error(f"Transcript data is None for video {video_id}")
|
48 |
+
return None
|
49 |
+
|
50 |
+
if 'metadata' not in transcript_data or 'transcript' not in transcript_data:
|
51 |
+
logger.error(f"Invalid transcript data structure for video {video_id}")
|
52 |
+
logger.debug(f"Transcript data keys: {transcript_data.keys()}")
|
53 |
return None
|
54 |
|
55 |
metadata = transcript_data['metadata']
|
56 |
transcript = transcript_data['transcript']
|
57 |
|
|
|
58 |
logger.info(f"Number of transcript segments: {len(transcript)}")
|
59 |
|
60 |
full_transcript = " ".join([segment.get('text', '') for segment in transcript])
|
61 |
+
logger.debug(f"Full transcript length before cleaning: {len(full_transcript)}")
|
62 |
+
logger.debug(f"Full transcript sample before cleaning: '{full_transcript[:500]}...'")
|
63 |
+
|
64 |
cleaned_transcript = clean_text(full_transcript)
|
65 |
+
logger.debug(f"Cleaned transcript length: {len(cleaned_transcript)}")
|
66 |
+
logger.debug(f"Cleaned transcript sample: '{cleaned_transcript[:500]}...'")
|
67 |
|
68 |
if not cleaned_transcript:
|
69 |
logger.warning(f"Empty cleaned transcript for video {video_id}")
|
|
|
72 |
doc = {
|
73 |
"video_id": video_id,
|
74 |
"content": cleaned_transcript,
|
|
|
75 |
"title": clean_text(metadata.get('title', '')),
|
76 |
+
"description": clean_text(metadata.get('description', 'Not Available')),
|
77 |
"author": metadata.get('author', ''),
|
78 |
"upload_date": metadata.get('upload_date', ''),
|
79 |
+
"segment_id": f"{video_id}_full",
|
80 |
"view_count": metadata.get('view_count', 0),
|
81 |
"like_count": metadata.get('like_count', 0),
|
82 |
"comment_count": metadata.get('comment_count', 0),
|
83 |
"video_duration": metadata.get('duration', '')
|
84 |
}
|
85 |
+
|
86 |
+
logger.debug(f"Document created for video {video_id}")
|
87 |
+
for field in self.all_fields:
|
88 |
+
logger.debug(f"Document {field} length: {len(str(doc.get(field, '')))}")
|
89 |
+
logger.debug(f"Document {field} sample: '{str(doc.get(field, ''))[:100]}...'")
|
90 |
+
|
91 |
self.documents.append(doc)
|
92 |
+
embedding = self.embedding_model.encode(cleaned_transcript + " " + metadata.get('title', ''))
|
93 |
+
self.embeddings.append(embedding)
|
94 |
|
95 |
logger.info(f"Processed transcript for video {video_id}")
|
96 |
return f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
|
|
|
101 |
return None
|
102 |
|
103 |
logger.info(f"Building index with {len(self.documents)} documents")
|
104 |
+
|
105 |
+
# Fields to include in the fit function
|
106 |
+
index_fields = self.text_fields + self.keyword_fields
|
107 |
+
|
108 |
+
# Create a list of dictionaries with only the fields we want to index
|
109 |
+
docs_to_index = []
|
110 |
+
for doc in self.documents:
|
111 |
+
indexed_doc = {field: doc.get(field, '') for field in index_fields}
|
112 |
+
if all(indexed_doc.values()): # Check if all required fields have values
|
113 |
+
docs_to_index.append(indexed_doc)
|
114 |
+
else:
|
115 |
+
missing_fields = [field for field, value in indexed_doc.items() if not value]
|
116 |
+
logger.warning(f"Document with video_id {doc.get('video_id', 'unknown')} is missing values for fields: {missing_fields}")
|
117 |
+
|
118 |
+
if not docs_to_index:
|
119 |
+
logger.error("No valid documents to index")
|
120 |
+
return None
|
121 |
+
|
122 |
+
logger.info(f"Number of valid documents to index: {len(docs_to_index)}")
|
123 |
+
|
124 |
+
# Log the structure of the first document to be indexed
|
125 |
+
logger.debug("Structure of the first document to be indexed:")
|
126 |
+
logger.debug(json.dumps(docs_to_index[0], indent=2))
|
127 |
+
|
128 |
try:
|
129 |
+
logger.info("Fitting text index")
|
130 |
+
self.text_index.fit(docs_to_index)
|
131 |
self.index_built = True
|
132 |
logger.info("Text index built successfully")
|
133 |
except Exception as e:
|
134 |
logger.error(f"Error building text index: {str(e)}")
|
135 |
raise
|
136 |
|
|
|
|
|
137 |
try:
|
138 |
if not self.es.indices.exists(index=index_name):
|
139 |
self.es.indices.create(index=index_name, body={
|
140 |
"mappings": {
|
141 |
"properties": {
|
142 |
+
"embedding": {"type": "dense_vector", "dims": len(self.embeddings[0]), "index": True, "similarity": "cosine"},
|
143 |
"content": {"type": "text"},
|
|
|
|
|
144 |
"title": {"type": "text"},
|
145 |
+
"description": {"type": "text"},
|
146 |
+
"video_id": {"type": "keyword"},
|
147 |
"author": {"type": "keyword"},
|
148 |
"upload_date": {"type": "date"},
|
149 |
+
"segment_id": {"type": "keyword"},
|
150 |
"view_count": {"type": "integer"},
|
151 |
"like_count": {"type": "integer"},
|
152 |
"comment_count": {"type": "integer"},
|
|
|
167 |
except Exception as e:
|
168 |
logger.error(f"Error building Elasticsearch index: {str(e)}")
|
169 |
raise
|
170 |
+
|
171 |
+
def compute_rrf(self, rank, k=60):
|
172 |
+
return 1 / (k + rank)
|
173 |
|
174 |
+
def hybrid_search(self, query, index_name, num_results=5):
|
175 |
+
if not index_name:
|
176 |
+
logger.error("No index name provided for hybrid search.")
|
177 |
+
raise ValueError("No index name provided for hybrid search.")
|
178 |
+
|
179 |
+
vector = self.embedding_model.encode(query)
|
180 |
+
|
181 |
+
knn_query = {
|
182 |
+
"field": "embedding",
|
183 |
+
"query_vector": vector.tolist(),
|
184 |
+
"k": 10,
|
185 |
+
"num_candidates": 100
|
186 |
+
}
|
187 |
+
|
188 |
+
keyword_query = {
|
189 |
+
"multi_match": {
|
190 |
+
"query": query,
|
191 |
+
"fields": self.text_fields
|
192 |
+
}
|
193 |
+
}
|
194 |
+
|
195 |
+
try:
|
196 |
+
knn_results = self.es.search(
|
197 |
+
index=index_name,
|
198 |
+
body={
|
199 |
+
"knn": knn_query,
|
200 |
+
"size": 10
|
201 |
+
}
|
202 |
+
)['hits']['hits']
|
203 |
+
|
204 |
+
keyword_results = self.es.search(
|
205 |
+
index=index_name,
|
206 |
+
body={
|
207 |
+
"query": keyword_query,
|
208 |
+
"size": 10
|
209 |
+
}
|
210 |
+
)['hits']['hits']
|
211 |
+
|
212 |
+
rrf_scores = {}
|
213 |
+
for rank, hit in enumerate(knn_results):
|
214 |
+
doc_id = hit['_id']
|
215 |
+
rrf_scores[doc_id] = self.compute_rrf(rank + 1)
|
216 |
+
|
217 |
+
for rank, hit in enumerate(keyword_results):
|
218 |
+
doc_id = hit['_id']
|
219 |
+
if doc_id in rrf_scores:
|
220 |
+
rrf_scores[doc_id] += self.compute_rrf(rank + 1)
|
221 |
+
else:
|
222 |
+
rrf_scores[doc_id] = self.compute_rrf(rank + 1)
|
223 |
+
|
224 |
+
reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
|
225 |
+
|
226 |
+
final_results = []
|
227 |
+
for doc_id, score in reranked_docs[:num_results]:
|
228 |
+
doc = self.es.get(index=index_name, id=doc_id)
|
229 |
+
final_results.append(doc['_source'])
|
230 |
+
|
231 |
+
return final_results
|
232 |
+
except Exception as e:
|
233 |
+
logger.error(f"Error in hybrid search: {str(e)}")
|
234 |
+
raise
|
235 |
|
236 |
def search(self, query, filter_dict={}, boost_dict={}, num_results=10, method='hybrid', index_name=None):
|
237 |
if not index_name:
|
|
|
244 |
|
245 |
logger.info(f"Performing {method} search for query: {query} in index: {index_name}")
|
246 |
|
247 |
+
try:
|
248 |
+
if method == 'text':
|
249 |
+
return self.text_search(query, filter_dict, boost_dict, num_results, index_name)
|
250 |
+
elif method == 'embedding':
|
251 |
+
return self.embedding_search(query, num_results, index_name)
|
252 |
+
else: # hybrid search
|
253 |
+
return self.hybrid_search(query, index_name, num_results)
|
254 |
+
except Exception as e:
|
255 |
+
logger.error(f"Error in search method {method}: {str(e)}")
|
256 |
+
raise
|
257 |
|
258 |
def text_search(self, query, filter_dict={}, boost_dict={}, num_results=10, index_name=None):
|
259 |
if not index_name:
|
260 |
logger.error("No index name provided for text search.")
|
261 |
raise ValueError("No index name provided for text search.")
|
262 |
|
263 |
+
try:
|
264 |
+
search_body = {
|
265 |
+
"query": {
|
266 |
+
"multi_match": {
|
267 |
+
"query": query,
|
268 |
+
"fields": self.text_fields
|
269 |
+
}
|
270 |
+
},
|
271 |
+
"size": num_results
|
272 |
+
}
|
273 |
+
response = self.es.search(index=index_name, body=search_body)
|
274 |
+
return [hit['_source'] for hit in response['hits']['hits']]
|
275 |
+
except Exception as e:
|
276 |
+
logger.error(f"Error in text search: {str(e)}")
|
277 |
+
raise
|
278 |
|
279 |
def embedding_search(self, query, num_results=10, index_name=None):
|
280 |
if not index_name:
|
281 |
logger.error("No index name provided for embedding search.")
|
282 |
raise ValueError("No index name provided for embedding search.")
|
283 |
|
284 |
+
try:
|
285 |
+
query_vector = self.embedding_model.encode(query).tolist()
|
286 |
+
script_query = {
|
287 |
+
"script_score": {
|
288 |
+
"query": {"match_all": {}},
|
289 |
+
"script": {
|
290 |
+
"source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
|
291 |
+
"params": {"query_vector": query_vector}
|
292 |
+
}
|
293 |
}
|
294 |
}
|
295 |
+
response = self.es.search(
|
296 |
+
index=index_name,
|
297 |
+
body={
|
298 |
+
"size": num_results,
|
299 |
+
"query": script_query,
|
300 |
+
"_source": {"excludes": ["embedding"]}
|
301 |
+
}
|
302 |
+
)
|
303 |
+
return [hit['_source'] for hit in response['hits']['hits']]
|
304 |
+
except Exception as e:
|
305 |
+
logger.error(f"Error in embedding search: {str(e)}")
|
306 |
+
raise
|
307 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
def set_embedding_model(self, model_name):
|
309 |
self.embedding_model = SentenceTransformer(model_name)
|
310 |
logger.info(f"Embedding model set to: {model_name}")
|
app/evaluation.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
from sklearn.metrics.pairwise import cosine_similarity
|
2 |
import numpy as np
|
|
|
|
|
|
|
3 |
|
4 |
class EvaluationSystem:
|
5 |
def __init__(self, data_processor, database_handler):
|
@@ -7,15 +10,15 @@ class EvaluationSystem:
|
|
7 |
self.db_handler = database_handler
|
8 |
|
9 |
def relevance_scoring(self, query, retrieved_docs, top_k=5):
|
10 |
-
query_embedding = self.data_processor.
|
11 |
-
doc_embeddings = [self.data_processor.
|
12 |
|
13 |
similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
|
14 |
return np.mean(sorted(similarities, reverse=True)[:top_k])
|
15 |
|
16 |
def answer_similarity(self, generated_answer, reference_answer):
|
17 |
-
gen_embedding = self.data_processor.
|
18 |
-
ref_embedding = self.data_processor.
|
19 |
return cosine_similarity([gen_embedding], [ref_embedding])[0][0]
|
20 |
|
21 |
def human_evaluation(self, video_id, query):
|
@@ -34,8 +37,8 @@ class EvaluationSystem:
|
|
34 |
human_scores = []
|
35 |
|
36 |
for query, reference in zip(test_queries, reference_answers):
|
37 |
-
retrieved_docs = rag_system.
|
38 |
-
generated_answer = rag_system.query(
|
39 |
|
40 |
relevance_scores.append(self.relevance_scoring(query, retrieved_docs))
|
41 |
similarity_scores.append(self.answer_similarity(generated_answer, reference))
|
@@ -45,4 +48,67 @@ class EvaluationSystem:
|
|
45 |
"avg_relevance_score": np.mean(relevance_scores),
|
46 |
"avg_similarity_score": np.mean(similarity_scores),
|
47 |
"avg_human_score": np.mean(human_scores)
|
48 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from sklearn.metrics.pairwise import cosine_similarity
|
2 |
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
import ollama
|
6 |
|
7 |
class EvaluationSystem:
|
8 |
def __init__(self, data_processor, database_handler):
|
|
|
10 |
self.db_handler = database_handler
|
11 |
|
12 |
def relevance_scoring(self, query, retrieved_docs, top_k=5):
|
13 |
+
query_embedding = self.data_processor.embedding_model.encode(query)
|
14 |
+
doc_embeddings = [self.data_processor.embedding_model.encode(doc['content']) for doc in retrieved_docs]
|
15 |
|
16 |
similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
|
17 |
return np.mean(sorted(similarities, reverse=True)[:top_k])
|
18 |
|
19 |
def answer_similarity(self, generated_answer, reference_answer):
|
20 |
+
gen_embedding = self.data_processor.embedding_model.encode(generated_answer)
|
21 |
+
ref_embedding = self.data_processor.embedding_model.encode(reference_answer)
|
22 |
return cosine_similarity([gen_embedding], [ref_embedding])[0][0]
|
23 |
|
24 |
def human_evaluation(self, video_id, query):
|
|
|
37 |
human_scores = []
|
38 |
|
39 |
for query, reference in zip(test_queries, reference_answers):
|
40 |
+
retrieved_docs = rag_system.data_processor.search(query, num_results=5, method='hybrid', index_name=index_name)
|
41 |
+
generated_answer, _ = rag_system.query(query, search_method='hybrid', index_name=index_name)
|
42 |
|
43 |
relevance_scores.append(self.relevance_scoring(query, retrieved_docs))
|
44 |
similarity_scores.append(self.answer_similarity(generated_answer, reference))
|
|
|
48 |
"avg_relevance_score": np.mean(relevance_scores),
|
49 |
"avg_similarity_score": np.mean(similarity_scores),
|
50 |
"avg_human_score": np.mean(human_scores)
|
51 |
+
}
|
52 |
+
|
53 |
+
def llm_as_judge(self, question, generated_answer, prompt_template):
|
54 |
+
prompt = prompt_template.format(question=question, answer_llm=generated_answer)
|
55 |
+
|
56 |
+
try:
|
57 |
+
response = ollama.chat(
|
58 |
+
model='phi3.5',
|
59 |
+
messages=[{"role": "user", "content": prompt}]
|
60 |
+
)
|
61 |
+
evaluation = json.loads(response['message']['content'])
|
62 |
+
return evaluation
|
63 |
+
except Exception as e:
|
64 |
+
print(f"Error in LLM evaluation: {str(e)}")
|
65 |
+
return None
|
66 |
+
|
67 |
+
def evaluate_rag(self, rag_system, ground_truth_file, sample_size=200, prompt_template=None):
|
68 |
+
try:
|
69 |
+
ground_truth = pd.read_csv(ground_truth_file)
|
70 |
+
except FileNotFoundError:
|
71 |
+
print("Ground truth file not found. Please generate ground truth data first.")
|
72 |
+
return None
|
73 |
+
|
74 |
+
sample = ground_truth.sample(n=min(sample_size, len(ground_truth)), random_state=1)
|
75 |
+
evaluations = []
|
76 |
+
|
77 |
+
for _, row in sample.iterrows():
|
78 |
+
question = row['question']
|
79 |
+
video_id = row['video_id']
|
80 |
+
|
81 |
+
index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id, "multi-qa-MiniLM-L6-cos-v1")
|
82 |
+
|
83 |
+
if not index_name:
|
84 |
+
print(f"No index found for video {video_id}. Skipping this question.")
|
85 |
+
continue
|
86 |
+
|
87 |
+
try:
|
88 |
+
answer_llm, _ = rag_system.query(question, search_method='hybrid', index_name=index_name)
|
89 |
+
except ValueError as e:
|
90 |
+
print(f"Error querying RAG system: {str(e)}")
|
91 |
+
continue
|
92 |
+
|
93 |
+
if prompt_template:
|
94 |
+
evaluation = self.llm_as_judge(question, answer_llm, prompt_template)
|
95 |
+
if evaluation:
|
96 |
+
evaluations.append((
|
97 |
+
str(video_id),
|
98 |
+
str(question),
|
99 |
+
str(answer_llm),
|
100 |
+
str(evaluation.get('Relevance', 'UNKNOWN')),
|
101 |
+
str(evaluation.get('Explanation', 'No explanation provided'))
|
102 |
+
))
|
103 |
+
else:
|
104 |
+
# Fallback to cosine similarity if no prompt template is provided
|
105 |
+
similarity = self.answer_similarity(answer_llm, row.get('reference_answer', ''))
|
106 |
+
evaluations.append((
|
107 |
+
str(video_id),
|
108 |
+
str(question),
|
109 |
+
str(answer_llm),
|
110 |
+
f"Similarity: {similarity}",
|
111 |
+
"Cosine similarity used for evaluation"
|
112 |
+
))
|
113 |
+
|
114 |
+
return evaluations
|
app/generate_ground_truth.py
CHANGED
@@ -1,27 +1,16 @@
|
|
1 |
-
import os
|
2 |
import pandas as pd
|
3 |
import json
|
4 |
-
from youtube_transcript_api import YouTubeTranscriptApi
|
5 |
from tqdm import tqdm
|
6 |
-
import
|
7 |
-
|
8 |
-
OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'localhost')
|
9 |
-
OLLAMA_PORT = os.getenv('OLLAMA_PORT', '11434')
|
10 |
-
|
11 |
-
def get_transcript(video_id):
|
12 |
-
try:
|
13 |
-
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
14 |
-
return " ".join([entry['text'] for entry in transcript])
|
15 |
-
except Exception as e:
|
16 |
-
print(f"Error extracting transcript for video {video_id}: {str(e)}")
|
17 |
-
return None
|
18 |
|
19 |
def generate_questions(transcript):
|
20 |
prompt_template = """
|
21 |
You are an AI assistant tasked with generating questions based on a YouTube video transcript.
|
22 |
-
Formulate 10 questions that a user might ask based on the provided transcript.
|
23 |
Make the questions specific to the content of the transcript.
|
24 |
The questions should be complete and not too short. Use as few words as possible from the transcript.
|
|
|
25 |
|
26 |
The transcript:
|
27 |
|
@@ -34,34 +23,62 @@ def generate_questions(transcript):
|
|
34 |
|
35 |
prompt = prompt_template.format(transcript=transcript)
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
else:
|
45 |
-
print(f"
|
46 |
return None
|
47 |
|
48 |
-
|
49 |
-
video_id = "zjkBMFhNj_g"
|
50 |
-
transcript = get_transcript(video_id)
|
51 |
|
52 |
-
if
|
53 |
-
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
else:
|
62 |
-
print("Failed to
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
else:
|
64 |
-
print("Failed to generate
|
65 |
-
|
66 |
-
if __name__ == "__main__":
|
67 |
-
main()
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import json
|
|
|
3 |
from tqdm import tqdm
|
4 |
+
import ollama
|
5 |
+
from transcript_extractor import get_transcript
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def generate_questions(transcript):
|
8 |
prompt_template = """
|
9 |
You are an AI assistant tasked with generating questions based on a YouTube video transcript.
|
10 |
+
Formulate at least 10 questions that a user might ask based on the provided transcript.
|
11 |
Make the questions specific to the content of the transcript.
|
12 |
The questions should be complete and not too short. Use as few words as possible from the transcript.
|
13 |
+
It is important that the questions are relevant to the content of the transcript and are at least 10 in number.
|
14 |
|
15 |
The transcript:
|
16 |
|
|
|
23 |
|
24 |
prompt = prompt_template.format(transcript=transcript)
|
25 |
|
26 |
+
try:
|
27 |
+
response = ollama.chat(
|
28 |
+
model='phi3.5',
|
29 |
+
messages=[{"role": "user", "content": prompt}]
|
30 |
+
)
|
31 |
+
return json.loads(response['message']['content'])
|
32 |
+
except Exception as e:
|
33 |
+
print(f"Error generating questions: {str(e)}")
|
34 |
+
return None
|
35 |
+
|
36 |
+
def generate_ground_truth(db_handler, data_processor, video_id):
|
37 |
+
transcript_data = get_transcript(video_id)
|
38 |
+
if transcript_data and 'transcript' in transcript_data:
|
39 |
+
full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
|
40 |
+
# Process the transcript
|
41 |
+
data_processor.process_transcript(video_id, transcript_data)
|
42 |
else:
|
43 |
+
print(f"Failed to retrieve transcript for video {video_id}")
|
44 |
return None
|
45 |
|
46 |
+
questions = generate_questions(full_transcript)
|
|
|
|
|
47 |
|
48 |
+
if questions and 'questions' in questions:
|
49 |
+
df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
|
50 |
|
51 |
+
csv_path = 'data/ground-truth-retrieval.csv'
|
52 |
+
df.to_csv(csv_path, index=False)
|
53 |
+
print(f"Ground truth data saved to {csv_path}")
|
54 |
+
return df
|
55 |
+
else:
|
56 |
+
print("Failed to generate questions.")
|
57 |
+
return None
|
58 |
+
|
59 |
+
def generate_ground_truth_for_all_videos(db_handler, data_processor):
|
60 |
+
videos = db_handler.get_all_videos()
|
61 |
+
all_questions = []
|
62 |
+
|
63 |
+
for video in tqdm(videos, desc="Generating ground truth"):
|
64 |
+
video_id = video[0] # Assuming the video ID is the first element in the tuple
|
65 |
+
transcript_data = get_transcript(video_id)
|
66 |
+
if transcript_data and 'transcript' in transcript_data:
|
67 |
+
full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
|
68 |
+
# Process the transcript
|
69 |
+
data_processor.process_transcript(video_id, transcript_data)
|
70 |
+
questions = generate_questions(full_transcript)
|
71 |
+
if questions and 'questions' in questions:
|
72 |
+
all_questions.extend([(video_id, q) for q in questions['questions']])
|
73 |
else:
|
74 |
+
print(f"Failed to retrieve transcript for video {video_id}")
|
75 |
+
|
76 |
+
if all_questions:
|
77 |
+
df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
|
78 |
+
csv_path = 'data/ground-truth-retrieval.csv'
|
79 |
+
df.to_csv(csv_path, index=False)
|
80 |
+
print(f"Ground truth data for all videos saved to {csv_path}")
|
81 |
+
return df
|
82 |
else:
|
83 |
+
print("Failed to generate questions for any video.")
|
84 |
+
return None
|
|
|
|
app/main.py
CHANGED
@@ -1,21 +1,18 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
from transcript_extractor import
|
4 |
from data_processor import DataProcessor
|
5 |
from database import DatabaseHandler
|
6 |
from rag import RAGSystem
|
7 |
from query_rewriter import QueryRewriter
|
8 |
from evaluation import EvaluationSystem
|
|
|
9 |
from sentence_transformers import SentenceTransformer
|
10 |
import os
|
11 |
-
import
|
12 |
-
import requests
|
13 |
-
from tqdm import tqdm
|
14 |
-
import sqlite3
|
15 |
import logging
|
16 |
-
import ollama
|
17 |
|
18 |
-
logging.basicConfig(level=logging.
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
21 |
@st.cache_resource
|
@@ -33,170 +30,64 @@ def init_components():
|
|
33 |
st.error(f"Error initializing components: {str(e)}")
|
34 |
st.error("Please check your configuration and ensure all services are running.")
|
35 |
return None, None, None, None, None
|
36 |
-
|
37 |
-
components = init_components()
|
38 |
-
if components:
|
39 |
-
db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
|
40 |
-
else:
|
41 |
-
st.stop()
|
42 |
-
|
43 |
-
# Ground Truth Generation
|
44 |
|
45 |
-
def
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
|
56 |
|
57 |
-
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
|
62 |
-
|
|
|
|
|
|
|
63 |
|
64 |
-
|
65 |
-
response = ollama.chat(
|
66 |
-
model='phi3.5',
|
67 |
-
messages=[{"role": "user", "content": prompt}]
|
68 |
-
)
|
69 |
-
print("Printing the response from OLLAMA: " + response['message']['content'])
|
70 |
-
return json.loads(response['message']['content'])
|
71 |
-
except Exception as e:
|
72 |
-
logger.error(f"Error generating questions: {str(e)}")
|
73 |
-
return None
|
74 |
-
|
75 |
-
def generate_ground_truth(video_id=None, existing_transcript=None):
|
76 |
-
if video_id is None and existing_transcript is None:
|
77 |
-
st.error("Please provide either a video ID or an existing transcript.")
|
78 |
-
return None
|
79 |
-
|
80 |
-
if video_id:
|
81 |
-
transcript_data = get_transcript(video_id)
|
82 |
-
if transcript_data and 'transcript' in transcript_data:
|
83 |
-
full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
|
84 |
-
else:
|
85 |
-
logger.error("Failed to retrieve transcript for the provided video ID.")
|
86 |
-
st.error("Failed to retrieve transcript for the provided video ID.")
|
87 |
-
return None
|
88 |
-
else:
|
89 |
-
full_transcript = existing_transcript
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
os.makedirs('data', exist_ok=True)
|
97 |
-
df.to_csv('data/ground-truth-retrieval.csv', index=False)
|
98 |
-
st.success("Ground truth data generated and saved to data/ground-truth-retrieval.csv")
|
99 |
-
return df
|
100 |
-
else:
|
101 |
-
logger.error("Failed to generate questions.")
|
102 |
-
st.error("Failed to generate questions.")
|
103 |
-
return None
|
104 |
|
105 |
-
|
106 |
-
def evaluate_rag(sample_size=200):
|
107 |
-
try:
|
108 |
-
ground_truth = pd.read_csv('data/ground-truth-retrieval.csv')
|
109 |
-
except FileNotFoundError:
|
110 |
-
logger.error("Ground truth file not found. Please generate ground truth data first.")
|
111 |
-
st.error("Ground truth file not found. Please generate ground truth data first.")
|
112 |
-
return None
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
You are an expert evaluator for a Youtube transcript assistant.
|
119 |
-
Your task is to analyze the relevance of the generated answer to the given question.
|
120 |
-
Based on the relevance of the generated answer, you will classify it
|
121 |
-
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".
|
122 |
-
|
123 |
-
Here is the data for evaluation:
|
124 |
-
|
125 |
-
Question: {question}
|
126 |
-
Generated Answer: {answer_llm}
|
127 |
-
|
128 |
-
Please analyze the content and context of the generated answer in relation to the question
|
129 |
-
and provide your evaluation in parsable JSON without using code blocks:
|
130 |
-
|
131 |
-
{{
|
132 |
-
"Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
|
133 |
-
"Explanation": "[Provide a brief explanation for your evaluation]"
|
134 |
-
}}
|
135 |
-
""".strip()
|
136 |
-
|
137 |
-
progress_bar = st.progress(0)
|
138 |
-
for i, (_, row) in enumerate(sample.iterrows()):
|
139 |
-
question = row['question']
|
140 |
-
video_id = row['video_id']
|
141 |
-
|
142 |
-
# Get the index name for the video (you might need to adjust this based on your setup)
|
143 |
-
index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id, "all-MiniLM-L6-v2") # Assuming you're using this embedding model
|
144 |
-
|
145 |
-
if not index_name:
|
146 |
-
logger.warning(f"No index found for video {video_id}. Skipping this question.")
|
147 |
-
continue
|
148 |
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
logger.error(f"Error querying RAG system: {str(e)}")
|
153 |
-
continue
|
154 |
|
155 |
-
|
156 |
-
try:
|
157 |
-
response = ollama.chat(
|
158 |
-
model='phi3.5',
|
159 |
-
messages=[{"role": "user", "content": prompt}]
|
160 |
-
)
|
161 |
-
evaluation_json = json.loads(response['message']['content'])
|
162 |
-
evaluations.append((
|
163 |
-
str(video_id),
|
164 |
-
str(question),
|
165 |
-
str(answer_llm),
|
166 |
-
str(evaluation_json.get('Relevance', 'UNKNOWN')),
|
167 |
-
str(evaluation_json.get('Explanation', 'No explanation provided'))
|
168 |
-
))
|
169 |
-
except Exception as e:
|
170 |
-
logger.warning(f"Failed to evaluate question: {question}. Error: {str(e)}")
|
171 |
-
st.warning(f"Failed to evaluate question: {question}")
|
172 |
-
progress_bar.progress((i + 1) / len(sample))
|
173 |
-
|
174 |
-
# Store RAG evaluations in the database
|
175 |
-
conn = sqlite3.connect('data/sqlite.db')
|
176 |
-
cursor = conn.cursor()
|
177 |
-
cursor.execute('''
|
178 |
-
CREATE TABLE IF NOT EXISTS rag_evaluations (
|
179 |
-
video_id TEXT,
|
180 |
-
question TEXT,
|
181 |
-
answer TEXT,
|
182 |
-
relevance TEXT,
|
183 |
-
explanation TEXT
|
184 |
-
)
|
185 |
-
''')
|
186 |
-
cursor.executemany('''
|
187 |
-
INSERT INTO rag_evaluations (video_id, question, answer, relevance, explanation)
|
188 |
-
VALUES (?, ?, ?, ?, ?)
|
189 |
-
''', evaluations)
|
190 |
-
conn.commit()
|
191 |
-
conn.close()
|
192 |
-
|
193 |
-
logger.info("Evaluation complete. Results stored in the database.")
|
194 |
-
st.success("Evaluation complete. Results stored in the database.")
|
195 |
-
return evaluations
|
196 |
-
|
197 |
-
@st.cache_data
|
198 |
-
def process_single_video(video_id, embedding_model):
|
199 |
-
# Check if the video has already been processed with the current embedding model
|
200 |
existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
|
201 |
if existing_index:
|
202 |
logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
|
@@ -205,9 +96,9 @@ def process_single_video(video_id, embedding_model):
|
|
205 |
transcript_data = get_transcript(video_id)
|
206 |
if transcript_data is None:
|
207 |
logger.error(f"Failed to retrieve transcript for video {video_id}")
|
|
|
208 |
return None
|
209 |
|
210 |
-
# Store video metadata in the database
|
211 |
video_data = {
|
212 |
'video_id': video_id,
|
213 |
'title': transcript_data['metadata'].get('title', 'Unknown Title'),
|
@@ -222,67 +113,78 @@ def process_single_video(video_id, embedding_model):
|
|
222 |
db_handler.add_video(video_data)
|
223 |
except Exception as e:
|
224 |
logger.error(f"Error adding video to database: {str(e)}")
|
|
|
225 |
return None
|
226 |
|
227 |
-
# Process transcript for RAG system
|
228 |
try:
|
229 |
data_processor.process_transcript(video_id, transcript_data)
|
230 |
except Exception as e:
|
231 |
logger.error(f"Error processing transcript: {str(e)}")
|
|
|
232 |
return None
|
233 |
|
234 |
-
# Create Elasticsearch index
|
235 |
index_name = f"video_{video_id}_{embedding_model}".lower()
|
236 |
try:
|
237 |
index_name = data_processor.build_index(index_name)
|
238 |
logger.info(f"Successfully built index: {index_name}")
|
239 |
except Exception as e:
|
240 |
logger.error(f"Error building index: {str(e)}")
|
|
|
241 |
return None
|
242 |
|
243 |
-
# Add embedding model to the database
|
244 |
embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")
|
245 |
|
246 |
-
# Get the video ID from the database
|
247 |
video_db_record = db_handler.get_video_by_youtube_id(video_id)
|
248 |
if video_db_record is None:
|
249 |
logger.error(f"Failed to retrieve video record from database for video {video_id}")
|
|
|
250 |
return None
|
251 |
-
video_db_id = video_db_record[0]
|
252 |
|
253 |
-
# Store Elasticsearch index information
|
254 |
db_handler.add_elasticsearch_index(video_db_id, index_name, embedding_model_id)
|
255 |
|
256 |
logger.info(f"Processed and indexed transcript for video {video_id}")
|
|
|
257 |
return index_name
|
258 |
|
259 |
-
|
260 |
-
def process_multiple_videos(video_ids, embedding_model):
|
261 |
indices = []
|
262 |
for video_id in video_ids:
|
263 |
-
index = process_single_video(video_id, embedding_model)
|
264 |
if index:
|
265 |
indices.append(index)
|
266 |
logger.info(f"Processed and indexed transcripts for {len(indices)} videos")
|
267 |
st.success(f"Processed and indexed transcripts for {len(indices)} videos")
|
268 |
return indices
|
269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
def main():
|
271 |
st.title("YouTube Transcript RAG System")
|
272 |
|
273 |
-
|
274 |
-
if not all(components):
|
275 |
-
st.error("Failed to initialize one or more components. Please check the logs and your configuration.")
|
276 |
-
return
|
277 |
-
|
278 |
-
db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
|
279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
tab1, tab2, tab3 = st.tabs(["RAG System", "Ground Truth Generation", "Evaluation"])
|
281 |
|
282 |
with tab1:
|
283 |
st.header("RAG System")
|
284 |
|
285 |
-
|
|
|
286 |
st.subheader("Select a Video")
|
287 |
videos = db_handler.get_all_videos()
|
288 |
if not videos:
|
@@ -290,21 +192,15 @@ def main():
|
|
290 |
else:
|
291 |
video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
|
292 |
|
293 |
-
# Allow filtering by channel name
|
294 |
channels = sorted(video_df['channel_name'].unique())
|
295 |
selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
|
296 |
|
297 |
if selected_channel != "All":
|
298 |
video_df = video_df[video_df['channel_name'] == selected_channel]
|
299 |
|
300 |
-
# Display videos and allow selection
|
301 |
st.dataframe(video_df)
|
302 |
selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
|
303 |
|
304 |
-
# Embedding model selection
|
305 |
-
embedding_model = st.selectbox("Select embedding model:", ["all-MiniLM-L6-v2", "all-mpnet-base-v2"])
|
306 |
-
|
307 |
-
# Get the index name for the selected video and embedding model
|
308 |
index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id, embedding_model)
|
309 |
|
310 |
if index_name:
|
@@ -312,18 +208,17 @@ def main():
|
|
312 |
else:
|
313 |
st.warning("No index found for the selected video and embedding model. The index will be built when you search.")
|
314 |
|
315 |
-
# Process new video section
|
316 |
st.subheader("Process New Video")
|
317 |
input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
|
318 |
input_value = st.text_input("Enter the URL or ID:")
|
319 |
|
320 |
if st.button("Process"):
|
321 |
with st.spinner("Processing..."):
|
322 |
-
data_processor.
|
323 |
if input_type == "Video URL":
|
324 |
video_id = extract_video_id(input_value)
|
325 |
if video_id:
|
326 |
-
index_name = process_single_video(video_id, embedding_model)
|
327 |
if index_name is None:
|
328 |
st.error(f"Failed to process video {video_id}")
|
329 |
else:
|
@@ -333,7 +228,7 @@ def main():
|
|
333 |
elif input_type == "Channel URL":
|
334 |
channel_videos = get_channel_videos(input_value)
|
335 |
if channel_videos:
|
336 |
-
index_names = process_multiple_videos([video['video_id'] for video in channel_videos], embedding_model)
|
337 |
if not index_names:
|
338 |
st.error("Failed to process any videos from the channel")
|
339 |
else:
|
@@ -341,13 +236,12 @@ def main():
|
|
341 |
else:
|
342 |
st.error("Failed to retrieve videos from the channel")
|
343 |
else:
|
344 |
-
index_name = process_single_video(input_value, embedding_model)
|
345 |
if index_name is None:
|
346 |
st.error(f"Failed to process video {input_value}")
|
347 |
else:
|
348 |
st.success(f"Successfully processed video {input_value}")
|
349 |
|
350 |
-
# Query section
|
351 |
st.subheader("Query the RAG System")
|
352 |
query = st.text_input("Enter your query:")
|
353 |
rewrite_method = st.radio("Query rewriting method:", ["None", "Chain of Thought", "ReAct"])
|
@@ -375,10 +269,9 @@ def main():
|
|
375 |
|
376 |
search_method_map = {"Hybrid": "hybrid", "Text-only": "text", "Embedding-only": "embedding"}
|
377 |
try:
|
378 |
-
# Ensure index is built before searching
|
379 |
if not index_name:
|
380 |
st.info("Building index for the selected video...")
|
381 |
-
index_name = process_single_video(selected_video_id, embedding_model)
|
382 |
if not index_name:
|
383 |
st.error("Failed to build index for the selected video.")
|
384 |
return
|
@@ -405,66 +298,49 @@ def main():
|
|
405 |
|
406 |
with tab2:
|
407 |
st.header("Ground Truth Generation")
|
408 |
-
use_existing_transcript = st.checkbox("Use existing transcript")
|
409 |
|
410 |
-
|
411 |
-
|
412 |
-
videos
|
413 |
-
if not videos:
|
414 |
-
st.warning("No videos available. Please process some videos first.")
|
415 |
-
else:
|
416 |
-
video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
|
417 |
-
|
418 |
-
# Allow filtering by channel name
|
419 |
-
channels = sorted(video_df['channel_name'].unique())
|
420 |
-
selected_channel = st.selectbox("Filter by Channel", ["All"] + channels, key="gt_channel_filter")
|
421 |
-
|
422 |
-
if selected_channel != "All":
|
423 |
-
video_df = video_df[video_df['channel_name'] == selected_channel]
|
424 |
-
|
425 |
-
# Display videos and allow selection
|
426 |
-
st.dataframe(video_df)
|
427 |
-
selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(),
|
428 |
-
format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0],
|
429 |
-
key="gt_video_select")
|
430 |
-
|
431 |
-
if st.button("Generate Ground Truth from Existing Transcript"):
|
432 |
-
with st.spinner("Generating ground truth..."):
|
433 |
-
# Retrieve the transcript content (you'll need to implement this method)
|
434 |
-
transcript_data = get_transcript(selected_video_id)
|
435 |
-
if transcript_data and 'transcript' in transcript_data:
|
436 |
-
full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
|
437 |
-
ground_truth_df = generate_ground_truth(video_id=selected_video_id, existing_transcript=full_transcript)
|
438 |
-
if ground_truth_df is not None:
|
439 |
-
st.dataframe(ground_truth_df)
|
440 |
-
csv = ground_truth_df.to_csv(index=False)
|
441 |
-
st.download_button(
|
442 |
-
label="Download Ground Truth CSV",
|
443 |
-
data=csv,
|
444 |
-
file_name=f"ground_truth_{selected_video_id}.csv",
|
445 |
-
mime="text/csv",
|
446 |
-
)
|
447 |
-
else:
|
448 |
-
st.error("Failed to retrieve transcript content.")
|
449 |
else:
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
if ground_truth_df is not None:
|
455 |
st.dataframe(ground_truth_df)
|
456 |
csv = ground_truth_df.to_csv(index=False)
|
457 |
st.download_button(
|
458 |
-
label="Download Ground Truth CSV",
|
459 |
data=csv,
|
460 |
-
file_name=
|
461 |
mime="text/csv",
|
462 |
)
|
463 |
|
464 |
with tab3:
|
465 |
st.header("RAG Evaluation")
|
466 |
|
467 |
-
# Load ground truth data
|
468 |
try:
|
469 |
ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
|
470 |
ground_truth_available = True
|
@@ -480,7 +356,7 @@ def main():
|
|
480 |
|
481 |
if st.button("Run Evaluation"):
|
482 |
with st.spinner("Running evaluation..."):
|
483 |
-
evaluation_results = evaluate_rag(sample_size)
|
484 |
if evaluation_results:
|
485 |
st.write("Evaluation Results:")
|
486 |
st.dataframe(pd.DataFrame(evaluation_results, columns=['Video ID', 'Question', 'Answer', 'Relevance', 'Explanation']))
|
@@ -488,7 +364,6 @@ def main():
|
|
488 |
st.warning("No ground truth data available. Please generate ground truth data first.")
|
489 |
st.button("Run Evaluation", disabled=True)
|
490 |
|
491 |
-
# Add a section to generate ground truth if it's not available
|
492 |
if not ground_truth_available:
|
493 |
st.subheader("Generate Ground Truth")
|
494 |
st.write("You need to generate ground truth data before running the evaluation.")
|
@@ -497,4 +372,7 @@ def main():
|
|
497 |
st.experimental_rerun()
|
498 |
|
499 |
if __name__ == "__main__":
|
|
|
|
|
|
|
500 |
main()
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
from transcript_extractor import get_transcript, get_youtube_client, extract_video_id, get_channel_videos, test_api_key, initialize_youtube_api
|
4 |
from data_processor import DataProcessor
|
5 |
from database import DatabaseHandler
|
6 |
from rag import RAGSystem
|
7 |
from query_rewriter import QueryRewriter
|
8 |
from evaluation import EvaluationSystem
|
9 |
+
from generate_ground_truth import generate_ground_truth, generate_ground_truth_for_all_videos
|
10 |
from sentence_transformers import SentenceTransformer
|
11 |
import os
|
12 |
+
import sys
|
|
|
|
|
|
|
13 |
import logging
|
|
|
14 |
|
15 |
+
logging.basicConfig(level=logging.DEBUG)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
@st.cache_resource
|
|
|
30 |
st.error(f"Error initializing components: {str(e)}")
|
31 |
st.error("Please check your configuration and ensure all services are running.")
|
32 |
return None, None, None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
def check_api_key():
|
35 |
+
if test_api_key():
|
36 |
+
st.success("YouTube API key is valid and working.")
|
37 |
+
else:
|
38 |
+
st.error("YouTube API key is invalid or not set. Please check your .env file.")
|
39 |
+
new_api_key = st.text_input("Enter your YouTube API key:")
|
40 |
+
if new_api_key:
|
41 |
+
os.environ['YOUTUBE_API_KEY'] = new_api_key
|
42 |
+
with open('.env', 'a') as f:
|
43 |
+
f.write(f"\nYOUTUBE_API_KEY={new_api_key}")
|
44 |
+
st.success("API key saved. Reinitializing YouTube client...")
|
45 |
+
get_youtube_client.cache_clear() # Clear the cache to force reinitialization
|
46 |
+
if test_api_key():
|
47 |
+
st.success("YouTube client reinitialized successfully.")
|
48 |
+
else:
|
49 |
+
st.error("Failed to reinitialize YouTube client. Please check your API key.")
|
50 |
+
st.experimental_rerun()
|
51 |
|
52 |
+
# LLM-as-a-judge prompt template
|
53 |
+
prompt_template = """
|
54 |
+
You are an expert evaluator for a Youtube transcript assistant.
|
55 |
+
Your task is to analyze the relevance of the generated answer to the given question.
|
56 |
+
Based on the relevance of the generated answer, you will classify it
|
57 |
+
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".
|
58 |
|
59 |
+
Here is the data for evaluation:
|
60 |
|
61 |
+
Question: {question}
|
62 |
+
Generated Answer: {answer_llm}
|
63 |
|
64 |
+
Please analyze the content and context of the generated answer in relation to the question
|
65 |
+
and provide your evaluation in the following JSON format:
|
66 |
|
67 |
+
{{
|
68 |
+
"Relevance": "NON_RELEVANT",
|
69 |
+
"Explanation": "Your explanation here"
|
70 |
+
}}
|
71 |
|
72 |
+
OR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
+
{{
|
75 |
+
"Relevance": "PARTLY_RELEVANT",
|
76 |
+
"Explanation": "Your explanation here"
|
77 |
+
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
OR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
+
{{
|
82 |
+
"Relevance": "RELEVANT",
|
83 |
+
"Explanation": "Your explanation here"
|
84 |
+
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
+
Ensure your response is a valid JSON object with these exact keys and one of the three exact values for "Relevance".
|
87 |
+
Do not include any text outside of this JSON object.
|
88 |
+
"""
|
|
|
|
|
89 |
|
90 |
+
def process_single_video(db_handler, data_processor, video_id, embedding_model):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
|
92 |
if existing_index:
|
93 |
logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
|
|
|
96 |
transcript_data = get_transcript(video_id)
|
97 |
if transcript_data is None:
|
98 |
logger.error(f"Failed to retrieve transcript for video {video_id}")
|
99 |
+
st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
|
100 |
return None
|
101 |
|
|
|
102 |
video_data = {
|
103 |
'video_id': video_id,
|
104 |
'title': transcript_data['metadata'].get('title', 'Unknown Title'),
|
|
|
113 |
db_handler.add_video(video_data)
|
114 |
except Exception as e:
|
115 |
logger.error(f"Error adding video to database: {str(e)}")
|
116 |
+
st.error(f"Error adding video {video_id} to database: {str(e)}")
|
117 |
return None
|
118 |
|
|
|
119 |
try:
|
120 |
data_processor.process_transcript(video_id, transcript_data)
|
121 |
except Exception as e:
|
122 |
logger.error(f"Error processing transcript: {str(e)}")
|
123 |
+
st.error(f"Error processing transcript for video {video_id}: {str(e)}")
|
124 |
return None
|
125 |
|
|
|
126 |
index_name = f"video_{video_id}_{embedding_model}".lower()
|
127 |
try:
|
128 |
index_name = data_processor.build_index(index_name)
|
129 |
logger.info(f"Successfully built index: {index_name}")
|
130 |
except Exception as e:
|
131 |
logger.error(f"Error building index: {str(e)}")
|
132 |
+
st.error(f"Error building index for video {video_id}: {str(e)}")
|
133 |
return None
|
134 |
|
|
|
135 |
embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")
|
136 |
|
|
|
137 |
video_db_record = db_handler.get_video_by_youtube_id(video_id)
|
138 |
if video_db_record is None:
|
139 |
logger.error(f"Failed to retrieve video record from database for video {video_id}")
|
140 |
+
st.error(f"Failed to retrieve video record from database for video {video_id}")
|
141 |
return None
|
142 |
+
video_db_id = video_db_record[0]
|
143 |
|
|
|
144 |
db_handler.add_elasticsearch_index(video_db_id, index_name, embedding_model_id)
|
145 |
|
146 |
logger.info(f"Processed and indexed transcript for video {video_id}")
|
147 |
+
st.success(f"Successfully processed and indexed transcript for video {video_id}")
|
148 |
return index_name
|
149 |
|
150 |
+
def process_multiple_videos(db_handler, data_processor, video_ids, embedding_model):
|
|
|
151 |
indices = []
|
152 |
for video_id in video_ids:
|
153 |
+
index = process_single_video(db_handler, data_processor, video_id, embedding_model)
|
154 |
if index:
|
155 |
indices.append(index)
|
156 |
logger.info(f"Processed and indexed transcripts for {len(indices)} videos")
|
157 |
st.success(f"Processed and indexed transcripts for {len(indices)} videos")
|
158 |
return indices
|
159 |
|
160 |
+
def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
|
161 |
+
index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
|
162 |
+
if not index_name:
|
163 |
+
st.warning(f"Video {video_id} has not been processed yet. Processing now...")
|
164 |
+
index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
|
165 |
+
if not index_name:
|
166 |
+
st.error(f"Failed to process video {video_id}. Please check the logs for more information.")
|
167 |
+
return False
|
168 |
+
return True
|
169 |
+
|
170 |
def main():
|
171 |
st.title("YouTube Transcript RAG System")
|
172 |
|
173 |
+
check_api_key()
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
+
components = init_components()
|
176 |
+
if components:
|
177 |
+
db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
|
178 |
+
else:
|
179 |
+
st.stop()
|
180 |
+
|
181 |
tab1, tab2, tab3 = st.tabs(["RAG System", "Ground Truth Generation", "Evaluation"])
|
182 |
|
183 |
with tab1:
|
184 |
st.header("RAG System")
|
185 |
|
186 |
+
embedding_model = st.selectbox("Select embedding model:", ["multi-qa-MiniLM-L6-cos-v1", "all-mpnet-base-v2"])
|
187 |
+
|
188 |
st.subheader("Select a Video")
|
189 |
videos = db_handler.get_all_videos()
|
190 |
if not videos:
|
|
|
192 |
else:
|
193 |
video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
|
194 |
|
|
|
195 |
channels = sorted(video_df['channel_name'].unique())
|
196 |
selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
|
197 |
|
198 |
if selected_channel != "All":
|
199 |
video_df = video_df[video_df['channel_name'] == selected_channel]
|
200 |
|
|
|
201 |
st.dataframe(video_df)
|
202 |
selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
|
203 |
|
|
|
|
|
|
|
|
|
204 |
index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id, embedding_model)
|
205 |
|
206 |
if index_name:
|
|
|
208 |
else:
|
209 |
st.warning("No index found for the selected video and embedding model. The index will be built when you search.")
|
210 |
|
|
|
211 |
st.subheader("Process New Video")
|
212 |
input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
|
213 |
input_value = st.text_input("Enter the URL or ID:")
|
214 |
|
215 |
if st.button("Process"):
|
216 |
with st.spinner("Processing..."):
|
217 |
+
data_processor.set_embedding_model(embedding_model)
|
218 |
if input_type == "Video URL":
|
219 |
video_id = extract_video_id(input_value)
|
220 |
if video_id:
|
221 |
+
index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
|
222 |
if index_name is None:
|
223 |
st.error(f"Failed to process video {video_id}")
|
224 |
else:
|
|
|
228 |
elif input_type == "Channel URL":
|
229 |
channel_videos = get_channel_videos(input_value)
|
230 |
if channel_videos:
|
231 |
+
index_names = process_multiple_videos(db_handler, data_processor, [video['video_id'] for video in channel_videos], embedding_model)
|
232 |
if not index_names:
|
233 |
st.error("Failed to process any videos from the channel")
|
234 |
else:
|
|
|
236 |
else:
|
237 |
st.error("Failed to retrieve videos from the channel")
|
238 |
else:
|
239 |
+
index_name = process_single_video(db_handler, data_processor, input_value, embedding_model)
|
240 |
if index_name is None:
|
241 |
st.error(f"Failed to process video {input_value}")
|
242 |
else:
|
243 |
st.success(f"Successfully processed video {input_value}")
|
244 |
|
|
|
245 |
st.subheader("Query the RAG System")
|
246 |
query = st.text_input("Enter your query:")
|
247 |
rewrite_method = st.radio("Query rewriting method:", ["None", "Chain of Thought", "ReAct"])
|
|
|
269 |
|
270 |
search_method_map = {"Hybrid": "hybrid", "Text-only": "text", "Embedding-only": "embedding"}
|
271 |
try:
|
|
|
272 |
if not index_name:
|
273 |
st.info("Building index for the selected video...")
|
274 |
+
index_name = process_single_video(db_handler, data_processor, selected_video_id, embedding_model)
|
275 |
if not index_name:
|
276 |
st.error("Failed to build index for the selected video.")
|
277 |
return
|
|
|
298 |
|
299 |
with tab2:
|
300 |
st.header("Ground Truth Generation")
|
|
|
301 |
|
302 |
+
videos = db_handler.get_all_videos()
|
303 |
+
if not videos:
|
304 |
+
st.warning("No videos available. Please process some videos first.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
else:
|
306 |
+
video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
|
307 |
+
|
308 |
+
st.dataframe(video_df)
|
309 |
+
selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(),
|
310 |
+
format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0],
|
311 |
+
key="gt_video_select")
|
312 |
+
|
313 |
+
if st.button("Generate Ground Truth for Selected Video"):
|
314 |
+
if ensure_video_processed(db_handler, data_processor, selected_video_id, embedding_model):
|
315 |
+
with st.spinner("Generating ground truth..."):
|
316 |
+
ground_truth_df = generate_ground_truth(db_handler, data_processor, selected_video_id)
|
317 |
+
if ground_truth_df is not None:
|
318 |
+
st.dataframe(ground_truth_df)
|
319 |
+
csv = ground_truth_df.to_csv(index=False)
|
320 |
+
st.download_button(
|
321 |
+
label="Download Ground Truth CSV",
|
322 |
+
data=csv,
|
323 |
+
file_name=f"ground_truth_{selected_video_id}.csv",
|
324 |
+
mime="text/csv",
|
325 |
+
)
|
326 |
+
if st.button("Generate Ground Truth for All Videos"):
|
327 |
+
with st.spinner("Processing videos and generating ground truth..."):
|
328 |
+
for video_id in video_df['youtube_id']:
|
329 |
+
ensure_video_processed(db_handler, data_processor, video_id, embedding_model)
|
330 |
+
ground_truth_df = generate_ground_truth_for_all_videos(db_handler, data_processor)
|
331 |
if ground_truth_df is not None:
|
332 |
st.dataframe(ground_truth_df)
|
333 |
csv = ground_truth_df.to_csv(index=False)
|
334 |
st.download_button(
|
335 |
+
label="Download Ground Truth CSV (All Videos)",
|
336 |
data=csv,
|
337 |
+
file_name="ground_truth_all_videos.csv",
|
338 |
mime="text/csv",
|
339 |
)
|
340 |
|
341 |
with tab3:
|
342 |
st.header("RAG Evaluation")
|
343 |
|
|
|
344 |
try:
|
345 |
ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
|
346 |
ground_truth_available = True
|
|
|
356 |
|
357 |
if st.button("Run Evaluation"):
|
358 |
with st.spinner("Running evaluation..."):
|
359 |
+
evaluation_results = evaluation_system.evaluate_rag(rag_system, 'data/ground-truth-retrieval.csv', sample_size, prompt_template)
|
360 |
if evaluation_results:
|
361 |
st.write("Evaluation Results:")
|
362 |
st.dataframe(pd.DataFrame(evaluation_results, columns=['Video ID', 'Question', 'Answer', 'Relevance', 'Explanation']))
|
|
|
364 |
st.warning("No ground truth data available. Please generate ground truth data first.")
|
365 |
st.button("Run Evaluation", disabled=True)
|
366 |
|
|
|
367 |
if not ground_truth_available:
|
368 |
st.subheader("Generate Ground Truth")
|
369 |
st.write("You need to generate ground truth data before running the evaluation.")
|
|
|
372 |
st.experimental_rerun()
|
373 |
|
374 |
if __name__ == "__main__":
|
375 |
+
if not initialize_youtube_api():
|
376 |
+
logger.error("Failed to initialize YouTube API. Exiting.")
|
377 |
+
sys.exit(1)
|
378 |
main()
|
app/transcript_extractor.py
CHANGED
@@ -3,29 +3,51 @@ from dotenv import load_dotenv
|
|
3 |
from youtube_transcript_api import YouTubeTranscriptApi
|
4 |
from googleapiclient.discovery import build
|
5 |
from googleapiclient.errors import HttpError
|
|
|
|
|
|
|
6 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# Get the directory of the current script
|
9 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
10 |
# Construct the path to the .env file (one directory up from the current script)
|
11 |
dotenv_path = os.path.join(os.path.dirname(current_dir), '.env')
|
12 |
-
|
13 |
# Load environment variables from .env file
|
14 |
load_dotenv(dotenv_path)
|
15 |
|
16 |
# Get API key from environment variable
|
17 |
API_KEY = os.getenv('YOUTUBE_API_KEY')
|
18 |
-
|
|
|
19 |
if not API_KEY:
|
20 |
raise ValueError("YouTube API key not found. Make sure it's set in your .env file in the parent directory of the 'app' folder.")
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
29 |
|
30 |
def extract_video_id(url):
|
31 |
if not url:
|
@@ -36,16 +58,22 @@ def extract_video_id(url):
|
|
36 |
return None
|
37 |
|
38 |
def get_video_metadata(video_id):
|
|
|
39 |
try:
|
40 |
request = youtube.videos().list(
|
41 |
part="snippet,contentDetails,statistics",
|
42 |
id=video_id
|
43 |
)
|
44 |
response = request.execute()
|
45 |
-
|
46 |
if 'items' in response and len(response['items']) > 0:
|
47 |
video = response['items'][0]
|
48 |
snippet = video['snippet']
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
return {
|
50 |
'title': snippet['title'],
|
51 |
'author': snippet['channelTitle'],
|
@@ -53,48 +81,24 @@ def get_video_metadata(video_id):
|
|
53 |
'view_count': video['statistics'].get('viewCount', '0'),
|
54 |
'like_count': video['statistics'].get('likeCount', '0'),
|
55 |
'comment_count': video['statistics'].get('commentCount', '0'),
|
56 |
-
'duration': video['contentDetails']['duration']
|
|
|
57 |
}
|
58 |
else:
|
59 |
-
|
60 |
return None
|
61 |
-
except HttpError as e:
|
62 |
-
print(f"An HTTP error {e.resp.status} occurred: {e.content}")
|
63 |
-
return None
|
64 |
except Exception as e:
|
65 |
-
|
66 |
return None
|
67 |
-
|
68 |
-
def get_transcript(video_id):
|
69 |
-
# Get the directory of the current script
|
70 |
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
71 |
-
# Construct the path to the .env file (one directory up from the current script)
|
72 |
-
dotenv_path = os.path.join(os.path.dirname(current_dir), '.env')
|
73 |
-
print("the .env path is :" + dotenv_path)
|
74 |
-
# Load environment variables from .env file
|
75 |
-
load_dotenv(dotenv_path)
|
76 |
-
|
77 |
-
# Get API key from environment variable
|
78 |
-
API_KEY = os.getenv('YOUTUBE_API_KEY')
|
79 |
-
print("the api key is :" + API_KEY)
|
80 |
-
if not API_KEY:
|
81 |
-
raise ValueError("YouTube API key not found. Make sure it's set in your .env file in the parent directory of the 'app' folder.")
|
82 |
-
|
83 |
-
print(f"API_KEY: {API_KEY[:5]}...{API_KEY[-5:]}") # Print first and last 5 characters for verification
|
84 |
-
|
85 |
-
try:
|
86 |
-
youtube = build('youtube', 'v3', developerKey=API_KEY)
|
87 |
-
except Exception as e:
|
88 |
-
print(f"Error initializing YouTube API client: {str(e)}")
|
89 |
-
raise
|
90 |
|
|
|
91 |
if not video_id:
|
92 |
return None
|
93 |
try:
|
94 |
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
95 |
metadata = get_video_metadata(video_id)
|
96 |
-
|
97 |
-
|
98 |
if not metadata:
|
99 |
return None
|
100 |
return {
|
@@ -102,13 +106,14 @@ def get_transcript(video_id):
|
|
102 |
'metadata': metadata
|
103 |
}
|
104 |
except Exception as e:
|
105 |
-
|
106 |
return None
|
107 |
|
108 |
def get_channel_videos(channel_url):
|
|
|
109 |
channel_id = extract_channel_id(channel_url)
|
110 |
if not channel_id:
|
111 |
-
|
112 |
return []
|
113 |
try:
|
114 |
request = youtube.search().list(
|
@@ -129,10 +134,10 @@ def get_channel_videos(channel_url):
|
|
129 |
})
|
130 |
return videos
|
131 |
except HttpError as e:
|
132 |
-
|
133 |
return []
|
134 |
except Exception as e:
|
135 |
-
|
136 |
return []
|
137 |
|
138 |
def extract_channel_id(url):
|
@@ -141,10 +146,25 @@ def extract_channel_id(url):
|
|
141 |
return channel_id_match.group(1)
|
142 |
return None
|
143 |
|
144 |
-
def
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from youtube_transcript_api import YouTubeTranscriptApi
|
4 |
from googleapiclient.discovery import build
|
5 |
from googleapiclient.errors import HttpError
|
6 |
+
import google_auth_oauthlib.flow
|
7 |
+
import googleapiclient.discovery
|
8 |
+
import googleapiclient.errors
|
9 |
import re
|
10 |
+
import logging
|
11 |
+
import ssl
|
12 |
+
import certifi
|
13 |
+
import requests
|
14 |
+
|
15 |
+
# Set up logging
|
16 |
+
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
|
19 |
# Get the directory of the current script
|
20 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
21 |
# Construct the path to the .env file (one directory up from the current script)
|
22 |
dotenv_path = os.path.join(os.path.dirname(current_dir), '.env')
|
23 |
+
logger.info(f"The .env path is: {dotenv_path}")
|
24 |
# Load environment variables from .env file
|
25 |
load_dotenv(dotenv_path)
|
26 |
|
27 |
# Get API key from environment variable
|
28 |
API_KEY = os.getenv('YOUTUBE_API_KEY')
|
29 |
+
logger.info(f"API_KEY: {API_KEY[:5]}...{API_KEY[-5:]}") # Log first and last 5 characters for verification
|
30 |
+
|
31 |
if not API_KEY:
|
32 |
raise ValueError("YouTube API key not found. Make sure it's set in your .env file in the parent directory of the 'app' folder.")
|
33 |
|
34 |
+
def get_youtube_client():
|
35 |
+
try:
|
36 |
+
# Create a custom session with SSL verification
|
37 |
+
session = requests.Session()
|
38 |
+
session.verify = certifi.where()
|
39 |
+
|
40 |
+
# Create a custom HTTP object
|
41 |
+
http = googleapiclient.http.build_http()
|
42 |
+
http.verify = session.verify
|
43 |
|
44 |
+
# Build the YouTube client with the custom HTTP object
|
45 |
+
youtube = build('youtube', 'v3', developerKey=API_KEY, http=http)
|
46 |
+
logger.info("YouTube API client initialized successfully")
|
47 |
+
return youtube
|
48 |
+
except Exception as e:
|
49 |
+
logger.error(f"Error initializing YouTube API client: {str(e)}")
|
50 |
+
raise
|
51 |
|
52 |
def extract_video_id(url):
|
53 |
if not url:
|
|
|
58 |
return None
|
59 |
|
60 |
def get_video_metadata(video_id):
|
61 |
+
youtube = get_youtube_client()
|
62 |
try:
|
63 |
request = youtube.videos().list(
|
64 |
part="snippet,contentDetails,statistics",
|
65 |
id=video_id
|
66 |
)
|
67 |
response = request.execute()
|
|
|
68 |
if 'items' in response and len(response['items']) > 0:
|
69 |
video = response['items'][0]
|
70 |
snippet = video['snippet']
|
71 |
+
|
72 |
+
# Get the description and set default if it's blank
|
73 |
+
description = snippet.get('description', '').strip()
|
74 |
+
if not description:
|
75 |
+
description = 'Not Available'
|
76 |
+
|
77 |
return {
|
78 |
'title': snippet['title'],
|
79 |
'author': snippet['channelTitle'],
|
|
|
81 |
'view_count': video['statistics'].get('viewCount', '0'),
|
82 |
'like_count': video['statistics'].get('likeCount', '0'),
|
83 |
'comment_count': video['statistics'].get('commentCount', '0'),
|
84 |
+
'duration': video['contentDetails']['duration'],
|
85 |
+
'description': description # Add the description to the metadata
|
86 |
}
|
87 |
else:
|
88 |
+
logger.error(f"No video found with id: {video_id}")
|
89 |
return None
|
|
|
|
|
|
|
90 |
except Exception as e:
|
91 |
+
logger.error(f"An error occurred while fetching metadata for video {video_id}: {str(e)}")
|
92 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
+
def get_transcript(video_id):
|
95 |
if not video_id:
|
96 |
return None
|
97 |
try:
|
98 |
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
99 |
metadata = get_video_metadata(video_id)
|
100 |
+
logger.info(f"Metadata for video {video_id}: {metadata}")
|
101 |
+
logger.info(f"Transcript length for video {video_id}: {len(transcript)}")
|
102 |
if not metadata:
|
103 |
return None
|
104 |
return {
|
|
|
106 |
'metadata': metadata
|
107 |
}
|
108 |
except Exception as e:
|
109 |
+
logger.error(f"Error extracting transcript for video {video_id}: {str(e)}")
|
110 |
return None
|
111 |
|
112 |
def get_channel_videos(channel_url):
|
113 |
+
youtube = get_youtube_client()
|
114 |
channel_id = extract_channel_id(channel_url)
|
115 |
if not channel_id:
|
116 |
+
logger.error(f"Invalid channel URL: {channel_url}")
|
117 |
return []
|
118 |
try:
|
119 |
request = youtube.search().list(
|
|
|
134 |
})
|
135 |
return videos
|
136 |
except HttpError as e:
|
137 |
+
logger.error(f"An HTTP error {e.resp.status} occurred: {e.content}")
|
138 |
return []
|
139 |
except Exception as e:
|
140 |
+
logger.error(f"An error occurred while fetching channel videos: {str(e)}")
|
141 |
return []
|
142 |
|
143 |
def extract_channel_id(url):
|
|
|
146 |
return channel_id_match.group(1)
|
147 |
return None
|
148 |
|
149 |
+
def test_api_key():
|
150 |
+
youtube = get_youtube_client()
|
151 |
+
try:
|
152 |
+
request = youtube.videos().list(part="snippet", id="dQw4w9WgXcQ")
|
153 |
+
response = request.execute()
|
154 |
+
if 'items' in response:
|
155 |
+
logger.info("API key is valid and working")
|
156 |
+
return True
|
157 |
+
else:
|
158 |
+
logger.error("API request successful but returned unexpected response")
|
159 |
+
return False
|
160 |
+
except Exception as e:
|
161 |
+
logger.error(f"API key test failed: {str(e)}")
|
162 |
+
return False
|
163 |
+
|
164 |
+
def initialize_youtube_api():
|
165 |
+
if test_api_key():
|
166 |
+
logger.info("YouTube API initialized successfully")
|
167 |
+
return True
|
168 |
+
else:
|
169 |
+
logger.error("Failed to initialize YouTube API")
|
170 |
+
return False
|
data/sqlite.db
CHANGED
Binary files a/data/sqlite.db and b/data/sqlite.db differ
|
|
docker-compose.yaml
CHANGED
@@ -20,14 +20,21 @@ services:
|
|
20 |
volumes:
|
21 |
- ./data:/app/data
|
22 |
- ./config:/app/config
|
|
|
23 |
|
24 |
elasticsearch:
|
25 |
-
image: docker.elastic.co/elasticsearch/elasticsearch:
|
|
|
26 |
environment:
|
27 |
- discovery.type=single-node
|
28 |
-
-
|
29 |
ports:
|
30 |
- "9200:9200"
|
|
|
|
|
|
|
|
|
|
|
31 |
volumes:
|
32 |
- esdata:/usr/share/elasticsearch/data
|
33 |
|
@@ -50,5 +57,6 @@ services:
|
|
50 |
|
51 |
volumes:
|
52 |
esdata:
|
|
|
53 |
grafana-storage:
|
54 |
ollama_data:
|
|
|
20 |
volumes:
|
21 |
- ./data:/app/data
|
22 |
- ./config:/app/config
|
23 |
+
- ./app:/app/app # Add this line to map your local app directory
|
24 |
|
25 |
elasticsearch:
|
26 |
+
image: docker.elastic.co/elasticsearch/elasticsearch:8.9.0
|
27 |
+
container_name: elasticsearch
|
28 |
environment:
|
29 |
- discovery.type=single-node
|
30 |
+
- xpack.security.enabled=false
|
31 |
ports:
|
32 |
- "9200:9200"
|
33 |
+
- "9300:9300"
|
34 |
+
deploy:
|
35 |
+
resources:
|
36 |
+
limits:
|
37 |
+
memory: 2G
|
38 |
volumes:
|
39 |
- esdata:/usr/share/elasticsearch/data
|
40 |
|
|
|
57 |
|
58 |
volumes:
|
59 |
esdata:
|
60 |
+
driver: local
|
61 |
grafana-storage:
|
62 |
ollama_data:
|
requirements.txt
CHANGED
@@ -12,4 +12,6 @@ ollama
|
|
12 |
requests
|
13 |
matplotlib
|
14 |
tqdm
|
15 |
-
python-dotenv
|
|
|
|
|
|
12 |
requests
|
13 |
matplotlib
|
14 |
tqdm
|
15 |
+
python-dotenv
|
16 |
+
certifi
|
17 |
+
httplib2
|
run-docker-compose-windows.ps1
CHANGED
@@ -5,18 +5,35 @@ $envPath = ".\.env"
|
|
5 |
if (Test-Path $envPath) {
|
6 |
# Read the .env file
|
7 |
$envContent = Get-Content $envPath
|
8 |
-
|
9 |
# Parse the environment variables
|
10 |
foreach ($line in $envContent) {
|
11 |
if ($line -match '^([^=]+)=(.*)$') {
|
12 |
$name = $matches[1]
|
13 |
$value = $matches[2]
|
14 |
[Environment]::SetEnvironmentVariable($name, $value, "Process")
|
|
|
15 |
}
|
16 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
# Run
|
19 |
-
|
|
|
20 |
}
|
21 |
else {
|
22 |
Write-Error "The .env file was not found at $envPath"
|
|
|
5 |
if (Test-Path $envPath) {
|
6 |
# Read the .env file
|
7 |
$envContent = Get-Content $envPath
|
|
|
8 |
# Parse the environment variables
|
9 |
foreach ($line in $envContent) {
|
10 |
if ($line -match '^([^=]+)=(.*)$') {
|
11 |
$name = $matches[1]
|
12 |
$value = $matches[2]
|
13 |
[Environment]::SetEnvironmentVariable($name, $value, "Process")
|
14 |
+
Write-Host "Loaded environment variable: $name"
|
15 |
}
|
16 |
}
|
17 |
+
|
18 |
+
# Stop existing containers
|
19 |
+
Write-Host "Stopping existing containers..."
|
20 |
+
docker-compose down
|
21 |
+
|
22 |
+
# Rebuild the container
|
23 |
+
Write-Host "Rebuilding Docker containers..."
|
24 |
+
docker-compose build --no-cache app
|
25 |
+
|
26 |
+
# Start the services
|
27 |
+
Write-Host "Starting Docker services..."
|
28 |
+
docker-compose up -d
|
29 |
+
|
30 |
+
# Wait for services to be ready
|
31 |
+
Write-Host "Waiting for services to start up..."
|
32 |
+
Start-Sleep -Seconds 20
|
33 |
|
34 |
+
# Run the Streamlit app
|
35 |
+
Write-Host "Starting Streamlit app..."
|
36 |
+
docker-compose exec -T app sh -c "cd /app/app && streamlit run main.py"
|
37 |
}
|
38 |
else {
|
39 |
Write-Error "The .env file was not found at $envPath"
|