ganesh3 commited on
Commit
25b2b2b
·
1 Parent(s): a0d49e7

third commit

Browse files
app/data_processor.py CHANGED
@@ -1,15 +1,13 @@
1
- import logging
2
  from minsearch import Index
3
  from sentence_transformers import SentenceTransformer
4
  import numpy as np
5
- from sklearn.metrics.pairwise import cosine_similarity
6
- import re
7
  from elasticsearch import Elasticsearch
8
  import os
9
  import json
10
- from transcript_extractor import get_transcript
 
11
 
12
- logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
15
  def clean_text(text):
@@ -18,13 +16,17 @@ def clean_text(text):
18
  return ""
19
  cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
20
  cleaned = re.sub(r'\s+', ' ', cleaned).strip()
21
- logger.info(f"Cleaned text: '{cleaned[:100]}...'")
 
22
  return cleaned
23
 
24
  class DataProcessor:
25
  def __init__(self, text_fields=["content", "title", "description"],
26
  keyword_fields=["video_id", "author", "upload_date"],
27
- embedding_model="all-MiniLM-L6-v2"):
 
 
 
28
  self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
29
  self.embedding_model = SentenceTransformer(embedding_model)
30
  self.documents = []
@@ -39,18 +41,29 @@ class DataProcessor:
39
  logger.info(f"DataProcessor initialized with Elasticsearch at {elasticsearch_host}:{elasticsearch_port}")
40
 
41
  def process_transcript(self, video_id, transcript_data):
42
- if not transcript_data or 'metadata' not in transcript_data or 'transcript' not in transcript_data:
43
- logger.error(f"Invalid transcript data for video {video_id}")
 
 
 
 
 
 
 
44
  return None
45
 
46
  metadata = transcript_data['metadata']
47
  transcript = transcript_data['transcript']
48
 
49
- logger.info(f"Processing transcript for video {video_id}")
50
  logger.info(f"Number of transcript segments: {len(transcript)}")
51
 
52
  full_transcript = " ".join([segment.get('text', '') for segment in transcript])
 
 
 
53
  cleaned_transcript = clean_text(full_transcript)
 
 
54
 
55
  if not cleaned_transcript:
56
  logger.warning(f"Empty cleaned transcript for video {video_id}")
@@ -59,17 +72,25 @@ class DataProcessor:
59
  doc = {
60
  "video_id": video_id,
61
  "content": cleaned_transcript,
62
- "segment_id": f"{video_id}_full",
63
  "title": clean_text(metadata.get('title', '')),
 
64
  "author": metadata.get('author', ''),
65
  "upload_date": metadata.get('upload_date', ''),
 
66
  "view_count": metadata.get('view_count', 0),
67
  "like_count": metadata.get('like_count', 0),
68
  "comment_count": metadata.get('comment_count', 0),
69
  "video_duration": metadata.get('duration', '')
70
  }
 
 
 
 
 
 
71
  self.documents.append(doc)
72
- self.embeddings.append(self.embedding_model.encode(cleaned_transcript + " " + metadata.get('title', '')))
 
73
 
74
  logger.info(f"Processed transcript for video {video_id}")
75
  return f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
@@ -80,28 +101,52 @@ class DataProcessor:
80
  return None
81
 
82
  logger.info(f"Building index with {len(self.documents)} documents")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  try:
84
- self.text_index.fit(self.documents)
 
85
  self.index_built = True
86
  logger.info("Text index built successfully")
87
  except Exception as e:
88
  logger.error(f"Error building text index: {str(e)}")
89
  raise
90
 
91
- self.embeddings = np.array(self.embeddings)
92
-
93
  try:
94
  if not self.es.indices.exists(index=index_name):
95
  self.es.indices.create(index=index_name, body={
96
  "mappings": {
97
  "properties": {
98
- "embedding": {"type": "dense_vector", "dims": self.embeddings.shape[1]},
99
  "content": {"type": "text"},
100
- "video_id": {"type": "keyword"},
101
- "segment_id": {"type": "keyword"},
102
  "title": {"type": "text"},
 
 
103
  "author": {"type": "keyword"},
104
  "upload_date": {"type": "date"},
 
105
  "view_count": {"type": "integer"},
106
  "like_count": {"type": "integer"},
107
  "comment_count": {"type": "integer"},
@@ -122,19 +167,71 @@ class DataProcessor:
122
  except Exception as e:
123
  logger.error(f"Error building Elasticsearch index: {str(e)}")
124
  raise
 
 
 
125
 
126
- def ensure_index_built(self, video_id, embedding_model):
127
- index_name = f"video_{video_id}_{embedding_model.replace('-', '_')}".lower()
128
- if not self.es.indices.exists(index=index_name):
129
- logger.info(f"Index {index_name} does not exist. Building now...")
130
- transcript_data = get_transcript(video_id)
131
- if transcript_data:
132
- self.process_transcript(video_id, transcript_data)
133
- return self.build_index(index_name)
134
- else:
135
- logger.error(f"Failed to retrieve transcript for video {video_id}")
136
- return None
137
- return index_name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  def search(self, query, filter_dict={}, boost_dict={}, num_results=10, method='hybrid', index_name=None):
140
  if not index_name:
@@ -147,78 +244,67 @@ class DataProcessor:
147
 
148
  logger.info(f"Performing {method} search for query: {query} in index: {index_name}")
149
 
150
- if method == 'text':
151
- return self.text_search(query, filter_dict, boost_dict, num_results, index_name)
152
- elif method == 'embedding':
153
- return self.embedding_search(query, num_results, index_name)
154
- else: # hybrid search
155
- text_results = self.text_search(query, filter_dict, boost_dict, num_results, index_name)
156
- embedding_results = self.embedding_search(query, num_results, index_name)
157
- return self.combine_results(text_results, embedding_results, num_results)
 
 
158
 
159
  def text_search(self, query, filter_dict={}, boost_dict={}, num_results=10, index_name=None):
160
  if not index_name:
161
  logger.error("No index name provided for text search.")
162
  raise ValueError("No index name provided for text search.")
163
 
164
- # Perform text search using Elasticsearch
165
- search_body = {
166
- "query": {
167
- "multi_match": {
168
- "query": query,
169
- "fields": ["content", "title"]
170
- }
171
- },
172
- "size": num_results
173
- }
174
- response = self.es.search(index=index_name, body=search_body)
175
- return [hit['_source'] for hit in response['hits']['hits']]
 
 
 
176
 
177
  def embedding_search(self, query, num_results=10, index_name=None):
178
  if not index_name:
179
  logger.error("No index name provided for embedding search.")
180
  raise ValueError("No index name provided for embedding search.")
181
 
182
- query_vector = self.embedding_model.encode(query).tolist()
183
- script_query = {
184
- "script_score": {
185
- "query": {"match_all": {}},
186
- "script": {
187
- "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
188
- "params": {"query_vector": query_vector}
 
 
189
  }
190
  }
191
- }
192
- response = self.es.search(
193
- index=index_name,
194
- body={
195
- "size": num_results,
196
- "query": script_query,
197
- "_source": {"excludes": ["embedding"]}
198
- }
199
- )
200
- return [hit['_source'] for hit in response['hits']['hits']]
201
-
202
- def combine_results(self, text_results, embedding_results, num_results):
203
- combined = []
204
- for i in range(max(len(text_results), len(embedding_results))):
205
- if i < len(text_results):
206
- combined.append(text_results[i])
207
- if i < len(embedding_results):
208
- combined.append(embedding_results[i])
209
-
210
- seen = set()
211
- deduped = []
212
- for doc in combined:
213
- if doc['segment_id'] not in seen:
214
- seen.add(doc['segment_id'])
215
- deduped.append(doc)
216
-
217
- return deduped[:num_results]
218
-
219
- def process_query(self, query):
220
- return clean_text(query)
221
-
222
  def set_embedding_model(self, model_name):
223
  self.embedding_model = SentenceTransformer(model_name)
224
  logger.info(f"Embedding model set to: {model_name}")
 
 
1
  from minsearch import Index
2
  from sentence_transformers import SentenceTransformer
3
  import numpy as np
 
 
4
  from elasticsearch import Elasticsearch
5
  import os
6
  import json
7
+ import logging
8
+ import re
9
 
10
+ logging.basicConfig(level=logging.DEBUG)
11
  logger = logging.getLogger(__name__)
12
 
13
  def clean_text(text):
 
16
  return ""
17
  cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
18
  cleaned = re.sub(r'\s+', ' ', cleaned).strip()
19
+ logger.debug(f"Original text length: {len(text)}, Cleaned text length: {len(cleaned)}")
20
+ logger.debug(f"Cleaned text sample: '{cleaned[:100]}...'")
21
  return cleaned
22
 
23
  class DataProcessor:
24
  def __init__(self, text_fields=["content", "title", "description"],
25
  keyword_fields=["video_id", "author", "upload_date"],
26
+ embedding_model="multi-qa-MiniLM-L6-cos-v1"):
27
+ self.text_fields = text_fields
28
+ self.keyword_fields = keyword_fields
29
+ self.all_fields = text_fields + keyword_fields
30
  self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
31
  self.embedding_model = SentenceTransformer(embedding_model)
32
  self.documents = []
 
41
  logger.info(f"DataProcessor initialized with Elasticsearch at {elasticsearch_host}:{elasticsearch_port}")
42
 
43
  def process_transcript(self, video_id, transcript_data):
44
+ logger.info(f"Processing transcript for video {video_id}")
45
+
46
+ if not transcript_data:
47
+ logger.error(f"Transcript data is None for video {video_id}")
48
+ return None
49
+
50
+ if 'metadata' not in transcript_data or 'transcript' not in transcript_data:
51
+ logger.error(f"Invalid transcript data structure for video {video_id}")
52
+ logger.debug(f"Transcript data keys: {transcript_data.keys()}")
53
  return None
54
 
55
  metadata = transcript_data['metadata']
56
  transcript = transcript_data['transcript']
57
 
 
58
  logger.info(f"Number of transcript segments: {len(transcript)}")
59
 
60
  full_transcript = " ".join([segment.get('text', '') for segment in transcript])
61
+ logger.debug(f"Full transcript length before cleaning: {len(full_transcript)}")
62
+ logger.debug(f"Full transcript sample before cleaning: '{full_transcript[:500]}...'")
63
+
64
  cleaned_transcript = clean_text(full_transcript)
65
+ logger.debug(f"Cleaned transcript length: {len(cleaned_transcript)}")
66
+ logger.debug(f"Cleaned transcript sample: '{cleaned_transcript[:500]}...'")
67
 
68
  if not cleaned_transcript:
69
  logger.warning(f"Empty cleaned transcript for video {video_id}")
 
72
  doc = {
73
  "video_id": video_id,
74
  "content": cleaned_transcript,
 
75
  "title": clean_text(metadata.get('title', '')),
76
+ "description": clean_text(metadata.get('description', 'Not Available')),
77
  "author": metadata.get('author', ''),
78
  "upload_date": metadata.get('upload_date', ''),
79
+ "segment_id": f"{video_id}_full",
80
  "view_count": metadata.get('view_count', 0),
81
  "like_count": metadata.get('like_count', 0),
82
  "comment_count": metadata.get('comment_count', 0),
83
  "video_duration": metadata.get('duration', '')
84
  }
85
+
86
+ logger.debug(f"Document created for video {video_id}")
87
+ for field in self.all_fields:
88
+ logger.debug(f"Document {field} length: {len(str(doc.get(field, '')))}")
89
+ logger.debug(f"Document {field} sample: '{str(doc.get(field, ''))[:100]}...'")
90
+
91
  self.documents.append(doc)
92
+ embedding = self.embedding_model.encode(cleaned_transcript + " " + metadata.get('title', ''))
93
+ self.embeddings.append(embedding)
94
 
95
  logger.info(f"Processed transcript for video {video_id}")
96
  return f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
 
101
  return None
102
 
103
  logger.info(f"Building index with {len(self.documents)} documents")
104
+
105
+ # Fields to include in the fit function
106
+ index_fields = self.text_fields + self.keyword_fields
107
+
108
+ # Create a list of dictionaries with only the fields we want to index
109
+ docs_to_index = []
110
+ for doc in self.documents:
111
+ indexed_doc = {field: doc.get(field, '') for field in index_fields}
112
+ if all(indexed_doc.values()): # Check if all required fields have values
113
+ docs_to_index.append(indexed_doc)
114
+ else:
115
+ missing_fields = [field for field, value in indexed_doc.items() if not value]
116
+ logger.warning(f"Document with video_id {doc.get('video_id', 'unknown')} is missing values for fields: {missing_fields}")
117
+
118
+ if not docs_to_index:
119
+ logger.error("No valid documents to index")
120
+ return None
121
+
122
+ logger.info(f"Number of valid documents to index: {len(docs_to_index)}")
123
+
124
+ # Log the structure of the first document to be indexed
125
+ logger.debug("Structure of the first document to be indexed:")
126
+ logger.debug(json.dumps(docs_to_index[0], indent=2))
127
+
128
  try:
129
+ logger.info("Fitting text index")
130
+ self.text_index.fit(docs_to_index)
131
  self.index_built = True
132
  logger.info("Text index built successfully")
133
  except Exception as e:
134
  logger.error(f"Error building text index: {str(e)}")
135
  raise
136
 
 
 
137
  try:
138
  if not self.es.indices.exists(index=index_name):
139
  self.es.indices.create(index=index_name, body={
140
  "mappings": {
141
  "properties": {
142
+ "embedding": {"type": "dense_vector", "dims": len(self.embeddings[0]), "index": True, "similarity": "cosine"},
143
  "content": {"type": "text"},
 
 
144
  "title": {"type": "text"},
145
+ "description": {"type": "text"},
146
+ "video_id": {"type": "keyword"},
147
  "author": {"type": "keyword"},
148
  "upload_date": {"type": "date"},
149
+ "segment_id": {"type": "keyword"},
150
  "view_count": {"type": "integer"},
151
  "like_count": {"type": "integer"},
152
  "comment_count": {"type": "integer"},
 
167
  except Exception as e:
168
  logger.error(f"Error building Elasticsearch index: {str(e)}")
169
  raise
170
+
171
+ def compute_rrf(self, rank, k=60):
172
+ return 1 / (k + rank)
173
 
174
+ def hybrid_search(self, query, index_name, num_results=5):
175
+ if not index_name:
176
+ logger.error("No index name provided for hybrid search.")
177
+ raise ValueError("No index name provided for hybrid search.")
178
+
179
+ vector = self.embedding_model.encode(query)
180
+
181
+ knn_query = {
182
+ "field": "embedding",
183
+ "query_vector": vector.tolist(),
184
+ "k": 10,
185
+ "num_candidates": 100
186
+ }
187
+
188
+ keyword_query = {
189
+ "multi_match": {
190
+ "query": query,
191
+ "fields": self.text_fields
192
+ }
193
+ }
194
+
195
+ try:
196
+ knn_results = self.es.search(
197
+ index=index_name,
198
+ body={
199
+ "knn": knn_query,
200
+ "size": 10
201
+ }
202
+ )['hits']['hits']
203
+
204
+ keyword_results = self.es.search(
205
+ index=index_name,
206
+ body={
207
+ "query": keyword_query,
208
+ "size": 10
209
+ }
210
+ )['hits']['hits']
211
+
212
+ rrf_scores = {}
213
+ for rank, hit in enumerate(knn_results):
214
+ doc_id = hit['_id']
215
+ rrf_scores[doc_id] = self.compute_rrf(rank + 1)
216
+
217
+ for rank, hit in enumerate(keyword_results):
218
+ doc_id = hit['_id']
219
+ if doc_id in rrf_scores:
220
+ rrf_scores[doc_id] += self.compute_rrf(rank + 1)
221
+ else:
222
+ rrf_scores[doc_id] = self.compute_rrf(rank + 1)
223
+
224
+ reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
225
+
226
+ final_results = []
227
+ for doc_id, score in reranked_docs[:num_results]:
228
+ doc = self.es.get(index=index_name, id=doc_id)
229
+ final_results.append(doc['_source'])
230
+
231
+ return final_results
232
+ except Exception as e:
233
+ logger.error(f"Error in hybrid search: {str(e)}")
234
+ raise
235
 
236
  def search(self, query, filter_dict={}, boost_dict={}, num_results=10, method='hybrid', index_name=None):
237
  if not index_name:
 
244
 
245
  logger.info(f"Performing {method} search for query: {query} in index: {index_name}")
246
 
247
+ try:
248
+ if method == 'text':
249
+ return self.text_search(query, filter_dict, boost_dict, num_results, index_name)
250
+ elif method == 'embedding':
251
+ return self.embedding_search(query, num_results, index_name)
252
+ else: # hybrid search
253
+ return self.hybrid_search(query, index_name, num_results)
254
+ except Exception as e:
255
+ logger.error(f"Error in search method {method}: {str(e)}")
256
+ raise
257
 
258
  def text_search(self, query, filter_dict={}, boost_dict={}, num_results=10, index_name=None):
259
  if not index_name:
260
  logger.error("No index name provided for text search.")
261
  raise ValueError("No index name provided for text search.")
262
 
263
+ try:
264
+ search_body = {
265
+ "query": {
266
+ "multi_match": {
267
+ "query": query,
268
+ "fields": self.text_fields
269
+ }
270
+ },
271
+ "size": num_results
272
+ }
273
+ response = self.es.search(index=index_name, body=search_body)
274
+ return [hit['_source'] for hit in response['hits']['hits']]
275
+ except Exception as e:
276
+ logger.error(f"Error in text search: {str(e)}")
277
+ raise
278
 
279
  def embedding_search(self, query, num_results=10, index_name=None):
280
  if not index_name:
281
  logger.error("No index name provided for embedding search.")
282
  raise ValueError("No index name provided for embedding search.")
283
 
284
+ try:
285
+ query_vector = self.embedding_model.encode(query).tolist()
286
+ script_query = {
287
+ "script_score": {
288
+ "query": {"match_all": {}},
289
+ "script": {
290
+ "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
291
+ "params": {"query_vector": query_vector}
292
+ }
293
  }
294
  }
295
+ response = self.es.search(
296
+ index=index_name,
297
+ body={
298
+ "size": num_results,
299
+ "query": script_query,
300
+ "_source": {"excludes": ["embedding"]}
301
+ }
302
+ )
303
+ return [hit['_source'] for hit in response['hits']['hits']]
304
+ except Exception as e:
305
+ logger.error(f"Error in embedding search: {str(e)}")
306
+ raise
307
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  def set_embedding_model(self, model_name):
309
  self.embedding_model = SentenceTransformer(model_name)
310
  logger.info(f"Embedding model set to: {model_name}")
app/evaluation.py CHANGED
@@ -1,5 +1,8 @@
1
  from sklearn.metrics.pairwise import cosine_similarity
2
  import numpy as np
 
 
 
3
 
4
  class EvaluationSystem:
5
  def __init__(self, data_processor, database_handler):
@@ -7,15 +10,15 @@ class EvaluationSystem:
7
  self.db_handler = database_handler
8
 
9
  def relevance_scoring(self, query, retrieved_docs, top_k=5):
10
- query_embedding = self.data_processor.process_query(query)
11
- doc_embeddings = [self.data_processor.process_query(doc) for doc in retrieved_docs]
12
 
13
  similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
14
  return np.mean(sorted(similarities, reverse=True)[:top_k])
15
 
16
  def answer_similarity(self, generated_answer, reference_answer):
17
- gen_embedding = self.data_processor.process_query(generated_answer)
18
- ref_embedding = self.data_processor.process_query(reference_answer)
19
  return cosine_similarity([gen_embedding], [ref_embedding])[0][0]
20
 
21
  def human_evaluation(self, video_id, query):
@@ -34,8 +37,8 @@ class EvaluationSystem:
34
  human_scores = []
35
 
36
  for query, reference in zip(test_queries, reference_answers):
37
- retrieved_docs = rag_system.es_handler.search(index_name, rag_system.data_processor.process_query(query))
38
- generated_answer = rag_system.query(index_name, query)
39
 
40
  relevance_scores.append(self.relevance_scoring(query, retrieved_docs))
41
  similarity_scores.append(self.answer_similarity(generated_answer, reference))
@@ -45,4 +48,67 @@ class EvaluationSystem:
45
  "avg_relevance_score": np.mean(relevance_scores),
46
  "avg_similarity_score": np.mean(similarity_scores),
47
  "avg_human_score": np.mean(human_scores)
48
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from sklearn.metrics.pairwise import cosine_similarity
2
  import numpy as np
3
+ import pandas as pd
4
+ import json
5
+ import ollama
6
 
7
  class EvaluationSystem:
8
  def __init__(self, data_processor, database_handler):
 
10
  self.db_handler = database_handler
11
 
12
  def relevance_scoring(self, query, retrieved_docs, top_k=5):
13
+ query_embedding = self.data_processor.embedding_model.encode(query)
14
+ doc_embeddings = [self.data_processor.embedding_model.encode(doc['content']) for doc in retrieved_docs]
15
 
16
  similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
17
  return np.mean(sorted(similarities, reverse=True)[:top_k])
18
 
19
  def answer_similarity(self, generated_answer, reference_answer):
20
+ gen_embedding = self.data_processor.embedding_model.encode(generated_answer)
21
+ ref_embedding = self.data_processor.embedding_model.encode(reference_answer)
22
  return cosine_similarity([gen_embedding], [ref_embedding])[0][0]
23
 
24
  def human_evaluation(self, video_id, query):
 
37
  human_scores = []
38
 
39
  for query, reference in zip(test_queries, reference_answers):
40
+ retrieved_docs = rag_system.data_processor.search(query, num_results=5, method='hybrid', index_name=index_name)
41
+ generated_answer, _ = rag_system.query(query, search_method='hybrid', index_name=index_name)
42
 
43
  relevance_scores.append(self.relevance_scoring(query, retrieved_docs))
44
  similarity_scores.append(self.answer_similarity(generated_answer, reference))
 
48
  "avg_relevance_score": np.mean(relevance_scores),
49
  "avg_similarity_score": np.mean(similarity_scores),
50
  "avg_human_score": np.mean(human_scores)
51
+ }
52
+
53
+ def llm_as_judge(self, question, generated_answer, prompt_template):
54
+ prompt = prompt_template.format(question=question, answer_llm=generated_answer)
55
+
56
+ try:
57
+ response = ollama.chat(
58
+ model='phi3.5',
59
+ messages=[{"role": "user", "content": prompt}]
60
+ )
61
+ evaluation = json.loads(response['message']['content'])
62
+ return evaluation
63
+ except Exception as e:
64
+ print(f"Error in LLM evaluation: {str(e)}")
65
+ return None
66
+
67
+ def evaluate_rag(self, rag_system, ground_truth_file, sample_size=200, prompt_template=None):
68
+ try:
69
+ ground_truth = pd.read_csv(ground_truth_file)
70
+ except FileNotFoundError:
71
+ print("Ground truth file not found. Please generate ground truth data first.")
72
+ return None
73
+
74
+ sample = ground_truth.sample(n=min(sample_size, len(ground_truth)), random_state=1)
75
+ evaluations = []
76
+
77
+ for _, row in sample.iterrows():
78
+ question = row['question']
79
+ video_id = row['video_id']
80
+
81
+ index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id, "multi-qa-MiniLM-L6-cos-v1")
82
+
83
+ if not index_name:
84
+ print(f"No index found for video {video_id}. Skipping this question.")
85
+ continue
86
+
87
+ try:
88
+ answer_llm, _ = rag_system.query(question, search_method='hybrid', index_name=index_name)
89
+ except ValueError as e:
90
+ print(f"Error querying RAG system: {str(e)}")
91
+ continue
92
+
93
+ if prompt_template:
94
+ evaluation = self.llm_as_judge(question, answer_llm, prompt_template)
95
+ if evaluation:
96
+ evaluations.append((
97
+ str(video_id),
98
+ str(question),
99
+ str(answer_llm),
100
+ str(evaluation.get('Relevance', 'UNKNOWN')),
101
+ str(evaluation.get('Explanation', 'No explanation provided'))
102
+ ))
103
+ else:
104
+ # Fallback to cosine similarity if no prompt template is provided
105
+ similarity = self.answer_similarity(answer_llm, row.get('reference_answer', ''))
106
+ evaluations.append((
107
+ str(video_id),
108
+ str(question),
109
+ str(answer_llm),
110
+ f"Similarity: {similarity}",
111
+ "Cosine similarity used for evaluation"
112
+ ))
113
+
114
+ return evaluations
app/generate_ground_truth.py CHANGED
@@ -1,27 +1,16 @@
1
- import os
2
  import pandas as pd
3
  import json
4
- from youtube_transcript_api import YouTubeTranscriptApi
5
  from tqdm import tqdm
6
- import requests
7
-
8
- OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'localhost')
9
- OLLAMA_PORT = os.getenv('OLLAMA_PORT', '11434')
10
-
11
- def get_transcript(video_id):
12
- try:
13
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
14
- return " ".join([entry['text'] for entry in transcript])
15
- except Exception as e:
16
- print(f"Error extracting transcript for video {video_id}: {str(e)}")
17
- return None
18
 
19
  def generate_questions(transcript):
20
  prompt_template = """
21
  You are an AI assistant tasked with generating questions based on a YouTube video transcript.
22
- Formulate 10 questions that a user might ask based on the provided transcript.
23
  Make the questions specific to the content of the transcript.
24
  The questions should be complete and not too short. Use as few words as possible from the transcript.
 
25
 
26
  The transcript:
27
 
@@ -34,34 +23,62 @@ def generate_questions(transcript):
34
 
35
  prompt = prompt_template.format(transcript=transcript)
36
 
37
- response = requests.post(f'http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/generate', json={
38
- 'model': 'phi3.5',
39
- 'prompt': prompt
40
- })
41
-
42
- if response.status_code == 200:
43
- return json.loads(response.json()['response'])
 
 
 
 
 
 
 
 
 
44
  else:
45
- print(f"Error: {response.status_code} - {response.text}")
46
  return None
47
 
48
- def main():
49
- video_id = "zjkBMFhNj_g"
50
- transcript = get_transcript(video_id)
51
 
52
- if transcript:
53
- questions = generate_questions(transcript)
54
 
55
- if questions:
56
- df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
57
-
58
- os.makedirs('data', exist_ok=True)
59
- df.to_csv('data/ground-truth-retrieval.csv', index=False)
60
- print("Ground truth data saved to data/ground-truth-retrieval.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  else:
62
- print("Failed to generate questions.")
 
 
 
 
 
 
 
63
  else:
64
- print("Failed to generate ground truth data due to transcript retrieval error.")
65
-
66
- if __name__ == "__main__":
67
- main()
 
 
1
  import pandas as pd
2
  import json
 
3
  from tqdm import tqdm
4
+ import ollama
5
+ from transcript_extractor import get_transcript
 
 
 
 
 
 
 
 
 
 
6
 
7
  def generate_questions(transcript):
8
  prompt_template = """
9
  You are an AI assistant tasked with generating questions based on a YouTube video transcript.
10
+ Formulate at least 10 questions that a user might ask based on the provided transcript.
11
  Make the questions specific to the content of the transcript.
12
  The questions should be complete and not too short. Use as few words as possible from the transcript.
13
+ It is important that the questions are relevant to the content of the transcript and are at least 10 in number.
14
 
15
  The transcript:
16
 
 
23
 
24
  prompt = prompt_template.format(transcript=transcript)
25
 
26
+ try:
27
+ response = ollama.chat(
28
+ model='phi3.5',
29
+ messages=[{"role": "user", "content": prompt}]
30
+ )
31
+ return json.loads(response['message']['content'])
32
+ except Exception as e:
33
+ print(f"Error generating questions: {str(e)}")
34
+ return None
35
+
36
+ def generate_ground_truth(db_handler, data_processor, video_id):
37
+ transcript_data = get_transcript(video_id)
38
+ if transcript_data and 'transcript' in transcript_data:
39
+ full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
40
+ # Process the transcript
41
+ data_processor.process_transcript(video_id, transcript_data)
42
  else:
43
+ print(f"Failed to retrieve transcript for video {video_id}")
44
  return None
45
 
46
+ questions = generate_questions(full_transcript)
 
 
47
 
48
+ if questions and 'questions' in questions:
49
+ df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
50
 
51
+ csv_path = 'data/ground-truth-retrieval.csv'
52
+ df.to_csv(csv_path, index=False)
53
+ print(f"Ground truth data saved to {csv_path}")
54
+ return df
55
+ else:
56
+ print("Failed to generate questions.")
57
+ return None
58
+
59
+ def generate_ground_truth_for_all_videos(db_handler, data_processor):
60
+ videos = db_handler.get_all_videos()
61
+ all_questions = []
62
+
63
+ for video in tqdm(videos, desc="Generating ground truth"):
64
+ video_id = video[0] # Assuming the video ID is the first element in the tuple
65
+ transcript_data = get_transcript(video_id)
66
+ if transcript_data and 'transcript' in transcript_data:
67
+ full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
68
+ # Process the transcript
69
+ data_processor.process_transcript(video_id, transcript_data)
70
+ questions = generate_questions(full_transcript)
71
+ if questions and 'questions' in questions:
72
+ all_questions.extend([(video_id, q) for q in questions['questions']])
73
  else:
74
+ print(f"Failed to retrieve transcript for video {video_id}")
75
+
76
+ if all_questions:
77
+ df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
78
+ csv_path = 'data/ground-truth-retrieval.csv'
79
+ df.to_csv(csv_path, index=False)
80
+ print(f"Ground truth data for all videos saved to {csv_path}")
81
+ return df
82
  else:
83
+ print("Failed to generate questions for any video.")
84
+ return None
 
 
app/main.py CHANGED
@@ -1,21 +1,18 @@
1
  import streamlit as st
2
  import pandas as pd
3
- from transcript_extractor import extract_video_id, get_transcript, get_channel_videos, process_videos
4
  from data_processor import DataProcessor
5
  from database import DatabaseHandler
6
  from rag import RAGSystem
7
  from query_rewriter import QueryRewriter
8
  from evaluation import EvaluationSystem
 
9
  from sentence_transformers import SentenceTransformer
10
  import os
11
- import json
12
- import requests
13
- from tqdm import tqdm
14
- import sqlite3
15
  import logging
16
- import ollama
17
 
18
- logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
21
  @st.cache_resource
@@ -33,170 +30,64 @@ def init_components():
33
  st.error(f"Error initializing components: {str(e)}")
34
  st.error("Please check your configuration and ensure all services are running.")
35
  return None, None, None, None, None
36
-
37
- components = init_components()
38
- if components:
39
- db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
40
- else:
41
- st.stop()
42
-
43
- # Ground Truth Generation
44
 
45
- def generate_questions(transcript):
46
- prompt_template = """
47
- You are an AI assistant tasked with generating questions based on a YouTube video transcript.
48
- Formulate atleast 10 questions that a user might ask based on the provided transcript.
49
- Make the questions specific to the content of the transcript.
50
- The questions should be complete and not too short. Use as few words as possible from the transcript.
51
- It is important that the questions are relevant to the content of the transcript and are atleast 10 in number.
 
 
 
 
 
 
 
 
 
 
52
 
53
- The transcript:
 
 
 
 
 
54
 
55
- {transcript}
56
 
57
- Provide the output in parsable JSON without using code blocks:
 
58
 
59
- {{"questions": ["question1", "question2", ..., "question10"]}}
60
- """.strip()
61
 
62
- prompt = prompt_template.format(transcript=transcript)
 
 
 
63
 
64
- try:
65
- response = ollama.chat(
66
- model='phi3.5',
67
- messages=[{"role": "user", "content": prompt}]
68
- )
69
- print("Printing the response from OLLAMA: " + response['message']['content'])
70
- return json.loads(response['message']['content'])
71
- except Exception as e:
72
- logger.error(f"Error generating questions: {str(e)}")
73
- return None
74
-
75
- def generate_ground_truth(video_id=None, existing_transcript=None):
76
- if video_id is None and existing_transcript is None:
77
- st.error("Please provide either a video ID or an existing transcript.")
78
- return None
79
-
80
- if video_id:
81
- transcript_data = get_transcript(video_id)
82
- if transcript_data and 'transcript' in transcript_data:
83
- full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
84
- else:
85
- logger.error("Failed to retrieve transcript for the provided video ID.")
86
- st.error("Failed to retrieve transcript for the provided video ID.")
87
- return None
88
- else:
89
- full_transcript = existing_transcript
90
 
91
- questions = generate_questions(full_transcript)
92
-
93
- if questions and 'questions' in questions:
94
- df = pd.DataFrame([(video_id if video_id else "custom", q) for q in questions['questions']], columns=['video_id', 'question'])
95
-
96
- os.makedirs('data', exist_ok=True)
97
- df.to_csv('data/ground-truth-retrieval.csv', index=False)
98
- st.success("Ground truth data generated and saved to data/ground-truth-retrieval.csv")
99
- return df
100
- else:
101
- logger.error("Failed to generate questions.")
102
- st.error("Failed to generate questions.")
103
- return None
104
 
105
- # RAG Evaluation
106
- def evaluate_rag(sample_size=200):
107
- try:
108
- ground_truth = pd.read_csv('data/ground-truth-retrieval.csv')
109
- except FileNotFoundError:
110
- logger.error("Ground truth file not found. Please generate ground truth data first.")
111
- st.error("Ground truth file not found. Please generate ground truth data first.")
112
- return None
113
 
114
- sample = ground_truth.sample(n=min(sample_size, len(ground_truth)), random_state=1)
115
- evaluations = []
116
-
117
- prompt_template = """
118
- You are an expert evaluator for a Youtube transcript assistant.
119
- Your task is to analyze the relevance of the generated answer to the given question.
120
- Based on the relevance of the generated answer, you will classify it
121
- as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".
122
-
123
- Here is the data for evaluation:
124
-
125
- Question: {question}
126
- Generated Answer: {answer_llm}
127
-
128
- Please analyze the content and context of the generated answer in relation to the question
129
- and provide your evaluation in parsable JSON without using code blocks:
130
-
131
- {{
132
- "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
133
- "Explanation": "[Provide a brief explanation for your evaluation]"
134
- }}
135
- """.strip()
136
-
137
- progress_bar = st.progress(0)
138
- for i, (_, row) in enumerate(sample.iterrows()):
139
- question = row['question']
140
- video_id = row['video_id']
141
-
142
- # Get the index name for the video (you might need to adjust this based on your setup)
143
- index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id, "all-MiniLM-L6-v2") # Assuming you're using this embedding model
144
-
145
- if not index_name:
146
- logger.warning(f"No index found for video {video_id}. Skipping this question.")
147
- continue
148
 
149
- try:
150
- answer_llm, _ = rag_system.query(question, index_name=index_name)
151
- except ValueError as e:
152
- logger.error(f"Error querying RAG system: {str(e)}")
153
- continue
154
 
155
- prompt = prompt_template.format(question=question, answer_llm=answer_llm)
156
- try:
157
- response = ollama.chat(
158
- model='phi3.5',
159
- messages=[{"role": "user", "content": prompt}]
160
- )
161
- evaluation_json = json.loads(response['message']['content'])
162
- evaluations.append((
163
- str(video_id),
164
- str(question),
165
- str(answer_llm),
166
- str(evaluation_json.get('Relevance', 'UNKNOWN')),
167
- str(evaluation_json.get('Explanation', 'No explanation provided'))
168
- ))
169
- except Exception as e:
170
- logger.warning(f"Failed to evaluate question: {question}. Error: {str(e)}")
171
- st.warning(f"Failed to evaluate question: {question}")
172
- progress_bar.progress((i + 1) / len(sample))
173
-
174
- # Store RAG evaluations in the database
175
- conn = sqlite3.connect('data/sqlite.db')
176
- cursor = conn.cursor()
177
- cursor.execute('''
178
- CREATE TABLE IF NOT EXISTS rag_evaluations (
179
- video_id TEXT,
180
- question TEXT,
181
- answer TEXT,
182
- relevance TEXT,
183
- explanation TEXT
184
- )
185
- ''')
186
- cursor.executemany('''
187
- INSERT INTO rag_evaluations (video_id, question, answer, relevance, explanation)
188
- VALUES (?, ?, ?, ?, ?)
189
- ''', evaluations)
190
- conn.commit()
191
- conn.close()
192
-
193
- logger.info("Evaluation complete. Results stored in the database.")
194
- st.success("Evaluation complete. Results stored in the database.")
195
- return evaluations
196
-
197
- @st.cache_data
198
- def process_single_video(video_id, embedding_model):
199
- # Check if the video has already been processed with the current embedding model
200
  existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
201
  if existing_index:
202
  logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
@@ -205,9 +96,9 @@ def process_single_video(video_id, embedding_model):
205
  transcript_data = get_transcript(video_id)
206
  if transcript_data is None:
207
  logger.error(f"Failed to retrieve transcript for video {video_id}")
 
208
  return None
209
 
210
- # Store video metadata in the database
211
  video_data = {
212
  'video_id': video_id,
213
  'title': transcript_data['metadata'].get('title', 'Unknown Title'),
@@ -222,67 +113,78 @@ def process_single_video(video_id, embedding_model):
222
  db_handler.add_video(video_data)
223
  except Exception as e:
224
  logger.error(f"Error adding video to database: {str(e)}")
 
225
  return None
226
 
227
- # Process transcript for RAG system
228
  try:
229
  data_processor.process_transcript(video_id, transcript_data)
230
  except Exception as e:
231
  logger.error(f"Error processing transcript: {str(e)}")
 
232
  return None
233
 
234
- # Create Elasticsearch index
235
  index_name = f"video_{video_id}_{embedding_model}".lower()
236
  try:
237
  index_name = data_processor.build_index(index_name)
238
  logger.info(f"Successfully built index: {index_name}")
239
  except Exception as e:
240
  logger.error(f"Error building index: {str(e)}")
 
241
  return None
242
 
243
- # Add embedding model to the database
244
  embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")
245
 
246
- # Get the video ID from the database
247
  video_db_record = db_handler.get_video_by_youtube_id(video_id)
248
  if video_db_record is None:
249
  logger.error(f"Failed to retrieve video record from database for video {video_id}")
 
250
  return None
251
- video_db_id = video_db_record[0] # Assuming the ID is the first column
252
 
253
- # Store Elasticsearch index information
254
  db_handler.add_elasticsearch_index(video_db_id, index_name, embedding_model_id)
255
 
256
  logger.info(f"Processed and indexed transcript for video {video_id}")
 
257
  return index_name
258
 
259
- @st.cache_data
260
- def process_multiple_videos(video_ids, embedding_model):
261
  indices = []
262
  for video_id in video_ids:
263
- index = process_single_video(video_id, embedding_model)
264
  if index:
265
  indices.append(index)
266
  logger.info(f"Processed and indexed transcripts for {len(indices)} videos")
267
  st.success(f"Processed and indexed transcripts for {len(indices)} videos")
268
  return indices
269
 
 
 
 
 
 
 
 
 
 
 
270
  def main():
271
  st.title("YouTube Transcript RAG System")
272
 
273
- components = init_components()
274
- if not all(components):
275
- st.error("Failed to initialize one or more components. Please check the logs and your configuration.")
276
- return
277
-
278
- db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
279
 
 
 
 
 
 
 
280
  tab1, tab2, tab3 = st.tabs(["RAG System", "Ground Truth Generation", "Evaluation"])
281
 
282
  with tab1:
283
  st.header("RAG System")
284
 
285
- # Video selection section
 
286
  st.subheader("Select a Video")
287
  videos = db_handler.get_all_videos()
288
  if not videos:
@@ -290,21 +192,15 @@ def main():
290
  else:
291
  video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
292
 
293
- # Allow filtering by channel name
294
  channels = sorted(video_df['channel_name'].unique())
295
  selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
296
 
297
  if selected_channel != "All":
298
  video_df = video_df[video_df['channel_name'] == selected_channel]
299
 
300
- # Display videos and allow selection
301
  st.dataframe(video_df)
302
  selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
303
 
304
- # Embedding model selection
305
- embedding_model = st.selectbox("Select embedding model:", ["all-MiniLM-L6-v2", "all-mpnet-base-v2"])
306
-
307
- # Get the index name for the selected video and embedding model
308
  index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id, embedding_model)
309
 
310
  if index_name:
@@ -312,18 +208,17 @@ def main():
312
  else:
313
  st.warning("No index found for the selected video and embedding model. The index will be built when you search.")
314
 
315
- # Process new video section
316
  st.subheader("Process New Video")
317
  input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
318
  input_value = st.text_input("Enter the URL or ID:")
319
 
320
  if st.button("Process"):
321
  with st.spinner("Processing..."):
322
- data_processor.embedding_model = SentenceTransformer(embedding_model)
323
  if input_type == "Video URL":
324
  video_id = extract_video_id(input_value)
325
  if video_id:
326
- index_name = process_single_video(video_id, embedding_model)
327
  if index_name is None:
328
  st.error(f"Failed to process video {video_id}")
329
  else:
@@ -333,7 +228,7 @@ def main():
333
  elif input_type == "Channel URL":
334
  channel_videos = get_channel_videos(input_value)
335
  if channel_videos:
336
- index_names = process_multiple_videos([video['video_id'] for video in channel_videos], embedding_model)
337
  if not index_names:
338
  st.error("Failed to process any videos from the channel")
339
  else:
@@ -341,13 +236,12 @@ def main():
341
  else:
342
  st.error("Failed to retrieve videos from the channel")
343
  else:
344
- index_name = process_single_video(input_value, embedding_model)
345
  if index_name is None:
346
  st.error(f"Failed to process video {input_value}")
347
  else:
348
  st.success(f"Successfully processed video {input_value}")
349
 
350
- # Query section
351
  st.subheader("Query the RAG System")
352
  query = st.text_input("Enter your query:")
353
  rewrite_method = st.radio("Query rewriting method:", ["None", "Chain of Thought", "ReAct"])
@@ -375,10 +269,9 @@ def main():
375
 
376
  search_method_map = {"Hybrid": "hybrid", "Text-only": "text", "Embedding-only": "embedding"}
377
  try:
378
- # Ensure index is built before searching
379
  if not index_name:
380
  st.info("Building index for the selected video...")
381
- index_name = process_single_video(selected_video_id, embedding_model)
382
  if not index_name:
383
  st.error("Failed to build index for the selected video.")
384
  return
@@ -405,66 +298,49 @@ def main():
405
 
406
  with tab2:
407
  st.header("Ground Truth Generation")
408
- use_existing_transcript = st.checkbox("Use existing transcript")
409
 
410
- if use_existing_transcript:
411
- # Get all available videos (assuming all videos have transcripts)
412
- videos = db_handler.get_all_videos()
413
- if not videos:
414
- st.warning("No videos available. Please process some videos first.")
415
- else:
416
- video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
417
-
418
- # Allow filtering by channel name
419
- channels = sorted(video_df['channel_name'].unique())
420
- selected_channel = st.selectbox("Filter by Channel", ["All"] + channels, key="gt_channel_filter")
421
-
422
- if selected_channel != "All":
423
- video_df = video_df[video_df['channel_name'] == selected_channel]
424
-
425
- # Display videos and allow selection
426
- st.dataframe(video_df)
427
- selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(),
428
- format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0],
429
- key="gt_video_select")
430
-
431
- if st.button("Generate Ground Truth from Existing Transcript"):
432
- with st.spinner("Generating ground truth..."):
433
- # Retrieve the transcript content (you'll need to implement this method)
434
- transcript_data = get_transcript(selected_video_id)
435
- if transcript_data and 'transcript' in transcript_data:
436
- full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
437
- ground_truth_df = generate_ground_truth(video_id=selected_video_id, existing_transcript=full_transcript)
438
- if ground_truth_df is not None:
439
- st.dataframe(ground_truth_df)
440
- csv = ground_truth_df.to_csv(index=False)
441
- st.download_button(
442
- label="Download Ground Truth CSV",
443
- data=csv,
444
- file_name=f"ground_truth_{selected_video_id}.csv",
445
- mime="text/csv",
446
- )
447
- else:
448
- st.error("Failed to retrieve transcript content.")
449
  else:
450
- video_id = st.text_input("Enter YouTube Video ID for ground truth generation:")
451
- if st.button("Generate Ground Truth"):
452
- with st.spinner("Generating ground truth..."):
453
- ground_truth_df = generate_ground_truth(video_id=video_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  if ground_truth_df is not None:
455
  st.dataframe(ground_truth_df)
456
  csv = ground_truth_df.to_csv(index=False)
457
  st.download_button(
458
- label="Download Ground Truth CSV",
459
  data=csv,
460
- file_name=f"ground_truth_{video_id}.csv",
461
  mime="text/csv",
462
  )
463
 
464
  with tab3:
465
  st.header("RAG Evaluation")
466
 
467
- # Load ground truth data
468
  try:
469
  ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
470
  ground_truth_available = True
@@ -480,7 +356,7 @@ def main():
480
 
481
  if st.button("Run Evaluation"):
482
  with st.spinner("Running evaluation..."):
483
- evaluation_results = evaluate_rag(sample_size)
484
  if evaluation_results:
485
  st.write("Evaluation Results:")
486
  st.dataframe(pd.DataFrame(evaluation_results, columns=['Video ID', 'Question', 'Answer', 'Relevance', 'Explanation']))
@@ -488,7 +364,6 @@ def main():
488
  st.warning("No ground truth data available. Please generate ground truth data first.")
489
  st.button("Run Evaluation", disabled=True)
490
 
491
- # Add a section to generate ground truth if it's not available
492
  if not ground_truth_available:
493
  st.subheader("Generate Ground Truth")
494
  st.write("You need to generate ground truth data before running the evaluation.")
@@ -497,4 +372,7 @@ def main():
497
  st.experimental_rerun()
498
 
499
  if __name__ == "__main__":
 
 
 
500
  main()
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from transcript_extractor import get_transcript, get_youtube_client, extract_video_id, get_channel_videos, test_api_key, initialize_youtube_api
4
  from data_processor import DataProcessor
5
  from database import DatabaseHandler
6
  from rag import RAGSystem
7
  from query_rewriter import QueryRewriter
8
  from evaluation import EvaluationSystem
9
+ from generate_ground_truth import generate_ground_truth, generate_ground_truth_for_all_videos
10
  from sentence_transformers import SentenceTransformer
11
  import os
12
+ import sys
 
 
 
13
  import logging
 
14
 
15
+ logging.basicConfig(level=logging.DEBUG)
16
  logger = logging.getLogger(__name__)
17
 
18
  @st.cache_resource
 
30
  st.error(f"Error initializing components: {str(e)}")
31
  st.error("Please check your configuration and ensure all services are running.")
32
  return None, None, None, None, None
 
 
 
 
 
 
 
 
33
 
34
+ def check_api_key():
35
+ if test_api_key():
36
+ st.success("YouTube API key is valid and working.")
37
+ else:
38
+ st.error("YouTube API key is invalid or not set. Please check your .env file.")
39
+ new_api_key = st.text_input("Enter your YouTube API key:")
40
+ if new_api_key:
41
+ os.environ['YOUTUBE_API_KEY'] = new_api_key
42
+ with open('.env', 'a') as f:
43
+ f.write(f"\nYOUTUBE_API_KEY={new_api_key}")
44
+ st.success("API key saved. Reinitializing YouTube client...")
45
+ get_youtube_client.cache_clear() # Clear the cache to force reinitialization
46
+ if test_api_key():
47
+ st.success("YouTube client reinitialized successfully.")
48
+ else:
49
+ st.error("Failed to reinitialize YouTube client. Please check your API key.")
50
+ st.experimental_rerun()
51
 
52
+ # LLM-as-a-judge prompt template
53
+ prompt_template = """
54
+ You are an expert evaluator for a Youtube transcript assistant.
55
+ Your task is to analyze the relevance of the generated answer to the given question.
56
+ Based on the relevance of the generated answer, you will classify it
57
+ as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".
58
 
59
+ Here is the data for evaluation:
60
 
61
+ Question: {question}
62
+ Generated Answer: {answer_llm}
63
 
64
+ Please analyze the content and context of the generated answer in relation to the question
65
+ and provide your evaluation in the following JSON format:
66
 
67
+ {{
68
+ "Relevance": "NON_RELEVANT",
69
+ "Explanation": "Your explanation here"
70
+ }}
71
 
72
+ OR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ {{
75
+ "Relevance": "PARTLY_RELEVANT",
76
+ "Explanation": "Your explanation here"
77
+ }}
 
 
 
 
 
 
 
 
 
78
 
79
+ OR
 
 
 
 
 
 
 
80
 
81
+ {{
82
+ "Relevance": "RELEVANT",
83
+ "Explanation": "Your explanation here"
84
+ }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ Ensure your response is a valid JSON object with these exact keys and one of the three exact values for "Relevance".
87
+ Do not include any text outside of this JSON object.
88
+ """
 
 
89
 
90
+ def process_single_video(db_handler, data_processor, video_id, embedding_model):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
92
  if existing_index:
93
  logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
 
96
  transcript_data = get_transcript(video_id)
97
  if transcript_data is None:
98
  logger.error(f"Failed to retrieve transcript for video {video_id}")
99
+ st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
100
  return None
101
 
 
102
  video_data = {
103
  'video_id': video_id,
104
  'title': transcript_data['metadata'].get('title', 'Unknown Title'),
 
113
  db_handler.add_video(video_data)
114
  except Exception as e:
115
  logger.error(f"Error adding video to database: {str(e)}")
116
+ st.error(f"Error adding video {video_id} to database: {str(e)}")
117
  return None
118
 
 
119
  try:
120
  data_processor.process_transcript(video_id, transcript_data)
121
  except Exception as e:
122
  logger.error(f"Error processing transcript: {str(e)}")
123
+ st.error(f"Error processing transcript for video {video_id}: {str(e)}")
124
  return None
125
 
 
126
  index_name = f"video_{video_id}_{embedding_model}".lower()
127
  try:
128
  index_name = data_processor.build_index(index_name)
129
  logger.info(f"Successfully built index: {index_name}")
130
  except Exception as e:
131
  logger.error(f"Error building index: {str(e)}")
132
+ st.error(f"Error building index for video {video_id}: {str(e)}")
133
  return None
134
 
 
135
  embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")
136
 
 
137
  video_db_record = db_handler.get_video_by_youtube_id(video_id)
138
  if video_db_record is None:
139
  logger.error(f"Failed to retrieve video record from database for video {video_id}")
140
+ st.error(f"Failed to retrieve video record from database for video {video_id}")
141
  return None
142
+ video_db_id = video_db_record[0]
143
 
 
144
  db_handler.add_elasticsearch_index(video_db_id, index_name, embedding_model_id)
145
 
146
  logger.info(f"Processed and indexed transcript for video {video_id}")
147
+ st.success(f"Successfully processed and indexed transcript for video {video_id}")
148
  return index_name
149
 
150
+ def process_multiple_videos(db_handler, data_processor, video_ids, embedding_model):
 
151
  indices = []
152
  for video_id in video_ids:
153
+ index = process_single_video(db_handler, data_processor, video_id, embedding_model)
154
  if index:
155
  indices.append(index)
156
  logger.info(f"Processed and indexed transcripts for {len(indices)} videos")
157
  st.success(f"Processed and indexed transcripts for {len(indices)} videos")
158
  return indices
159
 
160
+ def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
161
+ index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
162
+ if not index_name:
163
+ st.warning(f"Video {video_id} has not been processed yet. Processing now...")
164
+ index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
165
+ if not index_name:
166
+ st.error(f"Failed to process video {video_id}. Please check the logs for more information.")
167
+ return False
168
+ return True
169
+
170
  def main():
171
  st.title("YouTube Transcript RAG System")
172
 
173
+ check_api_key()
 
 
 
 
 
174
 
175
+ components = init_components()
176
+ if components:
177
+ db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
178
+ else:
179
+ st.stop()
180
+
181
  tab1, tab2, tab3 = st.tabs(["RAG System", "Ground Truth Generation", "Evaluation"])
182
 
183
  with tab1:
184
  st.header("RAG System")
185
 
186
+ embedding_model = st.selectbox("Select embedding model:", ["multi-qa-MiniLM-L6-cos-v1", "all-mpnet-base-v2"])
187
+
188
  st.subheader("Select a Video")
189
  videos = db_handler.get_all_videos()
190
  if not videos:
 
192
  else:
193
  video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
194
 
 
195
  channels = sorted(video_df['channel_name'].unique())
196
  selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
197
 
198
  if selected_channel != "All":
199
  video_df = video_df[video_df['channel_name'] == selected_channel]
200
 
 
201
  st.dataframe(video_df)
202
  selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
203
 
 
 
 
 
204
  index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id, embedding_model)
205
 
206
  if index_name:
 
208
  else:
209
  st.warning("No index found for the selected video and embedding model. The index will be built when you search.")
210
 
 
211
  st.subheader("Process New Video")
212
  input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
213
  input_value = st.text_input("Enter the URL or ID:")
214
 
215
  if st.button("Process"):
216
  with st.spinner("Processing..."):
217
+ data_processor.set_embedding_model(embedding_model)
218
  if input_type == "Video URL":
219
  video_id = extract_video_id(input_value)
220
  if video_id:
221
+ index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
222
  if index_name is None:
223
  st.error(f"Failed to process video {video_id}")
224
  else:
 
228
  elif input_type == "Channel URL":
229
  channel_videos = get_channel_videos(input_value)
230
  if channel_videos:
231
+ index_names = process_multiple_videos(db_handler, data_processor, [video['video_id'] for video in channel_videos], embedding_model)
232
  if not index_names:
233
  st.error("Failed to process any videos from the channel")
234
  else:
 
236
  else:
237
  st.error("Failed to retrieve videos from the channel")
238
  else:
239
+ index_name = process_single_video(db_handler, data_processor, input_value, embedding_model)
240
  if index_name is None:
241
  st.error(f"Failed to process video {input_value}")
242
  else:
243
  st.success(f"Successfully processed video {input_value}")
244
 
 
245
  st.subheader("Query the RAG System")
246
  query = st.text_input("Enter your query:")
247
  rewrite_method = st.radio("Query rewriting method:", ["None", "Chain of Thought", "ReAct"])
 
269
 
270
  search_method_map = {"Hybrid": "hybrid", "Text-only": "text", "Embedding-only": "embedding"}
271
  try:
 
272
  if not index_name:
273
  st.info("Building index for the selected video...")
274
+ index_name = process_single_video(db_handler, data_processor, selected_video_id, embedding_model)
275
  if not index_name:
276
  st.error("Failed to build index for the selected video.")
277
  return
 
298
 
299
  with tab2:
300
  st.header("Ground Truth Generation")
 
301
 
302
+ videos = db_handler.get_all_videos()
303
+ if not videos:
304
+ st.warning("No videos available. Please process some videos first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  else:
306
+ video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
307
+
308
+ st.dataframe(video_df)
309
+ selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(),
310
+ format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0],
311
+ key="gt_video_select")
312
+
313
+ if st.button("Generate Ground Truth for Selected Video"):
314
+ if ensure_video_processed(db_handler, data_processor, selected_video_id, embedding_model):
315
+ with st.spinner("Generating ground truth..."):
316
+ ground_truth_df = generate_ground_truth(db_handler, data_processor, selected_video_id)
317
+ if ground_truth_df is not None:
318
+ st.dataframe(ground_truth_df)
319
+ csv = ground_truth_df.to_csv(index=False)
320
+ st.download_button(
321
+ label="Download Ground Truth CSV",
322
+ data=csv,
323
+ file_name=f"ground_truth_{selected_video_id}.csv",
324
+ mime="text/csv",
325
+ )
326
+ if st.button("Generate Ground Truth for All Videos"):
327
+ with st.spinner("Processing videos and generating ground truth..."):
328
+ for video_id in video_df['youtube_id']:
329
+ ensure_video_processed(db_handler, data_processor, video_id, embedding_model)
330
+ ground_truth_df = generate_ground_truth_for_all_videos(db_handler, data_processor)
331
  if ground_truth_df is not None:
332
  st.dataframe(ground_truth_df)
333
  csv = ground_truth_df.to_csv(index=False)
334
  st.download_button(
335
+ label="Download Ground Truth CSV (All Videos)",
336
  data=csv,
337
+ file_name="ground_truth_all_videos.csv",
338
  mime="text/csv",
339
  )
340
 
341
  with tab3:
342
  st.header("RAG Evaluation")
343
 
 
344
  try:
345
  ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
346
  ground_truth_available = True
 
356
 
357
  if st.button("Run Evaluation"):
358
  with st.spinner("Running evaluation..."):
359
+ evaluation_results = evaluation_system.evaluate_rag(rag_system, 'data/ground-truth-retrieval.csv', sample_size, prompt_template)
360
  if evaluation_results:
361
  st.write("Evaluation Results:")
362
  st.dataframe(pd.DataFrame(evaluation_results, columns=['Video ID', 'Question', 'Answer', 'Relevance', 'Explanation']))
 
364
  st.warning("No ground truth data available. Please generate ground truth data first.")
365
  st.button("Run Evaluation", disabled=True)
366
 
 
367
  if not ground_truth_available:
368
  st.subheader("Generate Ground Truth")
369
  st.write("You need to generate ground truth data before running the evaluation.")
 
372
  st.experimental_rerun()
373
 
374
  if __name__ == "__main__":
375
+ if not initialize_youtube_api():
376
+ logger.error("Failed to initialize YouTube API. Exiting.")
377
+ sys.exit(1)
378
  main()
app/transcript_extractor.py CHANGED
@@ -3,29 +3,51 @@ from dotenv import load_dotenv
3
  from youtube_transcript_api import YouTubeTranscriptApi
4
  from googleapiclient.discovery import build
5
  from googleapiclient.errors import HttpError
 
 
 
6
  import re
 
 
 
 
 
 
 
 
7
 
8
  # Get the directory of the current script
9
  current_dir = os.path.dirname(os.path.abspath(__file__))
10
  # Construct the path to the .env file (one directory up from the current script)
11
  dotenv_path = os.path.join(os.path.dirname(current_dir), '.env')
12
- print("the .env path is :" + dotenv_path)
13
  # Load environment variables from .env file
14
  load_dotenv(dotenv_path)
15
 
16
  # Get API key from environment variable
17
  API_KEY = os.getenv('YOUTUBE_API_KEY')
18
- print("the api key is :" + API_KEY)
 
19
  if not API_KEY:
20
  raise ValueError("YouTube API key not found. Make sure it's set in your .env file in the parent directory of the 'app' folder.")
21
 
22
- print(f"API_KEY: {API_KEY[:5]}...{API_KEY[-5:]}") # Print first and last 5 characters for verification
 
 
 
 
 
 
 
 
23
 
24
- try:
25
- youtube = build('youtube', 'v3', developerKey=API_KEY)
26
- except Exception as e:
27
- print(f"Error initializing YouTube API client: {str(e)}")
28
- raise
 
 
29
 
30
  def extract_video_id(url):
31
  if not url:
@@ -36,16 +58,22 @@ def extract_video_id(url):
36
  return None
37
 
38
  def get_video_metadata(video_id):
 
39
  try:
40
  request = youtube.videos().list(
41
  part="snippet,contentDetails,statistics",
42
  id=video_id
43
  )
44
  response = request.execute()
45
-
46
  if 'items' in response and len(response['items']) > 0:
47
  video = response['items'][0]
48
  snippet = video['snippet']
 
 
 
 
 
 
49
  return {
50
  'title': snippet['title'],
51
  'author': snippet['channelTitle'],
@@ -53,48 +81,24 @@ def get_video_metadata(video_id):
53
  'view_count': video['statistics'].get('viewCount', '0'),
54
  'like_count': video['statistics'].get('likeCount', '0'),
55
  'comment_count': video['statistics'].get('commentCount', '0'),
56
- 'duration': video['contentDetails']['duration']
 
57
  }
58
  else:
59
- print(f"No video found with ID: {video_id}")
60
  return None
61
- except HttpError as e:
62
- print(f"An HTTP error {e.resp.status} occurred: {e.content}")
63
- return None
64
  except Exception as e:
65
- print(f"An error occurred while fetching video metadata: {str(e)}")
66
  return None
67
-
68
- def get_transcript(video_id):
69
- # Get the directory of the current script
70
- current_dir = os.path.dirname(os.path.abspath(__file__))
71
- # Construct the path to the .env file (one directory up from the current script)
72
- dotenv_path = os.path.join(os.path.dirname(current_dir), '.env')
73
- print("the .env path is :" + dotenv_path)
74
- # Load environment variables from .env file
75
- load_dotenv(dotenv_path)
76
-
77
- # Get API key from environment variable
78
- API_KEY = os.getenv('YOUTUBE_API_KEY')
79
- print("the api key is :" + API_KEY)
80
- if not API_KEY:
81
- raise ValueError("YouTube API key not found. Make sure it's set in your .env file in the parent directory of the 'app' folder.")
82
-
83
- print(f"API_KEY: {API_KEY[:5]}...{API_KEY[-5:]}") # Print first and last 5 characters for verification
84
-
85
- try:
86
- youtube = build('youtube', 'v3', developerKey=API_KEY)
87
- except Exception as e:
88
- print(f"Error initializing YouTube API client: {str(e)}")
89
- raise
90
 
 
91
  if not video_id:
92
  return None
93
  try:
94
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
95
  metadata = get_video_metadata(video_id)
96
- print(f"Metadata for video {video_id}: {metadata}")
97
- print(f"Transcript length for video {video_id}: {len(transcript)}")
98
  if not metadata:
99
  return None
100
  return {
@@ -102,13 +106,14 @@ def get_transcript(video_id):
102
  'metadata': metadata
103
  }
104
  except Exception as e:
105
- print(f"Error extracting transcript for video {video_id}: {str(e)}")
106
  return None
107
 
108
  def get_channel_videos(channel_url):
 
109
  channel_id = extract_channel_id(channel_url)
110
  if not channel_id:
111
- print(f"Invalid channel URL: {channel_url}")
112
  return []
113
  try:
114
  request = youtube.search().list(
@@ -129,10 +134,10 @@ def get_channel_videos(channel_url):
129
  })
130
  return videos
131
  except HttpError as e:
132
- print(f"An HTTP error {e.resp.status} occurred: {e.content}")
133
  return []
134
  except Exception as e:
135
- print(f"An error occurred while fetching channel videos: {str(e)}")
136
  return []
137
 
138
  def extract_channel_id(url):
@@ -141,10 +146,25 @@ def extract_channel_id(url):
141
  return channel_id_match.group(1)
142
  return None
143
 
144
- def process_videos(video_ids):
145
- transcripts = {}
146
- for video_id in video_ids:
147
- transcript_data = get_transcript(video_id)
148
- if transcript_data:
149
- transcripts[video_id] = transcript_data
150
- return transcripts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from youtube_transcript_api import YouTubeTranscriptApi
4
  from googleapiclient.discovery import build
5
  from googleapiclient.errors import HttpError
6
+ import google_auth_oauthlib.flow
7
+ import googleapiclient.discovery
8
+ import googleapiclient.errors
9
  import re
10
+ import logging
11
+ import ssl
12
+ import certifi
13
+ import requests
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
 
19
  # Get the directory of the current script
20
  current_dir = os.path.dirname(os.path.abspath(__file__))
21
  # Construct the path to the .env file (one directory up from the current script)
22
  dotenv_path = os.path.join(os.path.dirname(current_dir), '.env')
23
+ logger.info(f"The .env path is: {dotenv_path}")
24
  # Load environment variables from .env file
25
  load_dotenv(dotenv_path)
26
 
27
  # Get API key from environment variable
28
  API_KEY = os.getenv('YOUTUBE_API_KEY')
29
+ logger.info(f"API_KEY: {API_KEY[:5]}...{API_KEY[-5:]}") # Log first and last 5 characters for verification
30
+
31
  if not API_KEY:
32
  raise ValueError("YouTube API key not found. Make sure it's set in your .env file in the parent directory of the 'app' folder.")
33
 
34
+ def get_youtube_client():
35
+ try:
36
+ # Create a custom session with SSL verification
37
+ session = requests.Session()
38
+ session.verify = certifi.where()
39
+
40
+ # Create a custom HTTP object
41
+ http = googleapiclient.http.build_http()
42
+ http.verify = session.verify
43
 
44
+ # Build the YouTube client with the custom HTTP object
45
+ youtube = build('youtube', 'v3', developerKey=API_KEY, http=http)
46
+ logger.info("YouTube API client initialized successfully")
47
+ return youtube
48
+ except Exception as e:
49
+ logger.error(f"Error initializing YouTube API client: {str(e)}")
50
+ raise
51
 
52
  def extract_video_id(url):
53
  if not url:
 
58
  return None
59
 
60
  def get_video_metadata(video_id):
61
+ youtube = get_youtube_client()
62
  try:
63
  request = youtube.videos().list(
64
  part="snippet,contentDetails,statistics",
65
  id=video_id
66
  )
67
  response = request.execute()
 
68
  if 'items' in response and len(response['items']) > 0:
69
  video = response['items'][0]
70
  snippet = video['snippet']
71
+
72
+ # Get the description and set default if it's blank
73
+ description = snippet.get('description', '').strip()
74
+ if not description:
75
+ description = 'Not Available'
76
+
77
  return {
78
  'title': snippet['title'],
79
  'author': snippet['channelTitle'],
 
81
  'view_count': video['statistics'].get('viewCount', '0'),
82
  'like_count': video['statistics'].get('likeCount', '0'),
83
  'comment_count': video['statistics'].get('commentCount', '0'),
84
+ 'duration': video['contentDetails']['duration'],
85
+ 'description': description # Add the description to the metadata
86
  }
87
  else:
88
+ logger.error(f"No video found with id: {video_id}")
89
  return None
 
 
 
90
  except Exception as e:
91
+ logger.error(f"An error occurred while fetching metadata for video {video_id}: {str(e)}")
92
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ def get_transcript(video_id):
95
  if not video_id:
96
  return None
97
  try:
98
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
99
  metadata = get_video_metadata(video_id)
100
+ logger.info(f"Metadata for video {video_id}: {metadata}")
101
+ logger.info(f"Transcript length for video {video_id}: {len(transcript)}")
102
  if not metadata:
103
  return None
104
  return {
 
106
  'metadata': metadata
107
  }
108
  except Exception as e:
109
+ logger.error(f"Error extracting transcript for video {video_id}: {str(e)}")
110
  return None
111
 
112
  def get_channel_videos(channel_url):
113
+ youtube = get_youtube_client()
114
  channel_id = extract_channel_id(channel_url)
115
  if not channel_id:
116
+ logger.error(f"Invalid channel URL: {channel_url}")
117
  return []
118
  try:
119
  request = youtube.search().list(
 
134
  })
135
  return videos
136
  except HttpError as e:
137
+ logger.error(f"An HTTP error {e.resp.status} occurred: {e.content}")
138
  return []
139
  except Exception as e:
140
+ logger.error(f"An error occurred while fetching channel videos: {str(e)}")
141
  return []
142
 
143
  def extract_channel_id(url):
 
146
  return channel_id_match.group(1)
147
  return None
148
 
149
+ def test_api_key():
150
+ youtube = get_youtube_client()
151
+ try:
152
+ request = youtube.videos().list(part="snippet", id="dQw4w9WgXcQ")
153
+ response = request.execute()
154
+ if 'items' in response:
155
+ logger.info("API key is valid and working")
156
+ return True
157
+ else:
158
+ logger.error("API request successful but returned unexpected response")
159
+ return False
160
+ except Exception as e:
161
+ logger.error(f"API key test failed: {str(e)}")
162
+ return False
163
+
164
+ def initialize_youtube_api():
165
+ if test_api_key():
166
+ logger.info("YouTube API initialized successfully")
167
+ return True
168
+ else:
169
+ logger.error("Failed to initialize YouTube API")
170
+ return False
data/sqlite.db CHANGED
Binary files a/data/sqlite.db and b/data/sqlite.db differ
 
docker-compose.yaml CHANGED
@@ -20,14 +20,21 @@ services:
20
  volumes:
21
  - ./data:/app/data
22
  - ./config:/app/config
 
23
 
24
  elasticsearch:
25
- image: docker.elastic.co/elasticsearch/elasticsearch:7.14.0
 
26
  environment:
27
  - discovery.type=single-node
28
- - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
29
  ports:
30
  - "9200:9200"
 
 
 
 
 
31
  volumes:
32
  - esdata:/usr/share/elasticsearch/data
33
 
@@ -50,5 +57,6 @@ services:
50
 
51
  volumes:
52
  esdata:
 
53
  grafana-storage:
54
  ollama_data:
 
20
  volumes:
21
  - ./data:/app/data
22
  - ./config:/app/config
23
+ - ./app:/app/app # Add this line to map your local app directory
24
 
25
  elasticsearch:
26
+ image: docker.elastic.co/elasticsearch/elasticsearch:8.9.0
27
+ container_name: elasticsearch
28
  environment:
29
  - discovery.type=single-node
30
+ - xpack.security.enabled=false
31
  ports:
32
  - "9200:9200"
33
+ - "9300:9300"
34
+ deploy:
35
+ resources:
36
+ limits:
37
+ memory: 2G
38
  volumes:
39
  - esdata:/usr/share/elasticsearch/data
40
 
 
57
 
58
  volumes:
59
  esdata:
60
+ driver: local
61
  grafana-storage:
62
  ollama_data:
requirements.txt CHANGED
@@ -12,4 +12,6 @@ ollama
12
  requests
13
  matplotlib
14
  tqdm
15
- python-dotenv
 
 
 
12
  requests
13
  matplotlib
14
  tqdm
15
+ python-dotenv
16
+ certifi
17
+ httplib2
run-docker-compose-windows.ps1 CHANGED
@@ -5,18 +5,35 @@ $envPath = ".\.env"
5
  if (Test-Path $envPath) {
6
  # Read the .env file
7
  $envContent = Get-Content $envPath
8
-
9
  # Parse the environment variables
10
  foreach ($line in $envContent) {
11
  if ($line -match '^([^=]+)=(.*)$') {
12
  $name = $matches[1]
13
  $value = $matches[2]
14
  [Environment]::SetEnvironmentVariable($name, $value, "Process")
 
15
  }
16
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Run docker-compose
19
- docker-compose up --build
 
20
  }
21
  else {
22
  Write-Error "The .env file was not found at $envPath"
 
5
  if (Test-Path $envPath) {
6
  # Read the .env file
7
  $envContent = Get-Content $envPath
 
8
  # Parse the environment variables
9
  foreach ($line in $envContent) {
10
  if ($line -match '^([^=]+)=(.*)$') {
11
  $name = $matches[1]
12
  $value = $matches[2]
13
  [Environment]::SetEnvironmentVariable($name, $value, "Process")
14
+ Write-Host "Loaded environment variable: $name"
15
  }
16
  }
17
+
18
+ # Stop existing containers
19
+ Write-Host "Stopping existing containers..."
20
+ docker-compose down
21
+
22
+ # Rebuild the container
23
+ Write-Host "Rebuilding Docker containers..."
24
+ docker-compose build --no-cache app
25
+
26
+ # Start the services
27
+ Write-Host "Starting Docker services..."
28
+ docker-compose up -d
29
+
30
+ # Wait for services to be ready
31
+ Write-Host "Waiting for services to start up..."
32
+ Start-Sleep -Seconds 20
33
 
34
+ # Run the Streamlit app
35
+ Write-Host "Starting Streamlit app..."
36
+ docker-compose exec -T app sh -c "cd /app/app && streamlit run main.py"
37
  }
38
  else {
39
  Write-Error "The .env file was not found at $envPath"