Spaces:

ganesh3
/

rag-youtube-assistant

Running

App Files Files Community

ganesh3 commited on Oct 18, 2024

Commit

25b2b2b

1 Parent(s): a0d49e7

third commit

Browse files

Files changed (9) hide show

app/data_processor.py +174 -88
app/evaluation.py +73 -7
app/generate_ground_truth.py +56 -39
app/main.py +117 -239
app/transcript_extractor.py +71 -51
data/sqlite.db +0 -0
docker-compose.yaml +10 -2
requirements.txt +3 -1
run-docker-compose-windows.ps1 +20 -3

app/data_processor.py CHANGED Viewed

@@ -1,15 +1,13 @@
-import logging
 from minsearch import Index
 from sentence_transformers import SentenceTransformer
 import numpy as np
-from sklearn.metrics.pairwise import cosine_similarity
-import re
 from elasticsearch import Elasticsearch
 import os
 import json
-from transcript_extractor import get_transcript
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def clean_text(text):
@@ -18,13 +16,17 @@ def clean_text(text):
         return ""
     cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
     cleaned = re.sub(r'\s+', ' ', cleaned).strip()
-    logger.info(f"Cleaned text: '{cleaned[:100]}...'")
     return cleaned
 class DataProcessor:
     def __init__(self, text_fields=["content", "title", "description"],
                  keyword_fields=["video_id", "author", "upload_date"],
-                 embedding_model="all-MiniLM-L6-v2"):
         self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
         self.embedding_model = SentenceTransformer(embedding_model)
         self.documents = []
@@ -39,18 +41,29 @@ class DataProcessor:
         logger.info(f"DataProcessor initialized with Elasticsearch at {elasticsearch_host}:{elasticsearch_port}")
     def process_transcript(self, video_id, transcript_data):
-        if not transcript_data or 'metadata' not in transcript_data or 'transcript' not in transcript_data:
-            logger.error(f"Invalid transcript data for video {video_id}")
             return None
         metadata = transcript_data['metadata']
         transcript = transcript_data['transcript']
-        logger.info(f"Processing transcript for video {video_id}")
         logger.info(f"Number of transcript segments: {len(transcript)}")
         full_transcript = " ".join([segment.get('text', '') for segment in transcript])
         cleaned_transcript = clean_text(full_transcript)
         if not cleaned_transcript:
             logger.warning(f"Empty cleaned transcript for video {video_id}")
@@ -59,17 +72,25 @@ class DataProcessor:
         doc = {
             "video_id": video_id,
             "content": cleaned_transcript,
-            "segment_id": f"{video_id}_full",
             "title": clean_text(metadata.get('title', '')),
             "author": metadata.get('author', ''),
             "upload_date": metadata.get('upload_date', ''),
             "view_count": metadata.get('view_count', 0),
             "like_count": metadata.get('like_count', 0),
             "comment_count": metadata.get('comment_count', 0),
             "video_duration": metadata.get('duration', '')
         }
         self.documents.append(doc)
-        self.embeddings.append(self.embedding_model.encode(cleaned_transcript + " " + metadata.get('title', '')))
         logger.info(f"Processed transcript for video {video_id}")
         return f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
@@ -80,28 +101,52 @@ class DataProcessor:
             return None
         logger.info(f"Building index with {len(self.documents)} documents")
         try:
-            self.text_index.fit(self.documents)
             self.index_built = True
             logger.info("Text index built successfully")
         except Exception as e:
             logger.error(f"Error building text index: {str(e)}")
             raise
-        self.embeddings = np.array(self.embeddings)
         try:
             if not self.es.indices.exists(index=index_name):
                 self.es.indices.create(index=index_name, body={
                     "mappings": {
                         "properties": {
-                            "embedding": {"type": "dense_vector", "dims": self.embeddings.shape[1]},
                             "content": {"type": "text"},
-                            "video_id": {"type": "keyword"},
-                            "segment_id": {"type": "keyword"},
                             "title": {"type": "text"},
                             "author": {"type": "keyword"},
                             "upload_date": {"type": "date"},
                             "view_count": {"type": "integer"},
                             "like_count": {"type": "integer"},
                             "comment_count": {"type": "integer"},
@@ -122,19 +167,71 @@ class DataProcessor:
         except Exception as e:
             logger.error(f"Error building Elasticsearch index: {str(e)}")
             raise
-    def ensure_index_built(self, video_id, embedding_model):
-        index_name = f"video_{video_id}_{embedding_model.replace('-', '_')}".lower()
-        if not self.es.indices.exists(index=index_name):
-            logger.info(f"Index {index_name} does not exist. Building now...")
-            transcript_data = get_transcript(video_id)
-            if transcript_data:
-                self.process_transcript(video_id, transcript_data)
-                return self.build_index(index_name)
-            else:
-                logger.error(f"Failed to retrieve transcript for video {video_id}")
-                return None
-        return index_name
     def search(self, query, filter_dict={}, boost_dict={}, num_results=10, method='hybrid', index_name=None):
         if not index_name:
@@ -147,78 +244,67 @@ class DataProcessor:
         logger.info(f"Performing {method} search for query: {query} in index: {index_name}")
-        if method == 'text':
-            return self.text_search(query, filter_dict, boost_dict, num_results, index_name)
-        elif method == 'embedding':
-            return self.embedding_search(query, num_results, index_name)
-        else:  # hybrid search
-            text_results = self.text_search(query, filter_dict, boost_dict, num_results, index_name)
-            embedding_results = self.embedding_search(query, num_results, index_name)
-            return self.combine_results(text_results, embedding_results, num_results)
     def text_search(self, query, filter_dict={}, boost_dict={}, num_results=10, index_name=None):
         if not index_name:
             logger.error("No index name provided for text search.")
             raise ValueError("No index name provided for text search.")
-        # Perform text search using Elasticsearch
-        search_body = {
-            "query": {
-                "multi_match": {
-                    "query": query,
-                    "fields": ["content", "title"]
-                }
-            },
-            "size": num_results
-        }
-        response = self.es.search(index=index_name, body=search_body)
-        return [hit['_source'] for hit in response['hits']['hits']]
     def embedding_search(self, query, num_results=10, index_name=None):
         if not index_name:
             logger.error("No index name provided for embedding search.")
             raise ValueError("No index name provided for embedding search.")
-        query_vector = self.embedding_model.encode(query).tolist()
-        script_query = {
-            "script_score": {
-                "query": {"match_all": {}},
-                "script": {
-                    "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
-                    "params": {"query_vector": query_vector}
                 }
             }
-        }
-        response = self.es.search(
-            index=index_name,
-            body={
-                "size": num_results,
-                "query": script_query,
-                "_source": {"excludes": ["embedding"]}
-            }
-        )
-        return [hit['_source'] for hit in response['hits']['hits']]
-    def combine_results(self, text_results, embedding_results, num_results):
-        combined = []
-        for i in range(max(len(text_results), len(embedding_results))):
-            if i < len(text_results):
-                combined.append(text_results[i])
-            if i < len(embedding_results):
-                combined.append(embedding_results[i])
-        seen = set()
-        deduped = []
-        for doc in combined:
-            if doc['segment_id'] not in seen:
-                seen.add(doc['segment_id'])
-                deduped.append(doc)
-        return deduped[:num_results]
-    def process_query(self, query):
-        return clean_text(query)
     def set_embedding_model(self, model_name):
         self.embedding_model = SentenceTransformer(model_name)
         logger.info(f"Embedding model set to: {model_name}")

 from minsearch import Index
 from sentence_transformers import SentenceTransformer
 import numpy as np
 from elasticsearch import Elasticsearch
 import os
 import json
+import logging
+import re
+logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 def clean_text(text):
         return ""
     cleaned = re.sub(r'[^\w\s.,!?]', ' ', text)
     cleaned = re.sub(r'\s+', ' ', cleaned).strip()
+    logger.debug(f"Original text length: {len(text)}, Cleaned text length: {len(cleaned)}")
+    logger.debug(f"Cleaned text sample: '{cleaned[:100]}...'")
     return cleaned
 class DataProcessor:
     def __init__(self, text_fields=["content", "title", "description"],
                  keyword_fields=["video_id", "author", "upload_date"],
+                 embedding_model="multi-qa-MiniLM-L6-cos-v1"):
+        self.text_fields = text_fields
+        self.keyword_fields = keyword_fields
+        self.all_fields = text_fields + keyword_fields
         self.text_index = Index(text_fields=text_fields, keyword_fields=keyword_fields)
         self.embedding_model = SentenceTransformer(embedding_model)
         self.documents = []
         logger.info(f"DataProcessor initialized with Elasticsearch at {elasticsearch_host}:{elasticsearch_port}")
     def process_transcript(self, video_id, transcript_data):
+        logger.info(f"Processing transcript for video {video_id}")
+        if not transcript_data:
+            logger.error(f"Transcript data is None for video {video_id}")
+            return None
+        if 'metadata' not in transcript_data or 'transcript' not in transcript_data:
+            logger.error(f"Invalid transcript data structure for video {video_id}")
+            logger.debug(f"Transcript data keys: {transcript_data.keys()}")
             return None
         metadata = transcript_data['metadata']
         transcript = transcript_data['transcript']
         logger.info(f"Number of transcript segments: {len(transcript)}")
         full_transcript = " ".join([segment.get('text', '') for segment in transcript])
+        logger.debug(f"Full transcript length before cleaning: {len(full_transcript)}")
+        logger.debug(f"Full transcript sample before cleaning: '{full_transcript[:500]}...'")
         cleaned_transcript = clean_text(full_transcript)
+        logger.debug(f"Cleaned transcript length: {len(cleaned_transcript)}")
+        logger.debug(f"Cleaned transcript sample: '{cleaned_transcript[:500]}...'")
         if not cleaned_transcript:
             logger.warning(f"Empty cleaned transcript for video {video_id}")
         doc = {
             "video_id": video_id,
             "content": cleaned_transcript,
             "title": clean_text(metadata.get('title', '')),
+            "description": clean_text(metadata.get('description', 'Not Available')),
             "author": metadata.get('author', ''),
             "upload_date": metadata.get('upload_date', ''),
+            "segment_id": f"{video_id}_full",
             "view_count": metadata.get('view_count', 0),
             "like_count": metadata.get('like_count', 0),
             "comment_count": metadata.get('comment_count', 0),
             "video_duration": metadata.get('duration', '')
         }
+        logger.debug(f"Document created for video {video_id}")
+        for field in self.all_fields:
+            logger.debug(f"Document {field} length: {len(str(doc.get(field, '')))}")
+            logger.debug(f"Document {field} sample: '{str(doc.get(field, ''))[:100]}...'")
         self.documents.append(doc)
+        embedding = self.embedding_model.encode(cleaned_transcript + " " + metadata.get('title', ''))
+        self.embeddings.append(embedding)
         logger.info(f"Processed transcript for video {video_id}")
         return f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
             return None
         logger.info(f"Building index with {len(self.documents)} documents")
+        # Fields to include in the fit function
+        index_fields = self.text_fields + self.keyword_fields
+        # Create a list of dictionaries with only the fields we want to index
+        docs_to_index = []
+        for doc in self.documents:
+            indexed_doc = {field: doc.get(field, '') for field in index_fields}
+            if all(indexed_doc.values()):  # Check if all required fields have values
+                docs_to_index.append(indexed_doc)
+            else:
+                missing_fields = [field for field, value in indexed_doc.items() if not value]
+                logger.warning(f"Document with video_id {doc.get('video_id', 'unknown')} is missing values for fields: {missing_fields}")
+        if not docs_to_index:
+            logger.error("No valid documents to index")
+            return None
+        logger.info(f"Number of valid documents to index: {len(docs_to_index)}")
+        # Log the structure of the first document to be indexed
+        logger.debug("Structure of the first document to be indexed:")
+        logger.debug(json.dumps(docs_to_index[0], indent=2))
         try:
+            logger.info("Fitting text index")
+            self.text_index.fit(docs_to_index)
             self.index_built = True
             logger.info("Text index built successfully")
         except Exception as e:
             logger.error(f"Error building text index: {str(e)}")
             raise
         try:
             if not self.es.indices.exists(index=index_name):
                 self.es.indices.create(index=index_name, body={
                     "mappings": {
                         "properties": {
+                            "embedding": {"type": "dense_vector", "dims": len(self.embeddings[0]), "index": True, "similarity": "cosine"},
                             "content": {"type": "text"},
                             "title": {"type": "text"},
+                            "description": {"type": "text"},
+                            "video_id": {"type": "keyword"},
                             "author": {"type": "keyword"},
                             "upload_date": {"type": "date"},
+                            "segment_id": {"type": "keyword"},
                             "view_count": {"type": "integer"},
                             "like_count": {"type": "integer"},
                             "comment_count": {"type": "integer"},
         except Exception as e:
             logger.error(f"Error building Elasticsearch index: {str(e)}")
             raise
+    def compute_rrf(self, rank, k=60):
+        return 1 / (k + rank)
+    def hybrid_search(self, query, index_name, num_results=5):
+        if not index_name:
+            logger.error("No index name provided for hybrid search.")
+            raise ValueError("No index name provided for hybrid search.")
+        vector = self.embedding_model.encode(query)
+        knn_query = {
+            "field": "embedding",
+            "query_vector": vector.tolist(),
+            "k": 10,
+            "num_candidates": 100
+        }
+        keyword_query = {
+            "multi_match": {
+                "query": query,
+                "fields": self.text_fields
+            }
+        }
+        try:
+            knn_results = self.es.search(
+                index=index_name,
+                body={
+                    "knn": knn_query,
+                    "size": 10
+                }
+            )['hits']['hits']
+            keyword_results = self.es.search(
+                index=index_name,
+                body={
+                    "query": keyword_query,
+                    "size": 10
+                }
+            )['hits']['hits']
+            rrf_scores = {}
+            for rank, hit in enumerate(knn_results):
+                doc_id = hit['_id']
+                rrf_scores[doc_id] = self.compute_rrf(rank + 1)
+            for rank, hit in enumerate(keyword_results):
+                doc_id = hit['_id']
+                if doc_id in rrf_scores:
+                    rrf_scores[doc_id] += self.compute_rrf(rank + 1)
+                else:
+                    rrf_scores[doc_id] = self.compute_rrf(rank + 1)
+            reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
+            final_results = []
+            for doc_id, score in reranked_docs[:num_results]:
+                doc = self.es.get(index=index_name, id=doc_id)
+                final_results.append(doc['_source'])
+            return final_results
+        except Exception as e:
+            logger.error(f"Error in hybrid search: {str(e)}")
+            raise
     def search(self, query, filter_dict={}, boost_dict={}, num_results=10, method='hybrid', index_name=None):
         if not index_name:
         logger.info(f"Performing {method} search for query: {query} in index: {index_name}")
+        try:
+            if method == 'text':
+                return self.text_search(query, filter_dict, boost_dict, num_results, index_name)
+            elif method == 'embedding':
+                return self.embedding_search(query, num_results, index_name)
+            else:  # hybrid search
+                return self.hybrid_search(query, index_name, num_results)
+        except Exception as e:
+            logger.error(f"Error in search method {method}: {str(e)}")
+            raise
     def text_search(self, query, filter_dict={}, boost_dict={}, num_results=10, index_name=None):
         if not index_name:
             logger.error("No index name provided for text search.")
             raise ValueError("No index name provided for text search.")
+        try:
+            search_body = {
+                "query": {
+                    "multi_match": {
+                        "query": query,
+                        "fields": self.text_fields
+                    }
+                },
+                "size": num_results
+            }
+            response = self.es.search(index=index_name, body=search_body)
+            return [hit['_source'] for hit in response['hits']['hits']]
+        except Exception as e:
+            logger.error(f"Error in text search: {str(e)}")
+            raise
     def embedding_search(self, query, num_results=10, index_name=None):
         if not index_name:
             logger.error("No index name provided for embedding search.")
             raise ValueError("No index name provided for embedding search.")
+        try:
+            query_vector = self.embedding_model.encode(query).tolist()
+            script_query = {
+                "script_score": {
+                    "query": {"match_all": {}},
+                    "script": {
+                        "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
+                        "params": {"query_vector": query_vector}
+                    }
                 }
             }
+            response = self.es.search(
+                index=index_name,
+                body={
+                    "size": num_results,
+                    "query": script_query,
+                    "_source": {"excludes": ["embedding"]}
+                }
+            )
+            return [hit['_source'] for hit in response['hits']['hits']]
+        except Exception as e:
+            logger.error(f"Error in embedding search: {str(e)}")
+            raise
     def set_embedding_model(self, model_name):
         self.embedding_model = SentenceTransformer(model_name)
         logger.info(f"Embedding model set to: {model_name}")

app/evaluation.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 class EvaluationSystem:
     def __init__(self, data_processor, database_handler):
@@ -7,15 +10,15 @@ class EvaluationSystem:
         self.db_handler = database_handler
     def relevance_scoring(self, query, retrieved_docs, top_k=5):
-        query_embedding = self.data_processor.process_query(query)
-        doc_embeddings = [self.data_processor.process_query(doc) for doc in retrieved_docs]
         similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
         return np.mean(sorted(similarities, reverse=True)[:top_k])
     def answer_similarity(self, generated_answer, reference_answer):
-        gen_embedding = self.data_processor.process_query(generated_answer)
-        ref_embedding = self.data_processor.process_query(reference_answer)
         return cosine_similarity([gen_embedding], [ref_embedding])[0][0]
     def human_evaluation(self, video_id, query):
@@ -34,8 +37,8 @@ class EvaluationSystem:
         human_scores = []
         for query, reference in zip(test_queries, reference_answers):
-            retrieved_docs = rag_system.es_handler.search(index_name, rag_system.data_processor.process_query(query))
-            generated_answer = rag_system.query(index_name, query)
             relevance_scores.append(self.relevance_scoring(query, retrieved_docs))
             similarity_scores.append(self.answer_similarity(generated_answer, reference))
@@ -45,4 +48,67 @@ class EvaluationSystem:
             "avg_relevance_score": np.mean(relevance_scores),
             "avg_similarity_score": np.mean(similarity_scores),
             "avg_human_score": np.mean(human_scores)
-        }

 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
+import pandas as pd
+import json
+import ollama
 class EvaluationSystem:
     def __init__(self, data_processor, database_handler):
         self.db_handler = database_handler
     def relevance_scoring(self, query, retrieved_docs, top_k=5):
+        query_embedding = self.data_processor.embedding_model.encode(query)
+        doc_embeddings = [self.data_processor.embedding_model.encode(doc['content']) for doc in retrieved_docs]
         similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
         return np.mean(sorted(similarities, reverse=True)[:top_k])
     def answer_similarity(self, generated_answer, reference_answer):
+        gen_embedding = self.data_processor.embedding_model.encode(generated_answer)
+        ref_embedding = self.data_processor.embedding_model.encode(reference_answer)
         return cosine_similarity([gen_embedding], [ref_embedding])[0][0]
     def human_evaluation(self, video_id, query):
         human_scores = []
         for query, reference in zip(test_queries, reference_answers):
+            retrieved_docs = rag_system.data_processor.search(query, num_results=5, method='hybrid', index_name=index_name)
+            generated_answer, _ = rag_system.query(query, search_method='hybrid', index_name=index_name)
             relevance_scores.append(self.relevance_scoring(query, retrieved_docs))
             similarity_scores.append(self.answer_similarity(generated_answer, reference))
             "avg_relevance_score": np.mean(relevance_scores),
             "avg_similarity_score": np.mean(similarity_scores),
             "avg_human_score": np.mean(human_scores)
+        }
+    def llm_as_judge(self, question, generated_answer, prompt_template):
+        prompt = prompt_template.format(question=question, answer_llm=generated_answer)
+        try:
+            response = ollama.chat(
+                model='phi3.5',
+                messages=[{"role": "user", "content": prompt}]
+            )
+            evaluation = json.loads(response['message']['content'])
+            return evaluation
+        except Exception as e:
+            print(f"Error in LLM evaluation: {str(e)}")
+            return None
+    def evaluate_rag(self, rag_system, ground_truth_file, sample_size=200, prompt_template=None):
+        try:
+            ground_truth = pd.read_csv(ground_truth_file)
+        except FileNotFoundError:
+            print("Ground truth file not found. Please generate ground truth data first.")
+            return None
+        sample = ground_truth.sample(n=min(sample_size, len(ground_truth)), random_state=1)
+        evaluations = []
+        for _, row in sample.iterrows():
+            question = row['question']
+            video_id = row['video_id']
+            index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id, "multi-qa-MiniLM-L6-cos-v1")
+            if not index_name:
+                print(f"No index found for video {video_id}. Skipping this question.")
+                continue
+            try:
+                answer_llm, _ = rag_system.query(question, search_method='hybrid', index_name=index_name)
+            except ValueError as e:
+                print(f"Error querying RAG system: {str(e)}")
+                continue
+            if prompt_template:
+                evaluation = self.llm_as_judge(question, answer_llm, prompt_template)
+                if evaluation:
+                    evaluations.append((
+                        str(video_id),
+                        str(question),
+                        str(answer_llm),
+                        str(evaluation.get('Relevance', 'UNKNOWN')),
+                        str(evaluation.get('Explanation', 'No explanation provided'))
+                    ))
+            else:
+                # Fallback to cosine similarity if no prompt template is provided
+                similarity = self.answer_similarity(answer_llm, row.get('reference_answer', ''))
+                evaluations.append((
+                    str(video_id),
+                    str(question),
+                    str(answer_llm),
+                    f"Similarity: {similarity}",
+                    "Cosine similarity used for evaluation"
+                ))
+        return evaluations

app/generate_ground_truth.py CHANGED Viewed

@@ -1,27 +1,16 @@
-import os
 import pandas as pd
 import json
-from youtube_transcript_api import YouTubeTranscriptApi
 from tqdm import tqdm
-import requests
-OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'localhost')
-OLLAMA_PORT = os.getenv('OLLAMA_PORT', '11434')
-def get_transcript(video_id):
-    try:
-        transcript = YouTubeTranscriptApi.get_transcript(video_id)
-        return " ".join([entry['text'] for entry in transcript])
-    except Exception as e:
-        print(f"Error extracting transcript for video {video_id}: {str(e)}")
-        return None
 def generate_questions(transcript):
     prompt_template = """
     You are an AI assistant tasked with generating questions based on a YouTube video transcript.
-    Formulate 10 questions that a user might ask based on the provided transcript.
     Make the questions specific to the content of the transcript.
     The questions should be complete and not too short. Use as few words as possible from the transcript.
     The transcript:
@@ -34,34 +23,62 @@ def generate_questions(transcript):
     prompt = prompt_template.format(transcript=transcript)
-    response = requests.post(f'http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/generate', json={
-        'model': 'phi3.5',
-        'prompt': prompt
-    })
-    if response.status_code == 200:
-        return json.loads(response.json()['response'])
     else:
-        print(f"Error: {response.status_code} - {response.text}")
         return None
-def main():
-    video_id = "zjkBMFhNj_g"
-    transcript = get_transcript(video_id)
-    if transcript:
-        questions = generate_questions(transcript)
-        if questions:
-            df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
-            os.makedirs('data', exist_ok=True)
-            df.to_csv('data/ground-truth-retrieval.csv', index=False)
-            print("Ground truth data saved to data/ground-truth-retrieval.csv")
         else:
-            print("Failed to generate questions.")
     else:
-        print("Failed to generate ground truth data due to transcript retrieval error.")
-if __name__ == "__main__":
-    main()

 import pandas as pd
 import json
 from tqdm import tqdm
+import ollama
+from transcript_extractor import get_transcript
 def generate_questions(transcript):
     prompt_template = """
     You are an AI assistant tasked with generating questions based on a YouTube video transcript.
+    Formulate at least 10 questions that a user might ask based on the provided transcript.
     Make the questions specific to the content of the transcript.
     The questions should be complete and not too short. Use as few words as possible from the transcript.
+    It is important that the questions are relevant to the content of the transcript and are at least 10 in number.
     The transcript:
     prompt = prompt_template.format(transcript=transcript)
+    try:
+        response = ollama.chat(
+            model='phi3.5',
+            messages=[{"role": "user", "content": prompt}]
+        )
+        return json.loads(response['message']['content'])
+    except Exception as e:
+        print(f"Error generating questions: {str(e)}")
+        return None
+def generate_ground_truth(db_handler, data_processor, video_id):
+    transcript_data = get_transcript(video_id)
+    if transcript_data and 'transcript' in transcript_data:
+        full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
+        # Process the transcript
+        data_processor.process_transcript(video_id, transcript_data)
     else:
+        print(f"Failed to retrieve transcript for video {video_id}")
         return None
+    questions = generate_questions(full_transcript)
+    if questions and 'questions' in questions:
+        df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
+        csv_path = 'data/ground-truth-retrieval.csv'
+        df.to_csv(csv_path, index=False)
+        print(f"Ground truth data saved to {csv_path}")
+        return df
+    else:
+        print("Failed to generate questions.")
+    return None
+def generate_ground_truth_for_all_videos(db_handler, data_processor):
+    videos = db_handler.get_all_videos()
+    all_questions = []
+    for video in tqdm(videos, desc="Generating ground truth"):
+        video_id = video[0]  # Assuming the video ID is the first element in the tuple
+        transcript_data = get_transcript(video_id)
+        if transcript_data and 'transcript' in transcript_data:
+            full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
+            # Process the transcript
+            data_processor.process_transcript(video_id, transcript_data)
+            questions = generate_questions(full_transcript)
+            if questions and 'questions' in questions:
+                all_questions.extend([(video_id, q) for q in questions['questions']])
         else:
+            print(f"Failed to retrieve transcript for video {video_id}")
+    if all_questions:
+        df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
+        csv_path = 'data/ground-truth-retrieval.csv'
+        df.to_csv(csv_path, index=False)
+        print(f"Ground truth data for all videos saved to {csv_path}")
+        return df
     else:
+        print("Failed to generate questions for any video.")
+        return None

app/main.py CHANGED Viewed

@@ -1,21 +1,18 @@
 import streamlit as st
 import pandas as pd
-from transcript_extractor import extract_video_id, get_transcript, get_channel_videos, process_videos
 from data_processor import DataProcessor
 from database import DatabaseHandler
 from rag import RAGSystem
 from query_rewriter import QueryRewriter
 from evaluation import EvaluationSystem
 from sentence_transformers import SentenceTransformer
 import os
-import json
-import requests
-from tqdm import tqdm
-import sqlite3
 import logging
-import ollama
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 @st.cache_resource
@@ -33,170 +30,64 @@ def init_components():
         st.error(f"Error initializing components: {str(e)}")
         st.error("Please check your configuration and ensure all services are running.")
         return None, None, None, None, None
-components = init_components()
-if components:
-    db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
-else:
-    st.stop()
-# Ground Truth Generation
-def generate_questions(transcript):
-    prompt_template = """
-    You are an AI assistant tasked with generating questions based on a YouTube video transcript.
-    Formulate atleast 10 questions that a user might ask based on the provided transcript.
-    Make the questions specific to the content of the transcript.
-    The questions should be complete and not too short. Use as few words as possible from the transcript.
-    It is important that the questions are relevant to the content of the transcript and are atleast 10 in number.
-    The transcript:
-    {transcript}
-    Provide the output in parsable JSON without using code blocks:
-    {{"questions": ["question1", "question2", ..., "question10"]}}
-    """.strip()
-    prompt = prompt_template.format(transcript=transcript)
-    try:
-        response = ollama.chat(
-            model='phi3.5',
-            messages=[{"role": "user", "content": prompt}]
-        )
-        print("Printing the response from OLLAMA: " + response['message']['content'])
-        return json.loads(response['message']['content'])
-    except Exception as e:
-        logger.error(f"Error generating questions: {str(e)}")
-        return None
-def generate_ground_truth(video_id=None, existing_transcript=None):
-    if video_id is None and existing_transcript is None:
-        st.error("Please provide either a video ID or an existing transcript.")
-        return None
-    if video_id:
-        transcript_data = get_transcript(video_id)
-        if transcript_data and 'transcript' in transcript_data:
-            full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
-        else:
-            logger.error("Failed to retrieve transcript for the provided video ID.")
-            st.error("Failed to retrieve transcript for the provided video ID.")
-            return None
-    else:
-        full_transcript = existing_transcript
-    questions = generate_questions(full_transcript)
-    if questions and 'questions' in questions:
-        df = pd.DataFrame([(video_id if video_id else "custom", q) for q in questions['questions']], columns=['video_id', 'question'])
-        os.makedirs('data', exist_ok=True)
-        df.to_csv('data/ground-truth-retrieval.csv', index=False)
-        st.success("Ground truth data generated and saved to data/ground-truth-retrieval.csv")
-        return df
-    else:
-        logger.error("Failed to generate questions.")
-        st.error("Failed to generate questions.")
-    return None
-# RAG Evaluation
-def evaluate_rag(sample_size=200):
-    try:
-        ground_truth = pd.read_csv('data/ground-truth-retrieval.csv')
-    except FileNotFoundError:
-        logger.error("Ground truth file not found. Please generate ground truth data first.")
-        st.error("Ground truth file not found. Please generate ground truth data first.")
-        return None
-    sample = ground_truth.sample(n=min(sample_size, len(ground_truth)), random_state=1)
-    evaluations = []
-    prompt_template = """
-    You are an expert evaluator for a Youtube transcript assistant.
-    Your task is to analyze the relevance of the generated answer to the given question.
-    Based on the relevance of the generated answer, you will classify it
-    as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".
-    Here is the data for evaluation:
-    Question: {question}
-    Generated Answer: {answer_llm}
-    Please analyze the content and context of the generated answer in relation to the question
-    and provide your evaluation in parsable JSON without using code blocks:
-    {{
-      "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
-      "Explanation": "[Provide a brief explanation for your evaluation]"
-    }}
-    """.strip()
-    progress_bar = st.progress(0)
-    for i, (_, row) in enumerate(sample.iterrows()):
-        question = row['question']
-        video_id = row['video_id']
-        # Get the index name for the video (you might need to adjust this based on your setup)
-        index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id, "all-MiniLM-L6-v2")  # Assuming you're using this embedding model
-        if not index_name:
-            logger.warning(f"No index found for video {video_id}. Skipping this question.")
-            continue
-        try:
-            answer_llm, _ = rag_system.query(question, index_name=index_name)
-        except ValueError as e:
-            logger.error(f"Error querying RAG system: {str(e)}")
-            continue
-        prompt = prompt_template.format(question=question, answer_llm=answer_llm)
-        try:
-            response = ollama.chat(
-                model='phi3.5',
-                messages=[{"role": "user", "content": prompt}]
-            )
-            evaluation_json = json.loads(response['message']['content'])
-            evaluations.append((
-                str(video_id),
-                str(question),
-                str(answer_llm),
-                str(evaluation_json.get('Relevance', 'UNKNOWN')),
-                str(evaluation_json.get('Explanation', 'No explanation provided'))
-            ))
-        except Exception as e:
-            logger.warning(f"Failed to evaluate question: {question}. Error: {str(e)}")
-            st.warning(f"Failed to evaluate question: {question}")
-        progress_bar.progress((i + 1) / len(sample))
-    # Store RAG evaluations in the database
-    conn = sqlite3.connect('data/sqlite.db')
-    cursor = conn.cursor()
-    cursor.execute('''
-    CREATE TABLE IF NOT EXISTS rag_evaluations (
-        video_id TEXT,
-        question TEXT,
-        answer TEXT,
-        relevance TEXT,
-        explanation TEXT
-    )
-    ''')
-    cursor.executemany('''
-    INSERT INTO rag_evaluations (video_id, question, answer, relevance, explanation)
-    VALUES (?, ?, ?, ?, ?)
-    ''', evaluations)
-    conn.commit()
-    conn.close()
-    logger.info("Evaluation complete. Results stored in the database.")
-    st.success("Evaluation complete. Results stored in the database.")
-    return evaluations
-@st.cache_data
-def process_single_video(video_id, embedding_model):
-    # Check if the video has already been processed with the current embedding model
     existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
     if existing_index:
         logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
@@ -205,9 +96,9 @@ def process_single_video(video_id, embedding_model):
     transcript_data = get_transcript(video_id)
     if transcript_data is None:
         logger.error(f"Failed to retrieve transcript for video {video_id}")
         return None
-    # Store video metadata in the database
     video_data = {
         'video_id': video_id,
         'title': transcript_data['metadata'].get('title', 'Unknown Title'),
@@ -222,67 +113,78 @@ def process_single_video(video_id, embedding_model):
         db_handler.add_video(video_data)
     except Exception as e:
         logger.error(f"Error adding video to database: {str(e)}")
         return None
-    # Process transcript for RAG system
     try:
         data_processor.process_transcript(video_id, transcript_data)
     except Exception as e:
         logger.error(f"Error processing transcript: {str(e)}")
         return None
-    # Create Elasticsearch index
     index_name = f"video_{video_id}_{embedding_model}".lower()
     try:
         index_name = data_processor.build_index(index_name)
         logger.info(f"Successfully built index: {index_name}")
     except Exception as e:
         logger.error(f"Error building index: {str(e)}")
         return None
-    # Add embedding model to the database
     embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")
-    # Get the video ID from the database
     video_db_record = db_handler.get_video_by_youtube_id(video_id)
     if video_db_record is None:
         logger.error(f"Failed to retrieve video record from database for video {video_id}")
         return None
-    video_db_id = video_db_record[0]  # Assuming the ID is the first column
-    # Store Elasticsearch index information
     db_handler.add_elasticsearch_index(video_db_id, index_name, embedding_model_id)
     logger.info(f"Processed and indexed transcript for video {video_id}")
     return index_name
-@st.cache_data
-def process_multiple_videos(video_ids, embedding_model):
     indices = []
     for video_id in video_ids:
-        index = process_single_video(video_id, embedding_model)
         if index:
             indices.append(index)
     logger.info(f"Processed and indexed transcripts for {len(indices)} videos")
     st.success(f"Processed and indexed transcripts for {len(indices)} videos")
     return indices
 def main():
     st.title("YouTube Transcript RAG System")
-    components = init_components()
-    if not all(components):
-        st.error("Failed to initialize one or more components. Please check the logs and your configuration.")
-        return
-    db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
     tab1, tab2, tab3 = st.tabs(["RAG System", "Ground Truth Generation", "Evaluation"])
     with tab1:
         st.header("RAG System")
-        # Video selection section
         st.subheader("Select a Video")
         videos = db_handler.get_all_videos()
         if not videos:
@@ -290,21 +192,15 @@ def main():
         else:
             video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
-            # Allow filtering by channel name
             channels = sorted(video_df['channel_name'].unique())
             selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
             if selected_channel != "All":
                 video_df = video_df[video_df['channel_name'] == selected_channel]
-            # Display videos and allow selection
             st.dataframe(video_df)
             selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
-            # Embedding model selection
-            embedding_model = st.selectbox("Select embedding model:", ["all-MiniLM-L6-v2", "all-mpnet-base-v2"])
-            # Get the index name for the selected video and embedding model
             index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id, embedding_model)
             if index_name:
@@ -312,18 +208,17 @@ def main():
             else:
                 st.warning("No index found for the selected video and embedding model. The index will be built when you search.")
-        # Process new video section
         st.subheader("Process New Video")
         input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
         input_value = st.text_input("Enter the URL or ID:")
         if st.button("Process"):
             with st.spinner("Processing..."):
-                data_processor.embedding_model = SentenceTransformer(embedding_model)
                 if input_type == "Video URL":
                     video_id = extract_video_id(input_value)
                     if video_id:
-                        index_name = process_single_video(video_id, embedding_model)
                         if index_name is None:
                             st.error(f"Failed to process video {video_id}")
                         else:
@@ -333,7 +228,7 @@ def main():
                 elif input_type == "Channel URL":
                     channel_videos = get_channel_videos(input_value)
                     if channel_videos:
-                        index_names = process_multiple_videos([video['video_id'] for video in channel_videos], embedding_model)
                         if not index_names:
                             st.error("Failed to process any videos from the channel")
                         else:
@@ -341,13 +236,12 @@ def main():
                     else:
                         st.error("Failed to retrieve videos from the channel")
                 else:
-                    index_name = process_single_video(input_value, embedding_model)
                     if index_name is None:
                         st.error(f"Failed to process video {input_value}")
                     else:
                         st.success(f"Successfully processed video {input_value}")
-        # Query section
         st.subheader("Query the RAG System")
         query = st.text_input("Enter your query:")
         rewrite_method = st.radio("Query rewriting method:", ["None", "Chain of Thought", "ReAct"])
@@ -375,10 +269,9 @@ def main():
                     search_method_map = {"Hybrid": "hybrid", "Text-only": "text", "Embedding-only": "embedding"}
                     try:
-                        # Ensure index is built before searching
                         if not index_name:
                             st.info("Building index for the selected video...")
-                            index_name = process_single_video(selected_video_id, embedding_model)
                             if not index_name:
                                 st.error("Failed to build index for the selected video.")
                                 return
@@ -405,66 +298,49 @@ def main():
     with tab2:
         st.header("Ground Truth Generation")
-        use_existing_transcript = st.checkbox("Use existing transcript")
-        if use_existing_transcript:
-            # Get all available videos (assuming all videos have transcripts)
-            videos = db_handler.get_all_videos()
-            if not videos:
-                st.warning("No videos available. Please process some videos first.")
-            else:
-                video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
-                # Allow filtering by channel name
-                channels = sorted(video_df['channel_name'].unique())
-                selected_channel = st.selectbox("Filter by Channel", ["All"] + channels, key="gt_channel_filter")
-                if selected_channel != "All":
-                    video_df = video_df[video_df['channel_name'] == selected_channel]
-                # Display videos and allow selection
-                st.dataframe(video_df)
-                selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(),
-                                                 format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0],
-                                                 key="gt_video_select")
-                if st.button("Generate Ground Truth from Existing Transcript"):
-                    with st.spinner("Generating ground truth..."):
-                        # Retrieve the transcript content (you'll need to implement this method)
-                        transcript_data = get_transcript(selected_video_id)
-                        if transcript_data and 'transcript' in transcript_data:
-                            full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
-                            ground_truth_df = generate_ground_truth(video_id=selected_video_id, existing_transcript=full_transcript)
-                            if ground_truth_df is not None:
-                                st.dataframe(ground_truth_df)
-                                csv = ground_truth_df.to_csv(index=False)
-                                st.download_button(
-                                    label="Download Ground Truth CSV",
-                                    data=csv,
-                                    file_name=f"ground_truth_{selected_video_id}.csv",
-                                    mime="text/csv",
-                                )
-                        else:
-                            st.error("Failed to retrieve transcript content.")
         else:
-            video_id = st.text_input("Enter YouTube Video ID for ground truth generation:")
-            if st.button("Generate Ground Truth"):
-                with st.spinner("Generating ground truth..."):
-                    ground_truth_df = generate_ground_truth(video_id=video_id)
                     if ground_truth_df is not None:
                         st.dataframe(ground_truth_df)
                         csv = ground_truth_df.to_csv(index=False)
                         st.download_button(
-                            label="Download Ground Truth CSV",
                             data=csv,
-                            file_name=f"ground_truth_{video_id}.csv",
                             mime="text/csv",
                         )
     with tab3:
         st.header("RAG Evaluation")
-        # Load ground truth data
         try:
             ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
             ground_truth_available = True
@@ -480,7 +356,7 @@ def main():
             if st.button("Run Evaluation"):
                 with st.spinner("Running evaluation..."):
-                    evaluation_results = evaluate_rag(sample_size)
                     if evaluation_results:
                         st.write("Evaluation Results:")
                         st.dataframe(pd.DataFrame(evaluation_results, columns=['Video ID', 'Question', 'Answer', 'Relevance', 'Explanation']))
@@ -488,7 +364,6 @@ def main():
             st.warning("No ground truth data available. Please generate ground truth data first.")
             st.button("Run Evaluation", disabled=True)
-        # Add a section to generate ground truth if it's not available
         if not ground_truth_available:
             st.subheader("Generate Ground Truth")
             st.write("You need to generate ground truth data before running the evaluation.")
@@ -497,4 +372,7 @@ def main():
                 st.experimental_rerun()
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pandas as pd
+from transcript_extractor import get_transcript, get_youtube_client, extract_video_id, get_channel_videos, test_api_key, initialize_youtube_api
 from data_processor import DataProcessor
 from database import DatabaseHandler
 from rag import RAGSystem
 from query_rewriter import QueryRewriter
 from evaluation import EvaluationSystem
+from generate_ground_truth import generate_ground_truth, generate_ground_truth_for_all_videos
 from sentence_transformers import SentenceTransformer
 import os
+import sys
 import logging
+logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 @st.cache_resource
         st.error(f"Error initializing components: {str(e)}")
         st.error("Please check your configuration and ensure all services are running.")
         return None, None, None, None, None
+def check_api_key():
+    if test_api_key():
+        st.success("YouTube API key is valid and working.")
+    else:
+        st.error("YouTube API key is invalid or not set. Please check your .env file.")
+        new_api_key = st.text_input("Enter your YouTube API key:")
+        if new_api_key:
+            os.environ['YOUTUBE_API_KEY'] = new_api_key
+            with open('.env', 'a') as f:
+                f.write(f"\nYOUTUBE_API_KEY={new_api_key}")
+            st.success("API key saved. Reinitializing YouTube client...")
+            get_youtube_client.cache_clear()  # Clear the cache to force reinitialization
+            if test_api_key():
+                st.success("YouTube client reinitialized successfully.")
+            else:
+                st.error("Failed to reinitialize YouTube client. Please check your API key.")
+            st.experimental_rerun()
+# LLM-as-a-judge prompt template
+prompt_template = """
+You are an expert evaluator for a Youtube transcript assistant.
+Your task is to analyze the relevance of the generated answer to the given question.
+Based on the relevance of the generated answer, you will classify it
+as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".
+Here is the data for evaluation:
+Question: {question}
+Generated Answer: {answer_llm}
+Please analyze the content and context of the generated answer in relation to the question
+and provide your evaluation in the following JSON format:
+{{
+  "Relevance": "NON_RELEVANT",
+  "Explanation": "Your explanation here"
+}}
+OR
+{{
+  "Relevance": "PARTLY_RELEVANT",
+  "Explanation": "Your explanation here"
+}}
+OR
+{{
+  "Relevance": "RELEVANT",
+  "Explanation": "Your explanation here"
+}}
+Ensure your response is a valid JSON object with these exact keys and one of the three exact values for "Relevance".
+Do not include any text outside of this JSON object.
+"""
+def process_single_video(db_handler, data_processor, video_id, embedding_model):
     existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
     if existing_index:
         logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
     transcript_data = get_transcript(video_id)
     if transcript_data is None:
         logger.error(f"Failed to retrieve transcript for video {video_id}")
+        st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
         return None
     video_data = {
         'video_id': video_id,
         'title': transcript_data['metadata'].get('title', 'Unknown Title'),
         db_handler.add_video(video_data)
     except Exception as e:
         logger.error(f"Error adding video to database: {str(e)}")
+        st.error(f"Error adding video {video_id} to database: {str(e)}")
         return None
     try:
         data_processor.process_transcript(video_id, transcript_data)
     except Exception as e:
         logger.error(f"Error processing transcript: {str(e)}")
+        st.error(f"Error processing transcript for video {video_id}: {str(e)}")
         return None
     index_name = f"video_{video_id}_{embedding_model}".lower()
     try:
         index_name = data_processor.build_index(index_name)
         logger.info(f"Successfully built index: {index_name}")
     except Exception as e:
         logger.error(f"Error building index: {str(e)}")
+        st.error(f"Error building index for video {video_id}: {str(e)}")
         return None
     embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")
     video_db_record = db_handler.get_video_by_youtube_id(video_id)
     if video_db_record is None:
         logger.error(f"Failed to retrieve video record from database for video {video_id}")
+        st.error(f"Failed to retrieve video record from database for video {video_id}")
         return None
+    video_db_id = video_db_record[0]
     db_handler.add_elasticsearch_index(video_db_id, index_name, embedding_model_id)
     logger.info(f"Processed and indexed transcript for video {video_id}")
+    st.success(f"Successfully processed and indexed transcript for video {video_id}")
     return index_name
+def process_multiple_videos(db_handler, data_processor, video_ids, embedding_model):
     indices = []
     for video_id in video_ids:
+        index = process_single_video(db_handler, data_processor, video_id, embedding_model)
         if index:
             indices.append(index)
     logger.info(f"Processed and indexed transcripts for {len(indices)} videos")
     st.success(f"Processed and indexed transcripts for {len(indices)} videos")
     return indices
+def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
+    index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
+    if not index_name:
+        st.warning(f"Video {video_id} has not been processed yet. Processing now...")
+        index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
+        if not index_name:
+            st.error(f"Failed to process video {video_id}. Please check the logs for more information.")
+            return False
+    return True
 def main():
     st.title("YouTube Transcript RAG System")
+    check_api_key()
+    components = init_components()
+    if components:
+        db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
+    else:
+        st.stop()
     tab1, tab2, tab3 = st.tabs(["RAG System", "Ground Truth Generation", "Evaluation"])
     with tab1:
         st.header("RAG System")
+        embedding_model = st.selectbox("Select embedding model:", ["multi-qa-MiniLM-L6-cos-v1", "all-mpnet-base-v2"])
         st.subheader("Select a Video")
         videos = db_handler.get_all_videos()
         if not videos:
         else:
             video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
             channels = sorted(video_df['channel_name'].unique())
             selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
             if selected_channel != "All":
                 video_df = video_df[video_df['channel_name'] == selected_channel]
             st.dataframe(video_df)
             selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
             index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id, embedding_model)
             if index_name:
             else:
                 st.warning("No index found for the selected video and embedding model. The index will be built when you search.")
         st.subheader("Process New Video")
         input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
         input_value = st.text_input("Enter the URL or ID:")
         if st.button("Process"):
             with st.spinner("Processing..."):
+                data_processor.set_embedding_model(embedding_model)
                 if input_type == "Video URL":
                     video_id = extract_video_id(input_value)
                     if video_id:
+                        index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
                         if index_name is None:
                             st.error(f"Failed to process video {video_id}")
                         else:
                 elif input_type == "Channel URL":
                     channel_videos = get_channel_videos(input_value)
                     if channel_videos:
+                        index_names = process_multiple_videos(db_handler, data_processor, [video['video_id'] for video in channel_videos], embedding_model)
                         if not index_names:
                             st.error("Failed to process any videos from the channel")
                         else:
                     else:
                         st.error("Failed to retrieve videos from the channel")
                 else:
+                    index_name = process_single_video(db_handler, data_processor, input_value, embedding_model)
                     if index_name is None:
                         st.error(f"Failed to process video {input_value}")
                     else:
                         st.success(f"Successfully processed video {input_value}")
         st.subheader("Query the RAG System")
         query = st.text_input("Enter your query:")
         rewrite_method = st.radio("Query rewriting method:", ["None", "Chain of Thought", "ReAct"])
                     search_method_map = {"Hybrid": "hybrid", "Text-only": "text", "Embedding-only": "embedding"}
                     try:
                         if not index_name:
                             st.info("Building index for the selected video...")
+                            index_name = process_single_video(db_handler, data_processor, selected_video_id, embedding_model)
                             if not index_name:
                                 st.error("Failed to build index for the selected video.")
                                 return
     with tab2:
         st.header("Ground Truth Generation")
+        videos = db_handler.get_all_videos()
+        if not videos:
+            st.warning("No videos available. Please process some videos first.")
         else:
+            video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
+            st.dataframe(video_df)
+            selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(),
+                                             format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0],
+                                             key="gt_video_select")
+            if st.button("Generate Ground Truth for Selected Video"):
+                if ensure_video_processed(db_handler, data_processor, selected_video_id, embedding_model):
+                    with st.spinner("Generating ground truth..."):
+                        ground_truth_df = generate_ground_truth(db_handler, data_processor, selected_video_id)
+                        if ground_truth_df is not None:
+                            st.dataframe(ground_truth_df)
+                            csv = ground_truth_df.to_csv(index=False)
+                            st.download_button(
+                                label="Download Ground Truth CSV",
+                                data=csv,
+                                file_name=f"ground_truth_{selected_video_id}.csv",
+                                mime="text/csv",
+                            )
+            if st.button("Generate Ground Truth for All Videos"):
+                with st.spinner("Processing videos and generating ground truth..."):
+                    for video_id in video_df['youtube_id']:
+                        ensure_video_processed(db_handler, data_processor, video_id, embedding_model)
+                    ground_truth_df = generate_ground_truth_for_all_videos(db_handler, data_processor)
                     if ground_truth_df is not None:
                         st.dataframe(ground_truth_df)
                         csv = ground_truth_df.to_csv(index=False)
                         st.download_button(
+                            label="Download Ground Truth CSV (All Videos)",
                             data=csv,
+                            file_name="ground_truth_all_videos.csv",
                             mime="text/csv",
                         )
     with tab3:
         st.header("RAG Evaluation")
         try:
             ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
             ground_truth_available = True
             if st.button("Run Evaluation"):
                 with st.spinner("Running evaluation..."):
+                    evaluation_results = evaluation_system.evaluate_rag(rag_system, 'data/ground-truth-retrieval.csv', sample_size, prompt_template)
                     if evaluation_results:
                         st.write("Evaluation Results:")
                         st.dataframe(pd.DataFrame(evaluation_results, columns=['Video ID', 'Question', 'Answer', 'Relevance', 'Explanation']))
             st.warning("No ground truth data available. Please generate ground truth data first.")
             st.button("Run Evaluation", disabled=True)
         if not ground_truth_available:
             st.subheader("Generate Ground Truth")
             st.write("You need to generate ground truth data before running the evaluation.")
                 st.experimental_rerun()
 if __name__ == "__main__":
+    if not initialize_youtube_api():
+        logger.error("Failed to initialize YouTube API. Exiting.")
+        sys.exit(1)
     main()

app/transcript_extractor.py CHANGED Viewed

@@ -3,29 +3,51 @@ from dotenv import load_dotenv
 from youtube_transcript_api import YouTubeTranscriptApi
 from googleapiclient.discovery import build
 from googleapiclient.errors import HttpError
 import re
 # Get the directory of the current script
 current_dir = os.path.dirname(os.path.abspath(__file__))
 # Construct the path to the .env file (one directory up from the current script)
 dotenv_path = os.path.join(os.path.dirname(current_dir), '.env')
-print("the .env path is :" + dotenv_path)
 # Load environment variables from .env file
 load_dotenv(dotenv_path)
 # Get API key from environment variable
 API_KEY = os.getenv('YOUTUBE_API_KEY')
-print("the api key is :" + API_KEY)
 if not API_KEY:
     raise ValueError("YouTube API key not found. Make sure it's set in your .env file in the parent directory of the 'app' folder.")
-print(f"API_KEY: {API_KEY[:5]}...{API_KEY[-5:]}")  # Print first and last 5 characters for verification
-try:
-    youtube = build('youtube', 'v3', developerKey=API_KEY)
-except Exception as e:
-    print(f"Error initializing YouTube API client: {str(e)}")
-    raise
 def extract_video_id(url):
     if not url:
@@ -36,16 +58,22 @@ def extract_video_id(url):
     return None
 def get_video_metadata(video_id):
     try:
         request = youtube.videos().list(
             part="snippet,contentDetails,statistics",
             id=video_id
         )
         response = request.execute()
         if 'items' in response and len(response['items']) > 0:
             video = response['items'][0]
             snippet = video['snippet']
             return {
                 'title': snippet['title'],
                 'author': snippet['channelTitle'],
@@ -53,48 +81,24 @@ def get_video_metadata(video_id):
                 'view_count': video['statistics'].get('viewCount', '0'),
                 'like_count': video['statistics'].get('likeCount', '0'),
                 'comment_count': video['statistics'].get('commentCount', '0'),
-                'duration': video['contentDetails']['duration']
             }
         else:
-            print(f"No video found with ID: {video_id}")
             return None
-    except HttpError as e:
-        print(f"An HTTP error {e.resp.status} occurred: {e.content}")
-        return None
     except Exception as e:
-        print(f"An error occurred while fetching video metadata: {str(e)}")
         return None
-def get_transcript(video_id):
-    # Get the directory of the current script
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    # Construct the path to the .env file (one directory up from the current script)
-    dotenv_path = os.path.join(os.path.dirname(current_dir), '.env')
-    print("the .env path is :" + dotenv_path)
-    # Load environment variables from .env file
-    load_dotenv(dotenv_path)
-    # Get API key from environment variable
-    API_KEY = os.getenv('YOUTUBE_API_KEY')
-    print("the api key is :" + API_KEY)
-    if not API_KEY:
-        raise ValueError("YouTube API key not found. Make sure it's set in your .env file in the parent directory of the 'app' folder.")
-    print(f"API_KEY: {API_KEY[:5]}...{API_KEY[-5:]}")  # Print first and last 5 characters for verification
-    try:
-        youtube = build('youtube', 'v3', developerKey=API_KEY)
-    except Exception as e:
-        print(f"Error initializing YouTube API client: {str(e)}")
-        raise
     if not video_id:
         return None
     try:
         transcript = YouTubeTranscriptApi.get_transcript(video_id)
         metadata = get_video_metadata(video_id)
-        print(f"Metadata for video {video_id}: {metadata}")
-        print(f"Transcript length for video {video_id}: {len(transcript)}")
         if not metadata:
             return None
         return {
@@ -102,13 +106,14 @@ def get_transcript(video_id):
             'metadata': metadata
         }
     except Exception as e:
-        print(f"Error extracting transcript for video {video_id}: {str(e)}")
         return None
 def get_channel_videos(channel_url):
     channel_id = extract_channel_id(channel_url)
     if not channel_id:
-        print(f"Invalid channel URL: {channel_url}")
         return []
     try:
         request = youtube.search().list(
@@ -129,10 +134,10 @@ def get_channel_videos(channel_url):
             })
         return videos
     except HttpError as e:
-        print(f"An HTTP error {e.resp.status} occurred: {e.content}")
         return []
     except Exception as e:
-        print(f"An error occurred while fetching channel videos: {str(e)}")
         return []
 def extract_channel_id(url):
@@ -141,10 +146,25 @@ def extract_channel_id(url):
         return channel_id_match.group(1)
     return None
-def process_videos(video_ids):
-    transcripts = {}
-    for video_id in video_ids:
-        transcript_data = get_transcript(video_id)
-        if transcript_data:
-            transcripts[video_id] = transcript_data
-    return transcripts

 from youtube_transcript_api import YouTubeTranscriptApi
 from googleapiclient.discovery import build
 from googleapiclient.errors import HttpError
+import google_auth_oauthlib.flow
+import googleapiclient.discovery
+import googleapiclient.errors
 import re
+import logging
+import ssl
+import certifi
+import requests
+# Set up logging
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 # Get the directory of the current script
 current_dir = os.path.dirname(os.path.abspath(__file__))
 # Construct the path to the .env file (one directory up from the current script)
 dotenv_path = os.path.join(os.path.dirname(current_dir), '.env')
+logger.info(f"The .env path is: {dotenv_path}")
 # Load environment variables from .env file
 load_dotenv(dotenv_path)
 # Get API key from environment variable
 API_KEY = os.getenv('YOUTUBE_API_KEY')
+logger.info(f"API_KEY: {API_KEY[:5]}...{API_KEY[-5:]}")  # Log first and last 5 characters for verification
 if not API_KEY:
     raise ValueError("YouTube API key not found. Make sure it's set in your .env file in the parent directory of the 'app' folder.")
+def get_youtube_client():
+    try:
+        # Create a custom session with SSL verification
+        session = requests.Session()
+        session.verify = certifi.where()
+        # Create a custom HTTP object
+        http = googleapiclient.http.build_http()
+        http.verify = session.verify
+        # Build the YouTube client with the custom HTTP object
+        youtube = build('youtube', 'v3', developerKey=API_KEY, http=http)
+        logger.info("YouTube API client initialized successfully")
+        return youtube
+    except Exception as e:
+        logger.error(f"Error initializing YouTube API client: {str(e)}")
+        raise
 def extract_video_id(url):
     if not url:
     return None
 def get_video_metadata(video_id):
+    youtube = get_youtube_client()
     try:
         request = youtube.videos().list(
             part="snippet,contentDetails,statistics",
             id=video_id
         )
         response = request.execute()
         if 'items' in response and len(response['items']) > 0:
             video = response['items'][0]
             snippet = video['snippet']
+            # Get the description and set default if it's blank
+            description = snippet.get('description', '').strip()
+            if not description:
+                description = 'Not Available'
             return {
                 'title': snippet['title'],
                 'author': snippet['channelTitle'],
                 'view_count': video['statistics'].get('viewCount', '0'),
                 'like_count': video['statistics'].get('likeCount', '0'),
                 'comment_count': video['statistics'].get('commentCount', '0'),
+                'duration': video['contentDetails']['duration'],
+                'description': description  # Add the description to the metadata
             }
         else:
+            logger.error(f"No video found with id: {video_id}")
             return None
     except Exception as e:
+        logger.error(f"An error occurred while fetching metadata for video {video_id}: {str(e)}")
         return None
+def get_transcript(video_id):
     if not video_id:
         return None
     try:
         transcript = YouTubeTranscriptApi.get_transcript(video_id)
         metadata = get_video_metadata(video_id)
+        logger.info(f"Metadata for video {video_id}: {metadata}")
+        logger.info(f"Transcript length for video {video_id}: {len(transcript)}")
         if not metadata:
             return None
         return {
             'metadata': metadata
         }
     except Exception as e:
+        logger.error(f"Error extracting transcript for video {video_id}: {str(e)}")
         return None
 def get_channel_videos(channel_url):
+    youtube = get_youtube_client()
     channel_id = extract_channel_id(channel_url)
     if not channel_id:
+        logger.error(f"Invalid channel URL: {channel_url}")
         return []
     try:
         request = youtube.search().list(
             })
         return videos
     except HttpError as e:
+        logger.error(f"An HTTP error {e.resp.status} occurred: {e.content}")
         return []
     except Exception as e:
+        logger.error(f"An error occurred while fetching channel videos: {str(e)}")
         return []
 def extract_channel_id(url):
         return channel_id_match.group(1)
     return None
+def test_api_key():
+    youtube = get_youtube_client()
+    try:
+        request = youtube.videos().list(part="snippet", id="dQw4w9WgXcQ")
+        response = request.execute()
+        if 'items' in response:
+            logger.info("API key is valid and working")
+            return True
+        else:
+            logger.error("API request successful but returned unexpected response")
+            return False
+    except Exception as e:
+        logger.error(f"API key test failed: {str(e)}")
+        return False
+def initialize_youtube_api():
+    if test_api_key():
+        logger.info("YouTube API initialized successfully")
+        return True
+    else:
+        logger.error("Failed to initialize YouTube API")
+        return False

data/sqlite.db CHANGED Viewed

Binary files a/data/sqlite.db and b/data/sqlite.db differ

docker-compose.yaml CHANGED Viewed

@@ -20,14 +20,21 @@ services:
     volumes:
       - ./data:/app/data
       - ./config:/app/config
   elasticsearch:
-    image: docker.elastic.co/elasticsearch/elasticsearch:7.14.0
     environment:
       - discovery.type=single-node
-      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
     ports:
       - "9200:9200"
     volumes:
       - esdata:/usr/share/elasticsearch/data
@@ -50,5 +57,6 @@ services:
 volumes:
   esdata:
   grafana-storage:
   ollama_data:

     volumes:
       - ./data:/app/data
       - ./config:/app/config
+      - ./app:/app/app  # Add this line to map your local app directory
   elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.9.0
+    container_name: elasticsearch
     environment:
       - discovery.type=single-node
+      - xpack.security.enabled=false
     ports:
       - "9200:9200"
+      - "9300:9300"
+    deploy:
+      resources:
+        limits:
+          memory: 2G
     volumes:
       - esdata:/usr/share/elasticsearch/data
 volumes:
   esdata:
+    driver: local
   grafana-storage:
   ollama_data:

requirements.txt CHANGED Viewed

@@ -12,4 +12,6 @@ ollama
 requests
 matplotlib
 tqdm
-python-dotenv

 requests
 matplotlib
 tqdm
+python-dotenv
+certifi
+httplib2

run-docker-compose-windows.ps1 CHANGED Viewed

@@ -5,18 +5,35 @@ $envPath = ".\.env"
 if (Test-Path $envPath) {
     # Read the .env file
     $envContent = Get-Content $envPath
     # Parse the environment variables
     foreach ($line in $envContent) {
         if ($line -match '^([^=]+)=(.*)$') {
             $name = $matches[1]
             $value = $matches[2]
             [Environment]::SetEnvironmentVariable($name, $value, "Process")
         }
     }
-    # Run docker-compose
-    docker-compose up --build
 }
 else {
     Write-Error "The .env file was not found at $envPath"

 if (Test-Path $envPath) {
     # Read the .env file
     $envContent = Get-Content $envPath
     # Parse the environment variables
     foreach ($line in $envContent) {
         if ($line -match '^([^=]+)=(.*)$') {
             $name = $matches[1]
             $value = $matches[2]
             [Environment]::SetEnvironmentVariable($name, $value, "Process")
+            Write-Host "Loaded environment variable: $name"
         }
     }
+    # Stop existing containers
+    Write-Host "Stopping existing containers..."
+    docker-compose down
+    # Rebuild the container
+    Write-Host "Rebuilding Docker containers..."
+    docker-compose build --no-cache app
+    # Start the services
+    Write-Host "Starting Docker services..."
+    docker-compose up -d
+    # Wait for services to be ready
+    Write-Host "Waiting for services to start up..."
+    Start-Sleep -Seconds 20
+    # Run the Streamlit app
+    Write-Host "Starting Streamlit app..."
+    docker-compose exec -T app sh -c "cd /app/app && streamlit run main.py"
 }
 else {
     Write-Error "The .env file was not found at $envPath"