In [None]:
# Dependencies
!pip install tiktoken
!pip install groq
!pip install langchain-groq
!pip install pinecone
!pip install langchain-community



In [None]:
# Mount Dataset from Google Drive
from google.colab import drive
drive.mount('/content/drive')
!unzip /content/drive/MyDrive/IMDB_Dataset.zip

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Archive:  /content/drive/MyDrive/IMDB_Dataset.zip
replace IMDB_Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: IMDB_Dataset.csv        


In [None]:
# Import Libraries
import os
import pandas as pd
import tiktoken

from groq import Groq
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

from pinecone import Pinecone, ServerlessSpec
from langchain_community.vectorstores import Pinecone as PineconeVectorStore

import io
from contextlib import redirect_stdout


In [None]:
#Groq API Key
groq_api_key = ""

In [None]:
# Load and preprocess data
imdb= pd.read_csv('/content/IMDB_Dataset.csv')
imdb = imdb.dropna()

In [None]:
# Create combined information column
imdb['combined_info'] = imdb.apply(lambda row: f"Title: {row['movie_name']}. Overview: {row['description']} Genres: {row['genre']} Certificate: {row['certificate']} Director: {row['director']} Actors: {row['star']}", axis=1)

In [None]:
# View Dataframe with combined info
imdb.head()

Unnamed: 0,movie_name,certificate,genre,rating,description,director,star,combined_info,n_tokens,vector
0,Black Panther: Wakanda Forever,PG-13,"Action, Adventure, Drama",6.9,The people of Wakanda fight to protect their h...,Ryan Coogler,"Letitia Wright, \nLupita Nyong'o, \nDanai Guri...",Title: Black Panther: Wakanda Forever. Overvie...,80,"[-0.026855934411287308, 0.05683119595050812, -..."
1,Avatar: The Way of Water,PG-13,"Action, Adventure, Fantasy",7.8,Jake Sully lives with his newfound family form...,James Cameron,"Sam Worthington, \nZoe Saldana, \nSigourney We...",Title: Avatar: The Way of Water. Overview: Jak...,99,"[-0.080960214138031, 0.012261789292097092, 0.0..."
2,Plane,R,"Action, Thriller",6.5,A pilot finds himself caught in a war zone aft...,Jean-François Richet,"Gerard Butler, \nMike Colter, \nTony Goldwyn, ...",Title: Plane. Overview: A pilot finds himself ...,67,"[0.02219153568148613, -0.0190700925886631, -0...."
3,Everything Everywhere All at Once,R,"Action, Adventure, Comedy",8.0,A middle-aged Chinese immigrant is swept up in...,"Dan Kwan, \nDaniel Scheinert","Michelle Yeoh, \nStephanie Hsu, \nJamie Lee Cu...",Title: Everything Everywhere All at Once. Over...,90,"[0.038551680743694305, -0.023787647485733032, ..."
5,Ant-Man and the Wasp: Quantumania,PG-13,"Action, Adventure, Comedy",6.6,"Scott Lang and Hope Van Dyne, along with Hank ...",Peyton Reed,"Paul Rudd, \nEvangeline Lilly, \nJonathan Majo...",Title: Ant-Man and the Wasp: Quantumania. Over...,100,"[-0.07330965995788574, -0.046703167259693146, ..."


In [None]:
# Token encoding
encoding = tiktoken.get_encoding('cl100k_base')
max_tokens = 8000

# Omit descriptions that are too long to embed
imdb["n_tokens"] = imdb.combined_info.apply(lambda x: len(encoding.encode(x)))
imdb = imdb[imdb.n_tokens <= max_tokens]

# Use HuggingFace embeddings instead of OpenAI
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Generate embeddings
imdb["vector"] = imdb.combined_info.apply(lambda x: embeddings.embed_query(x))

  embeddings = HuggingFaceEmbeddings(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
#View Dataframe after creating Embeddings
imdb.head()

Unnamed: 0,movie_name,certificate,genre,rating,description,director,star,combined_info,n_tokens,vector
0,Black Panther: Wakanda Forever,PG-13,"Action, Adventure, Drama",6.9,The people of Wakanda fight to protect their h...,Ryan Coogler,"Letitia Wright, \nLupita Nyong'o, \nDanai Guri...",Title: Black Panther: Wakanda Forever. Overvie...,80,"[-0.026855934411287308, 0.05683119595050812, -..."
1,Avatar: The Way of Water,PG-13,"Action, Adventure, Fantasy",7.8,Jake Sully lives with his newfound family form...,James Cameron,"Sam Worthington, \nZoe Saldana, \nSigourney We...",Title: Avatar: The Way of Water. Overview: Jak...,99,"[-0.080960214138031, 0.012261789292097092, 0.0..."
2,Plane,R,"Action, Thriller",6.5,A pilot finds himself caught in a war zone aft...,Jean-François Richet,"Gerard Butler, \nMike Colter, \nTony Goldwyn, ...",Title: Plane. Overview: A pilot finds himself ...,67,"[0.02219153568148613, -0.0190700925886631, -0...."
3,Everything Everywhere All at Once,R,"Action, Adventure, Comedy",8.0,A middle-aged Chinese immigrant is swept up in...,"Dan Kwan, \nDaniel Scheinert","Michelle Yeoh, \nStephanie Hsu, \nJamie Lee Cu...",Title: Everything Everywhere All at Once. Over...,90,"[0.038551680743694305, -0.023787647485733032, ..."
5,Ant-Man and the Wasp: Quantumania,PG-13,"Action, Adventure, Comedy",6.6,"Scott Lang and Hope Van Dyne, along with Hank ...",Peyton Reed,"Paul Rudd, \nEvangeline Lilly, \nJonathan Majo...",Title: Ant-Man and the Wasp: Quantumania. Over...,100,"[-0.07330965995788574, -0.046703167259693146, ..."


In [None]:
# Get dimensions of Embeddings
embedding_dimensions = len(imdb['vector'].iloc[0])
print(f"Embedding Dimensions: {embedding_dimensions}")

Embedding Dimensions: 384


In [None]:
# Save preprocessed data
imdb.to_pickle('imdb.pkl')

In [None]:
# Save Pickle File to Drive
!cp /content/imdb.pkl /content/drive/MyDrive/imdb.pkl
print(os.listdir('/content/drive/MyDrive'))

['Colab Notebooks', 'english-handwritten-characters-dataset', 'Sleepy.ai Project Proposal.gdoc', 'Untitled document (2).gdoc', '21K-4839 IS (SEED) Lab 1.zip', 'FYP_I_Proposal_Update_1.gdoc', 'FYP_I_Proposal_Update_1 (1).gdoc', 'Untitled document (1).gdoc', 'Final Year Project Proposal (Smart Sub_ Intelligent Football Substitution Planning).gdoc', 'Final Year Project Proposal (Smart Sub_ Intelligent Football Substitution Recommender System) (1) (1) (1).gdoc', 'Final Year Project Proposal (Smart Sub_ Intelligent Football Substitution Recommender System) (1) (1).gdoc', 'Final Year Project Proposal (Smart Sub: Intelligent Football Substitution Recommender System).gdoc', 'CV (Fahad Yousuf).gdoc', 'FYP WORKFLOW.gdoc', 'football-dataset', 'deedy-cv (1).gdoc', 'Untitled document.gdoc', 'deedy-cv.gdoc', 'Fahad Yousuf (Curriculum Vitae).gdoc', 'Fahad Yousuf (Curriculum Vitae) (1).pdf', 'Fahad Yousuf (Curriculum Vitae).pdf', '(Fahad Yousuf) Role of Civil Engagement in Bringing Social Change.gdoc'

In [None]:
# Set the API key as an environment variable
os.environ["PINECONE_API_KEY"] = ""

# Initialize Pinecone client
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

# Create or connect to a Pinecone index
index_name = "imdb-index"

if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=embedding_dimensions,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
else:
    print(f"Index '{index_name}' already exists.")

# Prepare documents
documents = [
    Document(
        page_content=row['combined_info'],
        metadata={
            'title': row['movie_name'],
        }
    ) for _, row in imdb.iterrows()
]

# Create vector store with Pinecone
docsearch = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=embeddings,
    index_name=index_name
)


In [None]:
# Define LLM using Groq (Llama model)
llm = ChatGroq(
    model_name="llama3-8b-8192",  # or another Llama model available
    api_key="",
    temperature=0
)

In [None]:
# Define custom prompt template
template = """You are a movie recommender system that helps users find movie that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three movies, with a short description of the plot and the reason why the user might like it.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Your response:"""

PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"]
)

def retrieve_ranked_context(query):
    results = docsearch.similarity_search(query, k=100)  # Retrieve top 10 matches
    # Sort by rating (descending) before creating the context
    ranked_results = sorted(
        results,
        key=lambda x: float(x.metadata.get("rating", 0)),  # Sort by rating
        reverse=True
    )
    print(ranked_results)
    context = "\n".join([doc.page_content for doc in ranked_results[:3]])  # Top 3
    return context

# Create QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

def get_movie_recommendations(query):
    # Suppress any print outputs during the execution
    with io.StringIO() as buf, redirect_stdout(buf):
        context = retrieve_ranked_context(query)
        qa_input = {"context": context, "query": query}
        result = qa_chain(qa_input)

    # Return the final result without any intermediate prints
    return result['result']



In [None]:
# Example usage
if __name__ == "__main__":
    query = input("What's Your Query?\n")
    recommendations = get_movie_recommendations(query)
    print(recommendations)

What's Your Query?
Recommend me some good movies using the dataset I provided to you
Based on the dataset you provided, I'll recommend three movies that match your preferences. Since I don't have any specific information about your preferences, I'll try to suggest movies that belong to a mix of genres (comedy, crime, action, sci-fi, thriller, and drama).

Here are three movie recommendations:

1. **District 9**: This sci-fi thriller won critical acclaim and explores themes of segregation, humanity, and action. The movie's unique blend of action, mystery, and social commentary might appeal to viewers looking for a thought-provoking film.
2. **One Way**: This unrated action-comedy seems to have a mix of fast-paced action and humor, which could be an appealing combination for fans of the genres. The movie's production values and performances might also be worth checking out.
3. **America America**: This crime-drama from India might offer a gripping story with complex characters, morality,