#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().system('pip install docling chromadb sentence-transformers')


# In[2]:


get_ipython().system('pip install pymupdf tqdm spacy')
get_ipython().system('python -m spacy download it_core_news_sm')


# In[3]:


get_ipython().system('pip install transformers')


# In[4]:


import fitz  # PyMuPDF
from tqdm.auto import tqdm
import pandas as pd

def text_formatter(text: str) -> str:
    # Pulizia semplice
    import re
    text = text.replace("\n", " ").strip()
    text = re.sub(r"[ \t]{2,}", " ", text)
    text = re.sub(r"\.{2,}", " ", text)  # sostituisce ... con spazio
    text = re.sub(r"Pagina\s+\d+\s+di\s+\d+", "", text, flags=re.IGNORECASE)
    text = re.sub(r"Creazione VM su Cloud INSIEL","", text)
    text = re.sub(r"IO_XX_00_XX  ISTRUZIONE OPERATIVA 22/10/2024", "",text)
    text = re.sub(r"IO_XX_00_XX  ISTRUZIONE OPERATIVA 22/10/2024 ","",text)
    return text.strip()

def open_and_read_pdf(pdf_path: str):
    doc = fitz.open(pdf_path)
    pages = []
    for page_number, page in tqdm(enumerate(doc), total=len(doc), desc="📄 Lettura pagine PDF"):
        text = text_formatter(page.get_text())
        pages.append({
            "page_number": page_number + 1,
            "page_char_count": len(text),
            "page_word_count": len(text.split()),
            "page_token_estimate": len(text) // 4,
            "text": text
        })
    return pages

pdf_path = "data/insiel.pdf"  # Cambia se il tuo file è altrove
pages_and_texts = open_and_read_pdf(pdf_path)


# In[5]:


import spacy
nlp = spacy.load("it_core_news_sm")

# Spezza il testo di ogni pagina in frasi
for page in tqdm(pages_and_texts, desc="✂️ Split in frasi"):
    doc = nlp(page["text"])
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    page["sentence_chunks"] = []

    CHUNK_SIZE = 10  # Gruppi da 5 frasi
    for i in range(0, len(sentences), CHUNK_SIZE):
        chunk = sentences[i:i + CHUNK_SIZE]
        page["sentence_chunks"].append(chunk)


# In[6]:


pages_and_texts[65]


# In[7]:


df = pd.DataFrame(pages_and_texts)
df["chunk_id"] = df.index.astype(str)

# Mostra i primi
df.tail()


# In[8]:


df.shape


# In[9]:


df[df['page_token_estimate'] < 60].count()


# In[10]:


final = df[df['page_token_estimate'] > 60]


# In[11]:


final.describe().round(2)


# In[12]:


get_ipython().system('pip install sentence-transformers chromadb')


# In[13]:


from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

embedding_model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1")

texts = final["text"].tolist()
chunk_ids = final["chunk_id"].tolist()
metadatas = [{"page": int(p)} for p in final["page_number"]]

embeddings = embedding_model.encode(texts, show_progress_bar=True)


# In[14]:


import chromadb

# nuovo client
client = chromadb.PersistentClient(path="./vectorstore")

# collection
collection = client.get_or_create_collection("insiel_chunks")

# aggiunta
collection.add(
    documents=texts,
    embeddings=embeddings.tolist(),
    metadatas=metadatas,
    ids=chunk_ids
)


# In[16]:


"""query = input("Domanda: ")
query_embedding = embedding_model.encode([query])
"""
results = collection.query(
    query_embeddings=query_embedding,
    n_results=3  # puoi aumentare a 5, 10, ecc.
)


# In[17]:


"""for i, (doc, meta) in enumerate(zip(results["documents"][0], results["metadatas"][0])):
    print(f"\n🔹 RISULTATO {i+1} (pagina {meta['page']}):")
    print(doc[:500] + "...\n---")
"""


# In[18]:


import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id).to(torch.device("cpu"))

rag_chat = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200, device=-1)


# In[ ]:


def generate_rag_response_local(query, retrieved_chunks):
    context = "\n\n".join(retrieved_chunks)
    
    prompt = f"""[INST] Usa solo le informazioni fornite nel contesto qui sotto per rispondere alla domanda, la risposta deve finire sempre con un punto. 
Se la risposta non è presente, di' chiaramente che non è specificato nel documento.

Contesto:
{context}

Domanda: {query}
Risposta: [/INST]
"""
    result = rag_chat(prompt)[0]["generated_text"]
    return result.split("Risposta:")[-1].strip()


# In[ ]:


# 🧠 Inserisci la domanda
query = input("Domanda: ")

# 🔎 Ottieni l'embedding della query (usa sentence-transformers, NON il modello generativo!)
query_embedding = embedding_model.encode([query])  

# 🔍 Retrieval dei chunk più simili da Chroma
results = collection.query(
    query_embeddings=query_embedding,
    n_results=3
)

# 🧱 Estrai i chunk di contesto
retrieved_chunks = results["documents"][0]

# 🤖 Genera la risposta usando il modello open-source locale
response = generate_rag_response_local(query, retrieved_chunks)

# 🖨️ Mostra la risposta
print("🤖 Risposta:\n", response)


# In[ ]:


retrieved_chunks


# In[ ]:


results


# In[ ]: