Spaces:

javiervzpucp
/

RAG-SA

Running

App Files Files Community

javiervzpucp commited on 25 days ago

Commit

8c421fb

verified ·

1 Parent(s): 60244ac

up

Browse files

Files changed (17) hide show

.gitignore +2 -0
README.md +1 -11
app.py +31 -0
embed_matrix.npy +3 -0
embed_matrix_hybrid.npy +3 -0
embed_matrix_hybrid_graphsage.npy +3 -0
grafo_embed.pickle +3 -0
grafo_embed_hybrid.pickle +3 -0
grafo_embed_hybrid_graphsage.pickle +3 -0
grafo_ttl_hibrido.ttl +0 -0
grafo_ttl_hibrido_graphsage.ttl +0 -0
grafo_ttl_no_hibrido.ttl +0 -0
id_map.pkl +3 -0
id_map_hybrid.pkl +3 -0
id_map_hybrid_graphsage.pkl +3 -0
rag_hf.py +253 -0
requirements.txt +97 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ .streamlit/secrets.toml

README.md CHANGED Viewed

@@ -1,11 +1 @@
----
-title: RAG SA
-emoji: 👀
-colorFrom: indigo
-colorTo: indigo
-sdk: static
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # RAG-glottolog

app.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr  9 10:56:02 2025
+@author: jveraz
+"""
+from fastapi import FastAPI
+from pydantic import BaseModel
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+app = FastAPI()
+MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+class QueryInput(BaseModel):
+    inputs: str
+@app.post("/")
+async def generate(query: QueryInput):
+    input_ids = tokenizer(query.inputs, return_tensors="pt").input_ids.to(model.device)
+    output_ids = model.generate(input_ids, max_new_tokens=200)
+    generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return {"generated_text": generated}

embed_matrix.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:447c4325d930de44070f9def3ad15bf54a93d39956587fc20525cfc5060f4a50
+size 1695872

embed_matrix_hybrid.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1fbd97a4a97523185aab51c4fad790aff36c8acd6710c5607f4acea9be0b96b
+size 3382400

embed_matrix_hybrid_graphsage.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4711ff26a44c4938f94a79169dd56b7adda7668e1b6dd89f718059092ca5c50
+size 3382400

grafo_embed.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:420fe64e2680f0d3c4f7f90b891f21b56cce89784ea104b3a2878e8b845ce451
+size 4470194

grafo_embed_hybrid.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:776ba033f208a5266c6a5b076189d96a9651993cdcd2297f5b55ed33a98dcf6d
+size 8320820

grafo_embed_hybrid_graphsage.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3993c376704872f90798a447434da0487db268d904ca4517e26c519b07bf1ac1
+size 7958567

grafo_ttl_hibrido.ttl ADDED Viewed

The diff for this file is too large to render. See raw diff

grafo_ttl_hibrido_graphsage.ttl ADDED Viewed

The diff for this file is too large to render. See raw diff

grafo_ttl_no_hibrido.ttl ADDED Viewed

The diff for this file is too large to render. See raw diff

id_map.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4e5e2ca4af562078ca2cc94a56fc409ac1fd0312f514a2835117568bc89b034
+size 6088

id_map_hybrid.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f011f1929fabc31724e935b9eac34ae691b530367d1b831bb2d92e78b555280a
+size 12189

id_map_hybrid_graphsage.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebb6de3e35fea86da236cd9b12a1b6d6a1d4867941c25955b401c3bf857ed5f6
+size 12189

rag_hf.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# rag_interface.py (with numpy instead of faiss)
+import streamlit as st
+import pickle
+import numpy as np
+import rdflib
+import torch
+import datetime
+import os
+import requests
+from rdflib import Graph as RDFGraph, Namespace
+from sentence_transformers import SentenceTransformer
+from dotenv import load_dotenv
+# === CONFIGURATION ===
+load_dotenv()
+MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
+EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+EX = Namespace("http://example.org/lang/")
+st.set_page_config(
+    page_title="Vanishing Voices: Language Atlas",
+    page_icon="🌍",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .header {
+        color: #2c3e50;
+        border-bottom: 2px solid #3498db;
+        padding-bottom: 10px;
+        margin-bottom: 1.5rem;
+    }
+    .info-box {
+        background-color: #e8f4fc;
+        border-radius: 8px;
+        padding: 1rem;
+        margin-bottom: 1.5rem;
+        border-left: 4px solid #3498db;
+    }
+    .sidebar-section {
+        margin-bottom: 2rem;
+    }
+    .sidebar-title {
+        color: #2c3e50;
+        font-size: 1.1rem;
+        font-weight: 600;
+        margin-bottom: 0.5rem;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.5rem;
+    }
+    .method-card {
+        background-color: #f8f9fa;
+        border-radius: 8px;
+        padding: 0.8rem;
+        margin-bottom: 0.8rem;
+        border-left: 3px solid #3498db;
+    }
+    .method-title {
+        font-weight: 600;
+        color: #3498db;
+        margin-bottom: 0.3rem;
+    }
+</style>
+""", unsafe_allow_html=True)
+@st.cache_resource(show_spinner="Loading models and indexes...")
+def load_all_components():
+    embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
+    methods = {}
+    for label, suffix, ttl, matrix_path in [
+        ("Standard", "", "grafo_ttl_no_hibrido.ttl", "embed_matrix.npy"),
+        ("Hybrid", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
+        ("GraphSAGE", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
+    ]:
+        with open(f"id_map{suffix}.pkl", "rb") as f:
+            id_map = pickle.load(f)
+        with open(f"grafo_embed{suffix}.pickle", "rb") as f:
+            G = pickle.load(f)
+        matrix = np.load(matrix_path)
+        rdf = RDFGraph()
+        rdf.parse(ttl, format="ttl")
+        methods[label] = (matrix, id_map, G, rdf)
+    return methods, embedder
+methods, embedder = load_all_components()
+# === CORE FUNCTIONS ===
+def get_top_k(matrix, id_map, query, k):
+    vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
+    vec = vec.cpu().numpy().astype("float32")
+    sims = np.dot(matrix, vec) / (np.linalg.norm(matrix, axis=1) * np.linalg.norm(vec) + 1e-10)
+    top_k_idx = np.argsort(sims)[-k:][::-1]
+    return [id_map[i] for i in top_k_idx]
+def get_context(G, lang_id):
+    node = G.nodes.get(lang_id, {})
+    lines = [f"**Language:** {node.get('label', lang_id)}"]
+    if node.get("wikipedia_summary"):
+        lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
+    if node.get("wikidata_description"):
+        lines.append(f"**Wikidata:** {node['wikidata_description']}")
+    if node.get("wikidata_countries"):
+        lines.append(f"**Countries:** {node['wikidata_countries']}")
+    return "\n\n".join(lines)
+def query_rdf(rdf, lang_id):
+    q = f"""
+    PREFIX ex: <http://example.org/lang/>
+    SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }}
+    """
+    try:
+        return [
+            (str(row[0]).split("/")[-1], str(row[1]))
+            for row in rdf.query(q)
+        ]
+    except Exception as e:
+        return [("error", str(e))]
+def generate_response(matrix, id_map, G, rdf, user_question, k=3):
+    ids = get_top_k(matrix, id_map, user_question, k)
+    context = [get_context(G, i) for i in ids]
+    rdf_facts = []
+    for i in ids:
+        rdf_facts.extend([f"{p}: {v}" for p, v in query_rdf(rdf, i)])
+    prompt = f"""<s>[INST]
+You are an expert in South American indigenous languages.
+Use strictly and only the information below to answer the user question in **English**.
+- Do not infer or assume facts that are not explicitly stated.
+- If the answer is unknown or insufficient, say "I cannot answer with the available data."
+- Limit your answer to 100 words.
+### CONTEXT:
+{chr(10).join(context)}
+### RDF RELATIONS:
+{chr(10).join(rdf_facts)}
+### QUESTION:
+{user_question}
+Answer:
+[/INST]"""
+    try:
+        res = requests.post(
+            f"https://api-inference.huggingface.co/models/{MODEL_ID}",
+            headers={"Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}", "Content-Type": "application/json"},
+            json={"inputs": prompt}, timeout=30
+        )
+        out = res.json()
+        if isinstance(out, list) and "generated_text" in out[0]:
+            return out[0]["generated_text"].replace(prompt.strip(), "").strip(), ids, context, rdf_facts
+        return str(out), ids, context, rdf_facts
+    except Exception as e:
+        return str(e), ids, context, rdf_facts
+# === MAIN FUNCTION ===
+def main():
+    st.markdown("""
+    <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
+    <div class='info-box'>
+    <b>Linguistic Emergency:</b> Over 40% of South America's indigenous languages face extinction.
+    This tool documents these cultural treasures before they disappear forever.
+    </div>
+    """, unsafe_allow_html=True)
+    with st.sidebar:
+        st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
+        with st.container():
+            st.markdown('<div class="sidebar-title">About This Tool</div>', unsafe_allow_html=True)
+            st.markdown("""
+            <div class="method-card">
+                <div class="method-title">Standard Search</div>
+                Semantic retrieval based on text-only embeddings. Identifies languages using purely linguistic similarity from Wikipedia summaries and labels.
+            </div>
+            <div class="method-card">
+                <div class="method-title">Hybrid Search</div>
+                Combines semantic embeddings with structured data from knowledge graphs. Enriches language representation with contextual facts.
+            </div>
+            <div class="method-card">
+                <div class="method-title">GraphSAGE Search</div>
+                Leverages deep graph neural networks to learn relational patterns across languages. Captures complex cultural and genealogical connections.
+            </div>
+            """, unsafe_allow_html=True)
+        with st.container():
+            st.markdown('<div class="sidebar-title">Research Settings</div>', unsafe_allow_html=True)
+            k = st.slider("Languages to analyze per query", 1, 10, 3)
+            st.markdown("**Display Options:**")
+            show_ids = st.checkbox("Language IDs", value=True, key="show_ids")
+            show_ctx = st.checkbox("Cultural Context", value=True, key="show_ctx")
+            show_rdf = st.checkbox("RDF Relations", value=True, key="show_rdf")
+        with st.container():
+            st.markdown('<div class="sidebar-title">Data Sources</div>', unsafe_allow_html=True)
+            st.markdown("""
+            - Glottolog
+            - Wikidata
+            - Wikipedia
+            - Ethnologue
+            """)
+    query = st.text_input("Ask about indigenous languages:", "Which Amazonian languages are most at risk?")
+    if st.button("Analyze with All Methods") and query:
+        col1, col2, col3 = st.columns(3)
+        results = {}
+        for col, (label, method) in zip([col1, col2, col3], methods.items()):
+            with col:
+                st.subheader(f"{label} Analysis")
+                start = datetime.datetime.now()
+                response, lang_ids, context, rdf_data = generate_response(*method, query, k)
+                duration = (datetime.datetime.now() - start).total_seconds()
+                st.markdown(response)
+                st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
+                if show_ids:
+                    st.markdown("**Language Identifiers:**")
+                    st.code("\n".join(lang_ids))
+                if show_ctx:
+                    st.markdown("**Cultural Context:**")
+                    st.markdown("\n\n---\n\n".join(context))
+                if show_rdf:
+                    st.markdown("**RDF Knowledge:**")
+                    st.code("\n".join(rdf_data))
+                results[label] = response
+        log = f"""
+[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]
+QUERY: {query}
+STANDARD:
+{results.get('Standard', '')}
+HYBRID:
+{results.get('Hybrid', '')}
+GRAPH-SAGE:
+{results.get('GraphSAGE', '')}
+{'='*60}
+"""
+        try:
+            with open("language_analysis_logs.txt", "a", encoding="utf-8") as f:
+                f.write(log)
+        except Exception as e:
+            st.warning(f"Failed to log: {str(e)}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,97 @@

+absl-py==2.2.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.16
+aiosignal==1.3.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+contourpy==1.3.1
+cycler==0.12.1
+et_xmlfile==2.0.0
+faiss-cpu==1.10.0
+fastapi==0.115.12
+filelock==3.18.0
+fonttools==4.56.0
+frozenlist==1.5.0
+fsspec==2025.3.2
+gensim==4.3.3
+gitdb==4.0.12
+GitPython==3.1.44
+huggingface-hub==0.30.1
+idna==3.10
+Jinja2==3.1.6
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+kiwisolver==1.4.8
+Levenshtein==0.27.1
+MarkupSafe==3.0.2
+matplotlib==3.10.1
+mpmath==1.3.0
+multidict==6.3.2
+narwhals==1.33.0
+networkx==3.4.2
+nltk==3.9.1
+node2vec==0.5.0
+numpy==1.26.4
+openpyxl==3.1.5
+ordpy==1.1.5
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+propcache==0.3.1
+protobuf==5.29.4
+psutil==7.0.0
+pyarrow==19.0.1
+pydantic==2.11.3
+pydantic_core==2.33.1
+pydeck==0.9.1
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-math==0.0.1
+pytz==2025.2
+PyYAML==6.0.2
+RapidFuzz==3.12.2
+rdflib==7.1.4
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rouge_score==0.1.2
+rpds-py==0.24.0
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.13.1
+sentence-transformers==4.0.1
+setuptools==75.8.0
+six==1.17.0
+smart-open==7.1.0
+smmap==5.0.2
+sniffio==1.3.1
+starlette==0.46.1
+streamlit==1.44.1
+sympy==1.13.1
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+toml==0.10.2
+torch==2.6.0
+torch-geometric==2.6.1
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.50.3
+typing-inspection==0.4.0
+typing_extensions==4.13.0
+tzdata==2025.2
+urllib3==2.3.0
+watchdog==6.0.0
+wheel==0.45.1
+wrapt==1.17.2
+yarl==1.19.0