javiervzpucp commited on
Commit
8c421fb
·
verified ·
1 Parent(s): 60244ac
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ .streamlit/secrets.toml
README.md CHANGED
@@ -1,11 +1 @@
1
- ---
2
- title: RAG SA
3
- emoji: 👀
4
- colorFrom: indigo
5
- colorTo: indigo
6
- sdk: static
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # RAG-glottolog
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Apr 9 10:56:02 2025
4
+
5
+ @author: jveraz
6
+ """
7
+
8
+ from fastapi import FastAPI
9
+ from pydantic import BaseModel
10
+ import torch
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer
12
+
13
+ app = FastAPI()
14
+
15
+ MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
16
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ MODEL_ID,
19
+ torch_dtype=torch.float16,
20
+ device_map="auto"
21
+ )
22
+
23
+ class QueryInput(BaseModel):
24
+ inputs: str
25
+
26
+ @app.post("/")
27
+ async def generate(query: QueryInput):
28
+ input_ids = tokenizer(query.inputs, return_tensors="pt").input_ids.to(model.device)
29
+ output_ids = model.generate(input_ids, max_new_tokens=200)
30
+ generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)
31
+ return {"generated_text": generated}
embed_matrix.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:447c4325d930de44070f9def3ad15bf54a93d39956587fc20525cfc5060f4a50
3
+ size 1695872
embed_matrix_hybrid.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1fbd97a4a97523185aab51c4fad790aff36c8acd6710c5607f4acea9be0b96b
3
+ size 3382400
embed_matrix_hybrid_graphsage.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4711ff26a44c4938f94a79169dd56b7adda7668e1b6dd89f718059092ca5c50
3
+ size 3382400
grafo_embed.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:420fe64e2680f0d3c4f7f90b891f21b56cce89784ea104b3a2878e8b845ce451
3
+ size 4470194
grafo_embed_hybrid.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:776ba033f208a5266c6a5b076189d96a9651993cdcd2297f5b55ed33a98dcf6d
3
+ size 8320820
grafo_embed_hybrid_graphsage.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3993c376704872f90798a447434da0487db268d904ca4517e26c519b07bf1ac1
3
+ size 7958567
grafo_ttl_hibrido.ttl ADDED
The diff for this file is too large to render. See raw diff
 
grafo_ttl_hibrido_graphsage.ttl ADDED
The diff for this file is too large to render. See raw diff
 
grafo_ttl_no_hibrido.ttl ADDED
The diff for this file is too large to render. See raw diff
 
id_map.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4e5e2ca4af562078ca2cc94a56fc409ac1fd0312f514a2835117568bc89b034
3
+ size 6088
id_map_hybrid.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f011f1929fabc31724e935b9eac34ae691b530367d1b831bb2d92e78b555280a
3
+ size 12189
id_map_hybrid_graphsage.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebb6de3e35fea86da236cd9b12a1b6d6a1d4867941c25955b401c3bf857ed5f6
3
+ size 12189
rag_hf.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_interface.py (with numpy instead of faiss)
2
+ import streamlit as st
3
+ import pickle
4
+ import numpy as np
5
+ import rdflib
6
+ import torch
7
+ import datetime
8
+ import os
9
+ import requests
10
+ from rdflib import Graph as RDFGraph, Namespace
11
+ from sentence_transformers import SentenceTransformer
12
+ from dotenv import load_dotenv
13
+
14
+ # === CONFIGURATION ===
15
+ load_dotenv()
16
+
17
+ MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
18
+ EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
19
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
20
+ EX = Namespace("http://example.org/lang/")
21
+
22
+ st.set_page_config(
23
+ page_title="Vanishing Voices: Language Atlas",
24
+ page_icon="🌍",
25
+ layout="wide",
26
+ initial_sidebar_state="expanded"
27
+ )
28
+
29
+ # Custom CSS
30
+ st.markdown("""
31
+ <style>
32
+ .header {
33
+ color: #2c3e50;
34
+ border-bottom: 2px solid #3498db;
35
+ padding-bottom: 10px;
36
+ margin-bottom: 1.5rem;
37
+ }
38
+ .info-box {
39
+ background-color: #e8f4fc;
40
+ border-radius: 8px;
41
+ padding: 1rem;
42
+ margin-bottom: 1.5rem;
43
+ border-left: 4px solid #3498db;
44
+ }
45
+ .sidebar-section {
46
+ margin-bottom: 2rem;
47
+ }
48
+ .sidebar-title {
49
+ color: #2c3e50;
50
+ font-size: 1.1rem;
51
+ font-weight: 600;
52
+ margin-bottom: 0.5rem;
53
+ border-bottom: 1px solid #eee;
54
+ padding-bottom: 0.5rem;
55
+ }
56
+ .method-card {
57
+ background-color: #f8f9fa;
58
+ border-radius: 8px;
59
+ padding: 0.8rem;
60
+ margin-bottom: 0.8rem;
61
+ border-left: 3px solid #3498db;
62
+ }
63
+ .method-title {
64
+ font-weight: 600;
65
+ color: #3498db;
66
+ margin-bottom: 0.3rem;
67
+ }
68
+ </style>
69
+ """, unsafe_allow_html=True)
70
+
71
+ @st.cache_resource(show_spinner="Loading models and indexes...")
72
+ def load_all_components():
73
+ embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
74
+ methods = {}
75
+ for label, suffix, ttl, matrix_path in [
76
+ ("Standard", "", "grafo_ttl_no_hibrido.ttl", "embed_matrix.npy"),
77
+ ("Hybrid", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
78
+ ("GraphSAGE", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
79
+ ]:
80
+ with open(f"id_map{suffix}.pkl", "rb") as f:
81
+ id_map = pickle.load(f)
82
+ with open(f"grafo_embed{suffix}.pickle", "rb") as f:
83
+ G = pickle.load(f)
84
+ matrix = np.load(matrix_path)
85
+ rdf = RDFGraph()
86
+ rdf.parse(ttl, format="ttl")
87
+ methods[label] = (matrix, id_map, G, rdf)
88
+ return methods, embedder
89
+
90
+ methods, embedder = load_all_components()
91
+
92
+ # === CORE FUNCTIONS ===
93
+ def get_top_k(matrix, id_map, query, k):
94
+ vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
95
+ vec = vec.cpu().numpy().astype("float32")
96
+ sims = np.dot(matrix, vec) / (np.linalg.norm(matrix, axis=1) * np.linalg.norm(vec) + 1e-10)
97
+ top_k_idx = np.argsort(sims)[-k:][::-1]
98
+ return [id_map[i] for i in top_k_idx]
99
+
100
+ def get_context(G, lang_id):
101
+ node = G.nodes.get(lang_id, {})
102
+ lines = [f"**Language:** {node.get('label', lang_id)}"]
103
+ if node.get("wikipedia_summary"):
104
+ lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
105
+ if node.get("wikidata_description"):
106
+ lines.append(f"**Wikidata:** {node['wikidata_description']}")
107
+ if node.get("wikidata_countries"):
108
+ lines.append(f"**Countries:** {node['wikidata_countries']}")
109
+ return "\n\n".join(lines)
110
+
111
+ def query_rdf(rdf, lang_id):
112
+ q = f"""
113
+ PREFIX ex: <http://example.org/lang/>
114
+ SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }}
115
+ """
116
+ try:
117
+ return [
118
+ (str(row[0]).split("/")[-1], str(row[1]))
119
+ for row in rdf.query(q)
120
+ ]
121
+ except Exception as e:
122
+ return [("error", str(e))]
123
+
124
+ def generate_response(matrix, id_map, G, rdf, user_question, k=3):
125
+ ids = get_top_k(matrix, id_map, user_question, k)
126
+ context = [get_context(G, i) for i in ids]
127
+ rdf_facts = []
128
+ for i in ids:
129
+ rdf_facts.extend([f"{p}: {v}" for p, v in query_rdf(rdf, i)])
130
+ prompt = f"""<s>[INST]
131
+ You are an expert in South American indigenous languages.
132
+ Use strictly and only the information below to answer the user question in **English**.
133
+ - Do not infer or assume facts that are not explicitly stated.
134
+ - If the answer is unknown or insufficient, say "I cannot answer with the available data."
135
+ - Limit your answer to 100 words.
136
+
137
+
138
+ ### CONTEXT:
139
+ {chr(10).join(context)}
140
+
141
+ ### RDF RELATIONS:
142
+ {chr(10).join(rdf_facts)}
143
+
144
+ ### QUESTION:
145
+ {user_question}
146
+
147
+ Answer:
148
+ [/INST]"""
149
+ try:
150
+ res = requests.post(
151
+ f"https://api-inference.huggingface.co/models/{MODEL_ID}",
152
+ headers={"Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}", "Content-Type": "application/json"},
153
+ json={"inputs": prompt}, timeout=30
154
+ )
155
+ out = res.json()
156
+ if isinstance(out, list) and "generated_text" in out[0]:
157
+ return out[0]["generated_text"].replace(prompt.strip(), "").strip(), ids, context, rdf_facts
158
+ return str(out), ids, context, rdf_facts
159
+ except Exception as e:
160
+ return str(e), ids, context, rdf_facts
161
+
162
+ # === MAIN FUNCTION ===
163
+ def main():
164
+ st.markdown("""
165
+ <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
166
+ <div class='info-box'>
167
+ <b>Linguistic Emergency:</b> Over 40% of South America's indigenous languages face extinction.
168
+ This tool documents these cultural treasures before they disappear forever.
169
+ </div>
170
+ """, unsafe_allow_html=True)
171
+
172
+ with st.sidebar:
173
+ st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
174
+
175
+ with st.container():
176
+ st.markdown('<div class="sidebar-title">About This Tool</div>', unsafe_allow_html=True)
177
+ st.markdown("""
178
+ <div class="method-card">
179
+ <div class="method-title">Standard Search</div>
180
+ Semantic retrieval based on text-only embeddings. Identifies languages using purely linguistic similarity from Wikipedia summaries and labels.
181
+ </div>
182
+ <div class="method-card">
183
+ <div class="method-title">Hybrid Search</div>
184
+ Combines semantic embeddings with structured data from knowledge graphs. Enriches language representation with contextual facts.
185
+ </div>
186
+ <div class="method-card">
187
+ <div class="method-title">GraphSAGE Search</div>
188
+ Leverages deep graph neural networks to learn relational patterns across languages. Captures complex cultural and genealogical connections.
189
+ </div>
190
+ """, unsafe_allow_html=True)
191
+
192
+ with st.container():
193
+ st.markdown('<div class="sidebar-title">Research Settings</div>', unsafe_allow_html=True)
194
+ k = st.slider("Languages to analyze per query", 1, 10, 3)
195
+ st.markdown("**Display Options:**")
196
+ show_ids = st.checkbox("Language IDs", value=True, key="show_ids")
197
+ show_ctx = st.checkbox("Cultural Context", value=True, key="show_ctx")
198
+ show_rdf = st.checkbox("RDF Relations", value=True, key="show_rdf")
199
+
200
+ with st.container():
201
+ st.markdown('<div class="sidebar-title">Data Sources</div>', unsafe_allow_html=True)
202
+ st.markdown("""
203
+ - Glottolog
204
+ - Wikidata
205
+ - Wikipedia
206
+ - Ethnologue
207
+ """)
208
+
209
+ query = st.text_input("Ask about indigenous languages:", "Which Amazonian languages are most at risk?")
210
+
211
+ if st.button("Analyze with All Methods") and query:
212
+ col1, col2, col3 = st.columns(3)
213
+ results = {}
214
+ for col, (label, method) in zip([col1, col2, col3], methods.items()):
215
+ with col:
216
+ st.subheader(f"{label} Analysis")
217
+ start = datetime.datetime.now()
218
+ response, lang_ids, context, rdf_data = generate_response(*method, query, k)
219
+ duration = (datetime.datetime.now() - start).total_seconds()
220
+ st.markdown(response)
221
+ st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
222
+ if show_ids:
223
+ st.markdown("**Language Identifiers:**")
224
+ st.code("\n".join(lang_ids))
225
+ if show_ctx:
226
+ st.markdown("**Cultural Context:**")
227
+ st.markdown("\n\n---\n\n".join(context))
228
+ if show_rdf:
229
+ st.markdown("**RDF Knowledge:**")
230
+ st.code("\n".join(rdf_data))
231
+ results[label] = response
232
+
233
+ log = f"""
234
+ [{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]
235
+ QUERY: {query}
236
+ STANDARD:
237
+ {results.get('Standard', '')}
238
+
239
+ HYBRID:
240
+ {results.get('Hybrid', '')}
241
+
242
+ GRAPH-SAGE:
243
+ {results.get('GraphSAGE', '')}
244
+ {'='*60}
245
+ """
246
+ try:
247
+ with open("language_analysis_logs.txt", "a", encoding="utf-8") as f:
248
+ f.write(log)
249
+ except Exception as e:
250
+ st.warning(f"Failed to log: {str(e)}")
251
+
252
+ if __name__ == "__main__":
253
+ main()
requirements.txt ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.2
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.11.16
4
+ aiosignal==1.3.2
5
+ altair==5.5.0
6
+ annotated-types==0.7.0
7
+ anyio==4.9.0
8
+ attrs==25.3.0
9
+ blinker==1.9.0
10
+ cachetools==5.5.2
11
+ certifi==2025.1.31
12
+ charset-normalizer==3.4.1
13
+ click==8.1.8
14
+ colorama==0.4.6
15
+ contourpy==1.3.1
16
+ cycler==0.12.1
17
+ et_xmlfile==2.0.0
18
+ faiss-cpu==1.10.0
19
+ fastapi==0.115.12
20
+ filelock==3.18.0
21
+ fonttools==4.56.0
22
+ frozenlist==1.5.0
23
+ fsspec==2025.3.2
24
+ gensim==4.3.3
25
+ gitdb==4.0.12
26
+ GitPython==3.1.44
27
+ huggingface-hub==0.30.1
28
+ idna==3.10
29
+ Jinja2==3.1.6
30
+ joblib==1.4.2
31
+ jsonschema==4.23.0
32
+ jsonschema-specifications==2024.10.1
33
+ kiwisolver==1.4.8
34
+ Levenshtein==0.27.1
35
+ MarkupSafe==3.0.2
36
+ matplotlib==3.10.1
37
+ mpmath==1.3.0
38
+ multidict==6.3.2
39
+ narwhals==1.33.0
40
+ networkx==3.4.2
41
+ nltk==3.9.1
42
+ node2vec==0.5.0
43
+ numpy==1.26.4
44
+ openpyxl==3.1.5
45
+ ordpy==1.1.5
46
+ packaging==24.2
47
+ pandas==2.2.3
48
+ pillow==11.1.0
49
+ propcache==0.3.1
50
+ protobuf==5.29.4
51
+ psutil==7.0.0
52
+ pyarrow==19.0.1
53
+ pydantic==2.11.3
54
+ pydantic_core==2.33.1
55
+ pydeck==0.9.1
56
+ pyparsing==3.2.3
57
+ python-dateutil==2.9.0.post0
58
+ python-dotenv==1.1.0
59
+ python-math==0.0.1
60
+ pytz==2025.2
61
+ PyYAML==6.0.2
62
+ RapidFuzz==3.12.2
63
+ rdflib==7.1.4
64
+ referencing==0.36.2
65
+ regex==2024.11.6
66
+ requests==2.32.3
67
+ rouge_score==0.1.2
68
+ rpds-py==0.24.0
69
+ safetensors==0.5.3
70
+ scikit-learn==1.6.1
71
+ scipy==1.13.1
72
+ sentence-transformers==4.0.1
73
+ setuptools==75.8.0
74
+ six==1.17.0
75
+ smart-open==7.1.0
76
+ smmap==5.0.2
77
+ sniffio==1.3.1
78
+ starlette==0.46.1
79
+ streamlit==1.44.1
80
+ sympy==1.13.1
81
+ tenacity==9.1.2
82
+ threadpoolctl==3.6.0
83
+ tokenizers==0.21.1
84
+ toml==0.10.2
85
+ torch==2.6.0
86
+ torch-geometric==2.6.1
87
+ tornado==6.4.2
88
+ tqdm==4.67.1
89
+ transformers==4.50.3
90
+ typing-inspection==0.4.0
91
+ typing_extensions==4.13.0
92
+ tzdata==2025.2
93
+ urllib3==2.3.0
94
+ watchdog==6.0.0
95
+ wheel==0.45.1
96
+ wrapt==1.17.2
97
+ yarl==1.19.0