Spaces:
Running
Running
Trying StarCoder and organizing documentation (#1)
Browse files- Trying StarCoder and organizing documentation (d32640f4879b52935532d37cd5416587c1bdee6f)
Co-authored-by: Xilena Atenea Rojas Salazar <[email protected]>
- app.py +150 -99
- documentacion.md +9 -0
- prompt_docs.md +23 -0
- requirements.txt +14 -5
app.py
CHANGED
@@ -1,99 +1,150 @@
|
|
1 |
-
#
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
import
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
#
|
40 |
-
#
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
return pd.DataFrame()
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
#
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ---------------------------------------------------------------------------------
|
2 |
+
# Aplicación principal para cargar el modelo, generar prompts y explicar los datos
|
3 |
+
# ---------------------------------------------------------------------------------
|
4 |
+
|
5 |
+
import streamlit as st # type: ignore
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
import pandas as pd # type: ignore
|
9 |
+
from dotenv import load_dotenv # type: ignore # Para cambios locales
|
10 |
+
from supabase import create_client, Client # type: ignore
|
11 |
+
from transformers import pipeline
|
12 |
+
|
13 |
+
from pandasai import SmartDataframe # type: ignore
|
14 |
+
from pandasai.llm.starcoder import Starcoder # type: ignore
|
15 |
+
|
16 |
+
# ---------------------------------------------------------------------------------
|
17 |
+
# Funciones auxiliares
|
18 |
+
# ---------------------------------------------------------------------------------
|
19 |
+
|
20 |
+
# Función para extracción de código Python del output del modelo
|
21 |
+
def extract_code(llm_output):
|
22 |
+
code_match = re.search(r"```python\n(.*?)\n```", llm_output, re.DOTALL)
|
23 |
+
if code_match:
|
24 |
+
return code_match.group(1)
|
25 |
+
return None
|
26 |
+
|
27 |
+
# Función para generar prompts de gráficos comparativos
|
28 |
+
# Ejemplo de prompt generado:
|
29 |
+
# generate_graph_prompt("Germany", "France", "fertility rate", 2020, 2030)
|
30 |
+
def generate_graph_prompt(country1, country2, metric, start_year, end_year):
|
31 |
+
prompt = f"""
|
32 |
+
You have access to a database of European countries with data on {metric}, labor force participation, population, and their predictions for future years.
|
33 |
+
Generate Python code using matplotlib to create a line graph showing the trend of {metric} for {country1} and {country2} from {start_year} to {end_year}.
|
34 |
+
Also, provide a concise explanation of what this graph represents for an end user who might not be familiar with the data.
|
35 |
+
"""
|
36 |
+
return prompt
|
37 |
+
|
38 |
+
# ---------------------------------------------------------------------------------
|
39 |
+
# Configuración de conexión a Supabase
|
40 |
+
# ---------------------------------------------------------------------------------
|
41 |
+
|
42 |
+
# Cargar variables de entorno desde archivo .env
|
43 |
+
load_dotenv()
|
44 |
+
|
45 |
+
# Conectar las credenciales de Supabase (ubicadas en "Secrets" en Streamlit)
|
46 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
47 |
+
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
|
48 |
+
|
49 |
+
# Crear cliente Supabase
|
50 |
+
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
51 |
+
|
52 |
+
# Función para cargar datos de una tabla de Supabase
|
53 |
+
# Tablas posibles: fertility, geo data, labor, population, predictions
|
54 |
+
def load_data(table):
|
55 |
+
try:
|
56 |
+
if supabase:
|
57 |
+
response = supabase.from_(table).select("*").execute()
|
58 |
+
print(f"Response object: {response}") # Inspeccionar objeto completo
|
59 |
+
print(f"Response type: {type(response)}") # Verificar tipo de objeto
|
60 |
+
|
61 |
+
# Acceder a atributos relacionados a error o data
|
62 |
+
if hasattr(response, 'data'):
|
63 |
+
print(f"Response data: {response.data}")
|
64 |
+
return pd.DataFrame(response.data)
|
65 |
+
elif hasattr(response, 'status_code'):
|
66 |
+
print(f"Response status code: {response.status_code}")
|
67 |
+
elif hasattr(response, '_error'): # Versiones antiguas
|
68 |
+
print(f"Older error attribute: {response._error}")
|
69 |
+
st.error(f"Error fetching data: {response._error}")
|
70 |
+
return pd.DataFrame()
|
71 |
+
else:
|
72 |
+
st.info("Response object does not have 'data' or known error attributes. Check the logs.")
|
73 |
+
return pd.DataFrame()
|
74 |
+
|
75 |
+
else:
|
76 |
+
st.error("Supabase client not initialized. Check environment variables.")
|
77 |
+
return pd.DataFrame()
|
78 |
+
except Exception as e:
|
79 |
+
st.error(f"An error occurred during data loading: {e}")
|
80 |
+
return pd.DataFrame()
|
81 |
+
|
82 |
+
# ---------------------------------------------------------------------------------
|
83 |
+
# Cargar datos iniciales
|
84 |
+
# ---------------------------------------------------------------------------------
|
85 |
+
|
86 |
+
# # Cargar datos desde la tabla "labor"
|
87 |
+
data = load_data("labor")
|
88 |
+
|
89 |
+
# TODO: La idea es luego usar todas las tablas, cuando ya funcione.
|
90 |
+
# Se puede si el modelo funciona con las gráficas, sino que toca mejorarlo
|
91 |
+
# porque serían consultas más complejas.
|
92 |
+
# labor_data = load_data("labor")
|
93 |
+
# fertility_data = load_data("fertility")
|
94 |
+
# population_data = load_data("population")
|
95 |
+
# predictions_data = load_data("predictions")
|
96 |
+
|
97 |
+
# ---------------------------------------------------------------------------------
|
98 |
+
# Inicializar modelo LLM
|
99 |
+
# ---------------------------------------------------------------------------------
|
100 |
+
|
101 |
+
# # Pendiente cambiar Keys dependiendo del modelo que escojamos
|
102 |
+
# model_name = "google/flan-t5-small" # Probando modelos
|
103 |
+
# generator = pipeline("text-generation", model=model_name)
|
104 |
+
|
105 |
+
# ---------------------------------------------------------------------------------
|
106 |
+
# Inicializar PandasAI con StarCoder
|
107 |
+
# ---------------------------------------------------------------------------------
|
108 |
+
|
109 |
+
# Definir el modelo StarCoder desde Hugging Face
|
110 |
+
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
111 |
+
llm = Starcoder(api_token=huggingface_token)
|
112 |
+
|
113 |
+
sdf = SmartDataframe(data, config={"llm": llm}) # DataFrame PandasAI-ready.
|
114 |
+
|
115 |
+
# ---------------------------------------------------------------------------------
|
116 |
+
# Configuración de la app en Streamlit
|
117 |
+
# ---------------------------------------------------------------------------------
|
118 |
+
|
119 |
+
# Título de la app
|
120 |
+
st.title("_Europe GraphGen_ :blue[Graph generator] :flag-eu:")
|
121 |
+
|
122 |
+
# Entrada de usuario para describir el gráfico
|
123 |
+
user_input = st.text_input("What graphics do you have in mind")
|
124 |
+
generate_button = st.button("Generate")
|
125 |
+
|
126 |
+
# Manejo de evento de botón
|
127 |
+
if generate_button and user_input:
|
128 |
+
# if data.empty and supabase is not None:
|
129 |
+
# st.warning("Successfully connected to Supabase, but no data was loaded (either the table is empty or there was a query issue). Check the error message above if any.")
|
130 |
+
# elif not data.empty:
|
131 |
+
# st.success("Successfully connected to Supabase and loaded data!")
|
132 |
+
# st.dataframe(data.head()) # Mostrar una pequeña muestra del DataFrame
|
133 |
+
# elif supabase is None:
|
134 |
+
# st.error("Failed to initialize Supabase client. Check environment variables in Settings.")
|
135 |
+
# else:
|
136 |
+
# st.info("Attempted to load data. Check for any error messages above.")
|
137 |
+
|
138 |
+
# Procesar el input del usuario con PandasAI
|
139 |
+
if generate_button and user_input:
|
140 |
+
st.dataframe(data.head())
|
141 |
+
|
142 |
+
with st.spinner('Generating answer...'):
|
143 |
+
try:
|
144 |
+
answer = sdf.chat(user_input)
|
145 |
+
st.write(answer)
|
146 |
+
except Exception as e:
|
147 |
+
st.error(f"Error generating answer: {e}")
|
148 |
+
|
149 |
+
|
150 |
+
# TODO: Output estructurado si vemos que es necesario.
|
documentacion.md
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
1. La conexion entre Supabase y Hugginface se logro hacer, al realizarla por seguridad en Supabase (por defecto RLS "row-level security" estaba activado), retornaba una lista vacia, desabilitando esta opcion desde el proyecto de Supabase donde estaba el problema se logro solucionar y tener acceso.
|
2 |
+
2. Se descartó la opción de desactivar RLS por motivos de seguridad en la tabla, y en su lugar se crearon políticas de RLS para permitir la lectura a anon, que es el rol utilizado por Supabase para el acceso público autenticado a través de la clave pública (anon key). Así podremos acceder a través de la API pública y leer los datos permitidos sin comprometer la seguridad del resto del sistema.
|
3 |
+
3. ¿Qué modelos pensamos en usar?
|
4 |
+
- google/flan-t5-small : ....@Camilo ¿por qué?
|
5 |
+
- StarCoder: Recomendado en https://huggingface.co/tasks/text-generation. Paper para detalles: https://arxiv.org/pdf/2305.06161 (parte relevante: "Improving Code Generation with Prompting"")
|
6 |
+
4. Librerías que tuvimos en cuenta (usamos y no usamos):
|
7 |
+
- https://github.com/DashyDashOrg/pandas-llm: No usamos porque actualmente está diseñado principalmente para funcionar con modelos de OpenAI como GPT-3.5 y GPT-4.
|
8 |
+
No es una herramienta multi-LLM como PandasAI (que sí permite trabajar con varios proveedores). pandas-llm en su implementación base espera una API compatible con OpenAI.
|
9 |
+
- PandasAI: https://pypi.org/project/pandasai/, https://pandasai-docs.readthedocs.io/en/latest/API/llms/
|
prompt_docs.md
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TO DO: Escribir propósito de esta documentación para contexto.
|
2 |
+
---
|
3 |
+
### First try
|
4 |
+
|
5 |
+
PROMPT:
|
6 |
+
```
|
7 |
+
```
|
8 |
+
|
9 |
+
RESULTS:
|
10 |
+
|
11 |
+
---
|
12 |
+
|
13 |
+
### Second try
|
14 |
+
|
15 |
+
PROMPT:
|
16 |
+
```
|
17 |
+
```
|
18 |
+
|
19 |
+
RESULTS:
|
20 |
+
|
21 |
+
---
|
22 |
+
|
23 |
+
|
requirements.txt
CHANGED
@@ -1,5 +1,14 @@
|
|
1 |
-
supabase
|
2 |
-
transformers
|
3 |
-
matplotlib
|
4 |
-
pandas
|
5 |
-
torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# supabase
|
2 |
+
# transformers
|
3 |
+
# matplotlib
|
4 |
+
# pandas
|
5 |
+
# torch
|
6 |
+
# pandasai
|
7 |
+
|
8 |
+
streamlit>=1.30.0
|
9 |
+
pandas>=2.2.0
|
10 |
+
numpy>=1.25.0
|
11 |
+
python-dotenv>=1.0.0
|
12 |
+
transformers>=4.37.2
|
13 |
+
pandasai>=2.0.0
|
14 |
+
supabase>=2.0.0
|