Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
import torch | |
import os | |
def initialize_model(): | |
"""Initialize a small and fast model for CPU""" | |
# Using a tiny model optimized for CPU | |
model_id = "facebook/opt-125m" # Much smaller model (125M parameters) | |
model_id ="GEB-AGI/geb-1.3b" | |
try: | |
# Initialize the pipeline directly - more efficient than loading model separately | |
pipe = pipeline( | |
"text-generation", | |
model=model_id, | |
device_map="cpu", | |
model_kwargs={"low_cpu_mem_usage": True} | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
return pipe, tokenizer | |
except Exception as e: | |
print(f"Error loading model: {str(e)}") | |
raise e | |
def generate_response(pipe, tokenizer, prompt, conversation_history): | |
"""Generate model response""" | |
try: | |
# Format conversation context | |
context = "" | |
for turn in conversation_history[-3:]: # Only use last 3 turns for efficiency | |
context += f"Human: {turn['user']}\nAssistant: {turn['assistant']}\n" | |
# Create the full prompt | |
full_prompt = f"{context}Human: {prompt}\nAssistant:" | |
# Generate response with conservative parameters | |
response = pipe( | |
full_prompt, | |
max_new_tokens=50, # Limit response length | |
temperature=0.7, | |
top_p=0.9, | |
num_return_sequences=1, | |
do_sample=True, | |
pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id | |
)[0]['generated_text'] | |
# Extract only the assistant's response | |
try: | |
assistant_response = response.split("Assistant:")[-1].strip() | |
if not assistant_response: | |
return "I apologize, but I couldn't generate a proper response." | |
return assistant_response | |
except: | |
return response.split(prompt)[-1].strip() | |
except Exception as e: | |
return f"An error occurred: {str(e)}" | |
def main(): | |
st.set_page_config(page_title="LLM Chat Interface", page_icon="π€") | |
st.title("π¬ Quick Chat Assistant") | |
# Initialize session state | |
if "chat_history" not in st.session_state: | |
st.session_state.chat_history = [] | |
if "model_loaded" not in st.session_state: | |
st.session_state.model_loaded = False | |
# Initialize model (only once) | |
if not st.session_state.model_loaded: | |
with st.spinner("Loading the model... (this should take just a few seconds)"): | |
try: | |
pipe, tokenizer = initialize_model() | |
st.session_state.pipe = pipe | |
st.session_state.tokenizer = tokenizer | |
st.session_state.model_loaded = True | |
except Exception as e: | |
st.error(f"Error loading model: {str(e)}") | |
return | |
# Display chat messages | |
for message in st.session_state.chat_history: | |
with st.chat_message("user"): | |
st.write(message["user"]) | |
with st.chat_message("assistant"): | |
st.write(message["assistant"]) | |
# Chat input | |
if prompt := st.chat_input("Ask me anything!"): | |
# Display user message | |
with st.chat_message("user"): | |
st.write(prompt) | |
# Generate and display assistant response | |
with st.chat_message("assistant"): | |
with st.spinner("Thinking..."): | |
current_turn = {"user": prompt, "assistant": ""} | |
st.session_state.chat_history.append(current_turn) | |
response = generate_response( | |
st.session_state.pipe, | |
st.session_state.tokenizer, | |
prompt, | |
st.session_state.chat_history | |
) | |
st.write(response) | |
st.session_state.chat_history[-1]["assistant"] = response | |
# Keep only last 5 turns | |
if len(st.session_state.chat_history) > 5: | |
st.session_state.chat_history = st.session_state.chat_history[-5:] | |
# Sidebar | |
with st.sidebar: | |
if st.button("Clear Chat"): | |
st.session_state.chat_history = [] | |
st.rerun() | |
st.markdown("---") | |
st.markdown(""" | |
### Chat Info | |
- Using OPT-125M model | |
- Optimized for quick responses | |
- Best for short conversations | |
""") | |
if __name__ == "__main__": | |
main() |