junaidbaber's picture
Update app.py
4257e69 verified
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import os
def initialize_model():
"""Initialize a small and fast model for CPU"""
# Using a tiny model optimized for CPU
model_id = "facebook/opt-125m" # Much smaller model (125M parameters)
model_id ="GEB-AGI/geb-1.3b"
try:
# Initialize the pipeline directly - more efficient than loading model separately
pipe = pipeline(
"text-generation",
model=model_id,
device_map="cpu",
model_kwargs={"low_cpu_mem_usage": True}
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
return pipe, tokenizer
except Exception as e:
print(f"Error loading model: {str(e)}")
raise e
def generate_response(pipe, tokenizer, prompt, conversation_history):
"""Generate model response"""
try:
# Format conversation context
context = ""
for turn in conversation_history[-3:]: # Only use last 3 turns for efficiency
context += f"Human: {turn['user']}\nAssistant: {turn['assistant']}\n"
# Create the full prompt
full_prompt = f"{context}Human: {prompt}\nAssistant:"
# Generate response with conservative parameters
response = pipe(
full_prompt,
max_new_tokens=50, # Limit response length
temperature=0.7,
top_p=0.9,
num_return_sequences=1,
do_sample=True,
pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
)[0]['generated_text']
# Extract only the assistant's response
try:
assistant_response = response.split("Assistant:")[-1].strip()
if not assistant_response:
return "I apologize, but I couldn't generate a proper response."
return assistant_response
except:
return response.split(prompt)[-1].strip()
except Exception as e:
return f"An error occurred: {str(e)}"
def main():
st.set_page_config(page_title="LLM Chat Interface", page_icon="πŸ€–")
st.title("πŸ’¬ Quick Chat Assistant")
# Initialize session state
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
if "model_loaded" not in st.session_state:
st.session_state.model_loaded = False
# Initialize model (only once)
if not st.session_state.model_loaded:
with st.spinner("Loading the model... (this should take just a few seconds)"):
try:
pipe, tokenizer = initialize_model()
st.session_state.pipe = pipe
st.session_state.tokenizer = tokenizer
st.session_state.model_loaded = True
except Exception as e:
st.error(f"Error loading model: {str(e)}")
return
# Display chat messages
for message in st.session_state.chat_history:
with st.chat_message("user"):
st.write(message["user"])
with st.chat_message("assistant"):
st.write(message["assistant"])
# Chat input
if prompt := st.chat_input("Ask me anything!"):
# Display user message
with st.chat_message("user"):
st.write(prompt)
# Generate and display assistant response
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
current_turn = {"user": prompt, "assistant": ""}
st.session_state.chat_history.append(current_turn)
response = generate_response(
st.session_state.pipe,
st.session_state.tokenizer,
prompt,
st.session_state.chat_history
)
st.write(response)
st.session_state.chat_history[-1]["assistant"] = response
# Keep only last 5 turns
if len(st.session_state.chat_history) > 5:
st.session_state.chat_history = st.session_state.chat_history[-5:]
# Sidebar
with st.sidebar:
if st.button("Clear Chat"):
st.session_state.chat_history = []
st.rerun()
st.markdown("---")
st.markdown("""
### Chat Info
- Using OPT-125M model
- Optimized for quick responses
- Best for short conversations
""")
if __name__ == "__main__":
main()