import streamlit as st from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import torch import os def initialize_model(): """Initialize a small and fast model for CPU""" # Using a tiny model optimized for CPU model_id = "facebook/opt-125m" # Much smaller model (125M parameters) model_id ="GEB-AGI/geb-1.3b" try: # Initialize the pipeline directly - more efficient than loading model separately pipe = pipeline( "text-generation", model=model_id, device_map="cpu", model_kwargs={"low_cpu_mem_usage": True} ) tokenizer = AutoTokenizer.from_pretrained(model_id) return pipe, tokenizer except Exception as e: print(f"Error loading model: {str(e)}") raise e def generate_response(pipe, tokenizer, prompt, conversation_history): """Generate model response""" try: # Format conversation context context = "" for turn in conversation_history[-3:]: # Only use last 3 turns for efficiency context += f"Human: {turn['user']}\nAssistant: {turn['assistant']}\n" # Create the full prompt full_prompt = f"{context}Human: {prompt}\nAssistant:" # Generate response with conservative parameters response = pipe( full_prompt, max_new_tokens=50, # Limit response length temperature=0.7, top_p=0.9, num_return_sequences=1, do_sample=True, pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id )[0]['generated_text'] # Extract only the assistant's response try: assistant_response = response.split("Assistant:")[-1].strip() if not assistant_response: return "I apologize, but I couldn't generate a proper response." return assistant_response except: return response.split(prompt)[-1].strip() except Exception as e: return f"An error occurred: {str(e)}" def main(): st.set_page_config(page_title="LLM Chat Interface", page_icon="🤖") st.title("💬 Quick Chat Assistant") # Initialize session state if "chat_history" not in st.session_state: st.session_state.chat_history = [] if "model_loaded" not in st.session_state: st.session_state.model_loaded = False # Initialize model (only once) if not st.session_state.model_loaded: with st.spinner("Loading the model... (this should take just a few seconds)"): try: pipe, tokenizer = initialize_model() st.session_state.pipe = pipe st.session_state.tokenizer = tokenizer st.session_state.model_loaded = True except Exception as e: st.error(f"Error loading model: {str(e)}") return # Display chat messages for message in st.session_state.chat_history: with st.chat_message("user"): st.write(message["user"]) with st.chat_message("assistant"): st.write(message["assistant"]) # Chat input if prompt := st.chat_input("Ask me anything!"): # Display user message with st.chat_message("user"): st.write(prompt) # Generate and display assistant response with st.chat_message("assistant"): with st.spinner("Thinking..."): current_turn = {"user": prompt, "assistant": ""} st.session_state.chat_history.append(current_turn) response = generate_response( st.session_state.pipe, st.session_state.tokenizer, prompt, st.session_state.chat_history ) st.write(response) st.session_state.chat_history[-1]["assistant"] = response # Keep only last 5 turns if len(st.session_state.chat_history) > 5: st.session_state.chat_history = st.session_state.chat_history[-5:] # Sidebar with st.sidebar: if st.button("Clear Chat"): st.session_state.chat_history = [] st.rerun() st.markdown("---") st.markdown(""" ### Chat Info - Using OPT-125M model - Optimized for quick responses - Best for short conversations """) if __name__ == "__main__": main()