Spaces:

junaidbaber
/

demo_lowcode_llm

Sleeping

App Files Files Community

demo_lowcode_llm / app.py

junaidbaber

Update app.py

4257e69 verified about 2 months ago

raw

history blame contribute delete

4.59 kB

	import streamlit as st
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import torch
	import os

	def initialize_model():
	"""Initialize a small and fast model for CPU"""
	# Using a tiny model optimized for CPU
	model_id = "facebook/opt-125m" # Much smaller model (125M parameters)
	model_id ="GEB-AGI/geb-1.3b"

	try:
	# Initialize the pipeline directly - more efficient than loading model separately
	pipe = pipeline(
	"text-generation",
	model=model_id,
	device_map="cpu",
	model_kwargs={"low_cpu_mem_usage": True}
	)
	tokenizer = AutoTokenizer.from_pretrained(model_id)

	return pipe, tokenizer
	except Exception as e:
	print(f"Error loading model: {str(e)}")
	raise e

	def generate_response(pipe, tokenizer, prompt, conversation_history):
	"""Generate model response"""
	try:
	# Format conversation context
	context = ""
	for turn in conversation_history[-3:]: # Only use last 3 turns for efficiency
	context += f"Human: {turn['user']}\nAssistant: {turn['assistant']}\n"

	# Create the full prompt
	full_prompt = f"{context}Human: {prompt}\nAssistant:"

	# Generate response with conservative parameters
	response = pipe(
	full_prompt,
	max_new_tokens=50, # Limit response length
	temperature=0.7,
	top_p=0.9,
	num_return_sequences=1,
	do_sample=True,
	pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
	)[0]['generated_text']

	# Extract only the assistant's response
	try:
	assistant_response = response.split("Assistant:")[-1].strip()
	if not assistant_response:
	return "I apologize, but I couldn't generate a proper response."
	return assistant_response
	except:
	return response.split(prompt)[-1].strip()

	except Exception as e:
	return f"An error occurred: {str(e)}"

	def main():
	st.set_page_config(page_title="LLM Chat Interface", page_icon="🤖")

	st.title("💬 Quick Chat Assistant")

	# Initialize session state
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []

	if "model_loaded" not in st.session_state:
	st.session_state.model_loaded = False

	# Initialize model (only once)
	if not st.session_state.model_loaded:
	with st.spinner("Loading the model... (this should take just a few seconds)"):
	try:
	pipe, tokenizer = initialize_model()
	st.session_state.pipe = pipe
	st.session_state.tokenizer = tokenizer
	st.session_state.model_loaded = True
	except Exception as e:
	st.error(f"Error loading model: {str(e)}")
	return

	# Display chat messages
	for message in st.session_state.chat_history:
	with st.chat_message("user"):
	st.write(message["user"])
	with st.chat_message("assistant"):
	st.write(message["assistant"])

	# Chat input
	if prompt := st.chat_input("Ask me anything!"):
	# Display user message
	with st.chat_message("user"):
	st.write(prompt)

	# Generate and display assistant response
	with st.chat_message("assistant"):
	with st.spinner("Thinking..."):
	current_turn = {"user": prompt, "assistant": ""}
	st.session_state.chat_history.append(current_turn)

	response = generate_response(
	st.session_state.pipe,
	st.session_state.tokenizer,
	prompt,
	st.session_state.chat_history
	)

	st.write(response)
	st.session_state.chat_history[-1]["assistant"] = response

	# Keep only last 5 turns
	if len(st.session_state.chat_history) > 5:
	st.session_state.chat_history = st.session_state.chat_history[-5:]

	# Sidebar
	with st.sidebar:
	if st.button("Clear Chat"):
	st.session_state.chat_history = []
	st.rerun()

	st.markdown("---")
	st.markdown("""
	### Chat Info
	- Using OPT-125M model
	- Optimized for quick responses
	- Best for short conversations
	""")

	if __name__ == "__main__":
	main()