freetestn / app.py
kimhyunwoo's picture
Update app.py
8a7a11f verified
raw
history blame contribute delete
10.8 kB
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
import os
import datetime
import time
# --- Configuration ---
MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
MAX_NEW_TOKENS = 512
CPU_THREAD_COUNT = 4 # ν•„μš”μ‹œ 쑰절
# --- Optional: Set CPU Threads ---
# torch.set_num_threads(CPU_THREAD_COUNT)
# os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
# os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)
print("--- Environment Setup ---")
print(f"PyTorch version: {torch.__version__}")
print(f"Running on device: cpu")
print(f"Torch Threads: {torch.get_num_threads()}")
# --- Model and Tokenizer Loading ---
print(f"--- Loading Model: {MODEL_ID} ---")
print("This might take a few minutes, especially on the first launch...")
model = None
tokenizer = None
load_successful = False
stop_token_ids_list = [] # Initialize stop_token_ids_list
try:
start_load_time = time.time()
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
device_map="cpu",
# force_download=True # Keep commented unless cache issues reappear
)
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
# force_download=True # Keep commented
)
model.eval()
load_time = time.time() - start_load_time
print(f"--- Model and Tokenizer Loaded Successfully on CPU in {load_time:.2f} seconds ---")
load_successful = True
# --- Stop Token Configuration ---
stop_token_strings = ["<|endofturn|>", "<|stop|>"]
temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids:
temp_stop_ids.append(tokenizer.eos_token_id)
elif tokenizer.eos_token_id is None:
print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.")
stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None] # Assign to the global scope variable
if not stop_token_ids_list:
print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.")
if tokenizer.eos_token_id is not None:
stop_token_ids_list = [tokenizer.eos_token_id]
else:
print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.")
# Consider raising an error or setting a default if this is critical
print(f"Using Stop Token IDs: {stop_token_ids_list}")
except Exception as e:
print(f"!!! Error loading model: {e}")
if 'model' in locals() and model is not None: del model
if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
gc.collect()
# Raise Gradio error to display in the Space UI if loading fails
raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}")
# --- System Prompt Definition ---
def get_system_prompt():
current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
return (
f"- AI μ–Έμ–΄λͺ¨λΈμ˜ 이름은 \"CLOVA X\" 이며 λ„€μ΄λ²„μ—μ„œ λ§Œλ“€μ—ˆλ‹€.\n"
# f"- μ˜€λŠ˜μ€ {current_date}이닀.\n" # Uncomment if needed
f"- μ‚¬μš©μžμ˜ μ§ˆλ¬Έμ— λŒ€ν•΄ μΉœμ ˆν•˜κ³  μžμ„Έν•˜κ²Œ ν•œκ΅­μ–΄λ‘œ λ‹΅λ³€ν•΄μ•Ό ν•œλ‹€."
)
# --- Warm-up Function ---
def warmup_model():
if not load_successful or model is None or tokenizer is None:
print("Skipping warmup: Model not loaded successfully.")
return
print("--- Starting Model Warm-up ---")
try:
start_warmup_time = time.time()
warmup_message = "μ•ˆλ…•ν•˜μ„Έμš”"
system_prompt = get_system_prompt()
warmup_chat = [
{"role": "tool_list", "content": ""},
{"role": "system", "content": system_prompt},
{"role": "user", "content": warmup_message}
]
inputs = tokenizer.apply_chat_template(
warmup_chat,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to("cpu")
# Check if stop_token_ids_list is empty and handle appropriately
gen_kwargs = {
"max_new_tokens": 10,
"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
"do_sample": False
}
if stop_token_ids_list:
gen_kwargs["eos_token_id"] = stop_token_ids_list
else:
print("Warmup Warning: No stop tokens defined for generation.")
with torch.no_grad():
output_ids = model.generate(**inputs, **gen_kwargs)
# Optional: Decode warmup response for verification
# response = tokenizer.decode(output_ids[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
# print(f"Warm-up response (decoded): {response}")
del inputs
del output_ids
gc.collect()
warmup_time = time.time() - start_warmup_time
print(f"--- Model Warm-up Completed in {warmup_time:.2f} seconds ---")
except Exception as e:
print(f"!!! Error during model warm-up: {e}")
finally:
gc.collect()
# --- Inference Function ---
def predict(message, history):
"""
Generates response using HyperCLOVAX.
Assumes 'history' is in the Gradio 'messages' format: List[Dict].
"""
if model is None or tokenizer is None:
return "였λ₯˜: λͺ¨λΈμ΄ λ‘œλ“œλ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."
system_prompt = get_system_prompt()
# Start with system prompt
chat_history_formatted = [
{"role": "tool_list", "content": ""}, # As required by model card
{"role": "system", "content": system_prompt}
]
# Append history (List of {'role': 'user'/'assistant', 'content': '...'})
if isinstance(history, list): # Check if history is a list
for turn in history:
# Validate turn format
if isinstance(turn, dict) and "role" in turn and "content" in turn:
chat_history_formatted.append(turn)
# Handle potential older tuple format if necessary (though less likely now)
elif isinstance(turn, (list, tuple)) and len(turn) == 2:
print(f"Warning: Received history item in tuple format: {turn}. Converting to messages format.")
chat_history_formatted.append({"role": "user", "content": turn[0]})
if turn[1]: # Ensure assistant message exists
chat_history_formatted.append({"role": "assistant", "content": turn[1]})
else:
print(f"Warning: Skipping unexpected history format item: {turn}")
# Append the latest user message
chat_history_formatted.append({"role": "user", "content": message})
inputs = None
output_ids = None
try:
inputs = tokenizer.apply_chat_template(
chat_history_formatted,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to("cpu")
input_length = inputs['input_ids'].shape[1]
print(f"\nInput tokens: {input_length}")
except Exception as e:
print(f"!!! Error applying chat template: {e}")
return f"였λ₯˜: μž…λ ₯ ν˜•μ‹μ„ μ²˜λ¦¬ν•˜λŠ” 쀑 λ¬Έμ œκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. ({e})"
try:
print("Generating response...")
generation_start_time = time.time()
# Prepare generation arguments, handling empty stop_token_ids_list
gen_kwargs = {
"max_new_tokens": MAX_NEW_TOKENS,
"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
"do_sample": True,
"temperature": 0.7,
"top_p": 0.9,
}
if stop_token_ids_list:
gen_kwargs["eos_token_id"] = stop_token_ids_list
else:
print("Generation Warning: No stop tokens defined.")
with torch.no_grad():
output_ids = model.generate(**inputs, **gen_kwargs)
generation_time = time.time() - generation_start_time
print(f"Generation complete in {generation_time:.2f} seconds.")
except Exception as e:
print(f"!!! Error during model generation: {e}")
if inputs is not None: del inputs
if output_ids is not None: del output_ids
gc.collect()
return f"였λ₯˜: 응닡을 μƒμ„±ν•˜λŠ” 쀑 λ¬Έμ œκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. ({e})"
# Decode the response
response = "였λ₯˜: 응닡 생성에 μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€."
if output_ids is not None:
try:
new_tokens = output_ids[0, input_length:]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
print(f"Output tokens: {len(new_tokens)}")
del new_tokens
except Exception as e:
print(f"!!! Error decoding response: {e}")
response = "였λ₯˜: 응닡을 λ””μ½”λ”©ν•˜λŠ” 쀑 λ¬Έμ œκ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€."
# Clean up memory
if inputs is not None: del inputs
if output_ids is not None: del output_ids
gc.collect()
print("Memory cleaned.")
return response
# --- Gradio Interface Setup ---
print("--- Setting up Gradio Interface ---")
# No need to create a separate Chatbot component beforehand
# chatbot_component = gr.Chatbot(...) # REMOVED
examples = [
["넀이버 ν΄λ‘œλ°”XλŠ” λ¬΄μ—‡μΈκ°€μš”?"],
["μŠˆλ’°λ”©κ±° 방정식과 μ–‘μžμ—­ν•™μ˜ 관계λ₯Ό μ„€λͺ…ν•΄μ£Όμ„Έμš”."],
["λ”₯λŸ¬λ‹ λͺ¨λΈ ν•™μŠ΅ 과정을 λ‹¨κ³„λ³„λ‘œ μ•Œλ €μ€˜."],
["μ œμ£Όλ„ μ—¬ν–‰ κ³„νšμ„ μ„Έμš°κ³  μžˆλŠ”λ°, 3λ°• 4일 μΆ”μ²œ μ½”μŠ€ μ’€ μ§œμ€„λž˜?"],
]
# Let ChatInterface manage its own internal Chatbot component
# Remove the chatbot=... argument
demo = gr.ChatInterface(
fn=predict, # Link the prediction function
# chatbot=chatbot_component, # REMOVED
title="πŸ‡°πŸ‡· 넀이버 HyperCLOVA X SEED (0.5B) 데λͺ¨",
description=(
f"**λͺ¨λΈ:** {MODEL_ID}\n"
f"**ν™˜κ²½:** Hugging Face 무료 CPU (16GB RAM)\n"
f"**주의:** CPUμ—μ„œ μ‹€ν–‰λ˜λ―€λ‘œ 응닡 생성에 λ‹€μ†Œ μ‹œκ°„μ΄ 걸릴 수 μžˆμŠ΅λ‹ˆλ‹€. (μ›œμ—… μ™„λ£Œ)\n"
f"μ΅œλŒ€ 생성 토큰 μˆ˜λŠ” {MAX_NEW_TOKENS}개둜 μ œν•œλ©λ‹ˆλ‹€."
),
examples=examples,
cache_examples=False,
theme="soft",
)
# --- Application Launch ---
if __name__ == "__main__":
if load_successful:
warmup_model()
else:
print("Skipping warm-up because model loading failed.")
print("--- Launching Gradio App ---")
demo.queue().launch(
# share=True # Uncomment for public link
# server_name="0.0.0.0" # Uncomment for local network access
)