Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import gc | |
import os | |
import datetime | |
import time | |
import spaces # Import spaces module for GPU acceleration | |
# --- Configuration --- | |
MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B" | |
MAX_NEW_TOKENS = 512 | |
USE_GPU = True # Enable GPU usage | |
# Hugging Face ν ν° μ€μ - νκ²½ λ³μμμ κ°μ Έμ€κΈ° | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
if not HF_TOKEN: | |
print("κ²½κ³ : HF_TOKEN νκ²½ λ³μκ° μ€μ λμ§ μμμ΅λλ€. λΉκ³΅κ° λͺ¨λΈμ μ κ·Όν μ μμ μ μμ΅λλ€.") | |
# --- Environment setup --- | |
print("--- Environment Setup ---") | |
device = torch.device("cuda" if torch.cuda.is_available() and USE_GPU else "cpu") | |
print(f"PyTorch version: {torch.__version__}") | |
print(f"Running on device: {device}") | |
print(f"Torch Threads: {torch.get_num_threads()}") | |
print(f"HF_TOKEN μ€μ μ¬λΆ: {'μμ' if HF_TOKEN else 'μμ'}") | |
# Custom CSS for improved UI | |
custom_css = """ | |
.gradio-container { | |
max-width: 850px !important; | |
margin: auto; | |
} | |
.gr-chat { | |
border-radius: 10px; | |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); | |
} | |
.user-message { | |
background-color: #f0f7ff !important; | |
border-radius: 8px; | |
} | |
.assistant-message { | |
background-color: #f9f9f9 !important; | |
border-radius: 8px; | |
} | |
.gr-button.primary-button { | |
background-color: #1f4e79 !important; | |
} | |
.gr-form { | |
padding: 20px; | |
border-radius: 10px; | |
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05); | |
} | |
#intro-message { | |
text-align: center; | |
margin-bottom: 20px; | |
padding: 15px; | |
background: linear-gradient(135deg, #e8f4ff 0%, #f0f7ff 100%); | |
border-radius: 10px; | |
border-left: 4px solid #1f4e79; | |
} | |
.footer { | |
text-align: center; | |
margin-top: 20px; | |
font-size: 0.8em; | |
color: #666; | |
} | |
""" | |
# --- Model and Tokenizer Loading --- | |
print(f"--- Loading Model: {MODEL_ID} ---") | |
print("This might take a few minutes, especially on the first launch...") | |
model = None | |
tokenizer = None | |
load_successful = False | |
stop_token_ids_list = [] # Initialize stop_token_ids_list | |
try: | |
start_load_time = time.time() | |
# ν ν¬λμ΄μ λ‘λ© | |
tokenizer_kwargs = { | |
"trust_remote_code": True | |
} | |
# HF_TOKENμ΄ μ€μ λμ΄ μμΌλ©΄ μΆκ° | |
if HF_TOKEN: | |
tokenizer_kwargs["token"] = HF_TOKEN | |
tokenizer = AutoTokenizer.from_pretrained( | |
MODEL_ID, | |
**tokenizer_kwargs | |
) | |
# λͺ¨λΈ λ‘λ© | |
model_kwargs = { | |
"trust_remote_code": True, | |
"device_map": "auto" if device.type == "cuda" else "cpu", | |
"torch_dtype": torch.float16 if device.type == "cuda" else torch.float32, | |
} | |
# HF_TOKENμ΄ μ€μ λμ΄ μμΌλ©΄ μΆκ° | |
if HF_TOKEN: | |
model_kwargs["token"] = HF_TOKEN | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, | |
**model_kwargs | |
) | |
model.eval() | |
load_time = time.time() - start_load_time | |
print(f"--- Model and Tokenizer Loaded Successfully in {load_time:.2f} seconds ---") | |
load_successful = True | |
# --- Stop Token Configuration --- | |
stop_token_strings = ["<|endofturn|>", "<|stop|>"] | |
temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings] | |
if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids: | |
temp_stop_ids.append(tokenizer.eos_token_id) | |
elif tokenizer.eos_token_id is None: | |
print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.") | |
stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None] | |
if not stop_token_ids_list: | |
print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.") | |
if tokenizer.eos_token_id is not None: | |
stop_token_ids_list = [tokenizer.eos_token_id] | |
else: | |
print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.") | |
print(f"Using Stop Token IDs: {stop_token_ids_list}") | |
except Exception as e: | |
print(f"!!! Error loading model: {e}") | |
if 'model' in locals() and model is not None: del model | |
if 'tokenizer' in locals() and tokenizer is not None: del tokenizer | |
gc.collect() | |
# Raise Gradio error to display in the Space UI if loading fails | |
raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}") | |
# --- System Prompt Definition --- | |
def get_system_prompt(): | |
current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)") | |
return ( | |
f"- AI μΈμ΄λͺ¨λΈμ μ΄λ¦μ \"CLOVA X\" μ΄λ©° λ€μ΄λ²μμ λ§λ€μλ€.\n" | |
f"- μ€λμ {current_date}μ΄λ€.\n" | |
f"- μ¬μ©μμ μ§λ¬Έμ λν΄ μΉμ νκ³ μμΈνκ² νκ΅μ΄λ‘ λ΅λ³ν΄μΌ νλ€." | |
) | |
# --- Warm-up Function --- | |
def warmup_model(): | |
if not load_successful or model is None or tokenizer is None: | |
print("Skipping warmup: Model not loaded successfully.") | |
return | |
print("--- Starting Model Warm-up ---") | |
try: | |
start_warmup_time = time.time() | |
warmup_message = "μλ νμΈμ" | |
system_prompt = get_system_prompt() | |
warmup_chat = [ | |
{"role": "tool_list", "content": ""}, | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": warmup_message} | |
] | |
inputs = tokenizer.apply_chat_template( | |
warmup_chat, | |
add_generation_prompt=True, | |
return_dict=True, | |
return_tensors="pt" | |
).to(device) | |
# Check if stop_token_ids_list is empty and handle appropriately | |
gen_kwargs = { | |
"max_new_tokens": 10, | |
"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, | |
"do_sample": False | |
} | |
if stop_token_ids_list: | |
gen_kwargs["eos_token_id"] = stop_token_ids_list | |
else: | |
print("Warmup Warning: No stop tokens defined for generation.") | |
with torch.no_grad(): | |
output_ids = model.generate(**inputs, **gen_kwargs) | |
del inputs | |
del output_ids | |
gc.collect() | |
warmup_time = time.time() - start_warmup_time | |
print(f"--- Model Warm-up Completed in {warmup_time:.2f} seconds ---") | |
except Exception as e: | |
print(f"!!! Error during model warm-up: {e}") | |
finally: | |
gc.collect() | |
# --- Inference Function with GPU decorator --- | |
# Important: Add the spaces.GPU() decorator for ZeroGPU | |
def predict(message, history): | |
""" | |
Generates response using HyperCLOVAX. | |
Assumes 'history' is in the Gradio 'messages' format: List[Dict]. | |
""" | |
if model is None or tokenizer is None: | |
return "μ€λ₯: λͺ¨λΈμ΄ λ‘λλμ§ μμμ΅λλ€." | |
system_prompt = get_system_prompt() | |
# Start with system prompt | |
chat_history_formatted = [ | |
{"role": "tool_list", "content": ""}, # As required by model card | |
{"role": "system", "content": system_prompt} | |
] | |
# Process history based on Gradio ChatInterface format (list of tuples) | |
if isinstance(history, list): | |
for user_msg, assistant_msg in history: | |
chat_history_formatted.append({"role": "user", "content": user_msg}) | |
if assistant_msg: # Check if not None or empty | |
chat_history_formatted.append({"role": "assistant", "content": assistant_msg}) | |
# Append the latest user message | |
chat_history_formatted.append({"role": "user", "content": message}) | |
inputs = None | |
output_ids = None | |
try: | |
inputs = tokenizer.apply_chat_template( | |
chat_history_formatted, | |
add_generation_prompt=True, | |
return_dict=True, | |
return_tensors="pt" | |
).to(device) | |
input_length = inputs['input_ids'].shape[1] | |
print(f"\nInput tokens: {input_length}") | |
except Exception as e: | |
print(f"!!! Error applying chat template: {e}") | |
return f"μ€λ₯: μ λ ₯ νμμ μ²λ¦¬νλ μ€ λ¬Έμ κ° λ°μνμ΅λλ€. ({e})" | |
try: | |
print("Generating response...") | |
generation_start_time = time.time() | |
# Prepare generation arguments, handling empty stop_token_ids_list | |
gen_kwargs = { | |
"max_new_tokens": MAX_NEW_TOKENS, | |
"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, | |
"do_sample": True, | |
"temperature": 0.7, | |
"top_p": 0.9, | |
} | |
if stop_token_ids_list: | |
gen_kwargs["eos_token_id"] = stop_token_ids_list | |
else: | |
print("Generation Warning: No stop tokens defined.") | |
with torch.no_grad(): | |
output_ids = model.generate(**inputs, **gen_kwargs) | |
generation_time = time.time() - generation_start_time | |
print(f"Generation complete in {generation_time:.2f} seconds.") | |
except Exception as e: | |
print(f"!!! Error during model generation: {e}") | |
if inputs is not None: del inputs | |
if output_ids is not None: del output_ids | |
gc.collect() | |
return f"μ€λ₯: μλ΅μ μμ±νλ μ€ λ¬Έμ κ° λ°μνμ΅λλ€. ({e})" | |
# Decode the response | |
response = "μ€λ₯: μλ΅ μμ±μ μ€ν¨νμ΅λλ€." | |
if output_ids is not None: | |
try: | |
new_tokens = output_ids[0, input_length:] | |
response = tokenizer.decode(new_tokens, skip_special_tokens=True) | |
print(f"Output tokens: {len(new_tokens)}") | |
del new_tokens | |
except Exception as e: | |
print(f"!!! Error decoding response: {e}") | |
response = "μ€λ₯: μλ΅μ λμ½λ©νλ μ€ λ¬Έμ κ° λ°μνμ΅λλ€." | |
# Clean up memory | |
if inputs is not None: del inputs | |
if output_ids is not None: del output_ids | |
gc.collect() | |
print("Memory cleaned.") | |
return response | |
# --- Gradio Interface Setup --- | |
print("--- Setting up Gradio Interface ---") | |
with gr.Blocks(css=custom_css) as demo: | |
gr.Markdown(""" | |
# NAVER hyperclovax: HyperCLOVAX-SEED-Text-Instruct-0.5B | |
""", elem_id="intro-message") | |
# Using standard ChatInterface (compatible with all Gradio versions) | |
chatbot = gr.ChatInterface( | |
fn=predict, | |
examples=[ | |
["λ€μ΄λ² ν΄λ‘λ°Xλ 무μμΈκ°μ?"], | |
["μλ’°λ©κ±° λ°©μ μκ³Ό μμμνμ κ΄κ³λ₯Ό μ€λͺ ν΄μ£ΌμΈμ."], | |
["λ₯λ¬λ λͺ¨λΈ νμ΅ κ³Όμ μ λ¨κ³λ³λ‘ μλ €μ€."], | |
["μ μ£Όλ μ¬ν κ³νμ μΈμ°κ³ μλλ°, 3λ° 4μΌ μΆμ² μ½μ€ μ’ μ§μ€λ?"], | |
["νκ΅ μμ¬μμ κ°μ₯ μ€μν μ¬κ±΄ 5κ°μ§λ 무μμΈκ°μ?"], | |
["μΈκ³΅μ§λ₯ μ€λ¦¬μ λν΄ μ€λͺ ν΄μ£ΌμΈμ."], | |
], | |
cache_examples=False, | |
) | |
with gr.Accordion("λͺ¨λΈ μ 보", open=False): | |
gr.Markdown(f""" | |
- **λͺ¨λΈ**: {MODEL_ID} | |
- **νκ²½**: ZeroGPU 곡μ νκ²½μμ μ€ν μ€ | |
- **ν ν° μ ν**: μ΅λ μμ± ν ν° μλ {MAX_NEW_TOKENS}κ°λ‘ μ νλ©λλ€. | |
- **νλμ¨μ΄**: {"GPU" if device.type == "cuda" else "CPU"} νκ²½μμ μ€ν μ€ | |
""") | |
gr.Markdown( | |
"Β© 2025 λ€μ΄λ² HyperCLOVA X λ°λͺ¨ | Powered by Hugging Face & ZeroGPU", | |
elem_classes="footer" | |
) | |
# --- Application Launch --- | |
if __name__ == "__main__": | |
if load_successful: | |
warmup_model() | |
else: | |
print("Skipping warm-up because model loading failed.") | |
print("--- Launching Gradio App ---") | |
demo.queue().launch( | |
# share=True # Uncomment for public link | |
server_name="0.0.0.0" # Enable external access | |
) |