Spaces:

kimhyunwoo
/

freetestn

Running

App Files Files Community

freetestn / app.py

kimhyunwoo

Update app.py

8a7a11f verified 9 days ago

raw

history blame contribute delete

10.8 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import gc
	import os
	import datetime
	import time

	# --- Configuration ---
	MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
	MAX_NEW_TOKENS = 512
	CPU_THREAD_COUNT = 4 # 필요시 조절

	# --- Optional: Set CPU Threads ---
	# torch.set_num_threads(CPU_THREAD_COUNT)
	# os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
	# os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)

	print("--- Environment Setup ---")
	print(f"PyTorch version: {torch.__version__}")
	print(f"Running on device: cpu")
	print(f"Torch Threads: {torch.get_num_threads()}")

	# --- Model and Tokenizer Loading ---
	print(f"--- Loading Model: {MODEL_ID} ---")
	print("This might take a few minutes, especially on the first launch...")

	model = None
	tokenizer = None
	load_successful = False
	stop_token_ids_list = [] # Initialize stop_token_ids_list

	try:
	start_load_time = time.time()
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float32,
	device_map="cpu",
	# force_download=True # Keep commented unless cache issues reappear
	)
	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_ID,
	# force_download=True # Keep commented
	)
	model.eval()
	load_time = time.time() - start_load_time
	print(f"--- Model and Tokenizer Loaded Successfully on CPU in {load_time:.2f} seconds ---")
	load_successful = True

	# --- Stop Token Configuration ---
	stop_token_strings = ["<\|endofturn\|>", "<\|stop\|>"]
	temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]

	if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids:
	temp_stop_ids.append(tokenizer.eos_token_id)
	elif tokenizer.eos_token_id is None:
	print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.")

	stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None] # Assign to the global scope variable

	if not stop_token_ids_list:
	print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.")
	if tokenizer.eos_token_id is not None:
	stop_token_ids_list = [tokenizer.eos_token_id]
	else:
	print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.")
	# Consider raising an error or setting a default if this is critical

	print(f"Using Stop Token IDs: {stop_token_ids_list}")

	except Exception as e:
	print(f"!!! Error loading model: {e}")
	if 'model' in locals() and model is not None: del model
	if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
	gc.collect()
	# Raise Gradio error to display in the Space UI if loading fails
	raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}")


	# --- System Prompt Definition ---
	def get_system_prompt():
	current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
	return (
	f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n"
	# f"- 오늘은 {current_date}이다.\n" # Uncomment if needed
	f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
	)

	# --- Warm-up Function ---
	def warmup_model():
	if not load_successful or model is None or tokenizer is None:
	print("Skipping warmup: Model not loaded successfully.")
	return

	print("--- Starting Model Warm-up ---")
	try:
	start_warmup_time = time.time()
	warmup_message = "안녕하세요"
	system_prompt = get_system_prompt()
	warmup_chat = [
	{"role": "tool_list", "content": ""},
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": warmup_message}
	]

	inputs = tokenizer.apply_chat_template(
	warmup_chat,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt"
	).to("cpu")

	# Check if stop_token_ids_list is empty and handle appropriately
	gen_kwargs = {
	"max_new_tokens": 10,
	"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
	"do_sample": False
	}
	if stop_token_ids_list:
	gen_kwargs["eos_token_id"] = stop_token_ids_list
	else:
	print("Warmup Warning: No stop tokens defined for generation.")


	with torch.no_grad():
	output_ids = model.generate(inputs, gen_kwargs)

	# Optional: Decode warmup response for verification
	# response = tokenizer.decode(output_ids[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
	# print(f"Warm-up response (decoded): {response}")

	del inputs
	del output_ids
	gc.collect()
	warmup_time = time.time() - start_warmup_time
	print(f"--- Model Warm-up Completed in {warmup_time:.2f} seconds ---")

	except Exception as e:
	print(f"!!! Error during model warm-up: {e}")
	finally:
	gc.collect()

	# --- Inference Function ---
	def predict(message, history):
	"""
	Generates response using HyperCLOVAX.
	Assumes 'history' is in the Gradio 'messages' format: List[Dict].
	"""
	if model is None or tokenizer is None:
	return "오류: 모델이 로드되지 않았습니다."

	system_prompt = get_system_prompt()

	# Start with system prompt
	chat_history_formatted = [
	{"role": "tool_list", "content": ""}, # As required by model card
	{"role": "system", "content": system_prompt}
	]

	# Append history (List of {'role': 'user'/'assistant', 'content': '...'})
	if isinstance(history, list): # Check if history is a list
	for turn in history:
	# Validate turn format
	if isinstance(turn, dict) and "role" in turn and "content" in turn:
	chat_history_formatted.append(turn)
	# Handle potential older tuple format if necessary (though less likely now)
	elif isinstance(turn, (list, tuple)) and len(turn) == 2:
	print(f"Warning: Received history item in tuple format: {turn}. Converting to messages format.")
	chat_history_formatted.append({"role": "user", "content": turn[0]})
	if turn[1]: # Ensure assistant message exists
	chat_history_formatted.append({"role": "assistant", "content": turn[1]})
	else:
	print(f"Warning: Skipping unexpected history format item: {turn}")


	# Append the latest user message
	chat_history_formatted.append({"role": "user", "content": message})

	inputs = None
	output_ids = None

	try:
	inputs = tokenizer.apply_chat_template(
	chat_history_formatted,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt"
	).to("cpu")
	input_length = inputs['input_ids'].shape[1]
	print(f"\nInput tokens: {input_length}")

	except Exception as e:
	print(f"!!! Error applying chat template: {e}")
	return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"

	try:
	print("Generating response...")
	generation_start_time = time.time()

	# Prepare generation arguments, handling empty stop_token_ids_list
	gen_kwargs = {
	"max_new_tokens": MAX_NEW_TOKENS,
	"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
	"do_sample": True,
	"temperature": 0.7,
	"top_p": 0.9,
	}
	if stop_token_ids_list:
	gen_kwargs["eos_token_id"] = stop_token_ids_list
	else:
	print("Generation Warning: No stop tokens defined.")


	with torch.no_grad():
	output_ids = model.generate(inputs, gen_kwargs)

	generation_time = time.time() - generation_start_time
	print(f"Generation complete in {generation_time:.2f} seconds.")

	except Exception as e:
	print(f"!!! Error during model generation: {e}")
	if inputs is not None: del inputs
	if output_ids is not None: del output_ids
	gc.collect()
	return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"

	# Decode the response
	response = "오류: 응답 생성에 실패했습니다."
	if output_ids is not None:
	try:
	new_tokens = output_ids[0, input_length:]
	response = tokenizer.decode(new_tokens, skip_special_tokens=True)
	print(f"Output tokens: {len(new_tokens)}")
	del new_tokens
	except Exception as e:
	print(f"!!! Error decoding response: {e}")
	response = "오류: 응답을 디코딩하는 중 문제가 발생했습니다."

	# Clean up memory
	if inputs is not None: del inputs
	if output_ids is not None: del output_ids
	gc.collect()
	print("Memory cleaned.")

	return response

	# --- Gradio Interface Setup ---
	print("--- Setting up Gradio Interface ---")

	# No need to create a separate Chatbot component beforehand
	# chatbot_component = gr.Chatbot(...) # REMOVED

	examples = [
	["네이버 클로바X는 무엇인가요?"],
	["슈뢰딩거 방정식과 양자역학의 관계를 설명해주세요."],
	["딥러닝 모델 학습 과정을 단계별로 알려줘."],
	["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
	]

	# Let ChatInterface manage its own internal Chatbot component
	# Remove the chatbot=... argument
	demo = gr.ChatInterface(
	fn=predict, # Link the prediction function
	# chatbot=chatbot_component, # REMOVED
	title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모",
	description=(
	f"모델: {MODEL_ID}\n"
	f"환경: Hugging Face 무료 CPU (16GB RAM)\n"
	f"주의: CPU에서 실행되므로 응답 생성에 다소 시간이 걸릴 수 있습니다. (웜업 완료)\n"
	f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
	),
	examples=examples,
	cache_examples=False,
	theme="soft",
	)

	# --- Application Launch ---
	if __name__ == "__main__":
	if load_successful:
	warmup_model()
	else:
	print("Skipping warm-up because model loading failed.")

	print("--- Launching Gradio App ---")
	demo.queue().launch(
	# share=True # Uncomment for public link
	# server_name="0.0.0.0" # Uncomment for local network access
	)