import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
import os
import datetime
import time
import spaces  # Import spaces module for GPU acceleration

# --- Configuration ---
MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
MAX_NEW_TOKENS = 512
USE_GPU = True  # Enable GPU usage

# Hugging Face 토큰 설정 - 환경 변수에서 가져오기
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    print("경고: HF_TOKEN 환경 변수가 설정되지 않았습니다. 비공개 모델에 접근할 수 없을 수 있습니다.")

# --- Environment setup ---
print("--- Environment Setup ---")
device = torch.device("cuda" if torch.cuda.is_available() and USE_GPU else "cpu")
print(f"PyTorch version: {torch.__version__}")
print(f"Running on device: {device}")
print(f"Torch Threads: {torch.get_num_threads()}")
print(f"HF_TOKEN 설정 여부: {'있음' if HF_TOKEN else '없음'}")

# Custom CSS for improved UI
custom_css = """
.gradio-container {
    max-width: 850px !important;
    margin: auto;
}
.gr-chat {
    border-radius: 10px;
    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
}
.user-message {
    background-color: #f0f7ff !important;
    border-radius: 8px;
}
.assistant-message {
    background-color: #f9f9f9 !important;
    border-radius: 8px;
}
.gr-button.primary-button {
    background-color: #1f4e79 !important;
}
.gr-form {
    padding: 20px;
    border-radius: 10px;
    box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05);
}
#intro-message {
    text-align: center;
    margin-bottom: 20px;
    padding: 15px;
    background: linear-gradient(135deg, #e8f4ff 0%, #f0f7ff 100%);
    border-radius: 10px;
    border-left: 4px solid #1f4e79;
}
.footer {
    text-align: center;
    margin-top: 20px;
    font-size: 0.8em;
    color: #666;
}
"""

# --- Model and Tokenizer Loading ---
print(f"--- Loading Model: {MODEL_ID} ---")
print("This might take a few minutes, especially on the first launch...")

model = None
tokenizer = None
load_successful = False
stop_token_ids_list = []  # Initialize stop_token_ids_list

try:
    start_load_time = time.time()
    
    # 토크나이저 로딩
    tokenizer_kwargs = {
        "trust_remote_code": True
    }
    
    # HF_TOKEN이 설정되어 있으면 추가
    if HF_TOKEN:
        tokenizer_kwargs["token"] = HF_TOKEN
    
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_ID,
        **tokenizer_kwargs
    )
    
    # 모델 로딩
    model_kwargs = {
        "trust_remote_code": True,
        "device_map": "auto" if device.type == "cuda" else "cpu",
        "torch_dtype": torch.float16 if device.type == "cuda" else torch.float32,
    }
    
    # HF_TOKEN이 설정되어 있으면 추가
    if HF_TOKEN:
        model_kwargs["token"] = HF_TOKEN
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        **model_kwargs
    )
    
    model.eval()
    load_time = time.time() - start_load_time
    print(f"--- Model and Tokenizer Loaded Successfully in {load_time:.2f} seconds ---")
    load_successful = True

    # --- Stop Token Configuration ---
    stop_token_strings = ["<|endofturn|>", "<|stop|>"]
    temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]

    if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids:
        temp_stop_ids.append(tokenizer.eos_token_id)
    elif tokenizer.eos_token_id is None:
         print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.")

    stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None]

    if not stop_token_ids_list:
        print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.")
        if tokenizer.eos_token_id is not None:
            stop_token_ids_list = [tokenizer.eos_token_id]
        else:
             print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.")

    print(f"Using Stop Token IDs: {stop_token_ids_list}")

except Exception as e:
    print(f"!!! Error loading model: {e}")
    if 'model' in locals() and model is not None: del model
    if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
    gc.collect()
    # Raise Gradio error to display in the Space UI if loading fails
    raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}")

# --- System Prompt Definition ---
def get_system_prompt():
    current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
    return (
        f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n"
        f"- 오늘은 {current_date}이다.\n"
        f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
    )

# --- Warm-up Function ---
def warmup_model():
    if not load_successful or model is None or tokenizer is None:
        print("Skipping warmup: Model not loaded successfully.")
        return

    print("--- Starting Model Warm-up ---")
    try:
        start_warmup_time = time.time()
        warmup_message = "안녕하세요"
        system_prompt = get_system_prompt()
        warmup_chat = [
            {"role": "tool_list", "content": ""},
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": warmup_message}
        ]

        inputs = tokenizer.apply_chat_template(
            warmup_chat,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        ).to(device)

        # Check if stop_token_ids_list is empty and handle appropriately
        gen_kwargs = {
            "max_new_tokens": 10,
            "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
            "do_sample": False
        }
        if stop_token_ids_list:
            gen_kwargs["eos_token_id"] = stop_token_ids_list
        else:
            print("Warmup Warning: No stop tokens defined for generation.")

        with torch.no_grad():
            output_ids = model.generate(**inputs, **gen_kwargs)

        del inputs
        del output_ids
        gc.collect()
        warmup_time = time.time() - start_warmup_time
        print(f"--- Model Warm-up Completed in {warmup_time:.2f} seconds ---")

    except Exception as e:
        print(f"!!! Error during model warm-up: {e}")
    finally:
        gc.collect()

# --- Inference Function with GPU decorator ---
@spaces.GPU()  # Important: Add the spaces.GPU() decorator for ZeroGPU
def predict(message, history):
    """
    Generates response using HyperCLOVAX.
    Assumes 'history' is in the Gradio 'messages' format: List[Dict].
    """
    if model is None or tokenizer is None:
         return "오류: 모델이 로드되지 않았습니다."

    system_prompt = get_system_prompt()

    # Start with system prompt
    chat_history_formatted = [
        {"role": "tool_list", "content": ""}, # As required by model card
        {"role": "system", "content": system_prompt}
    ]

    # Process history based on Gradio ChatInterface format (list of tuples)
    if isinstance(history, list):
        for user_msg, assistant_msg in history:
            chat_history_formatted.append({"role": "user", "content": user_msg})
            if assistant_msg:  # Check if not None or empty
                chat_history_formatted.append({"role": "assistant", "content": assistant_msg})

    # Append the latest user message
    chat_history_formatted.append({"role": "user", "content": message})

    inputs = None
    output_ids = None

    try:
        inputs = tokenizer.apply_chat_template(
            chat_history_formatted,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        ).to(device)
        input_length = inputs['input_ids'].shape[1]
        print(f"\nInput tokens: {input_length}")

    except Exception as e:
        print(f"!!! Error applying chat template: {e}")
        return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"

    try:
        print("Generating response...")
        generation_start_time = time.time()

        # Prepare generation arguments, handling empty stop_token_ids_list
        gen_kwargs = {
            "max_new_tokens": MAX_NEW_TOKENS,
            "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
            "do_sample": True,
            "temperature": 0.7,
            "top_p": 0.9,
        }
        if stop_token_ids_list:
             gen_kwargs["eos_token_id"] = stop_token_ids_list
        else:
             print("Generation Warning: No stop tokens defined.")

        with torch.no_grad():
            output_ids = model.generate(**inputs, **gen_kwargs)

        generation_time = time.time() - generation_start_time
        print(f"Generation complete in {generation_time:.2f} seconds.")

    except Exception as e:
        print(f"!!! Error during model generation: {e}")
        if inputs is not None: del inputs
        if output_ids is not None: del output_ids
        gc.collect()
        return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"

    # Decode the response
    response = "오류: 응답 생성에 실패했습니다."
    if output_ids is not None:
        try:
            new_tokens = output_ids[0, input_length:]
            response = tokenizer.decode(new_tokens, skip_special_tokens=True)
            print(f"Output tokens: {len(new_tokens)}")
            del new_tokens
        except Exception as e:
            print(f"!!! Error decoding response: {e}")
            response = "오류: 응답을 디코딩하는 중 문제가 발생했습니다."

    # Clean up memory
    if inputs is not None: del inputs
    if output_ids is not None: del output_ids
    gc.collect()
    print("Memory cleaned.")

    return response

# --- Gradio Interface Setup ---
print("--- Setting up Gradio Interface ---")

with gr.Blocks(css=custom_css) as demo:
    gr.Markdown("""
    # NAVER hyperclovax: HyperCLOVAX-SEED-Text-Instruct-0.5B 
    
    """, elem_id="intro-message")
    
    # Using standard ChatInterface (compatible with all Gradio versions)
    chatbot = gr.ChatInterface(
        fn=predict,
        examples=[
            ["네이버 클로바X는 무엇인가요?"],
            ["슈뢰딩거 방정식과 양자역학의 관계를 설명해주세요."],
            ["딥러닝 모델 학습 과정을 단계별로 알려줘."],
            ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
            ["한국 역사에서 가장 중요한 사건 5가지는 무엇인가요?"],
            ["인공지능 윤리에 대해 설명해주세요."],
        ],
        cache_examples=False,
    )
    
    with gr.Accordion("모델 정보", open=False):
        gr.Markdown(f"""
        - **모델**: {MODEL_ID}
        - **환경**: ZeroGPU 공유 환경에서 실행 중
        - **토큰 제한**: 최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다.
        - **하드웨어**: {"GPU" if device.type == "cuda" else "CPU"} 환경에서 실행 중
        """)
    
    gr.Markdown(
        "© 2025 네이버 HyperCLOVA X 데모 | Powered by Hugging Face & ZeroGPU", 
        elem_classes="footer"
    )

# --- Application Launch ---
if __name__ == "__main__":
    if load_successful:
        warmup_model()
    else:
        print("Skipping warm-up because model loading failed.")

    print("--- Launching Gradio App ---")
    demo.queue().launch(
        # share=True # Uncomment for public link
        server_name="0.0.0.0" # Enable external access
    )