Spaces:

ErenalpCet
/

AI-Persona-Simulator

Running on Zero

App Files Files Community

ErenalpCet commited on 14 days ago

Commit

1dec77f

verified ·

1 Parent(s): 8c32d82

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -24

app.py CHANGED Viewed

@@ -1,43 +1,35 @@
 import gradio as gr
 import transformers
 import torch
-from transformers import pipeline, BitsAndBytesConfig
 from duckduckgo_search import DDGS
 import re
 import time
-from huggingface_hub import HfApi
 from spaces import GPU
 # --- Constants and Configuration ---
-MODEL_ID = "nvidia/Llama-3.1-Nemotron-8B-UltraLong-4M-Instruct"
 MAX_GPU_MEMORY = "60GiB"
 # --- Model Loading ---
 @GPU(memory=60)
 def load_model():
-    """Load the LLM model optimized for A100 GPU using 4-bit quantization."""
-    print(f"Attempting to load model: {MODEL_ID} with 4-bit quantization")
     try:
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.bfloat16,
-            bnb_4bit_use_double_quant=True,
-        )
         pipe = pipeline(
             "text-generation",
             model=MODEL_ID,
-            torch_dtype=torch.bfloat16,
             device_map="auto",
             model_kwargs={
-                "quantization_config": quantization_config,
                 "use_cache": True,
             }
         )
-        print(f"Model {MODEL_ID} loaded successfully on device: {pipe.device} (using 4-bit quantization)")
         return pipe
     except Exception as e:
-        print(f"FATAL Error loading model '{MODEL_ID}' (check memory/config): {e}")
         raise e
 # --- Web Search ---
@@ -158,7 +150,7 @@ def parse_llm_output(full_output, input_prompt_list):
 @GPU(memory=60)
 def generate_enhanced_persona(name, bio_text, context=""):
-    """Use the LLM to enhance the persona profile."""
     pipe = load_model()
     print(f"Generating enhanced persona for {name}...")
     enhancement_prompt = [
@@ -166,8 +158,23 @@ def generate_enhanced_persona(name, bio_text, context=""):
         {"role": "user", "content": f"""Synthesize the following information about '{name}' into a character profile. Context: {context} Information Found:\n{bio_text}\n\nCreate the profile based *only* on the text above."""}
     ]
     try:
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
-            outputs = pipe(enhancement_prompt, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9)
         parsed_output = parse_llm_output(outputs, enhancement_prompt)
         print("Enhanced persona generated.")
         return parsed_output if parsed_output else f"Could not generate profile based on:\n{bio_text}"
@@ -178,7 +185,7 @@ def generate_enhanced_persona(name, bio_text, context=""):
 @GPU(memory=60)
 def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
-    """Generate an optimized system prompt for the persona."""
     pipe = load_model()
     print(f"Generating system prompt for {name}...")
     fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
@@ -187,8 +194,23 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
         {"role": "user", "content": f"""Create a system prompt for an AI to simulate the character '{name}'. Context for simulation: {context} Character Profile:\n{enhanced_profile}\n\nGenerate the system prompt based *only* on the profile and context provided."""}
     ]
     try:
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
-            outputs = pipe(prompt, max_new_tokens=300, do_sample=True, temperature=0.6)
         parsed_output = parse_llm_output(outputs, prompt)
         print("System prompt generated.")
         return parsed_output if parsed_output else fallback_prompt
@@ -199,19 +221,27 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
 @GPU(memory=60)
 def generate_response(messages):
-    """Generate a response using the LLM."""
     pipe = load_model()
     print("Generating response...")
     if not messages:
         return "Error: No message history provided."
     try:
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             outputs = pipe(
-                messages,
                 max_new_tokens=512,
                 do_sample=True,
-                top_p=0.9,
-                temperature=0.7,
                 pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
             )
         parsed_output = parse_llm_output(outputs, messages)
@@ -300,7 +330,7 @@ def create_interface():
     .send-button { background-color: #2c3e50 !important; color: white !important; }
     .persona-button { background-color: #4ca1af !important; color: white !important; }
     .system-prompt-display { background-color: #f5f5f5; border-radius: 8px; padding: 15px; margin-top: 15px; border: 1px solid #e0e0e0; font-family: monospace; white-space: pre-wrap; word-wrap: break-word; }
-    .footer { text-align: center; margin-top: 20px; font-size: 0.9rem; color: #666; }
     .typing-indicator { color: #aaa; font-style: italic; }
     """
     with gr.Blocks(css=css, title="AI Persona Simulator") as interface:

 import gradio as gr
 import transformers
 import torch
+from transformers import pipeline
 from duckduckgo_search import DDGS
 import re
 import time
 from spaces import GPU
 # --- Constants and Configuration ---
+MODEL_ID = "Qwen/Qwen3-4B"
 MAX_GPU_MEMORY = "60GiB"
 # --- Model Loading ---
 @GPU(memory=60)
 def load_model():
+    """Load the Qwen3-4B model without quantization for full precision."""
+    print(f"Attempting to load model: {MODEL_ID} without quantization")
     try:
         pipe = pipeline(
             "text-generation",
             model=MODEL_ID,
+            torch_dtype=torch.bfloat16,  # Full precision, no quantization
             device_map="auto",
             model_kwargs={
                 "use_cache": True,
             }
         )
+        print(f"Model {MODEL_ID} loaded successfully on device: {pipe.device} (full precision)")
         return pipe
     except Exception as e:
+        print(f"FATAL Error loading model '{MODEL_ID}': {e}")
         raise e
 # --- Web Search ---
 @GPU(memory=60)
 def generate_enhanced_persona(name, bio_text, context=""):
+    """Use the LLM to enhance the persona profile with thinking disabled."""
     pipe = load_model()
     print(f"Generating enhanced persona for {name}...")
     enhancement_prompt = [
         {"role": "user", "content": f"""Synthesize the following information about '{name}' into a character profile. Context: {context} Information Found:\n{bio_text}\n\nCreate the profile based *only* on the text above."""}
     ]
     try:
+        tokenizer = pipe.tokenizer
+        text = tokenizer.apply_chat_template(
+            enhancement_prompt,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False  # Disable thinking mode
+        )
+        model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            outputs = pipe(
+                model_inputs,
+                max_new_tokens=512,
+                do_sample=True,
+                temperature=0.7,  # Recommended for non-thinking mode
+                top_p=0.8,  # Recommended for non-thinking mode
+                pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
+            )
         parsed_output = parse_llm_output(outputs, enhancement_prompt)
         print("Enhanced persona generated.")
         return parsed_output if parsed_output else f"Could not generate profile based on:\n{bio_text}"
 @GPU(memory=60)
 def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
+    """Generate an optimized system prompt for the persona with thinking disabled."""
     pipe = load_model()
     print(f"Generating system prompt for {name}...")
     fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
         {"role": "user", "content": f"""Create a system prompt for an AI to simulate the character '{name}'. Context for simulation: {context} Character Profile:\n{enhanced_profile}\n\nGenerate the system prompt based *only* on the profile and context provided."""}
     ]
     try:
+        tokenizer = pipe.tokenizer
+        text = tokenizer.apply_chat_template(
+            prompt,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False  # Disable thinking mode
+        )
+        model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            outputs = pipe(
+                model_inputs,
+                max_new_tokens=300,
+                do_sample=True,
+                temperature=0.7,  # Recommended for non-thinking mode
+                top_p=0.8,  # Recommended for non-thinking mode
+                pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
+            )
         parsed_output = parse_llm_output(outputs, prompt)
         print("System prompt generated.")
         return parsed_output if parsed_output else fallback_prompt
 @GPU(memory=60)
 def generate_response(messages):
+    """Generate a response using the LLM with thinking disabled."""
     pipe = load_model()
     print("Generating response...")
     if not messages:
         return "Error: No message history provided."
     try:
+        tokenizer = pipe.tokenizer
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False  # Disable thinking mode
+        )
+        model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             outputs = pipe(
+                model_inputs,
                 max_new_tokens=512,
                 do_sample=True,
+                top_p=0.8,  # Recommended for non-thinking mode
+                temperature=0.7,  # Recommended for non-thinking mode
                 pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
             )
         parsed_output = parse_llm_output(outputs, messages)
     .send-button { background-color: #2c3e50 !important; color: white !important; }
     .persona-button { background-color: #4ca1af !important; color: white !important; }
     .system-prompt-display { background-color: #f5f5f5; border-radius: 8px; padding: 15px; margin-top: 15px; border: 1px solid #e0e0e0; font-family: monospace; white-space: pre-wrap; word-wrap: break-word; }
+    .footer { text-align: center; margin-top: 20px; font-size: 0.9em; color: #666; }
     .typing-indicator { color: #aaa; font-style: italic; }
     """
     with gr.Blocks(css=css, title="AI Persona Simulator") as interface: