Spaces:

ErenalpCet
/

AI-Persona-Simulator

Running on Zero

App Files Files Community

ErenalpCet commited on 13 days ago

Commit

5a3a162

verified ·

1 Parent(s): 67e28db

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -20

app.py CHANGED Viewed

@@ -8,19 +8,19 @@ import time
 from spaces import GPU
 # --- Constants and Configuration ---
-MODEL_ID = "Qwen/Qwen3-4B"
 MAX_GPU_MEMORY = "60GiB"
 # --- Model Loading ---
 @GPU(memory=60)
 def load_model():
-    """Load the Qwen3-4B model without quantization for full precision."""
     print(f"Attempting to load model: {MODEL_ID} without quantization")
     try:
         pipe = pipeline(
             "text-generation",
             model=MODEL_ID,
-            torch_dtype=torch.bfloat16,  # Full precision, no quantization
             device_map="auto",
             model_kwargs={
                 "use_cache": True,
@@ -150,7 +150,7 @@ def parse_llm_output(full_output, input_prompt_list):
 @GPU(memory=60)
 def generate_enhanced_persona(name, bio_text, context=""):
-    """Use the LLM to enhance the persona profile with thinking disabled."""
     pipe = load_model()
     print(f"Generating enhanced persona for {name}...")
     enhancement_prompt = [
@@ -161,17 +161,16 @@ def generate_enhanced_persona(name, bio_text, context=""):
         tokenizer = pipe.tokenizer
         text = tokenizer.apply_chat_template(
             enhancement_prompt,
-            tokenize=False,
             add_generation_prompt=True,
-            enable_thinking=False  # Disable thinking mode
         )
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             outputs = pipe(
                 text,
                 max_new_tokens=512,
                 do_sample=True,
-                temperature=0.7,  # Recommended for non-thinking mode
-                top_p=0.8,  # Recommended for non-thinking mode
                 pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
             )
         parsed_output = parse_llm_output(outputs, enhancement_prompt)
@@ -184,7 +183,7 @@ def generate_enhanced_persona(name, bio_text, context=""):
 @GPU(memory=60)
 def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
-    """Generate an optimized system prompt for the persona with thinking disabled."""
     pipe = load_model()
     print(f"Generating system prompt for {name}...")
     fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
@@ -196,17 +195,16 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
         tokenizer = pipe.tokenizer
         text = tokenizer.apply_chat_template(
             prompt,
-            tokenize=False,
             add_generation_prompt=True,
-            enable_thinking=False  # Disable thinking mode
         )
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             outputs = pipe(
                 text,
                 max_new_tokens=300,
                 do_sample=True,
-                temperature=0.7,  # Recommended for non-thinking mode
-                top_p=0.8,  # Recommended for non-thinking mode
                 pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
             )
         parsed_output = parse_llm_output(outputs, prompt)
@@ -219,7 +217,7 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
 @GPU(memory=60)
 def generate_response(messages):
-    """Generate a response using the LLM with thinking disabled."""
     pipe = load_model()
     print("Generating response...")
     if not messages:
@@ -228,17 +226,16 @@ def generate_response(messages):
         tokenizer = pipe.tokenizer
         text = tokenizer.apply_chat_template(
             messages,
-            tokenize=False,
             add_generation_prompt=True,
-            enable_thinking=False  # Disable thinking mode
         )
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             outputs = pipe(
                 text,
                 max_new_tokens=512,
                 do_sample=True,
-                top_p=0.8,  # Recommended for non-thinking mode
-                temperature=0.7,  # Recommended for non-thinking mode
                 pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
             )
         parsed_output = parse_llm_output(outputs, messages)
@@ -292,7 +289,7 @@ class PersonaChat:
             yield status, self.enhanced_profile, self.enhanced_profile, []
             self.system_prompt = generate_system_prompt_with_llm(name, profile_for_prompt, context)
             # Clean tokenizer artifacts from system prompt
-            self.system_prompt = re.sub(r'<\|im_end\|>|<\|im_start\|>|<think>.*?</think>|^assistant\s*', '', self.system_prompt).strip()
             self.messages = [{"role": "system", "content": self.system_prompt}]
             print(f"set_persona: Final yield with messages (not sent to Chatbot): {self.messages}")
             # Yield empty history for Chatbot to avoid system message issues
@@ -329,7 +326,7 @@ def create_interface():
     .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
     .main-container { max-width: 1200px; margin: auto; padding: 0; }
     .header { background: linear-gradient(90deg, #2c3e50, #4ca1af); color: white; padding: 20px; border-radius: 10px 10px 0 0; margin-bottom: 20px; text-align: center; }
-    .setup-section { background-color: #f9f9f9; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); margin-bottom: 20px; }
     .chat-section { background-color: white; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); }
     .status-bar { background: #e9ecef; padding: 10px 15px; border-radius: 5px; margin: 15px 0; font-weight: 500; border: 1px solid #ced4da; }
     .chat-container { border: 1px solid #eaeaea; border-radius: 10px; height: 500px !important; overflow-y: auto; background-color: #ffffff; padding: 10px; }

 from spaces import GPU
 # --- Constants and Configuration ---
+MODEL_ID = "google/gemma-3-1b-it"  # Updated to Gemma 3 1B
 MAX_GPU_MEMORY = "60GiB"
 # --- Model Loading ---
 @GPU(memory=60)
 def load_model():
+    """Load the Gemma 3 1B model without quantization for full precision."""
     print(f"Attempting to load model: {MODEL_ID} without quantization")
     try:
         pipe = pipeline(
             "text-generation",
             model=MODEL_ID,
+            torch_dtype=torch.bfloat16,  # Full precision, compatible with Gemma
             device_map="auto",
             model_kwargs={
                 "use_cache": True,
 @GPU(memory=60)
 def generate_enhanced_persona(name, bio_text, context=""):
+    """Use the LLM to enhance the persona profile."""
     pipe = load_model()
     print(f"Generating enhanced persona for {name}...")
     enhancement_prompt = [
         tokenizer = pipe.tokenizer
         text = tokenizer.apply_chat_template(
             enhancement_prompt,
             add_generation_prompt=True,
+            tokenize=False
         )
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             outputs = pipe(
                 text,
                 max_new_tokens=512,
                 do_sample=True,
+                temperature=0.7,
+                top_p=0.8,
                 pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
             )
         parsed_output = parse_llm_output(outputs, enhancement_prompt)
 @GPU(memory=60)
 def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
+    """Generate an optimized system prompt for the persona."""
     pipe = load_model()
     print(f"Generating system prompt for {name}...")
     fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
         tokenizer = pipe.tokenizer
         text = tokenizer.apply_chat_template(
             prompt,
             add_generation_prompt=True,
+            tokenize=False
         )
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             outputs = pipe(
                 text,
                 max_new_tokens=300,
                 do_sample=True,
+                temperature=0.7,
+                top_p=0.8,
                 pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
             )
         parsed_output = parse_llm_output(outputs, prompt)
 @GPU(memory=60)
 def generate_response(messages):
+    """Generate a response using the LLM."""
     pipe = load_model()
     print("Generating response...")
     if not messages:
         tokenizer = pipe.tokenizer
         text = tokenizer.apply_chat_template(
             messages,
             add_generation_prompt=True,
+            tokenize=False
         )
         with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             outputs = pipe(
                 text,
                 max_new_tokens=512,
                 do_sample=True,
+                top_p=0.8,
+                temperature=0.7,
                 pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
             )
         parsed_output = parse_llm_output(outputs, messages)
             yield status, self.enhanced_profile, self.enhanced_profile, []
             self.system_prompt = generate_system_prompt_with_llm(name, profile_for_prompt, context)
             # Clean tokenizer artifacts from system prompt
+            self.system_prompt = re.sub(r'<\|im_tailored\|>|<\|im_start\|>|^assistant\s*', '', self.system_prompt).strip()
             self.messages = [{"role": "system", "content": self.system_prompt}]
             print(f"set_persona: Final yield with messages (not sent to Chatbot): {self.messages}")
             # Yield empty history for Chatbot to avoid system message issues
     .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
     .main-container { max-width: 1200px; margin: auto; padding: 0; }
     .header { background: linear-gradient(90deg, #2c3e50, #4ca1af); color: white; padding: 20px; border-radius: 10px 10px 0 0; margin-bottom: 20px; text-align: center; }
+    .setup-section { tôackground-color: #f9f9f9; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); margin-bottom: 20px; }
     .chat-section { background-color: white; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); }
     .status-bar { background: #e9ecef; padding: 10px 15px; border-radius: 5px; margin: 15px 0; font-weight: 500; border: 1px solid #ced4da; }
     .chat-container { border: 1px solid #eaeaea; border-radius: 10px; height: 500px !important; overflow-y: auto; background-color: #ffffff; padding: 10px; }