ErenalpCet commited on
Commit
5a3a162
·
verified ·
1 Parent(s): 67e28db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -20
app.py CHANGED
@@ -8,19 +8,19 @@ import time
8
  from spaces import GPU
9
 
10
  # --- Constants and Configuration ---
11
- MODEL_ID = "Qwen/Qwen3-4B"
12
  MAX_GPU_MEMORY = "60GiB"
13
 
14
  # --- Model Loading ---
15
  @GPU(memory=60)
16
  def load_model():
17
- """Load the Qwen3-4B model without quantization for full precision."""
18
  print(f"Attempting to load model: {MODEL_ID} without quantization")
19
  try:
20
  pipe = pipeline(
21
  "text-generation",
22
  model=MODEL_ID,
23
- torch_dtype=torch.bfloat16, # Full precision, no quantization
24
  device_map="auto",
25
  model_kwargs={
26
  "use_cache": True,
@@ -150,7 +150,7 @@ def parse_llm_output(full_output, input_prompt_list):
150
 
151
  @GPU(memory=60)
152
  def generate_enhanced_persona(name, bio_text, context=""):
153
- """Use the LLM to enhance the persona profile with thinking disabled."""
154
  pipe = load_model()
155
  print(f"Generating enhanced persona for {name}...")
156
  enhancement_prompt = [
@@ -161,17 +161,16 @@ def generate_enhanced_persona(name, bio_text, context=""):
161
  tokenizer = pipe.tokenizer
162
  text = tokenizer.apply_chat_template(
163
  enhancement_prompt,
164
- tokenize=False,
165
  add_generation_prompt=True,
166
- enable_thinking=False # Disable thinking mode
167
  )
168
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
169
  outputs = pipe(
170
  text,
171
  max_new_tokens=512,
172
  do_sample=True,
173
- temperature=0.7, # Recommended for non-thinking mode
174
- top_p=0.8, # Recommended for non-thinking mode
175
  pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
176
  )
177
  parsed_output = parse_llm_output(outputs, enhancement_prompt)
@@ -184,7 +183,7 @@ def generate_enhanced_persona(name, bio_text, context=""):
184
 
185
  @GPU(memory=60)
186
  def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
187
- """Generate an optimized system prompt for the persona with thinking disabled."""
188
  pipe = load_model()
189
  print(f"Generating system prompt for {name}...")
190
  fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
@@ -196,17 +195,16 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
196
  tokenizer = pipe.tokenizer
197
  text = tokenizer.apply_chat_template(
198
  prompt,
199
- tokenize=False,
200
  add_generation_prompt=True,
201
- enable_thinking=False # Disable thinking mode
202
  )
203
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
204
  outputs = pipe(
205
  text,
206
  max_new_tokens=300,
207
  do_sample=True,
208
- temperature=0.7, # Recommended for non-thinking mode
209
- top_p=0.8, # Recommended for non-thinking mode
210
  pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
211
  )
212
  parsed_output = parse_llm_output(outputs, prompt)
@@ -219,7 +217,7 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
219
 
220
  @GPU(memory=60)
221
  def generate_response(messages):
222
- """Generate a response using the LLM with thinking disabled."""
223
  pipe = load_model()
224
  print("Generating response...")
225
  if not messages:
@@ -228,17 +226,16 @@ def generate_response(messages):
228
  tokenizer = pipe.tokenizer
229
  text = tokenizer.apply_chat_template(
230
  messages,
231
- tokenize=False,
232
  add_generation_prompt=True,
233
- enable_thinking=False # Disable thinking mode
234
  )
235
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
236
  outputs = pipe(
237
  text,
238
  max_new_tokens=512,
239
  do_sample=True,
240
- top_p=0.8, # Recommended for non-thinking mode
241
- temperature=0.7, # Recommended for non-thinking mode
242
  pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
243
  )
244
  parsed_output = parse_llm_output(outputs, messages)
@@ -292,7 +289,7 @@ class PersonaChat:
292
  yield status, self.enhanced_profile, self.enhanced_profile, []
293
  self.system_prompt = generate_system_prompt_with_llm(name, profile_for_prompt, context)
294
  # Clean tokenizer artifacts from system prompt
295
- self.system_prompt = re.sub(r'<\|im_end\|>|<\|im_start\|>|<think>.*?</think>|^assistant\s*', '', self.system_prompt).strip()
296
  self.messages = [{"role": "system", "content": self.system_prompt}]
297
  print(f"set_persona: Final yield with messages (not sent to Chatbot): {self.messages}")
298
  # Yield empty history for Chatbot to avoid system message issues
@@ -329,7 +326,7 @@ def create_interface():
329
  .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
330
  .main-container { max-width: 1200px; margin: auto; padding: 0; }
331
  .header { background: linear-gradient(90deg, #2c3e50, #4ca1af); color: white; padding: 20px; border-radius: 10px 10px 0 0; margin-bottom: 20px; text-align: center; }
332
- .setup-section { background-color: #f9f9f9; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); margin-bottom: 20px; }
333
  .chat-section { background-color: white; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); }
334
  .status-bar { background: #e9ecef; padding: 10px 15px; border-radius: 5px; margin: 15px 0; font-weight: 500; border: 1px solid #ced4da; }
335
  .chat-container { border: 1px solid #eaeaea; border-radius: 10px; height: 500px !important; overflow-y: auto; background-color: #ffffff; padding: 10px; }
 
8
  from spaces import GPU
9
 
10
  # --- Constants and Configuration ---
11
+ MODEL_ID = "google/gemma-3-1b-it" # Updated to Gemma 3 1B
12
  MAX_GPU_MEMORY = "60GiB"
13
 
14
  # --- Model Loading ---
15
  @GPU(memory=60)
16
  def load_model():
17
+ """Load the Gemma 3 1B model without quantization for full precision."""
18
  print(f"Attempting to load model: {MODEL_ID} without quantization")
19
  try:
20
  pipe = pipeline(
21
  "text-generation",
22
  model=MODEL_ID,
23
+ torch_dtype=torch.bfloat16, # Full precision, compatible with Gemma
24
  device_map="auto",
25
  model_kwargs={
26
  "use_cache": True,
 
150
 
151
  @GPU(memory=60)
152
  def generate_enhanced_persona(name, bio_text, context=""):
153
+ """Use the LLM to enhance the persona profile."""
154
  pipe = load_model()
155
  print(f"Generating enhanced persona for {name}...")
156
  enhancement_prompt = [
 
161
  tokenizer = pipe.tokenizer
162
  text = tokenizer.apply_chat_template(
163
  enhancement_prompt,
 
164
  add_generation_prompt=True,
165
+ tokenize=False
166
  )
167
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
168
  outputs = pipe(
169
  text,
170
  max_new_tokens=512,
171
  do_sample=True,
172
+ temperature=0.7,
173
+ top_p=0.8,
174
  pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
175
  )
176
  parsed_output = parse_llm_output(outputs, enhancement_prompt)
 
183
 
184
  @GPU(memory=60)
185
  def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
186
+ """Generate an optimized system prompt for the persona."""
187
  pipe = load_model()
188
  print(f"Generating system prompt for {name}...")
189
  fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
 
195
  tokenizer = pipe.tokenizer
196
  text = tokenizer.apply_chat_template(
197
  prompt,
 
198
  add_generation_prompt=True,
199
+ tokenize=False
200
  )
201
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
202
  outputs = pipe(
203
  text,
204
  max_new_tokens=300,
205
  do_sample=True,
206
+ temperature=0.7,
207
+ top_p=0.8,
208
  pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
209
  )
210
  parsed_output = parse_llm_output(outputs, prompt)
 
217
 
218
  @GPU(memory=60)
219
  def generate_response(messages):
220
+ """Generate a response using the LLM."""
221
  pipe = load_model()
222
  print("Generating response...")
223
  if not messages:
 
226
  tokenizer = pipe.tokenizer
227
  text = tokenizer.apply_chat_template(
228
  messages,
 
229
  add_generation_prompt=True,
230
+ tokenize=False
231
  )
232
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
233
  outputs = pipe(
234
  text,
235
  max_new_tokens=512,
236
  do_sample=True,
237
+ top_p=0.8,
238
+ temperature=0.7,
239
  pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
240
  )
241
  parsed_output = parse_llm_output(outputs, messages)
 
289
  yield status, self.enhanced_profile, self.enhanced_profile, []
290
  self.system_prompt = generate_system_prompt_with_llm(name, profile_for_prompt, context)
291
  # Clean tokenizer artifacts from system prompt
292
+ self.system_prompt = re.sub(r'<\|im_tailored\|>|<\|im_start\|>|^assistant\s*', '', self.system_prompt).strip()
293
  self.messages = [{"role": "system", "content": self.system_prompt}]
294
  print(f"set_persona: Final yield with messages (not sent to Chatbot): {self.messages}")
295
  # Yield empty history for Chatbot to avoid system message issues
 
326
  .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
327
  .main-container { max-width: 1200px; margin: auto; padding: 0; }
328
  .header { background: linear-gradient(90deg, #2c3e50, #4ca1af); color: white; padding: 20px; border-radius: 10px 10px 0 0; margin-bottom: 20px; text-align: center; }
329
+ .setup-section { tôackground-color: #f9f9f9; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); margin-bottom: 20px; }
330
  .chat-section { background-color: white; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); }
331
  .status-bar { background: #e9ecef; padding: 10px 15px; border-radius: 5px; margin: 15px 0; font-weight: 500; border: 1px solid #ced4da; }
332
  .chat-container { border: 1px solid #eaeaea; border-radius: 10px; height: 500px !important; overflow-y: auto; background-color: #ffffff; padding: 10px; }