ErenalpCet commited on
Commit
1dec77f
·
verified ·
1 Parent(s): 8c32d82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -24
app.py CHANGED
@@ -1,43 +1,35 @@
1
  import gradio as gr
2
  import transformers
3
  import torch
4
- from transformers import pipeline, BitsAndBytesConfig
5
  from duckduckgo_search import DDGS
6
  import re
7
  import time
8
- from huggingface_hub import HfApi
9
  from spaces import GPU
10
 
11
  # --- Constants and Configuration ---
12
- MODEL_ID = "nvidia/Llama-3.1-Nemotron-8B-UltraLong-4M-Instruct"
13
  MAX_GPU_MEMORY = "60GiB"
14
 
15
  # --- Model Loading ---
16
  @GPU(memory=60)
17
  def load_model():
18
- """Load the LLM model optimized for A100 GPU using 4-bit quantization."""
19
- print(f"Attempting to load model: {MODEL_ID} with 4-bit quantization")
20
  try:
21
- quantization_config = BitsAndBytesConfig(
22
- load_in_4bit=True,
23
- bnb_4bit_quant_type="nf4",
24
- bnb_4bit_compute_dtype=torch.bfloat16,
25
- bnb_4bit_use_double_quant=True,
26
- )
27
  pipe = pipeline(
28
  "text-generation",
29
  model=MODEL_ID,
30
- torch_dtype=torch.bfloat16,
31
  device_map="auto",
32
  model_kwargs={
33
- "quantization_config": quantization_config,
34
  "use_cache": True,
35
  }
36
  )
37
- print(f"Model {MODEL_ID} loaded successfully on device: {pipe.device} (using 4-bit quantization)")
38
  return pipe
39
  except Exception as e:
40
- print(f"FATAL Error loading model '{MODEL_ID}' (check memory/config): {e}")
41
  raise e
42
 
43
  # --- Web Search ---
@@ -158,7 +150,7 @@ def parse_llm_output(full_output, input_prompt_list):
158
 
159
  @GPU(memory=60)
160
  def generate_enhanced_persona(name, bio_text, context=""):
161
- """Use the LLM to enhance the persona profile."""
162
  pipe = load_model()
163
  print(f"Generating enhanced persona for {name}...")
164
  enhancement_prompt = [
@@ -166,8 +158,23 @@ def generate_enhanced_persona(name, bio_text, context=""):
166
  {"role": "user", "content": f"""Synthesize the following information about '{name}' into a character profile. Context: {context} Information Found:\n{bio_text}\n\nCreate the profile based *only* on the text above."""}
167
  ]
168
  try:
 
 
 
 
 
 
 
 
169
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
170
- outputs = pipe(enhancement_prompt, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9)
 
 
 
 
 
 
 
171
  parsed_output = parse_llm_output(outputs, enhancement_prompt)
172
  print("Enhanced persona generated.")
173
  return parsed_output if parsed_output else f"Could not generate profile based on:\n{bio_text}"
@@ -178,7 +185,7 @@ def generate_enhanced_persona(name, bio_text, context=""):
178
 
179
  @GPU(memory=60)
180
  def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
181
- """Generate an optimized system prompt for the persona."""
182
  pipe = load_model()
183
  print(f"Generating system prompt for {name}...")
184
  fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
@@ -187,8 +194,23 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
187
  {"role": "user", "content": f"""Create a system prompt for an AI to simulate the character '{name}'. Context for simulation: {context} Character Profile:\n{enhanced_profile}\n\nGenerate the system prompt based *only* on the profile and context provided."""}
188
  ]
189
  try:
 
 
 
 
 
 
 
 
190
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
191
- outputs = pipe(prompt, max_new_tokens=300, do_sample=True, temperature=0.6)
 
 
 
 
 
 
 
192
  parsed_output = parse_llm_output(outputs, prompt)
193
  print("System prompt generated.")
194
  return parsed_output if parsed_output else fallback_prompt
@@ -199,19 +221,27 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
199
 
200
  @GPU(memory=60)
201
  def generate_response(messages):
202
- """Generate a response using the LLM."""
203
  pipe = load_model()
204
  print("Generating response...")
205
  if not messages:
206
  return "Error: No message history provided."
207
  try:
 
 
 
 
 
 
 
 
208
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
209
  outputs = pipe(
210
- messages,
211
  max_new_tokens=512,
212
  do_sample=True,
213
- top_p=0.9,
214
- temperature=0.7,
215
  pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
216
  )
217
  parsed_output = parse_llm_output(outputs, messages)
@@ -300,7 +330,7 @@ def create_interface():
300
  .send-button { background-color: #2c3e50 !important; color: white !important; }
301
  .persona-button { background-color: #4ca1af !important; color: white !important; }
302
  .system-prompt-display { background-color: #f5f5f5; border-radius: 8px; padding: 15px; margin-top: 15px; border: 1px solid #e0e0e0; font-family: monospace; white-space: pre-wrap; word-wrap: break-word; }
303
- .footer { text-align: center; margin-top: 20px; font-size: 0.9rem; color: #666; }
304
  .typing-indicator { color: #aaa; font-style: italic; }
305
  """
306
  with gr.Blocks(css=css, title="AI Persona Simulator") as interface:
 
1
  import gradio as gr
2
  import transformers
3
  import torch
4
+ from transformers import pipeline
5
  from duckduckgo_search import DDGS
6
  import re
7
  import time
 
8
  from spaces import GPU
9
 
10
  # --- Constants and Configuration ---
11
+ MODEL_ID = "Qwen/Qwen3-4B"
12
  MAX_GPU_MEMORY = "60GiB"
13
 
14
  # --- Model Loading ---
15
  @GPU(memory=60)
16
  def load_model():
17
+ """Load the Qwen3-4B model without quantization for full precision."""
18
+ print(f"Attempting to load model: {MODEL_ID} without quantization")
19
  try:
 
 
 
 
 
 
20
  pipe = pipeline(
21
  "text-generation",
22
  model=MODEL_ID,
23
+ torch_dtype=torch.bfloat16, # Full precision, no quantization
24
  device_map="auto",
25
  model_kwargs={
 
26
  "use_cache": True,
27
  }
28
  )
29
+ print(f"Model {MODEL_ID} loaded successfully on device: {pipe.device} (full precision)")
30
  return pipe
31
  except Exception as e:
32
+ print(f"FATAL Error loading model '{MODEL_ID}': {e}")
33
  raise e
34
 
35
  # --- Web Search ---
 
150
 
151
  @GPU(memory=60)
152
  def generate_enhanced_persona(name, bio_text, context=""):
153
+ """Use the LLM to enhance the persona profile with thinking disabled."""
154
  pipe = load_model()
155
  print(f"Generating enhanced persona for {name}...")
156
  enhancement_prompt = [
 
158
  {"role": "user", "content": f"""Synthesize the following information about '{name}' into a character profile. Context: {context} Information Found:\n{bio_text}\n\nCreate the profile based *only* on the text above."""}
159
  ]
160
  try:
161
+ tokenizer = pipe.tokenizer
162
+ text = tokenizer.apply_chat_template(
163
+ enhancement_prompt,
164
+ tokenize=False,
165
+ add_generation_prompt=True,
166
+ enable_thinking=False # Disable thinking mode
167
+ )
168
+ model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
169
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
170
+ outputs = pipe(
171
+ model_inputs,
172
+ max_new_tokens=512,
173
+ do_sample=True,
174
+ temperature=0.7, # Recommended for non-thinking mode
175
+ top_p=0.8, # Recommended for non-thinking mode
176
+ pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
177
+ )
178
  parsed_output = parse_llm_output(outputs, enhancement_prompt)
179
  print("Enhanced persona generated.")
180
  return parsed_output if parsed_output else f"Could not generate profile based on:\n{bio_text}"
 
185
 
186
  @GPU(memory=60)
187
  def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
188
+ """Generate an optimized system prompt for the persona with thinking disabled."""
189
  pipe = load_model()
190
  print(f"Generating system prompt for {name}...")
191
  fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
 
194
  {"role": "user", "content": f"""Create a system prompt for an AI to simulate the character '{name}'. Context for simulation: {context} Character Profile:\n{enhanced_profile}\n\nGenerate the system prompt based *only* on the profile and context provided."""}
195
  ]
196
  try:
197
+ tokenizer = pipe.tokenizer
198
+ text = tokenizer.apply_chat_template(
199
+ prompt,
200
+ tokenize=False,
201
+ add_generation_prompt=True,
202
+ enable_thinking=False # Disable thinking mode
203
+ )
204
+ model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
205
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
206
+ outputs = pipe(
207
+ model_inputs,
208
+ max_new_tokens=300,
209
+ do_sample=True,
210
+ temperature=0.7, # Recommended for non-thinking mode
211
+ top_p=0.8, # Recommended for non-thinking mode
212
+ pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
213
+ )
214
  parsed_output = parse_llm_output(outputs, prompt)
215
  print("System prompt generated.")
216
  return parsed_output if parsed_output else fallback_prompt
 
221
 
222
  @GPU(memory=60)
223
  def generate_response(messages):
224
+ """Generate a response using the LLM with thinking disabled."""
225
  pipe = load_model()
226
  print("Generating response...")
227
  if not messages:
228
  return "Error: No message history provided."
229
  try:
230
+ tokenizer = pipe.tokenizer
231
+ text = tokenizer.apply_chat_template(
232
+ messages,
233
+ tokenize=False,
234
+ add_generation_prompt=True,
235
+ enable_thinking=False # Disable thinking mode
236
+ )
237
+ model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
238
  with torch.amp.autocast('cuda', dtype=torch.bfloat16):
239
  outputs = pipe(
240
+ model_inputs,
241
  max_new_tokens=512,
242
  do_sample=True,
243
+ top_p=0.8, # Recommended for non-thinking mode
244
+ temperature=0.7, # Recommended for non-thinking mode
245
  pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
246
  )
247
  parsed_output = parse_llm_output(outputs, messages)
 
330
  .send-button { background-color: #2c3e50 !important; color: white !important; }
331
  .persona-button { background-color: #4ca1af !important; color: white !important; }
332
  .system-prompt-display { background-color: #f5f5f5; border-radius: 8px; padding: 15px; margin-top: 15px; border: 1px solid #e0e0e0; font-family: monospace; white-space: pre-wrap; word-wrap: break-word; }
333
+ .footer { text-align: center; margin-top: 20px; font-size: 0.9em; color: #666; }
334
  .typing-indicator { color: #aaa; font-style: italic; }
335
  """
336
  with gr.Blocks(css=css, title="AI Persona Simulator") as interface: