Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,43 +1,35 @@
|
|
1 |
import gradio as gr
|
2 |
import transformers
|
3 |
import torch
|
4 |
-
from transformers import pipeline
|
5 |
from duckduckgo_search import DDGS
|
6 |
import re
|
7 |
import time
|
8 |
-
from huggingface_hub import HfApi
|
9 |
from spaces import GPU
|
10 |
|
11 |
# --- Constants and Configuration ---
|
12 |
-
MODEL_ID = "
|
13 |
MAX_GPU_MEMORY = "60GiB"
|
14 |
|
15 |
# --- Model Loading ---
|
16 |
@GPU(memory=60)
|
17 |
def load_model():
|
18 |
-
"""Load the
|
19 |
-
print(f"Attempting to load model: {MODEL_ID}
|
20 |
try:
|
21 |
-
quantization_config = BitsAndBytesConfig(
|
22 |
-
load_in_4bit=True,
|
23 |
-
bnb_4bit_quant_type="nf4",
|
24 |
-
bnb_4bit_compute_dtype=torch.bfloat16,
|
25 |
-
bnb_4bit_use_double_quant=True,
|
26 |
-
)
|
27 |
pipe = pipeline(
|
28 |
"text-generation",
|
29 |
model=MODEL_ID,
|
30 |
-
torch_dtype=torch.bfloat16,
|
31 |
device_map="auto",
|
32 |
model_kwargs={
|
33 |
-
"quantization_config": quantization_config,
|
34 |
"use_cache": True,
|
35 |
}
|
36 |
)
|
37 |
-
print(f"Model {MODEL_ID} loaded successfully on device: {pipe.device} (
|
38 |
return pipe
|
39 |
except Exception as e:
|
40 |
-
print(f"FATAL Error loading model '{MODEL_ID}'
|
41 |
raise e
|
42 |
|
43 |
# --- Web Search ---
|
@@ -158,7 +150,7 @@ def parse_llm_output(full_output, input_prompt_list):
|
|
158 |
|
159 |
@GPU(memory=60)
|
160 |
def generate_enhanced_persona(name, bio_text, context=""):
|
161 |
-
"""Use the LLM to enhance the persona profile."""
|
162 |
pipe = load_model()
|
163 |
print(f"Generating enhanced persona for {name}...")
|
164 |
enhancement_prompt = [
|
@@ -166,8 +158,23 @@ def generate_enhanced_persona(name, bio_text, context=""):
|
|
166 |
{"role": "user", "content": f"""Synthesize the following information about '{name}' into a character profile. Context: {context} Information Found:\n{bio_text}\n\nCreate the profile based *only* on the text above."""}
|
167 |
]
|
168 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
170 |
-
outputs = pipe(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
parsed_output = parse_llm_output(outputs, enhancement_prompt)
|
172 |
print("Enhanced persona generated.")
|
173 |
return parsed_output if parsed_output else f"Could not generate profile based on:\n{bio_text}"
|
@@ -178,7 +185,7 @@ def generate_enhanced_persona(name, bio_text, context=""):
|
|
178 |
|
179 |
@GPU(memory=60)
|
180 |
def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
|
181 |
-
"""Generate an optimized system prompt for the persona."""
|
182 |
pipe = load_model()
|
183 |
print(f"Generating system prompt for {name}...")
|
184 |
fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
|
@@ -187,8 +194,23 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
|
|
187 |
{"role": "user", "content": f"""Create a system prompt for an AI to simulate the character '{name}'. Context for simulation: {context} Character Profile:\n{enhanced_profile}\n\nGenerate the system prompt based *only* on the profile and context provided."""}
|
188 |
]
|
189 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
191 |
-
outputs = pipe(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
parsed_output = parse_llm_output(outputs, prompt)
|
193 |
print("System prompt generated.")
|
194 |
return parsed_output if parsed_output else fallback_prompt
|
@@ -199,19 +221,27 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
|
|
199 |
|
200 |
@GPU(memory=60)
|
201 |
def generate_response(messages):
|
202 |
-
"""Generate a response using the LLM."""
|
203 |
pipe = load_model()
|
204 |
print("Generating response...")
|
205 |
if not messages:
|
206 |
return "Error: No message history provided."
|
207 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
209 |
outputs = pipe(
|
210 |
-
|
211 |
max_new_tokens=512,
|
212 |
do_sample=True,
|
213 |
-
top_p=0.
|
214 |
-
temperature=0.7,
|
215 |
pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
|
216 |
)
|
217 |
parsed_output = parse_llm_output(outputs, messages)
|
@@ -300,7 +330,7 @@ def create_interface():
|
|
300 |
.send-button { background-color: #2c3e50 !important; color: white !important; }
|
301 |
.persona-button { background-color: #4ca1af !important; color: white !important; }
|
302 |
.system-prompt-display { background-color: #f5f5f5; border-radius: 8px; padding: 15px; margin-top: 15px; border: 1px solid #e0e0e0; font-family: monospace; white-space: pre-wrap; word-wrap: break-word; }
|
303 |
-
.footer { text-align: center; margin-top: 20px; font-size: 0.
|
304 |
.typing-indicator { color: #aaa; font-style: italic; }
|
305 |
"""
|
306 |
with gr.Blocks(css=css, title="AI Persona Simulator") as interface:
|
|
|
1 |
import gradio as gr
|
2 |
import transformers
|
3 |
import torch
|
4 |
+
from transformers import pipeline
|
5 |
from duckduckgo_search import DDGS
|
6 |
import re
|
7 |
import time
|
|
|
8 |
from spaces import GPU
|
9 |
|
10 |
# --- Constants and Configuration ---
|
11 |
+
MODEL_ID = "Qwen/Qwen3-4B"
|
12 |
MAX_GPU_MEMORY = "60GiB"
|
13 |
|
14 |
# --- Model Loading ---
|
15 |
@GPU(memory=60)
|
16 |
def load_model():
|
17 |
+
"""Load the Qwen3-4B model without quantization for full precision."""
|
18 |
+
print(f"Attempting to load model: {MODEL_ID} without quantization")
|
19 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
pipe = pipeline(
|
21 |
"text-generation",
|
22 |
model=MODEL_ID,
|
23 |
+
torch_dtype=torch.bfloat16, # Full precision, no quantization
|
24 |
device_map="auto",
|
25 |
model_kwargs={
|
|
|
26 |
"use_cache": True,
|
27 |
}
|
28 |
)
|
29 |
+
print(f"Model {MODEL_ID} loaded successfully on device: {pipe.device} (full precision)")
|
30 |
return pipe
|
31 |
except Exception as e:
|
32 |
+
print(f"FATAL Error loading model '{MODEL_ID}': {e}")
|
33 |
raise e
|
34 |
|
35 |
# --- Web Search ---
|
|
|
150 |
|
151 |
@GPU(memory=60)
|
152 |
def generate_enhanced_persona(name, bio_text, context=""):
|
153 |
+
"""Use the LLM to enhance the persona profile with thinking disabled."""
|
154 |
pipe = load_model()
|
155 |
print(f"Generating enhanced persona for {name}...")
|
156 |
enhancement_prompt = [
|
|
|
158 |
{"role": "user", "content": f"""Synthesize the following information about '{name}' into a character profile. Context: {context} Information Found:\n{bio_text}\n\nCreate the profile based *only* on the text above."""}
|
159 |
]
|
160 |
try:
|
161 |
+
tokenizer = pipe.tokenizer
|
162 |
+
text = tokenizer.apply_chat_template(
|
163 |
+
enhancement_prompt,
|
164 |
+
tokenize=False,
|
165 |
+
add_generation_prompt=True,
|
166 |
+
enable_thinking=False # Disable thinking mode
|
167 |
+
)
|
168 |
+
model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
|
169 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
170 |
+
outputs = pipe(
|
171 |
+
model_inputs,
|
172 |
+
max_new_tokens=512,
|
173 |
+
do_sample=True,
|
174 |
+
temperature=0.7, # Recommended for non-thinking mode
|
175 |
+
top_p=0.8, # Recommended for non-thinking mode
|
176 |
+
pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
|
177 |
+
)
|
178 |
parsed_output = parse_llm_output(outputs, enhancement_prompt)
|
179 |
print("Enhanced persona generated.")
|
180 |
return parsed_output if parsed_output else f"Could not generate profile based on:\n{bio_text}"
|
|
|
185 |
|
186 |
@GPU(memory=60)
|
187 |
def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
|
188 |
+
"""Generate an optimized system prompt for the persona with thinking disabled."""
|
189 |
pipe = load_model()
|
190 |
print(f"Generating system prompt for {name}...")
|
191 |
fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
|
|
|
194 |
{"role": "user", "content": f"""Create a system prompt for an AI to simulate the character '{name}'. Context for simulation: {context} Character Profile:\n{enhanced_profile}\n\nGenerate the system prompt based *only* on the profile and context provided."""}
|
195 |
]
|
196 |
try:
|
197 |
+
tokenizer = pipe.tokenizer
|
198 |
+
text = tokenizer.apply_chat_template(
|
199 |
+
prompt,
|
200 |
+
tokenize=False,
|
201 |
+
add_generation_prompt=True,
|
202 |
+
enable_thinking=False # Disable thinking mode
|
203 |
+
)
|
204 |
+
model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
|
205 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
206 |
+
outputs = pipe(
|
207 |
+
model_inputs,
|
208 |
+
max_new_tokens=300,
|
209 |
+
do_sample=True,
|
210 |
+
temperature=0.7, # Recommended for non-thinking mode
|
211 |
+
top_p=0.8, # Recommended for non-thinking mode
|
212 |
+
pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
|
213 |
+
)
|
214 |
parsed_output = parse_llm_output(outputs, prompt)
|
215 |
print("System prompt generated.")
|
216 |
return parsed_output if parsed_output else fallback_prompt
|
|
|
221 |
|
222 |
@GPU(memory=60)
|
223 |
def generate_response(messages):
|
224 |
+
"""Generate a response using the LLM with thinking disabled."""
|
225 |
pipe = load_model()
|
226 |
print("Generating response...")
|
227 |
if not messages:
|
228 |
return "Error: No message history provided."
|
229 |
try:
|
230 |
+
tokenizer = pipe.tokenizer
|
231 |
+
text = tokenizer.apply_chat_template(
|
232 |
+
messages,
|
233 |
+
tokenize=False,
|
234 |
+
add_generation_prompt=True,
|
235 |
+
enable_thinking=False # Disable thinking mode
|
236 |
+
)
|
237 |
+
model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
|
238 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
239 |
outputs = pipe(
|
240 |
+
model_inputs,
|
241 |
max_new_tokens=512,
|
242 |
do_sample=True,
|
243 |
+
top_p=0.8, # Recommended for non-thinking mode
|
244 |
+
temperature=0.7, # Recommended for non-thinking mode
|
245 |
pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
|
246 |
)
|
247 |
parsed_output = parse_llm_output(outputs, messages)
|
|
|
330 |
.send-button { background-color: #2c3e50 !important; color: white !important; }
|
331 |
.persona-button { background-color: #4ca1af !important; color: white !important; }
|
332 |
.system-prompt-display { background-color: #f5f5f5; border-radius: 8px; padding: 15px; margin-top: 15px; border: 1px solid #e0e0e0; font-family: monospace; white-space: pre-wrap; word-wrap: break-word; }
|
333 |
+
.footer { text-align: center; margin-top: 20px; font-size: 0.9em; color: #666; }
|
334 |
.typing-indicator { color: #aaa; font-style: italic; }
|
335 |
"""
|
336 |
with gr.Blocks(css=css, title="AI Persona Simulator") as interface:
|