Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -8,19 +8,19 @@ import time
|
|
8 |
from spaces import GPU
|
9 |
|
10 |
# --- Constants and Configuration ---
|
11 |
-
MODEL_ID = "
|
12 |
MAX_GPU_MEMORY = "60GiB"
|
13 |
|
14 |
# --- Model Loading ---
|
15 |
@GPU(memory=60)
|
16 |
def load_model():
|
17 |
-
"""Load the
|
18 |
print(f"Attempting to load model: {MODEL_ID} without quantization")
|
19 |
try:
|
20 |
pipe = pipeline(
|
21 |
"text-generation",
|
22 |
model=MODEL_ID,
|
23 |
-
torch_dtype=torch.bfloat16, # Full precision,
|
24 |
device_map="auto",
|
25 |
model_kwargs={
|
26 |
"use_cache": True,
|
@@ -150,7 +150,7 @@ def parse_llm_output(full_output, input_prompt_list):
|
|
150 |
|
151 |
@GPU(memory=60)
|
152 |
def generate_enhanced_persona(name, bio_text, context=""):
|
153 |
-
"""Use the LLM to enhance the persona profile
|
154 |
pipe = load_model()
|
155 |
print(f"Generating enhanced persona for {name}...")
|
156 |
enhancement_prompt = [
|
@@ -161,17 +161,16 @@ def generate_enhanced_persona(name, bio_text, context=""):
|
|
161 |
tokenizer = pipe.tokenizer
|
162 |
text = tokenizer.apply_chat_template(
|
163 |
enhancement_prompt,
|
164 |
-
tokenize=False,
|
165 |
add_generation_prompt=True,
|
166 |
-
|
167 |
)
|
168 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
169 |
outputs = pipe(
|
170 |
text,
|
171 |
max_new_tokens=512,
|
172 |
do_sample=True,
|
173 |
-
temperature=0.7,
|
174 |
-
top_p=0.8,
|
175 |
pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
|
176 |
)
|
177 |
parsed_output = parse_llm_output(outputs, enhancement_prompt)
|
@@ -184,7 +183,7 @@ def generate_enhanced_persona(name, bio_text, context=""):
|
|
184 |
|
185 |
@GPU(memory=60)
|
186 |
def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
|
187 |
-
"""Generate an optimized system prompt for the persona
|
188 |
pipe = load_model()
|
189 |
print(f"Generating system prompt for {name}...")
|
190 |
fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
|
@@ -196,17 +195,16 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
|
|
196 |
tokenizer = pipe.tokenizer
|
197 |
text = tokenizer.apply_chat_template(
|
198 |
prompt,
|
199 |
-
tokenize=False,
|
200 |
add_generation_prompt=True,
|
201 |
-
|
202 |
)
|
203 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
204 |
outputs = pipe(
|
205 |
text,
|
206 |
max_new_tokens=300,
|
207 |
do_sample=True,
|
208 |
-
temperature=0.7,
|
209 |
-
top_p=0.8,
|
210 |
pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
|
211 |
)
|
212 |
parsed_output = parse_llm_output(outputs, prompt)
|
@@ -219,7 +217,7 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
|
|
219 |
|
220 |
@GPU(memory=60)
|
221 |
def generate_response(messages):
|
222 |
-
"""Generate a response using the LLM
|
223 |
pipe = load_model()
|
224 |
print("Generating response...")
|
225 |
if not messages:
|
@@ -228,17 +226,16 @@ def generate_response(messages):
|
|
228 |
tokenizer = pipe.tokenizer
|
229 |
text = tokenizer.apply_chat_template(
|
230 |
messages,
|
231 |
-
tokenize=False,
|
232 |
add_generation_prompt=True,
|
233 |
-
|
234 |
)
|
235 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
236 |
outputs = pipe(
|
237 |
text,
|
238 |
max_new_tokens=512,
|
239 |
do_sample=True,
|
240 |
-
top_p=0.8,
|
241 |
-
temperature=0.7,
|
242 |
pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
|
243 |
)
|
244 |
parsed_output = parse_llm_output(outputs, messages)
|
@@ -292,7 +289,7 @@ class PersonaChat:
|
|
292 |
yield status, self.enhanced_profile, self.enhanced_profile, []
|
293 |
self.system_prompt = generate_system_prompt_with_llm(name, profile_for_prompt, context)
|
294 |
# Clean tokenizer artifacts from system prompt
|
295 |
-
self.system_prompt = re.sub(r'<\|
|
296 |
self.messages = [{"role": "system", "content": self.system_prompt}]
|
297 |
print(f"set_persona: Final yield with messages (not sent to Chatbot): {self.messages}")
|
298 |
# Yield empty history for Chatbot to avoid system message issues
|
@@ -329,7 +326,7 @@ def create_interface():
|
|
329 |
.gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
|
330 |
.main-container { max-width: 1200px; margin: auto; padding: 0; }
|
331 |
.header { background: linear-gradient(90deg, #2c3e50, #4ca1af); color: white; padding: 20px; border-radius: 10px 10px 0 0; margin-bottom: 20px; text-align: center; }
|
332 |
-
.setup-section {
|
333 |
.chat-section { background-color: white; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); }
|
334 |
.status-bar { background: #e9ecef; padding: 10px 15px; border-radius: 5px; margin: 15px 0; font-weight: 500; border: 1px solid #ced4da; }
|
335 |
.chat-container { border: 1px solid #eaeaea; border-radius: 10px; height: 500px !important; overflow-y: auto; background-color: #ffffff; padding: 10px; }
|
|
|
8 |
from spaces import GPU
|
9 |
|
10 |
# --- Constants and Configuration ---
|
11 |
+
MODEL_ID = "google/gemma-3-1b-it" # Updated to Gemma 3 1B
|
12 |
MAX_GPU_MEMORY = "60GiB"
|
13 |
|
14 |
# --- Model Loading ---
|
15 |
@GPU(memory=60)
|
16 |
def load_model():
|
17 |
+
"""Load the Gemma 3 1B model without quantization for full precision."""
|
18 |
print(f"Attempting to load model: {MODEL_ID} without quantization")
|
19 |
try:
|
20 |
pipe = pipeline(
|
21 |
"text-generation",
|
22 |
model=MODEL_ID,
|
23 |
+
torch_dtype=torch.bfloat16, # Full precision, compatible with Gemma
|
24 |
device_map="auto",
|
25 |
model_kwargs={
|
26 |
"use_cache": True,
|
|
|
150 |
|
151 |
@GPU(memory=60)
|
152 |
def generate_enhanced_persona(name, bio_text, context=""):
|
153 |
+
"""Use the LLM to enhance the persona profile."""
|
154 |
pipe = load_model()
|
155 |
print(f"Generating enhanced persona for {name}...")
|
156 |
enhancement_prompt = [
|
|
|
161 |
tokenizer = pipe.tokenizer
|
162 |
text = tokenizer.apply_chat_template(
|
163 |
enhancement_prompt,
|
|
|
164 |
add_generation_prompt=True,
|
165 |
+
tokenize=False
|
166 |
)
|
167 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
168 |
outputs = pipe(
|
169 |
text,
|
170 |
max_new_tokens=512,
|
171 |
do_sample=True,
|
172 |
+
temperature=0.7,
|
173 |
+
top_p=0.8,
|
174 |
pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
|
175 |
)
|
176 |
parsed_output = parse_llm_output(outputs, enhancement_prompt)
|
|
|
183 |
|
184 |
@GPU(memory=60)
|
185 |
def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
|
186 |
+
"""Generate an optimized system prompt for the persona."""
|
187 |
pipe = load_model()
|
188 |
print(f"Generating system prompt for {name}...")
|
189 |
fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona."""
|
|
|
195 |
tokenizer = pipe.tokenizer
|
196 |
text = tokenizer.apply_chat_template(
|
197 |
prompt,
|
|
|
198 |
add_generation_prompt=True,
|
199 |
+
tokenize=False
|
200 |
)
|
201 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
202 |
outputs = pipe(
|
203 |
text,
|
204 |
max_new_tokens=300,
|
205 |
do_sample=True,
|
206 |
+
temperature=0.7,
|
207 |
+
top_p=0.8,
|
208 |
pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
|
209 |
)
|
210 |
parsed_output = parse_llm_output(outputs, prompt)
|
|
|
217 |
|
218 |
@GPU(memory=60)
|
219 |
def generate_response(messages):
|
220 |
+
"""Generate a response using the LLM."""
|
221 |
pipe = load_model()
|
222 |
print("Generating response...")
|
223 |
if not messages:
|
|
|
226 |
tokenizer = pipe.tokenizer
|
227 |
text = tokenizer.apply_chat_template(
|
228 |
messages,
|
|
|
229 |
add_generation_prompt=True,
|
230 |
+
tokenize=False
|
231 |
)
|
232 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
233 |
outputs = pipe(
|
234 |
text,
|
235 |
max_new_tokens=512,
|
236 |
do_sample=True,
|
237 |
+
top_p=0.8,
|
238 |
+
temperature=0.7,
|
239 |
pad_token_id=pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id else None
|
240 |
)
|
241 |
parsed_output = parse_llm_output(outputs, messages)
|
|
|
289 |
yield status, self.enhanced_profile, self.enhanced_profile, []
|
290 |
self.system_prompt = generate_system_prompt_with_llm(name, profile_for_prompt, context)
|
291 |
# Clean tokenizer artifacts from system prompt
|
292 |
+
self.system_prompt = re.sub(r'<\|im_tailored\|>|<\|im_start\|>|^assistant\s*', '', self.system_prompt).strip()
|
293 |
self.messages = [{"role": "system", "content": self.system_prompt}]
|
294 |
print(f"set_persona: Final yield with messages (not sent to Chatbot): {self.messages}")
|
295 |
# Yield empty history for Chatbot to avoid system message issues
|
|
|
326 |
.gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
|
327 |
.main-container { max-width: 1200px; margin: auto; padding: 0; }
|
328 |
.header { background: linear-gradient(90deg, #2c3e50, #4ca1af); color: white; padding: 20px; border-radius: 10px 10px 0 0; margin-bottom: 20px; text-align: center; }
|
329 |
+
.setup-section { tôackground-color: #f9f9f9; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); margin-bottom: 20px; }
|
330 |
.chat-section { background-color: white; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); }
|
331 |
.status-bar { background: #e9ecef; padding: 10px 15px; border-radius: 5px; margin: 15px 0; font-weight: 500; border: 1px solid #ced4da; }
|
332 |
.chat-container { border: 1px solid #eaeaea; border-radius: 10px; height: 500px !important; overflow-y: auto; background-color: #ffffff; padding: 10px; }
|