Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -142,7 +142,7 @@ def parse_llm_output(full_output, input_prompt_list):
|
|
142 |
cleaned_text = re.sub(r'^<\/?s?>', '', cleaned_text).strip()
|
143 |
cleaned_text = re.sub(r'^(assistant|ASSISTANT|System|SYSTEM)[:\s]*', '', cleaned_text).strip()
|
144 |
if not cleaned_text and generated_text:
|
145 |
-
print("
|
146 |
return generated_text
|
147 |
if last_input_content and last_occurrence_index == -1:
|
148 |
print("Warning: Could not find last input prompt in LLM output. Returning cleaned full output.")
|
@@ -165,10 +165,9 @@ def generate_enhanced_persona(name, bio_text, context=""):
|
|
165 |
add_generation_prompt=True,
|
166 |
enable_thinking=False # Disable thinking mode
|
167 |
)
|
168 |
-
model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
|
169 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
170 |
outputs = pipe(
|
171 |
-
|
172 |
max_new_tokens=512,
|
173 |
do_sample=True,
|
174 |
temperature=0.7, # Recommended for non-thinking mode
|
@@ -201,10 +200,9 @@ def generate_system_prompt_with_llm(name, enhanced_profile, context=""):
|
|
201 |
add_generation_prompt=True,
|
202 |
enable_thinking=False # Disable thinking mode
|
203 |
)
|
204 |
-
model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
|
205 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
206 |
outputs = pipe(
|
207 |
-
|
208 |
max_new_tokens=300,
|
209 |
do_sample=True,
|
210 |
temperature=0.7, # Recommended for non-thinking mode
|
@@ -234,10 +232,9 @@ def generate_response(messages):
|
|
234 |
add_generation_prompt=True,
|
235 |
enable_thinking=False # Disable thinking mode
|
236 |
)
|
237 |
-
model_inputs = tokenizer([text], return_tensors="pt").to(pipe.model.device)
|
238 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
239 |
outputs = pipe(
|
240 |
-
|
241 |
max_new_tokens=512,
|
242 |
do_sample=True,
|
243 |
top_p=0.8, # Recommended for non-thinking mode
|
|
|
142 |
cleaned_text = re.sub(r'^<\/?s?>', '', cleaned_text).strip()
|
143 |
cleaned_text = re.sub(r'^(assistant|ASSISTANT|System|SYSTEM)[:\s]*', '', cleaned_text).strip()
|
144 |
if not cleaned_text and generated_text:
|
145 |
+
print("Wireturning original generation.")
|
146 |
return generated_text
|
147 |
if last_input_content and last_occurrence_index == -1:
|
148 |
print("Warning: Could not find last input prompt in LLM output. Returning cleaned full output.")
|
|
|
165 |
add_generation_prompt=True,
|
166 |
enable_thinking=False # Disable thinking mode
|
167 |
)
|
|
|
168 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
169 |
outputs = pipe(
|
170 |
+
text,
|
171 |
max_new_tokens=512,
|
172 |
do_sample=True,
|
173 |
temperature=0.7, # Recommended for non-thinking mode
|
|
|
200 |
add_generation_prompt=True,
|
201 |
enable_thinking=False # Disable thinking mode
|
202 |
)
|
|
|
203 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
204 |
outputs = pipe(
|
205 |
+
text,
|
206 |
max_new_tokens=300,
|
207 |
do_sample=True,
|
208 |
temperature=0.7, # Recommended for non-thinking mode
|
|
|
232 |
add_generation_prompt=True,
|
233 |
enable_thinking=False # Disable thinking mode
|
234 |
)
|
|
|
235 |
with torch.amp.autocast('cuda', dtype=torch.bfloat16):
|
236 |
outputs = pipe(
|
237 |
+
text,
|
238 |
max_new_tokens=512,
|
239 |
do_sample=True,
|
240 |
top_p=0.8, # Recommended for non-thinking mode
|