Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -214,9 +214,7 @@ def call_edit_image(image, prompt, show_thinking, cfg_text_scale, cfg_img_scale,
|
|
214 |
|
215 |
DEFAULT_WELCOME_MESSAGE = {
|
216 |
"role": "assistant",
|
217 |
-
"content":
|
218 |
-
{"type": "text", "content": "Hello! I am BAGEL, your multimodal assistant. How can I help you today? Select a mode and enter your prompt."}
|
219 |
-
],
|
220 |
"key": "welcome"
|
221 |
}
|
222 |
|
@@ -264,7 +262,8 @@ class GradioApp:
|
|
264 |
# This is simplified; best-gradio-ui.py stores settings per conversation
|
265 |
current_turn_settings = {
|
266 |
"mode": mode,
|
267 |
-
|
|
|
268 |
# TTI
|
269 |
"tti_show_thinking": tti_show_thinking, "tti_cfg_text_scale": tti_cfg_text_scale, "tti_cfg_interval": tti_cfg_interval, "tti_timestep_shift": tti_timestep_shift, "tti_num_timesteps": tti_num_timesteps, "tti_cfg_renorm_min": tti_cfg_renorm_min, "tti_cfg_renorm_type": tti_cfg_renorm_type, "tti_max_think_token_n": tti_max_think_token_n, "tti_do_sample": tti_do_sample, "tti_text_temperature": tti_text_temperature, "tti_seed": tti_seed, "tti_image_ratio": tti_image_ratio,
|
270 |
# Edit
|
@@ -274,20 +273,31 @@ class GradioApp:
|
|
274 |
}
|
275 |
self.conversation_contexts[self.current_conversation_id]["settings"] = current_turn_settings
|
276 |
|
277 |
-
|
278 |
if text_input:
|
279 |
-
|
280 |
if image_input and mode in ["Image Edit", "Image Understanding"]:
|
281 |
-
#
|
282 |
-
#
|
283 |
-
|
284 |
|
285 |
-
|
286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
|
289 |
-
history.append(
|
290 |
-
history.append({"role": "assistant", "content":
|
291 |
|
292 |
yield history, gr.update(value=None), gr.update(value=None) # chatbot, text_input, image_input (clear inputs)
|
293 |
|
@@ -297,7 +307,8 @@ class GradioApp:
|
|
297 |
output_text = None
|
298 |
thinking_text = None
|
299 |
|
300 |
-
|
|
|
301 |
|
302 |
if mode == "Text to Image":
|
303 |
output_image, thinking_text = call_text_to_image(text_input, tti_show_thinking, tti_cfg_text_scale, tti_cfg_interval, tti_timestep_shift, tti_num_timesteps, tti_cfg_renorm_min, tti_cfg_renorm_type, tti_max_think_token_n, tti_do_sample, tti_text_temperature, tti_seed, tti_image_ratio)
|
@@ -323,17 +334,18 @@ class GradioApp:
|
|
323 |
|
324 |
bot_response_content = []
|
325 |
if thinking_text:
|
326 |
-
|
|
|
327 |
if output_text:
|
328 |
-
bot_response_content.append({"type": "text", "
|
329 |
-
if output_image:
|
330 |
-
bot_response_content.append({"type": "image", "
|
331 |
|
332 |
if not bot_response_content:
|
333 |
-
bot_response_content.append({"type": "text", "
|
334 |
|
335 |
-
|
336 |
-
history[-1]["
|
337 |
|
338 |
except Exception as e:
|
339 |
print(f"Error during processing: {e}")
|
|
|
214 |
|
215 |
DEFAULT_WELCOME_MESSAGE = {
|
216 |
"role": "assistant",
|
217 |
+
"content": "Hello! I am BAGEL, your multimodal assistant. How can I help you today? Select a mode and enter your prompt.",
|
|
|
|
|
218 |
"key": "welcome"
|
219 |
}
|
220 |
|
|
|
262 |
# This is simplified; best-gradio-ui.py stores settings per conversation
|
263 |
current_turn_settings = {
|
264 |
"mode": mode,
|
265 |
+
# Store PIL image directly if needed, or handle path carefully
|
266 |
+
"image_input": image_input, # Now storing the PIL image or None
|
267 |
# TTI
|
268 |
"tti_show_thinking": tti_show_thinking, "tti_cfg_text_scale": tti_cfg_text_scale, "tti_cfg_interval": tti_cfg_interval, "tti_timestep_shift": tti_timestep_shift, "tti_num_timesteps": tti_num_timesteps, "tti_cfg_renorm_min": tti_cfg_renorm_min, "tti_cfg_renorm_type": tti_cfg_renorm_type, "tti_max_think_token_n": tti_max_think_token_n, "tti_do_sample": tti_do_sample, "tti_text_temperature": tti_text_temperature, "tti_seed": tti_seed, "tti_image_ratio": tti_image_ratio,
|
269 |
# Edit
|
|
|
273 |
}
|
274 |
self.conversation_contexts[self.current_conversation_id]["settings"] = current_turn_settings
|
275 |
|
276 |
+
user_content_list = []
|
277 |
if text_input:
|
278 |
+
user_content_list.append({"type": "text", "text": text_input})
|
279 |
if image_input and mode in ["Image Edit", "Image Understanding"]:
|
280 |
+
# For 'messages' format, images are typically handled by passing them as part of a list of content dicts.
|
281 |
+
# Gradio's Chatbot with type='messages' can render PIL Images or file paths directly in the 'content' list.
|
282 |
+
user_content_list.append({"type": "image", "image": image_input}) # Assuming image_input is PIL
|
283 |
|
284 |
+
# Construct the user message for history
|
285 |
+
# If only text, content can be a string. If mixed, it's a list of dicts.
|
286 |
+
user_message_for_history = {
|
287 |
+
"role": "user",
|
288 |
+
"content": text_input if not image_input else user_content_list,
|
289 |
+
"key": str(uuid.uuid4())
|
290 |
+
}
|
291 |
+
if not text_input and image_input:
|
292 |
+
user_message_for_history["content"] = user_content_list
|
293 |
+
elif not user_content_list:
|
294 |
+
# Handle case where there's no input at all, though prior checks should prevent this.
|
295 |
+
gr.Warning("No input provided.")
|
296 |
+
return self._get_current_history(), gr.update(value=None), gr.update(value=None)
|
297 |
|
298 |
|
299 |
+
history.append(user_message_for_history)
|
300 |
+
history.append({"role": "assistant", "content": "Processing...", "key": str(uuid.uuid4())})
|
301 |
|
302 |
yield history, gr.update(value=None), gr.update(value=None) # chatbot, text_input, image_input (clear inputs)
|
303 |
|
|
|
307 |
output_text = None
|
308 |
thinking_text = None
|
309 |
|
310 |
+
# image_input is already a PIL image from the gr.Image component with type="pil"
|
311 |
+
pil_image_input = image_input
|
312 |
|
313 |
if mode == "Text to Image":
|
314 |
output_image, thinking_text = call_text_to_image(text_input, tti_show_thinking, tti_cfg_text_scale, tti_cfg_interval, tti_timestep_shift, tti_num_timesteps, tti_cfg_renorm_min, tti_cfg_renorm_type, tti_max_think_token_n, tti_do_sample, tti_text_temperature, tti_seed, tti_image_ratio)
|
|
|
334 |
|
335 |
bot_response_content = []
|
336 |
if thinking_text:
|
337 |
+
# For 'messages' type, each part of the content is a dict in a list
|
338 |
+
bot_response_content.append({"type": "text", "text": f"**Thinking Process:**\n{thinking_text}"})
|
339 |
if output_text:
|
340 |
+
bot_response_content.append({"type": "text", "text": output_text})
|
341 |
+
if output_image: # output_image should be a PIL Image
|
342 |
+
bot_response_content.append({"type": "image", "image": output_image})
|
343 |
|
344 |
if not bot_response_content:
|
345 |
+
bot_response_content.append({"type": "text", "text": "(No output generated)"})
|
346 |
|
347 |
+
# Update the last message (which was "Processing...")
|
348 |
+
history[-1]["content"] = bot_response_content_list[0]["text"] if len(bot_response_content_list) == 1 and bot_response_content_list[0]["type"] == "text" else bot_response_content_list
|
349 |
|
350 |
except Exception as e:
|
351 |
print(f"Error during processing: {e}")
|