KingNish commited on
Commit
71c41c2
·
verified ·
1 Parent(s): 0c38e11

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -21
app.py CHANGED
@@ -214,9 +214,7 @@ def call_edit_image(image, prompt, show_thinking, cfg_text_scale, cfg_img_scale,
214
 
215
  DEFAULT_WELCOME_MESSAGE = {
216
  "role": "assistant",
217
- "content": [
218
- {"type": "text", "content": "Hello! I am BAGEL, your multimodal assistant. How can I help you today? Select a mode and enter your prompt."}
219
- ],
220
  "key": "welcome"
221
  }
222
 
@@ -264,7 +262,8 @@ class GradioApp:
264
  # This is simplified; best-gradio-ui.py stores settings per conversation
265
  current_turn_settings = {
266
  "mode": mode,
267
- "image_input_path": image_input.name if image_input else None, # Store path if image is uploaded
 
268
  # TTI
269
  "tti_show_thinking": tti_show_thinking, "tti_cfg_text_scale": tti_cfg_text_scale, "tti_cfg_interval": tti_cfg_interval, "tti_timestep_shift": tti_timestep_shift, "tti_num_timesteps": tti_num_timesteps, "tti_cfg_renorm_min": tti_cfg_renorm_min, "tti_cfg_renorm_type": tti_cfg_renorm_type, "tti_max_think_token_n": tti_max_think_token_n, "tti_do_sample": tti_do_sample, "tti_text_temperature": tti_text_temperature, "tti_seed": tti_seed, "tti_image_ratio": tti_image_ratio,
270
  # Edit
@@ -274,20 +273,31 @@ class GradioApp:
274
  }
275
  self.conversation_contexts[self.current_conversation_id]["settings"] = current_turn_settings
276
 
277
- user_message_content = []
278
  if text_input:
279
- user_message_content.append({"type": "text", "content": text_input})
280
  if image_input and mode in ["Image Edit", "Image Understanding"]:
281
- # Gradio chatbot can display images directly if they are file paths or PIL Images
282
- # For simplicity, let's assume image_input is a PIL Image or path that gr.Image can handle
283
- user_message_content.append({"type": "image", "content": image_input})
284
 
285
- if not user_message_content:
286
- user_message_content.append({"type": "text", "content": "(No text prompt provided for image operation)"})
 
 
 
 
 
 
 
 
 
 
 
287
 
288
 
289
- history.append({"role": "user", "content": user_message_content, "key": str(uuid.uuid4())})
290
- history.append({"role": "assistant", "content": [{"type": "text", "content": "Processing..."}], "key": str(uuid.uuid4()), "loading": True})
291
 
292
  yield history, gr.update(value=None), gr.update(value=None) # chatbot, text_input, image_input (clear inputs)
293
 
@@ -297,7 +307,8 @@ class GradioApp:
297
  output_text = None
298
  thinking_text = None
299
 
300
- pil_image_input = Image.open(image_input.name) if image_input else None
 
301
 
302
  if mode == "Text to Image":
303
  output_image, thinking_text = call_text_to_image(text_input, tti_show_thinking, tti_cfg_text_scale, tti_cfg_interval, tti_timestep_shift, tti_num_timesteps, tti_cfg_renorm_min, tti_cfg_renorm_type, tti_max_think_token_n, tti_do_sample, tti_text_temperature, tti_seed, tti_image_ratio)
@@ -323,17 +334,18 @@ class GradioApp:
323
 
324
  bot_response_content = []
325
  if thinking_text:
326
- bot_response_content.append({"type": "text", "content": f"**Thinking Process:**\n{thinking_text}"})
 
327
  if output_text:
328
- bot_response_content.append({"type": "text", "content": output_text})
329
- if output_image:
330
- bot_response_content.append({"type": "image", "content": output_image})
331
 
332
  if not bot_response_content:
333
- bot_response_content.append({"type": "text", "content": "(No output generated)"})
334
 
335
- history[-1]["content"] = bot_response_content
336
- history[-1]["loading"] = False
337
 
338
  except Exception as e:
339
  print(f"Error during processing: {e}")
 
214
 
215
  DEFAULT_WELCOME_MESSAGE = {
216
  "role": "assistant",
217
+ "content": "Hello! I am BAGEL, your multimodal assistant. How can I help you today? Select a mode and enter your prompt.",
 
 
218
  "key": "welcome"
219
  }
220
 
 
262
  # This is simplified; best-gradio-ui.py stores settings per conversation
263
  current_turn_settings = {
264
  "mode": mode,
265
+ # Store PIL image directly if needed, or handle path carefully
266
+ "image_input": image_input, # Now storing the PIL image or None
267
  # TTI
268
  "tti_show_thinking": tti_show_thinking, "tti_cfg_text_scale": tti_cfg_text_scale, "tti_cfg_interval": tti_cfg_interval, "tti_timestep_shift": tti_timestep_shift, "tti_num_timesteps": tti_num_timesteps, "tti_cfg_renorm_min": tti_cfg_renorm_min, "tti_cfg_renorm_type": tti_cfg_renorm_type, "tti_max_think_token_n": tti_max_think_token_n, "tti_do_sample": tti_do_sample, "tti_text_temperature": tti_text_temperature, "tti_seed": tti_seed, "tti_image_ratio": tti_image_ratio,
269
  # Edit
 
273
  }
274
  self.conversation_contexts[self.current_conversation_id]["settings"] = current_turn_settings
275
 
276
+ user_content_list = []
277
  if text_input:
278
+ user_content_list.append({"type": "text", "text": text_input})
279
  if image_input and mode in ["Image Edit", "Image Understanding"]:
280
+ # For 'messages' format, images are typically handled by passing them as part of a list of content dicts.
281
+ # Gradio's Chatbot with type='messages' can render PIL Images or file paths directly in the 'content' list.
282
+ user_content_list.append({"type": "image", "image": image_input}) # Assuming image_input is PIL
283
 
284
+ # Construct the user message for history
285
+ # If only text, content can be a string. If mixed, it's a list of dicts.
286
+ user_message_for_history = {
287
+ "role": "user",
288
+ "content": text_input if not image_input else user_content_list,
289
+ "key": str(uuid.uuid4())
290
+ }
291
+ if not text_input and image_input:
292
+ user_message_for_history["content"] = user_content_list
293
+ elif not user_content_list:
294
+ # Handle case where there's no input at all, though prior checks should prevent this.
295
+ gr.Warning("No input provided.")
296
+ return self._get_current_history(), gr.update(value=None), gr.update(value=None)
297
 
298
 
299
+ history.append(user_message_for_history)
300
+ history.append({"role": "assistant", "content": "Processing...", "key": str(uuid.uuid4())})
301
 
302
  yield history, gr.update(value=None), gr.update(value=None) # chatbot, text_input, image_input (clear inputs)
303
 
 
307
  output_text = None
308
  thinking_text = None
309
 
310
+ # image_input is already a PIL image from the gr.Image component with type="pil"
311
+ pil_image_input = image_input
312
 
313
  if mode == "Text to Image":
314
  output_image, thinking_text = call_text_to_image(text_input, tti_show_thinking, tti_cfg_text_scale, tti_cfg_interval, tti_timestep_shift, tti_num_timesteps, tti_cfg_renorm_min, tti_cfg_renorm_type, tti_max_think_token_n, tti_do_sample, tti_text_temperature, tti_seed, tti_image_ratio)
 
334
 
335
  bot_response_content = []
336
  if thinking_text:
337
+ # For 'messages' type, each part of the content is a dict in a list
338
+ bot_response_content.append({"type": "text", "text": f"**Thinking Process:**\n{thinking_text}"})
339
  if output_text:
340
+ bot_response_content.append({"type": "text", "text": output_text})
341
+ if output_image: # output_image should be a PIL Image
342
+ bot_response_content.append({"type": "image", "image": output_image})
343
 
344
  if not bot_response_content:
345
+ bot_response_content.append({"type": "text", "text": "(No output generated)"})
346
 
347
+ # Update the last message (which was "Processing...")
348
+ history[-1]["content"] = bot_response_content_list[0]["text"] if len(bot_response_content_list) == 1 and bot_response_content_list[0]["type"] == "text" else bot_response_content_list
349
 
350
  except Exception as e:
351
  print(f"Error during processing: {e}")