Spaces:

KingNish
/

Bagel-7B-Demo

Paused

App Files Files Community

KingNish commited on 8 days ago

Commit

71c41c2

verified ·

1 Parent(s): 0c38e11

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -21

app.py CHANGED Viewed

@@ -214,9 +214,7 @@ def call_edit_image(image, prompt, show_thinking, cfg_text_scale, cfg_img_scale,
 DEFAULT_WELCOME_MESSAGE = {
     "role": "assistant",
-    "content": [
-        {"type": "text", "content": "Hello! I am BAGEL, your multimodal assistant. How can I help you today? Select a mode and enter your prompt."}
-    ],
     "key": "welcome"
 }
@@ -264,7 +262,8 @@ class GradioApp:
         # This is simplified; best-gradio-ui.py stores settings per conversation
         current_turn_settings = {
             "mode": mode,
-            "image_input_path": image_input.name if image_input else None, # Store path if image is uploaded
             # TTI
             "tti_show_thinking": tti_show_thinking, "tti_cfg_text_scale": tti_cfg_text_scale, "tti_cfg_interval": tti_cfg_interval, "tti_timestep_shift": tti_timestep_shift, "tti_num_timesteps": tti_num_timesteps, "tti_cfg_renorm_min": tti_cfg_renorm_min, "tti_cfg_renorm_type": tti_cfg_renorm_type, "tti_max_think_token_n": tti_max_think_token_n, "tti_do_sample": tti_do_sample, "tti_text_temperature": tti_text_temperature, "tti_seed": tti_seed, "tti_image_ratio": tti_image_ratio,
             # Edit
@@ -274,20 +273,31 @@ class GradioApp:
         }
         self.conversation_contexts[self.current_conversation_id]["settings"] = current_turn_settings
-        user_message_content = []
         if text_input:
-            user_message_content.append({"type": "text", "content": text_input})
         if image_input and mode in ["Image Edit", "Image Understanding"]:
-            # Gradio chatbot can display images directly if they are file paths or PIL Images
-            # For simplicity, let's assume image_input is a PIL Image or path that gr.Image can handle
-            user_message_content.append({"type": "image", "content": image_input})
-        if not user_message_content:
-             user_message_content.append({"type": "text", "content": "(No text prompt provided for image operation)"})
-        history.append({"role": "user", "content": user_message_content, "key": str(uuid.uuid4())})
-        history.append({"role": "assistant", "content": [{"type": "text", "content": "Processing..."}], "key": str(uuid.uuid4()), "loading": True})
         yield history, gr.update(value=None), gr.update(value=None) # chatbot, text_input, image_input (clear inputs)
@@ -297,7 +307,8 @@ class GradioApp:
             output_text = None
             thinking_text = None
-            pil_image_input = Image.open(image_input.name) if image_input else None
             if mode == "Text to Image":
                 output_image, thinking_text = call_text_to_image(text_input, tti_show_thinking, tti_cfg_text_scale, tti_cfg_interval, tti_timestep_shift, tti_num_timesteps, tti_cfg_renorm_min, tti_cfg_renorm_type, tti_max_think_token_n, tti_do_sample, tti_text_temperature, tti_seed, tti_image_ratio)
@@ -323,17 +334,18 @@ class GradioApp:
             bot_response_content = []
             if thinking_text:
-                bot_response_content.append({"type": "text", "content": f"**Thinking Process:**\n{thinking_text}"})
             if output_text:
-                bot_response_content.append({"type": "text", "content": output_text})
-            if output_image:
-                bot_response_content.append({"type": "image", "content": output_image})
             if not bot_response_content:
-                bot_response_content.append({"type": "text", "content": "(No output generated)"})
-            history[-1]["content"] = bot_response_content
-            history[-1]["loading"] = False
         except Exception as e:
             print(f"Error during processing: {e}")

 DEFAULT_WELCOME_MESSAGE = {
     "role": "assistant",
+    "content": "Hello! I am BAGEL, your multimodal assistant. How can I help you today? Select a mode and enter your prompt.",
     "key": "welcome"
 }
         # This is simplified; best-gradio-ui.py stores settings per conversation
         current_turn_settings = {
             "mode": mode,
+            # Store PIL image directly if needed, or handle path carefully
+            "image_input": image_input, # Now storing the PIL image or None
             # TTI
             "tti_show_thinking": tti_show_thinking, "tti_cfg_text_scale": tti_cfg_text_scale, "tti_cfg_interval": tti_cfg_interval, "tti_timestep_shift": tti_timestep_shift, "tti_num_timesteps": tti_num_timesteps, "tti_cfg_renorm_min": tti_cfg_renorm_min, "tti_cfg_renorm_type": tti_cfg_renorm_type, "tti_max_think_token_n": tti_max_think_token_n, "tti_do_sample": tti_do_sample, "tti_text_temperature": tti_text_temperature, "tti_seed": tti_seed, "tti_image_ratio": tti_image_ratio,
             # Edit
         }
         self.conversation_contexts[self.current_conversation_id]["settings"] = current_turn_settings
+        user_content_list = []
         if text_input:
+            user_content_list.append({"type": "text", "text": text_input})
         if image_input and mode in ["Image Edit", "Image Understanding"]:
+            # For 'messages' format, images are typically handled by passing them as part of a list of content dicts.
+            # Gradio's Chatbot with type='messages' can render PIL Images or file paths directly in the 'content' list.
+            user_content_list.append({"type": "image", "image": image_input}) # Assuming image_input is PIL
+        # Construct the user message for history
+        # If only text, content can be a string. If mixed, it's a list of dicts.
+        user_message_for_history = {
+            "role": "user",
+            "content": text_input if not image_input else user_content_list,
+            "key": str(uuid.uuid4())
+        }
+        if not text_input and image_input:
+             user_message_for_history["content"] = user_content_list
+        elif not user_content_list:
+            # Handle case where there's no input at all, though prior checks should prevent this.
+            gr.Warning("No input provided.")
+            return self._get_current_history(), gr.update(value=None), gr.update(value=None)
+        history.append(user_message_for_history)
+        history.append({"role": "assistant", "content": "Processing...", "key": str(uuid.uuid4())})
         yield history, gr.update(value=None), gr.update(value=None) # chatbot, text_input, image_input (clear inputs)
             output_text = None
             thinking_text = None
+            # image_input is already a PIL image from the gr.Image component with type="pil"
+            pil_image_input = image_input
             if mode == "Text to Image":
                 output_image, thinking_text = call_text_to_image(text_input, tti_show_thinking, tti_cfg_text_scale, tti_cfg_interval, tti_timestep_shift, tti_num_timesteps, tti_cfg_renorm_min, tti_cfg_renorm_type, tti_max_think_token_n, tti_do_sample, tti_text_temperature, tti_seed, tti_image_ratio)
             bot_response_content = []
             if thinking_text:
+                # For 'messages' type, each part of the content is a dict in a list
+                bot_response_content.append({"type": "text", "text": f"**Thinking Process:**\n{thinking_text}"})
             if output_text:
+                bot_response_content.append({"type": "text", "text": output_text})
+            if output_image: # output_image should be a PIL Image
+                bot_response_content.append({"type": "image", "image": output_image})
             if not bot_response_content:
+                bot_response_content.append({"type": "text", "text": "(No output generated)"})
+            # Update the last message (which was "Processing...")
+            history[-1]["content"] = bot_response_content_list[0]["text"] if len(bot_response_content_list) == 1 and bot_response_content_list[0]["type"] == "text" else bot_response_content_list
         except Exception as e:
             print(f"Error during processing: {e}")