Spaces:

KingNish
/

Bagel-7B-Demo

Running on Zero

App Files Files Community

KingNish commited on about 10 hours ago

Commit

e6e1837

verified ·

1 Parent(s): 3530257

Update inferencer.py

Browse files

Files changed (1) hide show

inferencer.py +10 -10

inferencer.py CHANGED Viewed

@@ -228,8 +228,8 @@ class InterleaveInferencer:
         image_shapes=(1024, 1024), # Default, can be overridden by actual input image
     ):
         gen_context = self.init_gen_context()
-        cfg_text_context = self.init_gen_context()
-        cfg_img_context = self.init_gen_context()
         current_image_shapes = image_shapes
@@ -243,15 +243,16 @@ class InterleaveInferencer:
             for input_term in input_lists:
                 if isinstance(input_term, str):
                     gen_context = self.update_context_text(input_term, gen_context)
-                    cfg_text_context = self.update_context_text(input_term, cfg_text_context)
                     cfg_img_context = self.update_context_text(input_term, cfg_img_context)
                 elif isinstance(input_term, Image.Image):
-                    current_image_shapes = input_term.size[::-1] # H, W
-                    use_vae_for_input_image = not understanding_output
-                    gen_context = self.update_context_image(input_term, gen_context, vae=use_vae_for_input_image, vit=True)
-                    cfg_text_context = self.update_context_image(input_term, cfg_text_context, vae=use_vae_for_input_image, vit=True)
-                    # cfg_img_context does not typically see input images
                 else:
                     raise ValueError(f"Unsupported input type: {type(input_term)}")
@@ -266,10 +267,9 @@ class InterleaveInferencer:
                     full_thought_text = "".join(thought_text_parts)
                     if full_thought_text: # Only update if thought was generated
                         gen_context = self.update_context_text(full_thought_text, gen_context)
-                        cfg_text_context = self.update_context_text(full_thought_text, cfg_text_context)
                 img = self.gen_image(
-                    image_shape=current_image_shapes,
                     gen_context=gen_context,
                     cfg_text_precontext=cfg_text_context,
                     cfg_img_precontext=cfg_img_context,

         image_shapes=(1024, 1024), # Default, can be overridden by actual input image
     ):
         gen_context = self.init_gen_context()
+        cfg_text_context = deepcopy(gen_context)
+        cfg_img_context = deepcopy(gen_context)
         current_image_shapes = image_shapes
             for input_term in input_lists:
                 if isinstance(input_term, str):
+                    cfg_text_context = deepcopy(gen_context)
                     gen_context = self.update_context_text(input_term, gen_context)
                     cfg_img_context = self.update_context_text(input_term, cfg_img_context)
                 elif isinstance(input_term, Image.Image):
+                    input_term = self.vae_transform.resize_transform(pil_img2rgb(input_term))
+                    gen_context = self.update_context_image(input_term, gen_context, vae=not understanding_output)
+                    image_shapes = input_term.size[::-1]
+                    cfg_text_context = deepcopy(gen_context)
                 else:
                     raise ValueError(f"Unsupported input type: {type(input_term)}")
                     full_thought_text = "".join(thought_text_parts)
                     if full_thought_text: # Only update if thought was generated
                         gen_context = self.update_context_text(full_thought_text, gen_context)
                 img = self.gen_image(
+                    image_shape=image_shapes,
                     gen_context=gen_context,
                     cfg_text_precontext=cfg_text_context,
                     cfg_img_precontext=cfg_img_context,