Spaces:

KingNish
/

Bagel-7B-Demo

Running on Zero

App Files Files Community

KingNish commited on 2 days ago

Commit

c2cc633

verified ·

1 Parent(s): c40e1ba

Update inferencer.py

Browse files

Files changed (1) hide show

inferencer.py +48 -50

inferencer.py CHANGED Viewed

@@ -233,57 +233,55 @@ class InterleaveInferencer:
         current_image_shapes = image_shapes
-        # Use torch.cuda.amp.autocast if available, otherwise a simple context manager
-        # For simplicity, assuming it's handled externally or not strictly needed for this snippet
-        # with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
-        if think:
-            system_prompt = VLM_THINK_SYSTEM_PROMPT if understanding_output else GEN_THINK_SYSTEM_PROMPT
-            gen_context = self.update_context_text(system_prompt, gen_context)
-            cfg_text_context = self.update_context_text(system_prompt, cfg_text_context)
-            cfg_img_context = self.update_context_text(system_prompt, cfg_img_context)
-        for input_term in input_lists:
-            if isinstance(input_term, str):
-                gen_context = self.update_context_text(input_term, gen_context)
-                cfg_text_context = self.update_context_text(input_term, cfg_text_context)
-                cfg_img_context = self.update_context_text(input_term, cfg_img_context)
-            elif isinstance(input_term, Image.Image):
-                current_image_shapes = input_term.size[::-1] # H, W
-                use_vae_for_input_image = not understanding_output
-                gen_context = self.update_context_image(input_term, gen_context, vae=use_vae_for_input_image, vit=True)
-                cfg_text_context = self.update_context_image(input_term, cfg_text_context, vae=use_vae_for_input_image, vit=True)
-                # cfg_img_context does not typically see input images
-            else:
-                raise ValueError(f"Unsupported input type: {type(input_term)}")
-        if understanding_output: # Generate text
-            yield from self.gen_text(gen_context, max_length=max_think_token_n, do_sample=do_sample, temperature=temperature)
-        else: # Generate image
             if think:
-                thought_text_parts = []
-                for part in self.gen_text(gen_context, max_length=max_think_token_n, do_sample=do_sample, temperature=temperature):
-                    yield part # Stream the thought
-                    thought_text_parts.append(part)
-                full_thought_text = "".join(thought_text_parts)
-                if full_thought_text: # Only update if thought was generated
-                    gen_context = self.update_context_text(full_thought_text, gen_context)
-                    cfg_text_context = self.update_context_text(full_thought_text, cfg_text_context)
-            img = self.gen_image(
-                image_shape=current_image_shapes,
-                gen_context=gen_context,
-                cfg_text_precontext=cfg_text_context,
-                cfg_img_precontext=cfg_img_context,
-                cfg_text_scale=cfg_text_scale,
-                cfg_img_scale=cfg_img_scale,
-                cfg_interval=cfg_interval,
-                timestep_shift=timestep_shift,
-                num_timesteps=num_timesteps,
-                cfg_renorm_min=cfg_renorm_min,
-                cfg_renorm_type=cfg_renorm_type,
-            )
-            yield img
     def __call__(
         self,

         current_image_shapes = image_shapes
+        with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
             if think:
+                system_prompt = VLM_THINK_SYSTEM_PROMPT if understanding_output else GEN_THINK_SYSTEM_PROMPT
+                gen_context = self.update_context_text(system_prompt, gen_context)
+                cfg_text_context = self.update_context_text(system_prompt, cfg_text_context)
+                cfg_img_context = self.update_context_text(system_prompt, cfg_img_context)
+            for input_term in input_lists:
+                if isinstance(input_term, str):
+                    gen_context = self.update_context_text(input_term, gen_context)
+                    cfg_text_context = self.update_context_text(input_term, cfg_text_context)
+                    cfg_img_context = self.update_context_text(input_term, cfg_img_context)
+                elif isinstance(input_term, Image.Image):
+                    current_image_shapes = input_term.size[::-1] # H, W
+                    use_vae_for_input_image = not understanding_output
+                    gen_context = self.update_context_image(input_term, gen_context, vae=use_vae_for_input_image, vit=True)
+                    cfg_text_context = self.update_context_image(input_term, cfg_text_context, vae=use_vae_for_input_image, vit=True)
+                    # cfg_img_context does not typically see input images
+                else:
+                    raise ValueError(f"Unsupported input type: {type(input_term)}")
+            if understanding_output: # Generate text
+                yield from self.gen_text(gen_context, max_length=max_think_token_n, do_sample=do_sample, temperature=temperature)
+            else: # Generate image
+                if think:
+                    thought_text_parts = []
+                    for part in self.gen_text(gen_context, max_length=max_think_token_n, do_sample=do_sample, temperature=temperature):
+                        yield part # Stream the thought
+                        thought_text_parts.append(part)
+                    full_thought_text = "".join(thought_text_parts)
+                    if full_thought_text: # Only update if thought was generated
+                        gen_context = self.update_context_text(full_thought_text, gen_context)
+                        cfg_text_context = self.update_context_text(full_thought_text, cfg_text_context)
+                img = self.gen_image(
+                    image_shape=current_image_shapes,
+                    gen_context=gen_context,
+                    cfg_text_precontext=cfg_text_context,
+                    cfg_img_precontext=cfg_img_context,
+                    cfg_text_scale=cfg_text_scale,
+                    cfg_img_scale=cfg_img_scale,
+                    cfg_interval=cfg_interval,
+                    timestep_shift=timestep_shift,
+                    num_timesteps=num_timesteps,
+                    cfg_renorm_min=cfg_renorm_min,
+                    cfg_renorm_type=cfg_renorm_type,
+                )
+                yield img
     def __call__(
         self,