stepfun-ai
/

GOT-OCR2_0

@@ -393,59 +393,46 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
-        # Omit tokens covered by past_key_values
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "images": kwargs.get("images", None),
-            }
-        )
         return model_inputs
     def initialize_vision_tokenizer(
@@ -536,7 +523,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         conv_mpt = Conversation(
             system="""<|im_start|>system
-        You should follow the instructions carefully and explain your answers in detail.""",
             # system = None,
             roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
             version="mpt",
@@ -728,7 +715,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         return processed_images
-    def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
         # Model
         self.disable_torch_init()
         multi_page=False
@@ -778,21 +765,18 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             image_tensor_1 = image_processor_high(image)
             image_list.append(image_tensor_1)
         image_list = torch.stack(image_list)
-        print('====new images batch size======:  \n',image_list.shape)
         if use_im_start_end:
             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
-        You should follow the instructions carefully and explain your answers in detail.""",
             # system = None,
             roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
             version="mpt",
@@ -811,8 +795,8 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             print(prompt)
         inputs = tokenizer([prompt])
         input_ids = torch.as_tensor(inputs.input_ids).cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
@@ -824,25 +808,26 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_list.half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
-                    # no_repeat_ngram_size = 20,
                     streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
-                    )
         else:
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_list.half().cuda()],
                     do_sample=False,
-                    num_beams = 1,
-                    # no_repeat_ngram_size = 20,
                     # streamer=streamer,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
-                    )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()

     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=input_ids.device)
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
+                current_length = cache_length
+                max_cache_shape = past_key_values.get_max_cache_shape()
+                max_cache_length = max_cache_shape[1] if max_cache_shape else None
             else:
+                cache_length = past_key_values[0][0].shape[2]
+                current_length = cache_length
                 max_cache_length = None
+            if attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - cache_length):]
+            elif cache_length < input_ids.shape[1]:
+                input_ids = input_ids[:, cache_length:]
+            if max_cache_length is not None and attention_mask is not None:
+                if cache_length + input_ids.shape[1] > max_cache_length:
+                    attention_mask = attention_mask[:, -max_cache_length:]
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        model_inputs = {
+            "input_ids": input_ids if inputs_embeds is None or past_key_values is not None else None,
+            "inputs_embeds": inputs_embeds if past_key_values is None else None,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "images": kwargs.get("images", None),
+            "use_cache": kwargs.get("use_cache", True)
+        }
         return model_inputs
     def initialize_vision_tokenizer(
         conv_mpt = Conversation(
             system="""<|im_start|>system
+You should follow the instructions carefully and explain your answers in detail.""",
             # system = None,
             roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
             version="mpt",
         return processed_images
+    def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag=False):
         # Model
         self.disable_torch_init()
         multi_page=False
             image_tensor_1 = image_processor_high(image)
             image_list.append(image_tensor_1)
         image_list = torch.stack(image_list)
+        # print('====new images batch size======:  \n',image_list.shape)
         if use_im_start_end:
             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         conv_mpt = Conversation(
             system="""<|im_start|>system
+You should follow the instructions carefully and explain your answers in detail.""",
             # system = None,
             roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
             version="mpt",
             print(prompt)
         inputs = tokenizer([prompt])
         input_ids = torch.as_tensor(inputs.input_ids).cuda()
+        attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=input_ids.device)
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
                 output_ids = self.generate(
                     input_ids,
                     images=[image_list.half().cuda()],
+                    attention_mask=attention_mask,
                     do_sample=False,
                     streamer=streamer,
+                    num_beams=1,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
+                )
         else:
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
                     images=[image_list.half().cuda()],
+                    attention_mask=attention_mask,
                     do_sample=False,
                     # streamer=streamer,
+                    num_beams=1,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
+                )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()