bytedance-research
/

ChatTS-14B

@@ -1450,6 +1450,9 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
             attention_mask=attention_mask
         )
     def _update_model_kwargs_for_generation(
         self,
         outputs: ModelOutput,
@@ -1505,8 +1508,12 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None

             attention_mask=attention_mask
         )
+    def _extract_past_from_model_output(self, outputs: ModelOutput):
+        return "past_key_values", outputs.past_key_values
     def _update_model_kwargs_for_generation(
         self,
         outputs: ModelOutput,
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
+                past_length  = past_key_values.seen_tokens
+                max_cache_length = (
+                    past_key_values.get_max_length()
+                    if hasattr(past_key_values, "get_max_length")
+                    else past_key_values.get_max_cache_shape()
+                )
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None