Update processing_qwen2_ts.py to allow text-only processing

This change is required for two purposes:
1. Text-only inference (no timeseries input).
2. vLLM v1 engine with prompt caching. In this case vLLM engine processes text and multimodal parts of the prompt separately.

Files changed (1) hide show

processing_qwen2_ts.py +8 -11

processing_qwen2_ts.py CHANGED Viewed

@@ -19,11 +19,7 @@ import torch
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import (
-    PreTokenizedInput,
-    TextInput,
-    PaddingStrategy,
-)
 def sp_encoding(timeseries: np.ndarray, eots_token: bool = True) -> Tuple[np.ndarray, str, dict]:
     """
@@ -70,8 +66,8 @@ class Qwen2TSProcessor(ProcessorMixin):
     def __call__(
         self,
-        text: List[str],
-        timeseries: List[List[np.ndarray]],
         padding: Union[bool, str, PaddingStrategy] = False,
         padding_side: str = 'left',
         vllm_flag: bool = False,
@@ -92,6 +88,8 @@ class Qwen2TSProcessor(ProcessorMixin):
         """
         if type(text) == str:
             text = [text]
         encoded_ts_arrays = []
         reconstructed_prompts = []
@@ -139,10 +137,9 @@ class Qwen2TSProcessor(ProcessorMixin):
             tokenizer_outputs = self.tokenizer(reconstructed_prompts, padding=padding, padding_side=padding_side, **kwargs)
         # Create the final output
-        outputs = {
-            "timeseries": concatenated_ts
-        }
-        outputs.update(tokenizer_outputs)
         return BatchFeature(data=outputs)

 from transformers.feature_extraction_utils import BatchFeature
 from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy
 def sp_encoding(timeseries: np.ndarray, eots_token: bool = True) -> Tuple[np.ndarray, str, dict]:
     """
     def __call__(
         self,
+        text: Union[str, List[str]],
+        timeseries: Optional[List[List[np.ndarray]]] = None,
         padding: Union[bool, str, PaddingStrategy] = False,
         padding_side: str = 'left',
         vllm_flag: bool = False,
         """
         if type(text) == str:
             text = [text]
+        if timeseries is None:
+            timeseries = []
         encoded_ts_arrays = []
         reconstructed_prompts = []
             tokenizer_outputs = self.tokenizer(reconstructed_prompts, padding=padding, padding_side=padding_side, **kwargs)
         # Create the final output
+        outputs = tokenizer_outputs
+        if concatenated_ts is not None:
+            outputs["timeseries"] = concatenated_ts
         return BatchFeature(data=outputs)