Upload 4 files

Browse files

Files changed (3) hide show

processor_config.json +1 -1
ultravox_model.py +48 -46
ultravox_processing.py +172 -91

processor_config.json CHANGED Viewed

@@ -5,7 +5,7 @@
   "auto_map": {
     "AutoProcessor": "ultravox_processing.UltravoxProcessor"
   },
-  "encoder_ds_factor": 320,
   "processor_class": "UltravoxProcessor",
   "stack_factor": 8
 }

   "auto_map": {
     "AutoProcessor": "ultravox_processing.UltravoxProcessor"
   },
+  "encoder_ds_factor": 2,
   "processor_class": "UltravoxProcessor",
   "stack_factor": 8
 }

ultravox_model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 import re
-from typing import Any, Dict, Optional, Set, Tuple, Union
 import peft
 import torch
@@ -10,6 +10,7 @@ import transformers
 import transformers.activations
 import transformers.modeling_outputs
 import transformers.models
 from transformers.models.whisper import modeling_whisper as whisper
 # We must use relative import in this directory to allow uploading to HF Hub
@@ -19,7 +20,7 @@ from .ultravox_config import LossFunction
 from .ultravox_config import UltravoxConfig
-class UltravoxModel(transformers.LlamaPreTrainedModel):
     """
     The Ultravox model which consists of an audio encoder and a language model.
@@ -57,10 +58,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
         # FSDP throws an error if some of the layer types are not found in the model.
-        # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
-        self._no_split_modules = (self.language_model._no_split_modules or []) + (
-            self.audio_tower._no_split_modules or []
-        )
         self.loss_config = LossConfig()
         self.post_init()
@@ -147,6 +146,24 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         )
         return {"loss": kl_loss}
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -188,23 +205,22 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
             # B x T  ->  B x T x D
             inputs_embeds = self.get_input_embeddings().forward(input_ids)
-        if audio_values is not None:
             assert (
                 audio_token_start_idx is not None
                 and audio_token_len is not None
                 and audio_batch_size is not None
-            ), "audio_token_start_idx and audio_token_len and audio_batch_size must be provided if audio_values are provided."
             assert (
                 len(audio_token_start_idx)
                 == len(audio_token_len)
-                == len(audio_batch_size)
-            ), "audio_token_start_idx and audio_token_len and audio_batch_size must have the same batch size."
-            assert (
-                audio_lens is not None
-            ), "audio_lens must be provided if audio_values are provided"
-            assert len(audio_lens) == len(
-                audio_values
-            ), "audio_lens must have the same batch size as audio_values."
             # B x A/3200 x (D=max-audio-length-in-batch)
             audio_tower_output = self.audio_tower.forward(
@@ -215,24 +231,11 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
             audio_embeds = self.multi_modal_projector.forward(audio_tower_output)
             # combine audio and text embeddings
-            # audio_embeds is (B_a X T X D)
-            # inputs_embeds is (B_i X T X D)
-            # B_a >= B_i because B_a includes all audio chunks.
-            # B_i == audio_token_start_idx.shape[0] == audio_token_len.shape[0] == audio_batch_size.shape[0]
-            audio_ind = 0
-            for i, (start, length, batch_size) in enumerate(
-                zip(audio_token_start_idx, audio_token_len, audio_batch_size)
-            ):
-                # audio_embeds is [B1 x T1 x D_hidden, B2 x T2 x D_hidden, ...]
-                # audio.shape (T1 + T2 + ..., D_hidden)
-                audio = torch.cat(
-                    [audio_embeds[k] for k in range(audio_ind, audio_ind + batch_size)],
-                    dim=0,
-                )
-                length = min(length, audio.shape[1])
-                inputs_embeds[i, start : start + length] = audio[:length]
-                audio_ind += batch_size
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
@@ -424,13 +427,17 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         if state_dict is None:
             state_dict = super().state_dict()
-        named_params = dict(self.named_parameters())
         state_dict = {
             k: v
             for k, v in state_dict.items()
-            if k in self.keep_params
-            or (k in named_params and named_params[k].requires_grad)
         }
         return state_dict
@@ -476,7 +483,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
 # TODO: refactor common parts to a shared module
 def is_cache_empty(
-    past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
 ) -> bool:
     """
     Check if the cache is empty.
@@ -512,12 +519,8 @@ def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
 class StackAudioFrames(nn.Module):
     """
-    Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`.
-    The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames.
-    NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor,
-    we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings.
-    In most cases this extra padding will get removed in the model's forward function so it has no effect.
     """
     def __init__(self, stack_factor: int = 8):
@@ -527,7 +530,7 @@ class StackAudioFrames(nn.Module):
     def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         B, T, C = audio_embeds.shape
         T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
-        audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T + self.stack_factor))
         B, T, C = audio_embeds.shape
         audio_embeds = audio_embeds.view(
             B, T // self.stack_factor, C * self.stack_factor
@@ -700,7 +703,6 @@ class ModifiedWhisperEncoder(
             attention_mask = self.get_extended_attention_mask(
                 attention_mask,
                 None,
-                device=hidden_states.device,
                 dtype=hidden_states.dtype,
             )

 import logging
 import re
+from typing import Any, Dict, Generator, Optional, Set, Tuple, Union
 import peft
 import torch
 import transformers.activations
 import transformers.modeling_outputs
 import transformers.models
+from transformers.generation.utils import GenerationMixin
 from transformers.models.whisper import modeling_whisper as whisper
 # We must use relative import in this directory to allow uploading to HF Hub
 from .ultravox_config import UltravoxConfig
+class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     """
     The Ultravox model which consists of an audio encoder and a language model.
         # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
         # FSDP throws an error if some of the layer types are not found in the model.
+        # This would be something like ["LlamaDecoderLayer"] as we don't split audio encoder layers.
+        self._no_split_modules = self.language_model._no_split_modules
         self.loss_config = LossConfig()
         self.post_init()
         )
         return {"loss": kl_loss}
+    def _audio_iter(
+        self, audio_batch_size: torch.Tensor
+    ) -> Generator[Tuple[int, int], None, None]:
+        """
+        Iterate over the audio batch size and yield the batch index and audio index of each audio item.
+        Args:
+            audio_batch_size: A tensor of shape (B,) where B is the batch size.
+        Returns:
+            A generator that yields a tuple of (start index, length) for each audio item.
+        """
+        audio_index = 0
+        for i_b, batch_count in enumerate(audio_batch_size):
+            for _ in range(batch_count):
+                yield i_b, audio_index
+                audio_index += 1
     def forward(
         self,
         input_ids: torch.Tensor,
             # B x T  ->  B x T x D
             inputs_embeds = self.get_input_embeddings().forward(input_ids)
+        if audio_values is not None and len(audio_values) > 0:
             assert (
                 audio_token_start_idx is not None
                 and audio_token_len is not None
+                and audio_lens is not None
                 and audio_batch_size is not None
+            ), "audio_token_start_idx/audio_token_len/audio_lens must be provided if audio_values are provided."
             assert (
                 len(audio_token_start_idx)
                 == len(audio_token_len)
+                == len(audio_lens)
+                == len(audio_values)
+            ), "audio_token_start_idx/audio_token_len/audio_lens/audio_values must have the same batch size."
+            assert len(audio_batch_size) == len(
+                inputs_embeds
+            ), "audio_batch_size and inputs_embeds must have the same batch size."
             # B x A/3200 x (D=max-audio-length-in-batch)
             audio_tower_output = self.audio_tower.forward(
             audio_embeds = self.multi_modal_projector.forward(audio_tower_output)
             # combine audio and text embeddings
+            for i_b, i_a in self._audio_iter(audio_batch_size):
+                start_idx = audio_token_start_idx[i_a]
+                token_len = audio_token_len[i_a]
+                item_embedding = audio_embeds[i_a][:token_len]
+                inputs_embeds[i_b][start_idx : start_idx + token_len] = item_embedding
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
         if state_dict is None:
             state_dict = super().state_dict()
+        trainable_params = {k for k, v in self.named_parameters() if v.requires_grad}
+        # normalize the keys to match the original model
+        # Example: audio_tower.base_model.model.layers.0._fsdp_wrapped_module.self_attn.k_proj.lora_B.default.weight
+        trainable_params = {
+            k.replace("_fsdp_wrapped_module.", "") for k in trainable_params
+        }
         state_dict = {
             k: v
             for k, v in state_dict.items()
+            if k in self.keep_params or k in trainable_params
         }
         return state_dict
 # TODO: refactor common parts to a shared module
 def is_cache_empty(
+    past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]],
 ) -> bool:
     """
     Check if the cache is empty.
 class StackAudioFrames(nn.Module):
     """
+    Stack the audio embedding frames to reduce the sequence length by a factor
+    of `stack_factor`.
     """
     def __init__(self, stack_factor: int = 8):
     def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         B, T, C = audio_embeds.shape
         T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
+        audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T))
         B, T, C = audio_embeds.shape
         audio_embeds = audio_embeds.view(
             B, T // self.stack_factor, C * self.stack_factor
             attention_mask = self.get_extended_attention_mask(
                 attention_mask,
                 None,
                 dtype=hidden_states.dtype,
             )

ultravox_processing.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import dataclasses
-from typing import Any, Dict, Optional, Union
 import numpy as np
 import torch
@@ -15,8 +15,13 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
     include_alt_fields: bool = False
     def __call__(self, features, *args, **kwargs):
-        audio_values = [f.pop("audio_values", None) for f in features]
-        audio_lens = [f.pop("audio_lens", None) for f in features]
         if self.include_alt_fields:
             # these fields are hard-coded in the transformer data collator, so they need special handling before calling the super method
             alt_features = [
@@ -35,10 +40,14 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
             batch["alt_attention_mask"] = alt_batch["attention_mask"]
             batch["alt_labels"] = alt_batch["labels"]
         # Pad the last dimension of all audio_values to the same length, with 0s on the right.
-        if audio_values and audio_values[0] is not None:
             max_len = max([x.shape[-1] for x in audio_values])
-            batch["audio_values"] = torch.cat(
                 [F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
             )
             if self.tokenizer.padding_side == "left":
@@ -46,11 +55,12 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
                     [f["input_ids"].shape[-1] for f in features]
                 )
                 displacement = batch["input_ids"].shape[-1] - input_ids_lens
                 batch["audio_token_start_idx"] += displacement.to(
                     batch["audio_token_start_idx"].device
                 )
-        # batch["audio_lens"].shape = (B,)
-        batch["audio_lens"] = torch.cat(audio_lens)
         return batch
@@ -64,11 +74,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
     """
     attributes = ["audio_processor", "tokenizer"]
-    audio_processor_class = (
-        "Wav2Vec2Processor",
-        "SeamlessM4TFeatureExtractor",
-        "WhisperProcessor",
-    )
     tokenizer_class = (
         "PreTrainedTokenizer",
         "PreTrainedTokenizerFast",
@@ -82,7 +88,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         audio_processor=None,
         tokenizer=None,
         audio_padding: str = "longest",
-        encoder_ds_factor: int = 320,
         stack_factor: int = 8,
         audio_placeholder: str = "<|audio|>",
         # Defaults to whisper encoder context size
@@ -93,8 +99,8 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             audio_processor: The audio processor for the audio encoder.
             tokenizer: The tokenizer for the language model.
             audio_padding: The padding strategy for the audio encoder.
-            encoder_ds_factor: The downsample factor of the audio encoder.
             stack_factor: The factor by which the audio encoder output is stacked in the multimodal projector.
             audio_placeholder: The placeholder for the audio in the text.
             audio_context_size: The maximum number of frames that the audio encoder can handle.
         """
@@ -102,11 +108,12 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         self.encoder_ds_factor = encoder_ds_factor
         self.stack_factor = stack_factor
         self.audio_placeholder = audio_placeholder
-        self.audio_token_replacement = tokenizer.eos_token
         self.audio_context_size = audio_context_size
         assert (
-            self.audio_token_replacement is not None
         ), "The tokenizer has no EOS token. Cannot recover."
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
@@ -120,7 +127,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         audio_processor = transformers.AutoProcessor.from_pretrained(
             config.audio_model_id
             or config.audio_config._name_or_path
-            or "facebook/wav2vec2-base-960h"
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(
@@ -135,65 +142,100 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             stack_factor=config.stack_factor,
         )
-    def _chunk_and_pad_audio(self, audio_values: torch.Tensor) -> Dict[str, Any]:
         """
-        Processes the audio tensor by chunking it according to the audio_context_size,
         padding the last chunk if needed, and returns a dictionary with updated audio data.
         Args:
             audio_values (torch.Tensor): A tensor of audio values (e.g., in B, D, T format).
         Returns:
             Dict[str, Any]: Dictionary with the following keys:
                 - "audio_values": The concatenated audio tensor after chunking and padding.
-                - "audio_lens": List of lengths (as torch.Tensor) for each chunk.
-                - "audio_batch_size": A list with one integer representing the number of chunks.
         """
-        result: Dict[str, Any] = {}
-        if self.audio_context_size and audio_values.shape[-1] > self.audio_context_size:
-            audio_chunks = list(
-                torch.split(audio_values, self.audio_context_size, dim=-1)
             )
-            valid_lengths = [chunk.shape[-1] for chunk in audio_chunks]
-            result = {
-                "audio_lens": [torch.as_tensor(length) for length in valid_lengths]
-            }
-            # Pad the last chunk to the full context length if needed.
-            last_chunk = audio_chunks[-1]
-            pad_size = self.audio_context_size - last_chunk.shape[-1]
-            if pad_size > 0:
-                audio_chunks[-1] = F.pad(last_chunk, (0, pad_size))
-        else:
-            audio_chunks = [audio_values]
-            result = {"audio_lens": [torch.as_tensor(audio_values.shape[-1])]}
-        result["audio_values"] = torch.cat(audio_chunks)
-        result["audio_batch_size"] = [result["audio_values"].shape[0]]
-        return result
     def __call__(
         self,
         text: Optional[str] = None,
         audio: Optional[Union[np.ndarray, torch.Tensor]] = None,
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[
             Union[str, transformers.TensorType]
         ] = transformers.TensorType.PYTORCH,
         **kwargs,
     ) -> transformers.BatchFeature:
         """
         Main method to prepare for the model one text sequence and audio. This method forwards the `text`
         and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
-        audio processor's [`~Wav2Vec2Processor.__call__`] if `audio` is not `None`. Please refer to the docstring
         of the above two methods for more information.
         Args:
             text (`str`, `List[str]`):
                 The sequence to be encoded. Sequence can be a string or (pretokenized string).
             audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The audio to be prepared. Audio can be NumPy array or PyTorch tensor. In case of a
-                NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels, and T the
-                sample length of the audio.
             sampling_rate (`int`, *optional*, defaults to 16000):
                 Sampling rate of the input audio. We expect 16kHz audio. Don't change this value unless you know what
                 you are doing.
@@ -217,66 +259,105 @@ class UltravoxProcessor(transformers.ProcessorMixin):
               Returned when `audio` is not `None`.
             - **audio_token_start_idx** -- The index in the tokenized text where the audio starts. Returned when `audio` is not `None`.
         """
-        # TODO: Add support for multiple audio and text inputs.
-        data: Dict[str, Any] = {}
-        audio_embed_frames = 0
-        if audio is not None and len(audio) > 0:
-            audio_len = audio.shape[-1]
-            # It's guaranteed that the number of frames is less than or equal to this amount.
-            # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
-            # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
-            nb_encoder_frames = int(round(audio_len / self.encoder_ds_factor + 1e-4))
-            audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
-            data["audio_token_len"] = [audio_embed_frames]
             # Main audio processing. The processor is model-specific.
-            x = self.audio_processor(
-                audio,
                 sampling_rate=sampling_rate,
                 padding="longest",
                 return_attention_mask=True,
                 **kwargs,
             )
-            if "input_features" in x:
-                audio_values = x.input_features
-            else:
-                audio_values = x.input_values
-            audio_values = torch.tensor(audio_values)
-            chunk_and_pad_results = self._chunk_and_pad_audio(audio_values)
-            data["audio_values"] = chunk_and_pad_results["audio_values"]
-            data["audio_lens"] = chunk_and_pad_results["audio_lens"]
-            data["audio_batch_size"] = chunk_and_pad_results["audio_batch_size"]
         if text is not None:
-            assert isinstance(
-                text, str
-            ), "Text must be a string. Batch mode not supported yet."
-            if self.audio_placeholder in text:
-                if "audio_token_len" not in data:
-                    raise ValueError(
-                        f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
-                    )
-                start_idx = len(
-                    self.tokenizer.encode(
-                        text[: text.index(self.audio_placeholder)],
-                        add_special_tokens=False,
-                    )
-                )
-                data["audio_token_start_idx"] = [start_idx]
-                # Replace the audio placeholder with the audio token.
-                #   e.g. "Transcribe\n<|audio|>" -> "Transcribe\n</s></s></s></s></s></s></s></s>"
-                #        where the number of </s> is the number of audio frames.
-                text = text.replace(
-                    self.audio_placeholder,
-                    self.audio_token_replacement * audio_embed_frames,
-                )
             # Special tokens like BOS should already have been added by the caller.
-            data.update(self.tokenizer([text], add_special_tokens=False, **kwargs))
         return transformers.BatchFeature(data=data, tensor_type=return_tensors)

 import dataclasses
+from typing import Any, Dict, List, Optional, Union
 import numpy as np
 import torch
     include_alt_fields: bool = False
     def __call__(self, features, *args, **kwargs):
+        audio_values = [x for f in features for x in f.pop("audio_values", [])]
+        audio_lens = [x for f in features for x in f.pop("audio_lens", [])]
+        audio_token_len = [x for f in features for x in f.pop("audio_token_len", [])]
+        audio_token_start_idx = [
+            x for f in features for x in f.pop("audio_token_start_idx", [])
+        ]
         if self.include_alt_fields:
             # these fields are hard-coded in the transformer data collator, so they need special handling before calling the super method
             alt_features = [
             batch["alt_attention_mask"] = alt_batch["attention_mask"]
             batch["alt_labels"] = alt_batch["labels"]
+        batch["audio_token_start_idx"] = torch.stack(audio_token_start_idx)
+        batch["audio_lens"] = torch.stack(audio_lens)
+        batch["audio_token_len"] = torch.stack(audio_token_len)
         # Pad the last dimension of all audio_values to the same length, with 0s on the right.
+        if audio_values:
             max_len = max([x.shape[-1] for x in audio_values])
+            batch["audio_values"] = torch.stack(
                 [F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
             )
             if self.tokenizer.padding_side == "left":
                     [f["input_ids"].shape[-1] for f in features]
                 )
                 displacement = batch["input_ids"].shape[-1] - input_ids_lens
+                displacement = displacement.repeat_interleave(
+                    batch["audio_batch_size"].squeeze(-1)
+                )
                 batch["audio_token_start_idx"] += displacement.to(
                     batch["audio_token_start_idx"].device
                 )
         return batch
     """
     attributes = ["audio_processor", "tokenizer"]
+    audio_processor_class = ("WhisperProcessor",)
     tokenizer_class = (
         "PreTrainedTokenizer",
         "PreTrainedTokenizerFast",
         audio_processor=None,
         tokenizer=None,
         audio_padding: str = "longest",
+        encoder_ds_factor: int = 2,
         stack_factor: int = 8,
         audio_placeholder: str = "<|audio|>",
         # Defaults to whisper encoder context size
             audio_processor: The audio processor for the audio encoder.
             tokenizer: The tokenizer for the language model.
             audio_padding: The padding strategy for the audio encoder.
             stack_factor: The factor by which the audio encoder output is stacked in the multimodal projector.
+            encoder_ds_factor: The downsampling factor of the audio encoder.
             audio_placeholder: The placeholder for the audio in the text.
             audio_context_size: The maximum number of frames that the audio encoder can handle.
         """
         self.encoder_ds_factor = encoder_ds_factor
         self.stack_factor = stack_factor
         self.audio_placeholder = audio_placeholder
         self.audio_context_size = audio_context_size
         assert (
+            tokenizer.eos_token is not None
         ), "The tokenizer has no EOS token. Cannot recover."
+        self.vocab = tokenizer.get_vocab()
+        self.audio_token_replacement = tokenizer.eos_token
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
         audio_processor = transformers.AutoProcessor.from_pretrained(
             config.audio_model_id
             or config.audio_config._name_or_path
+            or "openai/whisper-tiny"
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(
             stack_factor=config.stack_factor,
         )
+    def _chunk_and_pad_audio(
+        self,
+        audio_values: torch.Tensor,
+        audio_lens: torch.Tensor,
+        include_audio_num_chunks: bool = False,
+    ) -> Dict[str, Any]:
         """
+        Processes the audio batch by chunking any items in the batch according to the audio_context_size,
         padding the last chunk if needed, and returns a dictionary with updated audio data.
         Args:
             audio_values (torch.Tensor): A tensor of audio values (e.g., in B, D, T format).
+            audio_lens (torch.Tensor): A tensor of audio lengths.
         Returns:
             Dict[str, Any]: Dictionary with the following keys:
                 - "audio_values": The concatenated audio tensor after chunking and padding.
+                - "audio_lens": Tensor of lengths for each chunk.
+                - "audio_is_continuation": Tensor of booleans indicating if the chunk is a continuation of the previous chunk.
+                - "audio_batch_size": A Tensor with one integer representing the number of chunks.
         """
+        chunked_audio_values: List[torch.Tensor] = []
+        chunked_audio_lens: List[int] = []
+        is_continuation_list: List[bool] = []
+        num_chunks: List[int] = []
+        context_size = self.audio_context_size or audio_values.shape[-1]
+        for i in range(audio_values.shape[0]):  # iterate over the batch
+            num_chunks.append(int(np.ceil(audio_lens[i] / context_size)))
+            for offset in range(0, audio_lens[i], context_size):
+                is_continuation = offset > 0
+                chunk = audio_values[i, :, offset : offset + context_size]
+                if is_continuation and chunk.shape[-1] < context_size:
+                    # N.B. We only need to pad continuation chunks. If none of the samples require chunking, the
+                    # batch might not (need to) be padded all the way to the audio_context_size, in which case
+                    # we've already included the padding above. On the other hand, if we have any continuation
+                    # chunks we know that the batch needs to be padded to audio_context_size because that's what
+                    # we're slicing to.
+                    chunk = F.pad(chunk, (0, context_size - chunk.shape[-1]))
+                chunked_audio_values.append(chunk)
+                chunked_audio_lens.append(
+                    min(int(audio_lens[i].item()) - offset, context_size)
+                )
+                is_continuation_list.append(is_continuation)
+        data = {
+            "audio_values": torch.stack(chunked_audio_values, dim=0),
+            "audio_lens": torch.tensor(
+                chunked_audio_lens, dtype=torch.int64, device=audio_values.device
+            ),
+            "audio_is_continuation": torch.tensor(
+                is_continuation_list, dtype=torch.bool, device=audio_values.device
+            ),
+            "audio_batch_size": torch.tensor(
+                [len(chunked_audio_values)], device=audio_values.device
+            ),
+        }
+        if include_audio_num_chunks:
+            data["audio_num_chunks"] = torch.tensor(
+                num_chunks, dtype=torch.int64, device=audio_values.device
             )
+        return data
     def __call__(
         self,
         text: Optional[str] = None,
         audio: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        audios: Optional[
+            Union[
+                List[Union[np.ndarray, torch.Tensor]], Union[np.ndarray, torch.Tensor]
+            ]
+        ] = None,
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[
             Union[str, transformers.TensorType]
         ] = transformers.TensorType.PYTORCH,
+        include_audio_num_chunks: bool = False,
         **kwargs,
     ) -> transformers.BatchFeature:
         """
         Main method to prepare for the model one text sequence and audio. This method forwards the `text`
         and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
+        audio processor's [`~WhisperProcessor.__call__`] if `audio` is not `None`. Please refer to the docstring
         of the above two methods for more information.
         Args:
             text (`str`, `List[str]`):
                 The sequence to be encoded. Sequence can be a string or (pretokenized string).
             audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The audio to be prepared. Audio can be a single-channel (1-dimensional) NumPy array or PyTorch tensor.
+            audios (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                A list or two dimensional array of audio to be prepared.
             sampling_rate (`int`, *optional*, defaults to 16000):
                 Sampling rate of the input audio. We expect 16kHz audio. Don't change this value unless you know what
                 you are doing.
               Returned when `audio` is not `None`.
             - **audio_token_start_idx** -- The index in the tokenized text where the audio starts. Returned when `audio` is not `None`.
         """
+        # TODO: Add support for multiple text inputs.
+        if audio is not None and audios is not None:
+            raise ValueError("Only one of `audio` or `audios` should be provided.")
+        elif audio is not None:
+            audios = audio if isinstance(audio, list) or audio.ndim == 2 else [audio]
+        elif audios is None:
+            audios = []
+        data = {}
+        audio_is_continuation = []
+        if len(audios) > 0:
+            audios = [x.numpy() if isinstance(x, torch.Tensor) else x for x in audios]
+            # Pad out each audio to at least 2 hops (the minimum required by the processor).
+            hop_length = self.audio_processor.feature_extractor.hop_length
+            audios = [
+                (
+                    np.pad(x, (0, 2 * hop_length - len(x)), mode="constant")
+                    if len(x) < 2 * hop_length
+                    else x
+                )
+                for x in audios
+            ]
             # Main audio processing. The processor is model-specific.
+            x: transformers.BatchFeature = self.audio_processor(
+                audios,
                 sampling_rate=sampling_rate,
                 padding="longest",
+                pad_to_multiple_of=hop_length,  # The attention mask effectively gets padded to the hop length, so pad the audio to be consistent.
+                truncation=False,
                 return_attention_mask=True,
                 **kwargs,
             )
+            data.update(
+                self._chunk_and_pad_audio(
+                    audio_values=torch.as_tensor(
+                        x.input_features if "input_features" in x else x.input_values
+                    ),
+                    audio_lens=torch.as_tensor(x.attention_mask).sum(-1),
+                    include_audio_num_chunks=include_audio_num_chunks,
+                )
+            )
+            audio_is_continuation = data.pop("audio_is_continuation")
+            data["audio_token_len"] = torch.ceil(
+                data["audio_lens"] / (self.encoder_ds_factor * self.stack_factor)
+            ).to(dtype=torch.int)
         if text is not None:
+            if not isinstance(text, str):
+                raise ValueError("Text must be a string. Batch mode not supported yet.")
             # Special tokens like BOS should already have been added by the caller.
+            tokenized_parts = self.tokenizer(
+                text.split(
+                    "<|audio|>"  # The placeholder isn't part of the vocabulary, so split the text around it.
+                ),
+                add_special_tokens=False,
+                **kwargs,
+            )
+            audio_token_start_idx = []
+            placeholder_index = -1
+            split_input_ids = tokenized_parts["input_ids"]
+            input_ids: List[int] = []
+            audio_token_replacement_token_id = self.vocab[self.audio_token_replacement]
+            for i, token_len in enumerate(data.get("audio_token_len", [])):
+                if not audio_is_continuation[i]:
+                    placeholder_index += 1
+                    if placeholder_index >= len(split_input_ids):
+                        raise ValueError(
+                            f"Text contains too few audio placeholders. (Expected {len(audios)} placeholders)"
+                        )
+                    input_ids.extend(split_input_ids[placeholder_index])
+                audio_token_start_idx.append(len(input_ids))
+                input_ids.extend([audio_token_replacement_token_id] * token_len)
+            # Include any tokens after the last audio.
+            placeholder_index += 1
+            if placeholder_index != len(split_input_ids) - 1:
+                raise ValueError(
+                    f"Text contains too many audio placeholders. (Expected {len(audios)} placeholders)"
+                )
+            input_ids.extend(split_input_ids[placeholder_index])
+            if "audio_token_len" in data:
+                data["audio_token_start_idx"] = torch.as_tensor(audio_token_start_idx)
+            data["input_ids"] = [input_ids]
+            data["attention_mask"] = [[1] * len(input_ids)]
+            # Ensure that there are no audio placeholders after the last audio.
         return transformers.BatchFeature(data=data, tensor_type=return_tensors)