File size: 6,961 Bytes
12a0dd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Copyright 2024 The HuggingFace Inc. team.
# SPDX-License-Identifier: Apache-2.0

"""

Image/Text processor class for SigLIP.

"""

from typing import List, Optional, Union

from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from transformers.utils import TensorType


class SiglipProcessor(ProcessorMixin):
    r"""

    Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor.



    [`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the

    [`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information.



    Args:

        image_processor ([`SiglipImageProcessor`]):

            The image processor is a required input.

        tokenizer ([`SiglipTokenizer`]):

            The tokenizer is a required input.

    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "SiglipImageProcessor"
    tokenizer_class = "SiglipTokenizer"

    def __init__(self, image_processor, tokenizer):
        super().__init__(image_processor, tokenizer)

    def __call__(

        self,

        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,

        images: ImageInput = None,

        padding: Union[bool, str, PaddingStrategy] = False,

        truncation: Union[bool, str, TruncationStrategy] = None,

        max_length: int = None,

        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,

    ) -> BatchFeature:
        """

        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`

        and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode

        the text. To prepare the image(s), this method forwards the `images` argument to

        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring

        of the above two methods for more information.



        Args:

            text (`str`, `List[str]`, `List[List[str]]`):

                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings

                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set

                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).

            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):

                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch

                tensor. Both channels-first and channels-last formats are supported.

            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):

                Select a strategy to pad the returned sequences (according to the model's padding side and padding

                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single

                  sequence if provided).

                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum

                  acceptable input length for the model if that argument is not provided.

                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different

                  lengths).

            max_length (`int`, *optional*):

                Maximum length of the returned list and optionally padding length (see above).

            truncation (`bool`, *optional*):

                Activates truncation to cut input sequences longer than `max_length` to `max_length`.

            return_tensors (`str` or [`~utils.TensorType`], *optional*):

                If set, will return tensors of a particular framework. Acceptable values are:



                - `'tf'`: Return TensorFlow `tf.constant` objects.

                - `'pt'`: Return PyTorch `torch.Tensor` objects.

                - `'np'`: Return NumPy `np.ndarray` objects.

                - `'jax'`: Return JAX `jnp.ndarray` objects.



        Returns:

            [`BatchFeature`]: A [`BatchFeature`] with the following fields:



            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.

            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when

              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not

              `None`).

            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.

        """

        if text is None and images is None:
            raise ValueError("You have to specify either text or images. Both cannot be none.")

        if text is not None:
            encoding = self.tokenizer(
                text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
            )

        if images is not None:
            image_features = self.image_processor(images, return_tensors=return_tensors)

        if text is not None and images is not None:
            encoding["pixel_values"] = image_features.pixel_values
            return encoding
        elif text is not None:
            return encoding
        else:
            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)

    def decode(self, *args, **kwargs):
        """

        This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to

        the docstring of this method for more information.

        """
        return self.tokenizer.decode(*args, **kwargs)

    def batch_decode(self, *args, **kwargs):
        """

        This method forwards all its arguments to SiglipTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please

        refer to the docstring of this method for more information.

        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    @property
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Siglip, T5->Siglip
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))