Zhiding commited on
Commit
544f248
·
1 Parent(s): f465412
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.18843612657266906,
4
+ "train_runtime": 10275.4061,
5
+ "train_samples": 9129380,
6
+ "train_samples_per_second": 888.469,
7
+ "train_steps_per_second": 1.132
8
+ }
config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_commit_hash": null, "_name_or_path": "./work_dirs/commercial_eagle_128gpus_bs1024_stage1_ptv1_siglip_llama3_2_1B", "architectures": ["InternVLChatModel"], "auto_map": {"AutoConfig": "configuration_internvl_chat.InternVLChatConfig", "AutoModel": "modeling_internvl_chat.InternVLChatModel"}, "downsample_ratio": 0.5, "dynamic_image_size": true, "force_image_size": 448, "llm_config": {"_name_or_path": "./pretrained/Llama-3_2-1B-Instruct", "add_cross_attention": false, "architectures": ["LlamaForCausalLM"], "attention_bias": false, "attention_dropout": 0.0, "attn_implementation": "flash_attention_2", "auto_map": {"AutoConfig": "configuration_llama.LlamaConfig", "AutoModel": "modeling_llama.LlamaModel", "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM"}, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": 128000, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": [128001, 128008, 128009], "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "head_dim": 64, "hidden_act": "silu", "hidden_size": 2048, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "initializer_range": 0.02, "intermediate_size": 8192, "is_decoder": false, "is_encoder_decoder": false, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "length_penalty": 1.0, "max_length": 20, "max_position_embeddings": 131072, "min_length": 0, "mlp_bias": false, "model_type": "llama", "my_rope_scaling": {"factor": 32.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3"}, "no_repeat_ngram_size": 0, "num_attention_heads": 32, "num_beam_groups": 1, "num_beams": 1, "num_hidden_layers": 16, "num_key_value_heads": 8, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "pretraining_tp": 1, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "rms_norm_eps": 1e-05, "rope_scaling": {"factor": 32.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3", "type": "llama3"}, "rope_theta": 500000.0, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": "bfloat16", "torchscript": false, "transformers_version": "4.37.2", "typical_p": 1.0, "use_bfloat16": false, "use_cache": false, "vocab_size": 128267}, "loss_version": "efficient_v2_cp_head", "max_dynamic_patch": 12, "min_dynamic_patch": 1, "mlp_checkpoint": false, "model_type": "internvl_chat", "pad2square": false, "pre_feature_reduction": false, "ps_version": "v2", "select_layer": -1, "template": "llama3-chat", "torch_dtype": "bfloat16", "transformers_version": null, "use_backbone_lora": 0, "use_llm_lora": 0, "use_thumbnail": true, "vision_config": {"_name_or_path": "", "add_cross_attention": false, "architectures": null, "attention_dropout": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "drop_path_rate": 0.1, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 1152, "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, "image_size": 448, "intermediate_size": 4304, "is_decoder": false, "is_encoder_decoder": false, "label2id": {"LABEL_0": 0, "LABEL_1": 1}, "layer_norm_eps": 1e-06, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "siglip_vision_model", "no_repeat_ngram_size": 0, "num_attention_heads": 16, "num_beam_groups": 1, "num_beams": 1, "num_channels": 3, "num_hidden_layers": 27, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "patch_size": 14, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": null, "torchscript": false, "transformers_version": "4.37.2", "typical_p": 1.0, "use_bfloat16": false}}
configuration_eagle_chat.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Eagle2
3
+ # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import copy
8
+
9
+ from transformers import AutoConfig, LlamaConfig
10
+ from transformers.configuration_utils import PretrainedConfig
11
+ from transformers.utils import logging
12
+ from .configuration_siglip import SiglipVisionConfig
13
+ from .configuration_qwen2 import Qwen2Config
14
+ from .configuration_multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModelConfig
15
+ logger = logging.get_logger(__name__)
16
+
17
+
18
+ class Eagle2ChatConfig(PretrainedConfig):
19
+ model_type = 'eagle_chat'
20
+ is_composition = True
21
+
22
+ def __init__(
23
+ self,
24
+ vision_config=None,
25
+ llm_config=None,
26
+ use_backbone_lora=0,
27
+ use_llm_lora=0,
28
+ select_layer=-1,
29
+ force_image_size=None,
30
+ downsample_ratio=0.5,
31
+ template=None,
32
+ dynamic_image_size=False,
33
+ use_thumbnail=False,
34
+ min_dynamic_patch=1,
35
+ max_dynamic_patch=6,
36
+ mlp_checkpoint=True,
37
+ pre_feature_reduction=False,
38
+ keep_aspect_ratio=False,
39
+ **kwargs):
40
+ super().__init__(**kwargs)
41
+
42
+ if vision_config is None:
43
+ vision_config = {}
44
+ logger.info('vision_config is None. Initializing Vision Encoders with default values.')
45
+
46
+ if llm_config is None:
47
+ llm_config = {}
48
+ logger.info('llm_config is None. Initializing the LLM config with default values')
49
+
50
+ if vision_config['model_type'] == 'siglip_vision_model':
51
+ self.vision_config = SiglipVisionConfig(**vision_config)
52
+ elif vision_config['model_type'].startswith("MOB"):
53
+ self.vision_config = MultiBackboneChannelConcatenationVisionModelConfig(**vision_config)
54
+ else:
55
+ raise ValueError('Unsupported model_type: {}'.format(vision_config['model_type']))
56
+
57
+ if llm_config['architectures'][0] == 'LlamaForCausalLM':
58
+ self.llm_config = LlamaConfig(**llm_config)
59
+ elif llm_config['architectures'][0] == 'Qwen2ForCausalLM':
60
+ self.llm_config = Qwen2Config(**llm_config)
61
+ else:
62
+ raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
63
+ self.use_backbone_lora = use_backbone_lora
64
+ self.use_llm_lora = use_llm_lora
65
+ self.select_layer = select_layer
66
+ self.force_image_size = force_image_size
67
+ self.downsample_ratio = downsample_ratio
68
+ self.template = template
69
+ self.dynamic_image_size = dynamic_image_size
70
+ self.use_thumbnail = use_thumbnail
71
+ self.min_dynamic_patch = min_dynamic_patch
72
+ self.max_dynamic_patch = max_dynamic_patch
73
+ self.mlp_checkpoint = mlp_checkpoint
74
+ self.pre_feature_reduction = pre_feature_reduction
75
+ self.keep_aspect_ratio = keep_aspect_ratio
76
+ logger.info(f'keep_aspect_ratio: {self.keep_aspect_ratio}')
77
+ logger.info(f'vision_select_layer: {self.select_layer}')
78
+ logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
79
+ logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
80
+
81
+ def to_dict(self):
82
+ """
83
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
84
+
85
+ Returns:
86
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
87
+ """
88
+ output = copy.deepcopy(self.__dict__)
89
+ output['vision_config'] = self.vision_config.to_dict()
90
+ output['llm_config'] = self.llm_config.to_dict()
91
+ output['model_type'] = self.__class__.model_type
92
+ output['use_backbone_lora'] = self.use_backbone_lora
93
+ output['use_llm_lora'] = self.use_llm_lora
94
+ output['select_layer'] = self.select_layer
95
+ output['force_image_size'] = self.force_image_size
96
+ output['downsample_ratio'] = self.downsample_ratio
97
+ output['template'] = self.template
98
+ output['dynamic_image_size'] = self.dynamic_image_size
99
+ output['use_thumbnail'] = self.use_thumbnail
100
+ output['min_dynamic_patch'] = self.min_dynamic_patch
101
+ output['max_dynamic_patch'] = self.max_dynamic_patch
102
+ output['keep_aspect_ratio'] = self.keep_aspect_ratio
103
+
104
+ return output
configuration_multi_backbone_channel_concatentation_model.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Eagle2
3
+ # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import os
8
+ from typing import Union
9
+
10
+ from transformers.configuration_utils import PretrainedConfig
11
+ from transformers.utils import logging
12
+ from .configuration_siglip import SiglipVisionConfig
13
+ logger = logging.get_logger(__name__)
14
+
15
+
16
+ class MultiBackboneChannelConcatenationVisionModelConfig(PretrainedConfig):
17
+ r"""
18
+ This is the configuration class to store the configuration of a [`MultiBackboneChannelConcatenationVisionModelConfig`]. It is used to
19
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
20
+
21
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
22
+ documentation from [`PretrainedConfig`] for more information.
23
+
24
+ Args:
25
+ vision_path (str): Path to the vision model or its configuration.
26
+ mm_vision_select_layer (int, optional): The layer to select from the vision model
27
+ for multi-modal processing. Defaults to -2.
28
+ grid_size (int, optional): The size of the grid for vision processing. Defaults to 32.
29
+ **kwargs: Additional keyword arguments to be passed to the parent PretrainedConfig.
30
+
31
+ """
32
+
33
+ model_type = 'MOB'
34
+
35
+ def __init__(
36
+ self,
37
+ vision_path,
38
+ mm_vision_select_layer=-2,
39
+ grid_size=32,
40
+ input_image_size=1024,
41
+ hidden_size='lazy_calculation',
42
+ image_size=1024,
43
+ freeze_backbones=None,
44
+ moe_version_type=None,
45
+ delay_load=False,
46
+ convnext_img_size=1024,
47
+ vision_tower_siglip_path=None,
48
+ vision_tower_convnext_path='convnext_xxlarge.clip_laion2b_soup',
49
+ normalize_type='siglip',
50
+ **kwargs,
51
+ ):
52
+ super().__init__(**kwargs)
53
+
54
+ self.normalize_type = normalize_type
55
+ self.vision_path = vision_path
56
+ self.mm_vision_select_layer = mm_vision_select_layer
57
+ self.grid_size = grid_size
58
+ self.input_image_size = input_image_size
59
+ self.image_size = image_size
60
+ self.hidden_size = hidden_size
61
+ self.freeze_backbones = freeze_backbones
62
+ self.moe_version_type = moe_version_type
63
+ self.delay_load = delay_load
64
+ self.convnext_img_size = convnext_img_size
65
+ # other args. to make it compatable with eagle-next
66
+ self.vision_tower_siglip_path = vision_tower_siglip_path
67
+ self.vision_tower_convnext_path = vision_tower_convnext_path
68
+ self.vision_tower = self.vision_path[4:] # remove `MOB:` prefix
69
+
70
+ # asserts
71
+ assert image_size == input_image_size, f"input_image_size ({input_image_size}) != image_size ({image_size})"
72
+
73
+ @classmethod
74
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
75
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
76
+
77
+ if 'vision_config' in config_dict:
78
+ config_dict = config_dict['vision_config']
79
+
80
+ if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
81
+ logger.warning(
82
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
83
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
84
+ )
85
+
86
+ return cls.from_dict(config_dict, **kwargs)
configuration_qwen2.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Qwen2 model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.utils import logging
19
+
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24
+ "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
25
+ }
26
+
27
+
28
+ class Qwen2Config(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
31
+ Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
32
+ with the defaults will yield a similar configuration to that of
33
+ Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
34
+
35
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36
+ documentation from [`PretrainedConfig`] for more information.
37
+
38
+
39
+ Args:
40
+ vocab_size (`int`, *optional*, defaults to 151936):
41
+ Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
42
+ `inputs_ids` passed when calling [`Qwen2Model`]
43
+ hidden_size (`int`, *optional*, defaults to 4096):
44
+ Dimension of the hidden representations.
45
+ intermediate_size (`int`, *optional*, defaults to 22016):
46
+ Dimension of the MLP representations.
47
+ num_hidden_layers (`int`, *optional*, defaults to 32):
48
+ Number of hidden layers in the Transformer encoder.
49
+ num_attention_heads (`int`, *optional*, defaults to 32):
50
+ Number of attention heads for each attention layer in the Transformer encoder.
51
+ num_key_value_heads (`int`, *optional*, defaults to 32):
52
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
53
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
54
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
55
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
56
+ by meanpooling all the original heads within that group. For more details checkout [this
57
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
58
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59
+ The non-linear activation function (function or string) in the decoder.
60
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
61
+ The maximum sequence length that this model might ever be used with.
62
+ initializer_range (`float`, *optional*, defaults to 0.02):
63
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
65
+ The epsilon used by the rms normalization layers.
66
+ use_cache (`bool`, *optional*, defaults to `True`):
67
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
68
+ relevant if `config.is_decoder=True`.
69
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
70
+ Whether the model's input and output word embeddings should be tied.
71
+ rope_theta (`float`, *optional*, defaults to 10000.0):
72
+ The base period of the RoPE embeddings.
73
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
74
+ Whether to use sliding window attention.
75
+ sliding_window (`int`, *optional*, defaults to 4096):
76
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
77
+ max_window_layers (`int`, *optional*, defaults to 28):
78
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
79
+ attention_dropout (`float`, *optional*, defaults to 0.0):
80
+ The dropout ratio for the attention probabilities.
81
+
82
+ ```python
83
+ >>> from transformers import Qwen2Model, Qwen2Config
84
+
85
+ >>> # Initializing a Qwen2 style configuration
86
+ >>> configuration = Qwen2Config()
87
+
88
+ >>> # Initializing a model from the Qwen2-7B style configuration
89
+ >>> model = Qwen2Model(configuration)
90
+
91
+ >>> # Accessing the model configuration
92
+ >>> configuration = model.config
93
+ ```"""
94
+
95
+ model_type = "qwen2"
96
+ keys_to_ignore_at_inference = ["past_key_values"]
97
+
98
+ def __init__(
99
+ self,
100
+ vocab_size=151936,
101
+ hidden_size=4096,
102
+ intermediate_size=22016,
103
+ num_hidden_layers=32,
104
+ num_attention_heads=32,
105
+ num_key_value_heads=32,
106
+ hidden_act="silu",
107
+ max_position_embeddings=32768,
108
+ initializer_range=0.02,
109
+ rms_norm_eps=1e-6,
110
+ use_cache=True,
111
+ tie_word_embeddings=False,
112
+ rope_theta=10000.0,
113
+ use_sliding_window=False,
114
+ sliding_window=4096,
115
+ max_window_layers=28,
116
+ attention_dropout=0.0,
117
+ attn_implementation='flash_attention_2',
118
+ **kwargs,
119
+ ):
120
+ self.vocab_size = vocab_size
121
+ self.max_position_embeddings = max_position_embeddings
122
+ self.hidden_size = hidden_size
123
+ self.intermediate_size = intermediate_size
124
+ self.num_hidden_layers = num_hidden_layers
125
+ self.num_attention_heads = num_attention_heads
126
+ self.use_sliding_window = use_sliding_window
127
+ self.sliding_window = sliding_window
128
+ self.max_window_layers = max_window_layers
129
+
130
+ self.attn_implementation = attn_implementation
131
+ if self.attn_implementation is None:
132
+ self.attn_implementation = "flash_attention_2"
133
+
134
+ # for backward compatibility
135
+ if num_key_value_heads is None:
136
+ num_key_value_heads = num_attention_heads
137
+
138
+ self.num_key_value_heads = num_key_value_heads
139
+ self.hidden_act = hidden_act
140
+ self.initializer_range = initializer_range
141
+ self.rms_norm_eps = rms_norm_eps
142
+ self.use_cache = use_cache
143
+ self.rope_theta = rope_theta
144
+ self.attention_dropout = attention_dropout
145
+
146
+ super().__init__(
147
+ tie_word_embeddings=tie_word_embeddings,
148
+ **kwargs,
149
+ )
configuration_siglip.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Siglip model configuration"""
16
+
17
+ import os
18
+ from typing import Union
19
+
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.utils import logging
22
+
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+ SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27
+ "google/siglip-base-patch16-224": "https://huggingface.co/google/siglip-base-patch16-224/resolve/main/config.json",
28
+ }
29
+
30
+
31
+ class SiglipTextConfig(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
34
+ Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
35
+ configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
36
+ [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
37
+
38
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39
+ documentation from [`PretrainedConfig`] for more information.
40
+
41
+ Args:
42
+ vocab_size (`int`, *optional*, defaults to 32000):
43
+ Vocabulary size of the Siglip text model. Defines the number of different tokens that can be represented by
44
+ the `inputs_ids` passed when calling [`SiglipModel`].
45
+ hidden_size (`int`, *optional*, defaults to 768):
46
+ Dimensionality of the encoder layers and the pooler layer.
47
+ intermediate_size (`int`, *optional*, defaults to 3072):
48
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
49
+ num_hidden_layers (`int`, *optional*, defaults to 12):
50
+ Number of hidden layers in the Transformer encoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 12):
52
+ Number of attention heads for each attention layer in the Transformer encoder.
53
+ max_position_embeddings (`int`, *optional*, defaults to 64):
54
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
55
+ just in case (e.g., 512 or 1024 or 2048).
56
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
57
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
58
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
59
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
60
+ The epsilon used by the layer normalization layers.
61
+ attention_dropout (`float`, *optional*, defaults to 0.0):
62
+ The dropout ratio for the attention probabilities.
63
+ pad_token_id (`int`, *optional*, defaults to 1):
64
+ The id of the padding token in the vocabulary.
65
+ bos_token_id (`int`, *optional*, defaults to 49406):
66
+ The id of the beginning-of-sequence token in the vocabulary.
67
+ eos_token_id (`int`, *optional*, defaults to 49407):
68
+ The id of the end-of-sequence token in the vocabulary.
69
+
70
+ Example:
71
+
72
+ ```python
73
+ >>> from transformers import SiglipTextConfig, SiglipTextModel
74
+
75
+ >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 style configuration
76
+ >>> configuration = SiglipTextConfig()
77
+
78
+ >>> # Initializing a SiglipTextModel (with random weights) from the google/siglip-base-patch16-224 style configuration
79
+ >>> model = SiglipTextModel(configuration)
80
+
81
+ >>> # Accessing the model configuration
82
+ >>> configuration = model.config
83
+ ```"""
84
+
85
+ model_type = "siglip_text_model"
86
+
87
+ def __init__(
88
+ self,
89
+ vocab_size=32000,
90
+ hidden_size=768,
91
+ intermediate_size=3072,
92
+ num_hidden_layers=12,
93
+ num_attention_heads=12,
94
+ max_position_embeddings=64,
95
+ hidden_act="gelu_pytorch_tanh",
96
+ layer_norm_eps=1e-6,
97
+ attention_dropout=0.0,
98
+ # This differs from `CLIPTokenizer`'s default and from openai/siglip
99
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
100
+ pad_token_id=1,
101
+ bos_token_id=49406,
102
+ eos_token_id=49407,
103
+ **kwargs,
104
+ ):
105
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
106
+
107
+ self.vocab_size = vocab_size
108
+ self.hidden_size = hidden_size
109
+ self.intermediate_size = intermediate_size
110
+ self.num_hidden_layers = num_hidden_layers
111
+ self.num_attention_heads = num_attention_heads
112
+ self.max_position_embeddings = max_position_embeddings
113
+ self.layer_norm_eps = layer_norm_eps
114
+ self.hidden_act = hidden_act
115
+ self.attention_dropout = attention_dropout
116
+
117
+ @classmethod
118
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
119
+ cls._set_token_in_kwargs(kwargs)
120
+
121
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
122
+
123
+ # get the text config dict if we are loading from SiglipConfig
124
+ if config_dict.get("model_type") == "siglip":
125
+ config_dict = config_dict["text_config"]
126
+
127
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
128
+ logger.warning(
129
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
130
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
131
+ )
132
+
133
+ return cls.from_dict(config_dict, **kwargs)
134
+
135
+
136
+ class SiglipVisionConfig(PretrainedConfig):
137
+ r"""
138
+ This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
139
+ Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
140
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
141
+ [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
142
+
143
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
144
+ documentation from [`PretrainedConfig`] for more information.
145
+
146
+ Args:
147
+ hidden_size (`int`, *optional*, defaults to 768):
148
+ Dimensionality of the encoder layers and the pooler layer.
149
+ intermediate_size (`int`, *optional*, defaults to 3072):
150
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
151
+ num_hidden_layers (`int`, *optional*, defaults to 12):
152
+ Number of hidden layers in the Transformer encoder.
153
+ num_attention_heads (`int`, *optional*, defaults to 12):
154
+ Number of attention heads for each attention layer in the Transformer encoder.
155
+ num_channels (`int`, *optional*, defaults to 3):
156
+ Number of channels in the input images.
157
+ image_size (`int`, *optional*, defaults to 224):
158
+ The size (resolution) of each image.
159
+ patch_size (`int`, *optional*, defaults to 16):
160
+ The size (resolution) of each patch.
161
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
162
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
163
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
164
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
165
+ The epsilon used by the layer normalization layers.
166
+ attention_dropout (`float`, *optional*, defaults to 0.0):
167
+ The dropout ratio for the attention probabilities.
168
+
169
+ Example:
170
+
171
+ ```python
172
+ >>> from transformers import SiglipVisionConfig, SiglipVisionModel
173
+
174
+ >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
175
+ >>> configuration = SiglipVisionConfig()
176
+
177
+ >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
178
+ >>> model = SiglipVisionModel(configuration)
179
+
180
+ >>> # Accessing the model configuration
181
+ >>> configuration = model.config
182
+ ```"""
183
+
184
+ model_type = "siglip_vision_model"
185
+
186
+ def __init__(
187
+ self,
188
+ hidden_size=768,
189
+ intermediate_size=3072,
190
+ num_hidden_layers=12,
191
+ num_attention_heads=12,
192
+ num_channels=3,
193
+ image_size=224,
194
+ patch_size=16,
195
+ hidden_act="gelu_pytorch_tanh",
196
+ layer_norm_eps=1e-6,
197
+ attention_dropout=0.0,
198
+ **kwargs,
199
+ ):
200
+ super().__init__(**kwargs)
201
+
202
+ self.hidden_size = hidden_size
203
+ self.intermediate_size = intermediate_size
204
+ self.num_hidden_layers = num_hidden_layers
205
+ self.num_attention_heads = num_attention_heads
206
+ self.num_channels = num_channels
207
+ self.patch_size = patch_size
208
+ self.image_size = image_size
209
+ self.attention_dropout = attention_dropout
210
+ self.layer_norm_eps = layer_norm_eps
211
+ self.hidden_act = hidden_act
212
+
213
+ @classmethod
214
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
215
+ cls._set_token_in_kwargs(kwargs)
216
+
217
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
218
+
219
+ # get the vision config dict if we are loading from SiglipConfig
220
+ if config_dict.get("model_type") == "siglip":
221
+ config_dict = config_dict["vision_config"]
222
+
223
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
224
+ logger.warning(
225
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
226
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
227
+ )
228
+
229
+ return cls.from_dict(config_dict, **kwargs)
230
+
231
+
232
+ class SiglipConfig(PretrainedConfig):
233
+ r"""
234
+ [`SiglipConfig`] is the configuration class to store the configuration of a [`SiglipModel`]. It is used to
235
+ instantiate a Siglip model according to the specified arguments, defining the text model and vision model configs.
236
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip
237
+ [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
238
+
239
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
240
+ documentation from [`PretrainedConfig`] for more information.
241
+
242
+ Args:
243
+ text_config (`dict`, *optional*):
244
+ Dictionary of configuration options used to initialize [`SiglipTextConfig`].
245
+ vision_config (`dict`, *optional*):
246
+ Dictionary of configuration options used to initialize [`SiglipVisionConfig`].
247
+ kwargs (*optional*):
248
+ Dictionary of keyword arguments.
249
+
250
+ Example:
251
+
252
+ ```python
253
+ >>> from transformers import SiglipConfig, SiglipModel
254
+
255
+ >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 style configuration
256
+ >>> configuration = SiglipConfig()
257
+
258
+ >>> # Initializing a SiglipModel (with random weights) from the google/siglip-base-patch16-224 style configuration
259
+ >>> model = SiglipModel(configuration)
260
+
261
+ >>> # Accessing the model configuration
262
+ >>> configuration = model.config
263
+
264
+ >>> # We can also initialize a SiglipConfig from a SiglipTextConfig and a SiglipVisionConfig
265
+ >>> from transformers import SiglipTextConfig, SiglipVisionConfig
266
+
267
+ >>> # Initializing a SiglipText and SiglipVision configuration
268
+ >>> config_text = SiglipTextConfig()
269
+ >>> config_vision = SiglipVisionConfig()
270
+
271
+ >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
272
+ ```"""
273
+
274
+ model_type = "siglip"
275
+
276
+ def __init__(self, text_config=None, vision_config=None, **kwargs):
277
+ super().__init__(**kwargs)
278
+
279
+ if text_config is None:
280
+ text_config = {}
281
+ logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")
282
+
283
+ if vision_config is None:
284
+ vision_config = {}
285
+ logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")
286
+
287
+ self.text_config = SiglipTextConfig(**text_config)
288
+ self.vision_config = SiglipVisionConfig(**vision_config)
289
+
290
+ self.initializer_factor = 1.0
291
+
292
+ @classmethod
293
+ def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
294
+ r"""
295
+ Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
296
+ model configuration.
297
+
298
+ Returns:
299
+ [`SiglipConfig`]: An instance of a configuration object
300
+ """
301
+
302
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
conversation.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation prompt templates.
3
+
4
+ We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
+ If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
+ """
7
+
8
+ import dataclasses
9
+ from enum import IntEnum, auto
10
+ from typing import Any, Dict, List, Tuple, Union
11
+
12
+
13
+ class SeparatorStyle(IntEnum):
14
+ """Separator styles."""
15
+
16
+ ADD_COLON_SINGLE = auto()
17
+ ADD_COLON_TWO = auto()
18
+ ADD_COLON_SPACE_SINGLE = auto()
19
+ NO_COLON_SINGLE = auto()
20
+ NO_COLON_TWO = auto()
21
+ ADD_NEW_LINE_SINGLE = auto()
22
+ LLAMA2 = auto()
23
+ CHATGLM = auto()
24
+ CHATML = auto()
25
+ CHATINTERN = auto()
26
+ DOLLY = auto()
27
+ RWKV = auto()
28
+ PHOENIX = auto()
29
+ ROBIN = auto()
30
+ FALCON_CHAT = auto()
31
+ CHATGLM3 = auto()
32
+ INTERNVL_ZH = auto()
33
+ MPT = auto()
34
+ LLAMA3 = auto()
35
+
36
+
37
+ @dataclasses.dataclass
38
+ class Conversation:
39
+ """A class that manages prompt templates and keeps all conversation history."""
40
+
41
+ # The name of this template
42
+ name: str
43
+ # The template of the system prompt
44
+ system_template: str = '{system_message}'
45
+ # The system message
46
+ system_message: str = ''
47
+ # The names of two roles
48
+ roles: Tuple[str] = ('USER', 'ASSISTANT')
49
+ # All messages. Each item is (role, message).
50
+ messages: List[List[str]] = ()
51
+ # The number of few shot examples
52
+ offset: int = 0
53
+ # The separator style and configurations
54
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
55
+ sep: str = '\n'
56
+ sep2: str = None
57
+ # Stop criteria (the default one is EOS token)
58
+ stop_str: Union[str, List[str]] = None
59
+ # Stops generation if meeting any token in this list
60
+ stop_token_ids: List[int] = None
61
+
62
+ def get_prompt(self) -> str:
63
+ """Get the prompt for generation."""
64
+ system_prompt = self.system_template.format(system_message=self.system_message)
65
+ if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
66
+ ret = system_prompt + self.sep
67
+ for role, message in self.messages:
68
+ if message:
69
+ ret += role + ': ' + message + self.sep
70
+ else:
71
+ ret += role + ':'
72
+ return ret
73
+ elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
74
+ seps = [self.sep, self.sep2]
75
+ ret = system_prompt + seps[0]
76
+ for i, (role, message) in enumerate(self.messages):
77
+ if message:
78
+ ret += role + ': ' + message + seps[i % 2]
79
+ else:
80
+ ret += role + ':'
81
+ return ret
82
+ elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
83
+ ret = system_prompt + self.sep
84
+ for role, message in self.messages:
85
+ if message:
86
+ ret += role + ': ' + message + self.sep
87
+ else:
88
+ ret += role + ': ' # must be end with a space
89
+ return ret
90
+ elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
91
+ ret = '' if system_prompt == '' else system_prompt + self.sep
92
+ for role, message in self.messages:
93
+ if message:
94
+ ret += role + '\n' + message + self.sep
95
+ else:
96
+ ret += role + '\n'
97
+ return ret
98
+ elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
99
+ ret = system_prompt
100
+ for role, message in self.messages:
101
+ if message:
102
+ ret += role + message + self.sep
103
+ else:
104
+ ret += role
105
+ return ret
106
+ elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
107
+ seps = [self.sep, self.sep2]
108
+ ret = system_prompt
109
+ for i, (role, message) in enumerate(self.messages):
110
+ if message:
111
+ ret += role + message + seps[i % 2]
112
+ else:
113
+ ret += role
114
+ return ret
115
+ elif self.sep_style == SeparatorStyle.RWKV:
116
+ ret = system_prompt
117
+ for i, (role, message) in enumerate(self.messages):
118
+ if message:
119
+ ret += (
120
+ role
121
+ + ': '
122
+ + message.replace('\r\n', '\n').replace('\n\n', '\n')
123
+ )
124
+ ret += '\n\n'
125
+ else:
126
+ ret += role + ':'
127
+ return ret
128
+ elif self.sep_style == SeparatorStyle.LLAMA2:
129
+ seps = [self.sep, self.sep2]
130
+ if self.system_message:
131
+ ret = system_prompt
132
+ else:
133
+ ret = '[INST] '
134
+ for i, (role, message) in enumerate(self.messages):
135
+ tag = self.roles[i % 2]
136
+ if message:
137
+ if i == 0:
138
+ ret += message + ' '
139
+ else:
140
+ ret += tag + ' ' + message + seps[i % 2]
141
+ else:
142
+ ret += tag
143
+ return ret
144
+ elif self.sep_style == SeparatorStyle.CHATGLM:
145
+ # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
146
+ # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
147
+ round_add_n = 1 if self.name == 'chatglm2' else 0
148
+ if system_prompt:
149
+ ret = system_prompt + self.sep
150
+ else:
151
+ ret = ''
152
+
153
+ for i, (role, message) in enumerate(self.messages):
154
+ if i % 2 == 0:
155
+ ret += f'[Round {i//2 + round_add_n}]{self.sep}'
156
+
157
+ if message:
158
+ ret += f'{role}:{message}{self.sep}'
159
+ else:
160
+ ret += f'{role}:'
161
+ return ret
162
+ elif self.sep_style == SeparatorStyle.CHATML:
163
+ ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
164
+ for role, message in self.messages:
165
+ if message:
166
+ ret += role + '\n' + message + self.sep + '\n'
167
+ else:
168
+ ret += role + '\n'
169
+ return ret
170
+ elif self.sep_style == SeparatorStyle.CHATGLM3:
171
+ ret = ''
172
+ if self.system_message:
173
+ ret += system_prompt
174
+ for role, message in self.messages:
175
+ if message:
176
+ ret += role + '\n' + ' ' + message
177
+ else:
178
+ ret += role
179
+ return ret
180
+ elif self.sep_style == SeparatorStyle.CHATINTERN:
181
+ # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
182
+ seps = [self.sep, self.sep2]
183
+ ret = system_prompt
184
+ for i, (role, message) in enumerate(self.messages):
185
+ # if i % 2 == 0:
186
+ # ret += "<s>"
187
+ if message:
188
+ ret += role + ':' + message + seps[i % 2] + '\n'
189
+ else:
190
+ ret += role + ':'
191
+ return ret
192
+ elif self.sep_style == SeparatorStyle.DOLLY:
193
+ seps = [self.sep, self.sep2]
194
+ ret = system_prompt
195
+ for i, (role, message) in enumerate(self.messages):
196
+ if message:
197
+ ret += role + ':\n' + message + seps[i % 2]
198
+ if i % 2 == 1:
199
+ ret += '\n\n'
200
+ else:
201
+ ret += role + ':\n'
202
+ return ret
203
+ elif self.sep_style == SeparatorStyle.PHOENIX:
204
+ ret = system_prompt
205
+ for role, message in self.messages:
206
+ if message:
207
+ ret += role + ': ' + '<s>' + message + '</s>'
208
+ else:
209
+ ret += role + ': ' + '<s>'
210
+ return ret
211
+ elif self.sep_style == SeparatorStyle.ROBIN:
212
+ ret = system_prompt + self.sep
213
+ for role, message in self.messages:
214
+ if message:
215
+ ret += role + ':\n' + message + self.sep
216
+ else:
217
+ ret += role + ':\n'
218
+ return ret
219
+ elif self.sep_style == SeparatorStyle.FALCON_CHAT:
220
+ ret = ''
221
+ if self.system_message:
222
+ ret += system_prompt + self.sep
223
+ for role, message in self.messages:
224
+ if message:
225
+ ret += role + ': ' + message + self.sep
226
+ else:
227
+ ret += role + ':'
228
+
229
+ return ret
230
+ elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
231
+ seps = [self.sep, self.sep2]
232
+ ret = self.system_message + seps[0]
233
+ for i, (role, message) in enumerate(self.messages):
234
+ if message:
235
+ ret += role + ': ' + message + seps[i % 2]
236
+ else:
237
+ ret += role + ':'
238
+ return ret
239
+ elif self.sep_style == SeparatorStyle.MPT:
240
+ ret = system_prompt + self.sep
241
+ for role, message in self.messages:
242
+ if message:
243
+ if type(message) is tuple:
244
+ message, _, _ = message
245
+ ret += role + message + self.sep
246
+ else:
247
+ ret += role
248
+ return ret
249
+ elif self.sep_style == SeparatorStyle.LLAMA3:
250
+ ret = system_prompt + self.sep
251
+ for role, message in self.messages:
252
+ if message:
253
+ if type(message) is tuple:
254
+ message, _, _ = message
255
+ ret += role + message + self.sep
256
+ else:
257
+ ret += role
258
+ return ret
259
+ else:
260
+ raise ValueError(f'Invalid style: {self.sep_style}')
261
+
262
+ def set_system_message(self, system_message: str):
263
+ """Set the system message."""
264
+ self.system_message = system_message
265
+
266
+ def append_message(self, role: str, message: str):
267
+ """Append a new message."""
268
+ self.messages.append([role, message])
269
+
270
+ def update_last_message(self, message: str):
271
+ """Update the last output.
272
+
273
+ The last message is typically set to be None when constructing the prompt,
274
+ so we need to update it in-place after getting the response from a model.
275
+ """
276
+ self.messages[-1][1] = message
277
+
278
+ def to_gradio_chatbot(self):
279
+ """Convert the conversation to gradio chatbot format."""
280
+ ret = []
281
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
282
+ if i % 2 == 0:
283
+ ret.append([msg, None])
284
+ else:
285
+ ret[-1][-1] = msg
286
+ return ret
287
+
288
+ def to_openai_api_messages(self):
289
+ """Convert the conversation to OpenAI chat completion format."""
290
+ ret = [{'role': 'system', 'content': self.system_message}]
291
+
292
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
293
+ if i % 2 == 0:
294
+ ret.append({'role': 'user', 'content': msg})
295
+ else:
296
+ if msg is not None:
297
+ ret.append({'role': 'assistant', 'content': msg})
298
+ return ret
299
+
300
+ def copy(self):
301
+ return Conversation(
302
+ name=self.name,
303
+ system_template=self.system_template,
304
+ system_message=self.system_message,
305
+ roles=self.roles,
306
+ messages=[[x, y] for x, y in self.messages],
307
+ offset=self.offset,
308
+ sep_style=self.sep_style,
309
+ sep=self.sep,
310
+ sep2=self.sep2,
311
+ stop_str=self.stop_str,
312
+ stop_token_ids=self.stop_token_ids,
313
+ )
314
+
315
+ def dict(self):
316
+ return {
317
+ 'template_name': self.name,
318
+ 'system_message': self.system_message,
319
+ 'roles': self.roles,
320
+ 'messages': self.messages,
321
+ 'offset': self.offset,
322
+ }
323
+
324
+
325
+ # A global registry for all conversation templates
326
+ conv_templates: Dict[str, Conversation] = {}
327
+
328
+
329
+ def register_conv_template(template: Conversation, override: bool = False):
330
+ """Register a new conversation template."""
331
+ if not override:
332
+ assert (
333
+ template.name not in conv_templates
334
+ ), f'{template.name} has been registered.'
335
+
336
+ conv_templates[template.name] = template
337
+
338
+
339
+ def get_conv_template(name: str) -> Conversation:
340
+ """Get a conversation template."""
341
+ return conv_templates[name].copy()
342
+
343
+
344
+ # Note that for inference, using the Hermes-2 and internlm2-chat templates is equivalent.
345
+ register_conv_template(
346
+ Conversation(
347
+ name='Hermes-2',
348
+ system_template='<|im_start|>system\n{system_message}',
349
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
350
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。',
351
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
352
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
353
+ sep_style=SeparatorStyle.MPT,
354
+ sep='<|im_end|>',
355
+ stop_token_ids=[
356
+ 2,
357
+ 6,
358
+ 7,
359
+ 8,
360
+ ],
361
+ stop_str='<|endoftext|>',
362
+ )
363
+ )
364
+
365
+
366
+ register_conv_template(
367
+ Conversation(
368
+ name='internlm2-chat',
369
+ system_template='<|im_start|>system\n{system_message}',
370
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
371
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。',
372
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
373
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
374
+ sep_style=SeparatorStyle.MPT,
375
+ sep='<|im_end|>',
376
+ stop_token_ids=[
377
+ 2,
378
+ 92543,
379
+ 92542
380
+ ]
381
+ )
382
+ )
383
+
384
+
385
+ register_conv_template(
386
+ Conversation(
387
+ name='phi3-chat',
388
+ system_template='<|system|>\n{system_message}',
389
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
390
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室及��家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。',
391
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
392
+ roles=('<|user|>\n', '<|assistant|>\n'),
393
+ sep_style=SeparatorStyle.MPT,
394
+ sep='<|end|>',
395
+ stop_token_ids=[
396
+ 2,
397
+ 32000,
398
+ 32007
399
+ ]
400
+ )
401
+ )
402
+ register_conv_template(
403
+ Conversation(
404
+ name='llama3-chat',
405
+ system_template='<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_message}',
406
+ system_message='You are an AI assistant whose name is Eagle-Next.',
407
+ roles=('<|start_header_id|>user<|end_header_id|>\n\n', '<|start_header_id|>assistant<|end_header_id|>\n\n'),
408
+ sep_style=SeparatorStyle.LLAMA3,
409
+ sep='<|eot_id|>',
410
+ stop_token_ids=[
411
+ 128259,
412
+ 128001
413
+ ]
414
+ )
415
+ )
416
+
417
+ # Qwen-chat default template
418
+ # source: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/qwen_generation_utils.py#L130
419
+ register_conv_template(
420
+ Conversation(
421
+ name='qwen2-chat',
422
+ system_template='<|im_start|>system\n{system_message}',
423
+ system_message='You are a helpful assistant.',
424
+ roles=('<|im_start|>user', '<|im_start|>assistant'),
425
+ sep_style=SeparatorStyle.CHATML,
426
+ sep='<|im_end|>',
427
+ stop_token_ids=[
428
+ 151643,
429
+ 151644,
430
+ 151645,
431
+ ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
432
+ stop_str='<|endoftext|>',
433
+ )
434
+ )
convnext.py ADDED
@@ -0,0 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ ConvNeXt
2
+
3
+ Papers:
4
+ * `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf
5
+ @Article{liu2022convnet,
6
+ author = {Zhuang Liu and Hanzi Mao and Chao-Yuan Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie},
7
+ title = {A ConvNet for the 2020s},
8
+ journal = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
9
+ year = {2022},
10
+ }
11
+
12
+ * `ConvNeXt-V2 - Co-designing and Scaling ConvNets with Masked Autoencoders` - https://arxiv.org/abs/2301.00808
13
+ @article{Woo2023ConvNeXtV2,
14
+ title={ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders},
15
+ author={Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon and Saining Xie},
16
+ year={2023},
17
+ journal={arXiv preprint arXiv:2301.00808},
18
+ }
19
+
20
+ Original code and weights from:
21
+ * https://github.com/facebookresearch/ConvNeXt, original copyright below
22
+ * https://github.com/facebookresearch/ConvNeXt-V2, original copyright below
23
+
24
+ Model defs atto, femto, pico, nano and _ols / _hnf variants are timm originals.
25
+
26
+ Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
27
+ """
28
+ # ConvNeXt
29
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
30
+ # All rights reserved.
31
+ # This source code is licensed under the MIT license
32
+
33
+ # ConvNeXt-V2
34
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
35
+ # All rights reserved.
36
+ # This source code is licensed under the license found in the
37
+ # LICENSE file in the root directory of this source tree (Attribution-NonCommercial 4.0 International (CC BY-NC 4.0))
38
+ # No code was used directly from ConvNeXt-V2, however the weights are CC BY-NC 4.0 so beware if using commercially.
39
+
40
+ from collections import OrderedDict
41
+ from functools import partial
42
+ from typing import Callable, Optional, Tuple, Union
43
+
44
+ import torch
45
+ import torch.nn as nn
46
+
47
+ from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
48
+ from timm.layers import trunc_normal_, AvgPool2dSame, DropPath, Mlp, GlobalResponseNormMlp, \
49
+ LayerNorm2d, LayerNorm, create_conv2d, get_act_layer, make_divisible, to_ntuple
50
+ from timm.layers import NormMlpClassifierHead, ClassifierHead
51
+ from timm.models._builder import build_model_with_cfg
52
+ from timm.models._manipulate import named_apply, checkpoint_seq
53
+ from timm.models._registry import generate_default_cfgs, register_model, register_model_deprecations
54
+
55
+ __all__ = ['ConvNeXt'] # model_registry will add each entrypoint fn to this
56
+
57
+
58
+ class Downsample(nn.Module):
59
+
60
+ def __init__(self, in_chs, out_chs, stride=1, dilation=1):
61
+ super().__init__()
62
+ avg_stride = stride if dilation == 1 else 1
63
+ if stride > 1 or dilation > 1:
64
+ avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
65
+ self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
66
+ else:
67
+ self.pool = nn.Identity()
68
+
69
+ if in_chs != out_chs:
70
+ self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
71
+ else:
72
+ self.conv = nn.Identity()
73
+
74
+ def forward(self, x):
75
+ x = self.pool(x)
76
+ x = self.conv(x)
77
+ return x
78
+
79
+
80
+ class ConvNeXtBlock(nn.Module):
81
+ """ ConvNeXt Block
82
+ There are two equivalent implementations:
83
+ (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
84
+ (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
85
+
86
+ Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
87
+ choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
88
+ is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ in_chs: int,
94
+ out_chs: Optional[int] = None,
95
+ kernel_size: int = 7,
96
+ stride: int = 1,
97
+ dilation: Union[int, Tuple[int, int]] = (1, 1),
98
+ mlp_ratio: float = 4,
99
+ conv_mlp: bool = False,
100
+ conv_bias: bool = True,
101
+ use_grn: bool = False,
102
+ ls_init_value: Optional[float] = 1e-6,
103
+ act_layer: Union[str, Callable] = 'gelu',
104
+ norm_layer: Optional[Callable] = None,
105
+ drop_path: float = 0.,
106
+ ):
107
+ """
108
+
109
+ Args:
110
+ in_chs: Block input channels.
111
+ out_chs: Block output channels (same as in_chs if None).
112
+ kernel_size: Depthwise convolution kernel size.
113
+ stride: Stride of depthwise convolution.
114
+ dilation: Tuple specifying input and output dilation of block.
115
+ mlp_ratio: MLP expansion ratio.
116
+ conv_mlp: Use 1x1 convolutions for MLP and a NCHW compatible norm layer if True.
117
+ conv_bias: Apply bias for all convolution (linear) layers.
118
+ use_grn: Use GlobalResponseNorm in MLP (from ConvNeXt-V2)
119
+ ls_init_value: Layer-scale init values, layer-scale applied if not None.
120
+ act_layer: Activation layer.
121
+ norm_layer: Normalization layer (defaults to LN if not specified).
122
+ drop_path: Stochastic depth probability.
123
+ """
124
+ super().__init__()
125
+ out_chs = out_chs or in_chs
126
+ dilation = to_ntuple(2)(dilation)
127
+ act_layer = get_act_layer(act_layer)
128
+ if not norm_layer:
129
+ norm_layer = LayerNorm2d if conv_mlp else LayerNorm
130
+ mlp_layer = partial(GlobalResponseNormMlp if use_grn else Mlp, use_conv=conv_mlp)
131
+ self.use_conv_mlp = conv_mlp
132
+ self.conv_dw = create_conv2d(
133
+ in_chs,
134
+ out_chs,
135
+ kernel_size=kernel_size,
136
+ stride=stride,
137
+ dilation=dilation[0],
138
+ depthwise=True,
139
+ bias=conv_bias,
140
+ )
141
+ self.norm = norm_layer(out_chs)
142
+ self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
143
+ self.weight = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value is not None else None
144
+ if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
145
+ self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0])
146
+ else:
147
+ self.shortcut = nn.Identity()
148
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
149
+
150
+ def forward(self, x):
151
+ shortcut = x
152
+ x = self.conv_dw(x)
153
+ if self.use_conv_mlp:
154
+ x = self.norm(x)
155
+ x = self.mlp(x)
156
+ else:
157
+ x = x.permute(0, 2, 3, 1)
158
+ x = self.norm(x)
159
+ x = self.mlp(x)
160
+ x = x.permute(0, 3, 1, 2)
161
+ if self.weight is not None:
162
+ x = x.mul(self.weight.reshape(1, -1, 1, 1))
163
+
164
+ x = self.drop_path(x) + self.shortcut(shortcut)
165
+ return x
166
+
167
+
168
+ class ConvNeXtStage(nn.Module):
169
+
170
+ def __init__(
171
+ self,
172
+ in_chs,
173
+ out_chs,
174
+ kernel_size=7,
175
+ stride=2,
176
+ depth=2,
177
+ dilation=(1, 1),
178
+ drop_path_rates=None,
179
+ ls_init_value=1.0,
180
+ conv_mlp=False,
181
+ conv_bias=True,
182
+ use_grn=False,
183
+ act_layer='gelu',
184
+ norm_layer=None,
185
+ norm_layer_cl=None
186
+ ):
187
+ super().__init__()
188
+ self.grad_checkpointing = False
189
+
190
+ if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
191
+ ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
192
+ pad = 'same' if dilation[1] > 1 else 0 # same padding needed if dilation used
193
+ self.downsample = nn.Sequential(
194
+ norm_layer(in_chs),
195
+ create_conv2d(
196
+ in_chs,
197
+ out_chs,
198
+ kernel_size=ds_ks,
199
+ stride=stride,
200
+ dilation=dilation[0],
201
+ padding=pad,
202
+ bias=conv_bias,
203
+ ),
204
+ )
205
+ in_chs = out_chs
206
+ else:
207
+ self.downsample = nn.Identity()
208
+
209
+ drop_path_rates = drop_path_rates or [0.] * depth
210
+ stage_blocks = []
211
+ for i in range(depth):
212
+ stage_blocks.append(ConvNeXtBlock(
213
+ in_chs=in_chs,
214
+ out_chs=out_chs,
215
+ kernel_size=kernel_size,
216
+ dilation=dilation[1],
217
+ drop_path=drop_path_rates[i],
218
+ ls_init_value=ls_init_value,
219
+ conv_mlp=conv_mlp,
220
+ conv_bias=conv_bias,
221
+ use_grn=use_grn,
222
+ act_layer=act_layer,
223
+ norm_layer=norm_layer if conv_mlp else norm_layer_cl,
224
+ ))
225
+ in_chs = out_chs
226
+ self.blocks = nn.Sequential(*stage_blocks)
227
+
228
+ def forward(self, x):
229
+ x = self.downsample(x)
230
+ if self.grad_checkpointing and not torch.jit.is_scripting():
231
+ x = checkpoint_seq(self.blocks, x)
232
+ else:
233
+ x = self.blocks(x)
234
+ return x
235
+
236
+
237
+ class ConvNeXt(nn.Module):
238
+ r""" ConvNeXt
239
+ A PyTorch impl of : `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf
240
+ """
241
+
242
+ def __init__(
243
+ self,
244
+ in_chans: int = 3,
245
+ num_classes: int = 1000,
246
+ global_pool: str = 'avg',
247
+ output_stride: int = 32,
248
+ depths: Tuple[int, ...] = (3, 3, 9, 3),
249
+ dims: Tuple[int, ...] = (96, 192, 384, 768),
250
+ kernel_sizes: Union[int, Tuple[int, ...]] = 7,
251
+ ls_init_value: Optional[float] = 1e-6,
252
+ stem_type: str = 'patch',
253
+ patch_size: int = 4,
254
+ head_init_scale: float = 1.,
255
+ head_norm_first: bool = False,
256
+ head_hidden_size: Optional[int] = None,
257
+ conv_mlp: bool = False,
258
+ conv_bias: bool = True,
259
+ use_grn: bool = False,
260
+ act_layer: Union[str, Callable] = 'gelu',
261
+ norm_layer: Optional[Union[str, Callable]] = None,
262
+ norm_eps: Optional[float] = None,
263
+ drop_rate: float = 0.,
264
+ drop_path_rate: float = 0.,
265
+ ):
266
+ """
267
+ Args:
268
+ in_chans: Number of input image channels.
269
+ num_classes: Number of classes for classification head.
270
+ global_pool: Global pooling type.
271
+ output_stride: Output stride of network, one of (8, 16, 32).
272
+ depths: Number of blocks at each stage.
273
+ dims: Feature dimension at each stage.
274
+ kernel_sizes: Depthwise convolution kernel-sizes for each stage.
275
+ ls_init_value: Init value for Layer Scale, disabled if None.
276
+ stem_type: Type of stem.
277
+ patch_size: Stem patch size for patch stem.
278
+ head_init_scale: Init scaling value for classifier weights and biases.
279
+ head_norm_first: Apply normalization before global pool + head.
280
+ head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
281
+ conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
282
+ conv_bias: Use bias layers w/ all convolutions.
283
+ use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
284
+ act_layer: Activation layer type.
285
+ norm_layer: Normalization layer type.
286
+ drop_rate: Head pre-classifier dropout rate.
287
+ drop_path_rate: Stochastic depth drop rate.
288
+ """
289
+ super().__init__()
290
+ assert output_stride in (8, 16, 32)
291
+ kernel_sizes = to_ntuple(4)(kernel_sizes)
292
+ if norm_layer is None:
293
+ norm_layer = LayerNorm2d
294
+ norm_layer_cl = norm_layer if conv_mlp else LayerNorm
295
+ if norm_eps is not None:
296
+ norm_layer = partial(norm_layer, eps=norm_eps)
297
+ norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
298
+ else:
299
+ assert conv_mlp,\
300
+ 'If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input'
301
+ norm_layer_cl = norm_layer
302
+ if norm_eps is not None:
303
+ norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
304
+
305
+ self.num_classes = num_classes
306
+ self.drop_rate = drop_rate
307
+ self.feature_info = []
308
+
309
+ assert stem_type in ('patch', 'overlap', 'overlap_tiered')
310
+ if stem_type == 'patch':
311
+ # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
312
+ self.stem = nn.Sequential(
313
+ nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias),
314
+ norm_layer(dims[0]),
315
+ )
316
+ stem_stride = patch_size
317
+ else:
318
+ mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0]
319
+ self.stem = nn.Sequential(
320
+ nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias),
321
+ nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias),
322
+ norm_layer(dims[0]),
323
+ )
324
+ stem_stride = 4
325
+
326
+ self.stages = nn.Sequential()
327
+ dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
328
+ stages = []
329
+ prev_chs = dims[0]
330
+ curr_stride = stem_stride
331
+ dilation = 1
332
+ # 4 feature resolution stages, each consisting of multiple residual blocks
333
+ for i in range(4):
334
+ stride = 2 if curr_stride == 2 or i > 0 else 1
335
+ if curr_stride >= output_stride and stride > 1:
336
+ dilation *= stride
337
+ stride = 1
338
+ curr_stride *= stride
339
+ first_dilation = 1 if dilation in (1, 2) else 2
340
+ out_chs = dims[i]
341
+ stages.append(ConvNeXtStage(
342
+ prev_chs,
343
+ out_chs,
344
+ kernel_size=kernel_sizes[i],
345
+ stride=stride,
346
+ dilation=(first_dilation, dilation),
347
+ depth=depths[i],
348
+ drop_path_rates=dp_rates[i],
349
+ ls_init_value=ls_init_value,
350
+ conv_mlp=conv_mlp,
351
+ conv_bias=conv_bias,
352
+ use_grn=use_grn,
353
+ act_layer=act_layer,
354
+ norm_layer=norm_layer,
355
+ norm_layer_cl=norm_layer_cl,
356
+ ))
357
+ prev_chs = out_chs
358
+ # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
359
+ self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')]
360
+ self.stages = nn.Sequential(*stages)
361
+ self.num_features = prev_chs
362
+
363
+ # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
364
+ # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
365
+ if head_norm_first:
366
+ assert not head_hidden_size
367
+ self.norm_pre = norm_layer(self.num_features)
368
+ self.head = ClassifierHead(
369
+ self.num_features,
370
+ num_classes,
371
+ pool_type=global_pool,
372
+ drop_rate=self.drop_rate,
373
+ )
374
+ else:
375
+ self.norm_pre = nn.Identity()
376
+ self.head = NormMlpClassifierHead(
377
+ self.num_features,
378
+ num_classes,
379
+ hidden_size=head_hidden_size,
380
+ pool_type=global_pool,
381
+ drop_rate=self.drop_rate,
382
+ norm_layer=norm_layer,
383
+ act_layer='gelu',
384
+ )
385
+ named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
386
+
387
+ @torch.jit.ignore
388
+ def group_matcher(self, coarse=False):
389
+ return dict(
390
+ stem=r'^stem',
391
+ blocks=r'^stages\.(\d+)' if coarse else [
392
+ (r'^stages\.(\d+)\.downsample', (0,)), # blocks
393
+ (r'^stages\.(\d+)\.blocks\.(\d+)', None),
394
+ (r'^norm_pre', (99999,))
395
+ ]
396
+ )
397
+
398
+ @torch.jit.ignore
399
+ def set_grad_checkpointing(self, enable=True):
400
+ for s in self.stages:
401
+ s.grad_checkpointing = enable
402
+
403
+ @torch.jit.ignore
404
+ def get_classifier(self):
405
+ return self.head.fc
406
+
407
+ def reset_classifier(self, num_classes=0, global_pool=None):
408
+ self.head.reset(num_classes, global_pool)
409
+
410
+ def forward_features(self, x):
411
+ x = self.stem(x)
412
+ x = self.stages(x)
413
+ x = self.norm_pre(x)
414
+ return x
415
+
416
+ def forward_head(self, x, pre_logits: bool = False):
417
+ return self.head(x, pre_logits=True) if pre_logits else self.head(x)
418
+
419
+ def forward(self, x):
420
+ x = self.forward_features(x)
421
+ x = self.forward_head(x)
422
+ return x
423
+
424
+
425
+ def _init_weights(module, name=None, head_init_scale=1.0):
426
+ if isinstance(module, nn.Conv2d):
427
+ trunc_normal_(module.weight, std=.02)
428
+ if module.bias is not None:
429
+ nn.init.zeros_(module.bias)
430
+ elif isinstance(module, nn.Linear):
431
+ trunc_normal_(module.weight, std=.02)
432
+ nn.init.zeros_(module.bias)
433
+ if name and 'head.' in name:
434
+ module.weight.data.mul_(head_init_scale)
435
+ module.bias.data.mul_(head_init_scale)
436
+
437
+
438
+ def checkpoint_filter_fn(state_dict, model):
439
+ """ Remap FB checkpoints -> timm """
440
+ if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
441
+ out_dict={}
442
+ out_dict = {k.replace('gamma', 'weight'): v for k, v in state_dict.items()}
443
+ return out_dict # non-FB checkpoint
444
+ if 'model' in state_dict:
445
+ state_dict = state_dict['model']
446
+
447
+ out_dict = {}
448
+ if 'visual.trunk.stem.0.weight' in state_dict:
449
+ out_dict = {k.replace('visual.trunk.', '').replace('gamma', 'weight'): v for k, v in state_dict.items() if
450
+ k.startswith('visual.trunk.')}
451
+
452
+ if 'visual.head.proj.weight' in state_dict:
453
+ out_dict['head.fc.weight'] = state_dict['visual.head.proj.weight']
454
+ out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.proj.weight'].shape[0])
455
+ elif 'visual.head.mlp.fc1.weight' in state_dict:
456
+ out_dict['head.pre_logits.fc.weight'] = state_dict['visual.head.mlp.fc1.weight']
457
+ out_dict['head.pre_logits.fc.bias'] = state_dict['visual.head.mlp.fc1.bias']
458
+ out_dict['head.fc.weight'] = state_dict['visual.head.mlp.fc2.weight']
459
+ out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.mlp.fc2.weight'].shape[0])
460
+ return out_dict
461
+
462
+ import re
463
+ for k, v in state_dict.items():
464
+ k = k.replace('downsample_layers.0.', 'stem.')
465
+ k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
466
+ k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
467
+ k = k.replace('dwconv', 'conv_dw')
468
+ k = k.replace('pwconv', 'mlp.fc')
469
+ if 'grn' in k:
470
+ k = k.replace('grn.beta', 'mlp.grn.bias')
471
+ k = k.replace('grn.gamma', 'mlp.grn.weight')
472
+ v = v.reshape(v.shape[-1])
473
+ k = k.replace('head.', 'head.fc.')
474
+ if k.startswith('norm.'):
475
+ k = k.replace('norm', 'head.norm')
476
+ if v.ndim == 2 and 'head' not in k:
477
+ model_shape = model.state_dict()[k].shape
478
+ v = v.reshape(model_shape)
479
+ k=k.replace('gamma','weight')
480
+ out_dict[k] = v
481
+
482
+ return out_dict
483
+
484
+
485
+ def _create_convnext(variant, pretrained=False, **kwargs):
486
+ if kwargs.get('pretrained_cfg', '') == 'fcmae':
487
+ # NOTE fcmae pretrained weights have no classifier or final norm-layer (`head.norm`)
488
+ # This is workaround loading with num_classes=0 w/o removing norm-layer.
489
+ kwargs.setdefault('pretrained_strict', False)
490
+
491
+ model = build_model_with_cfg(
492
+ ConvNeXt, variant, pretrained,
493
+ pretrained_filter_fn=checkpoint_filter_fn,
494
+ feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
495
+ **kwargs)
496
+ return model
497
+
498
+
499
+ def _cfg(url='', **kwargs):
500
+ return {
501
+ 'url': url,
502
+ 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
503
+ 'crop_pct': 0.875, 'interpolation': 'bicubic',
504
+ 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
505
+ 'first_conv': 'stem.0', 'classifier': 'head.fc',
506
+ **kwargs
507
+ }
508
+
509
+
510
+ def _cfgv2(url='', **kwargs):
511
+ return {
512
+ 'url': url,
513
+ 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
514
+ 'crop_pct': 0.875, 'interpolation': 'bicubic',
515
+ 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
516
+ 'first_conv': 'stem.0', 'classifier': 'head.fc',
517
+ 'license': 'cc-by-nc-4.0', 'paper_ids': 'arXiv:2301.00808',
518
+ 'paper_name': 'ConvNeXt-V2: Co-designing and Scaling ConvNets with Masked Autoencoders',
519
+ 'origin_url': 'https://github.com/facebookresearch/ConvNeXt-V2',
520
+ **kwargs
521
+ }
522
+
523
+
524
+ default_cfgs = generate_default_cfgs({
525
+ 'convnext_xxlarge.clip_laion2b_soup_ft_in1k': _cfg(
526
+ hf_hub_id='timm/',
527
+ mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
528
+ input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
529
+
530
+ 'convnext_xxlarge.clip_laion2b_soup_ft_in12k': _cfg(
531
+ hf_hub_id='timm/',
532
+ mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
533
+ input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
534
+ 'convnext_xxlarge.clip_laion2b_soup': _cfg(
535
+ hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup',
536
+ hf_hub_filename='open_clip_pytorch_model.bin',
537
+ mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
538
+ input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024),
539
+ 'convnext_xxlarge.clip_laion2b_rewind': _cfg(
540
+ hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind',
541
+ hf_hub_filename='open_clip_pytorch_model.bin',
542
+ mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
543
+ input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024),
544
+ })
545
+
546
+
547
+
548
+ @register_model
549
+ def convnext_xxlarge(pretrained=False, **kwargs) -> ConvNeXt:
550
+ model_args = dict(depths=[3, 4, 30, 3], dims=[384, 768, 1536, 3072], norm_eps=kwargs.pop('norm_eps', 1e-5))
551
+ model = _create_convnext('convnext_xxlarge', pretrained=pretrained, **dict(model_args, **kwargs))
552
+ return model
553
+
554
+
555
+
556
+ # register_model_deprecations(__name__, {
557
+ # 'convnext_tiny_in22ft1k': 'convnext_tiny.fb_in22k_ft_in1k',
558
+ # 'convnext_small_in22ft1k': 'convnext_small.fb_in22k_ft_in1k',
559
+ # 'convnext_base_in22ft1k': 'convnext_base.fb_in22k_ft_in1k',
560
+ # 'convnext_large_in22ft1k': 'convnext_large.fb_in22k_ft_in1k',
561
+ # 'convnext_xlarge_in22ft1k': 'convnext_xlarge.fb_in22k_ft_in1k',
562
+ # 'convnext_tiny_384_in22ft1k': 'convnext_tiny.fb_in22k_ft_in1k_384',
563
+ # 'convnext_small_384_in22ft1k': 'convnext_small.fb_in22k_ft_in1k_384',
564
+ # 'convnext_base_384_in22ft1k': 'convnext_base.fb_in22k_ft_in1k_384',
565
+ # 'convnext_large_384_in22ft1k': 'convnext_large.fb_in22k_ft_in1k_384',
566
+ # 'convnext_xlarge_384_in22ft1k': 'convnext_xlarge.fb_in22k_ft_in1k_384',
567
+ # 'convnext_tiny_in22k': 'convnext_tiny.fb_in22k',
568
+ # 'convnext_small_in22k': 'convnext_small.fb_in22k',
569
+ # 'convnext_base_in22k': 'convnext_base.fb_in22k',
570
+ # 'convnext_large_in22k': 'convnext_large.fb_in22k',
571
+ # 'convnext_xlarge_in22k': 'convnext_xlarge.fb_in22k',
572
+ # })
convnext_encoder.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, os
2
+ import torch.nn as nn
3
+ from timm import create_model
4
+ from transformers import CLIPImageProcessor
5
+ from .convnext import convnext_xxlarge
6
+ from torch.utils.checkpoint import checkpoint
7
+ import torch
8
+ from torchvision import transforms as T
9
+ from PIL import Image
10
+
11
+
12
+
13
+ cfg={
14
+ "crop_size": 256,
15
+ "do_center_crop": True,
16
+ "do_normalize": True,
17
+ "do_resize": True,
18
+ "feature_extractor_type": "CLIPFeatureExtractor",
19
+ "image_mean": [
20
+ 0.48145466,
21
+ 0.4578275,
22
+ 0.40821073
23
+ ],
24
+ "image_std": [
25
+ 0.26862954,
26
+ 0.26130258,
27
+ 0.27577711
28
+ ],
29
+ "resample": 3,
30
+ "size": 256
31
+ }
32
+
33
+
34
+
35
+ MEAN_SLIP = [0.5, 0.5, 0.5]
36
+ STD_SLIP = [0.5, 0.5, 0.5]
37
+
38
+ MEAN_CLIP = [0.48145466, 0.4578275, 0.40821073]
39
+ STD_CLIP = [0.26862954, 0.26130258, 0.27577711]
40
+
41
+
42
+ a = [s_slip / s_clip for s_slip, s_clip in zip(STD_SLIP, STD_CLIP)]
43
+ b = [(m_slip - m_clip) / s_clip for m_slip, m_clip, s_clip in zip(MEAN_SLIP, MEAN_CLIP, STD_CLIP)]
44
+
45
+
46
+ class SlipToClipTransform:
47
+ def __init__(self, a, b):
48
+ self.a = torch.tensor(a).view(-1, 1, 1)
49
+ self.b = torch.tensor(b).view(-1, 1, 1)
50
+
51
+ def __call__(self, x_slip):
52
+ return x_slip * self.a.to(x_slip.device) + self.b.to(x_slip.device)
53
+ slip_to_clip = SlipToClipTransform(a, b)
54
+
55
+ class ConvNextVisionTower(nn.Module):
56
+ def __init__(self, vision_tower, args, delay_load=False, normalize_type=None):
57
+ super().__init__()
58
+
59
+ self.is_loaded = False
60
+ self.freeze_vision=args.freeze_vision
61
+ self.input_image_size=args.input_image_size
62
+ self.vision_tower_name = vision_tower
63
+ self.name = 'convnext'
64
+ self.select_layer = args.mm_vision_select_layer
65
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
66
+ self.pre_norm = normalize_type
67
+
68
+ print('pre_norm: ', self.pre_norm)
69
+ self.delay_load = delay_load
70
+ self.load_model()
71
+
72
+ def load_model(self):
73
+ if 'xxlarge' in self.vision_tower_name:
74
+ if self.delay_load:
75
+ self.vision_tower = convnext_xxlarge(pretrained=False)
76
+ else:
77
+ self.vision_tower = convnext_xxlarge(self.vision_tower_name)
78
+ setattr(self.vision_tower, 'hidden_size', 3072)
79
+ elif os.path.exists(self.vision_tower_name):
80
+ self.vision_tower = torch.load(self.vision_tower_name)
81
+ else:
82
+ assert False, 'Not implemented'
83
+
84
+
85
+ self.vision_tower = self.vision_tower.to(torch.bfloat16)
86
+
87
+ if self.freeze_vision:
88
+ self.vision_tower.requires_grad_(False)
89
+
90
+ # if self.vision_tower.grad_checkpointing:
91
+ for s in self.vision_tower.stages:
92
+ s.grad_checkpointing = True
93
+
94
+ self.is_loaded = True
95
+
96
+ def feature_select(self, image_forward_outs):
97
+
98
+ if self.select_layer>100:
99
+ image_features = image_forward_outs[-4:]
100
+ else:
101
+ image_features = image_forward_outs[-1]
102
+ return image_features
103
+
104
+ def forward_features(self, x):
105
+ x = self.vision_tower.stem(x)
106
+ image_forward_out=[]
107
+ for blk in self.vision_tower.stages:
108
+ x = blk(x)
109
+ b,c,h,w=x.shape
110
+ image_forward_out.append(x.view(b,c,-1).transpose(1,2))
111
+ return image_forward_out
112
+
113
+ def forward(self, images):
114
+ if self.freeze_vision:
115
+ with torch.no_grad():
116
+ image_features = self._forward_images(images)
117
+ else:
118
+ image_features = self._forward_images(images)
119
+
120
+ return image_features
121
+
122
+ def _forward_images(self, images):
123
+
124
+ if type(images) is list:
125
+ image_features = []
126
+ for image in images:
127
+ if self.pre_norm == 'siglip':
128
+ dtype = image.dtype
129
+ image = slip_to_clip(image.to(torch.float32)).to(dtype)
130
+ image_forward_out = self.forward_features(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
131
+ image_feature = self.feature_select(image_forward_out)
132
+ image_features.append(image_feature)
133
+ else:
134
+ if self.pre_norm == 'siglip':
135
+ dtype = images.dtype
136
+ images = slip_to_clip(images.to(torch.float32)).to(dtype)
137
+ image_forward_outs = self.forward_features(images.to(device=self.device, dtype=self.dtype))
138
+ image_features = self.feature_select(image_forward_outs)
139
+
140
+ return image_features
141
+
142
+ @property
143
+ def dummy_feature(self):
144
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
145
+
146
+ @property
147
+ def dtype(self):
148
+ return next(self.vision_tower.parameters()).dtype
149
+
150
+ @property
151
+ def device(self):
152
+ return next(self.vision_tower.parameters()).device
153
+
154
+ @property
155
+ def config(self):
156
+ assert NotImplementedError
157
+ pass
158
+
159
+ @property
160
+ def num_attention_heads(self):
161
+ # as constant
162
+ return 16
163
+ @property
164
+ def num_layers(self):
165
+ # as constant
166
+ return 4
167
+ @property
168
+ def hidden_size(self):
169
+ return self.vision_tower.hidden_size
170
+
171
+ @property
172
+ def num_patches(self):
173
+ return (self.input_image_size // self.patch_embed.patch_size[0]) ** 2
174
+
175
+
176
+ class ConvNextFPNVisionTower(nn.Module):
177
+ def __init__(self,
178
+ vision_tower,
179
+ args,
180
+ fpn_target_level=1,
181
+ fpn_layer_idx=[1,2,3],
182
+ fpn_input_dim=[768,1536,3072],
183
+ delay_load=False):
184
+
185
+ super().__init__()
186
+
187
+ self.is_loaded = False
188
+ self.vision_tower_name = vision_tower.replace('-fpn', 'fpn')
189
+ self.freeze_vision = getattr(args, "frozen_backbone", True)
190
+ # self.input_image_size = getattr(args, "vision_tower_input_size", 1024)
191
+ self.input_image_size = 1024 # hardcode
192
+ self.select_layer = args.mm_vision_select_layer # no effect
193
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
194
+
195
+ self.need_fpn = True
196
+ self.fpn_layer_idx = fpn_layer_idx # [1, 2, 3] # x8, x16, x32
197
+ self.fpn_input_dim = [768, 1536, 3072]
198
+ self.delay_load = delay_load
199
+ self.load_model()
200
+
201
+ def load_model(self):
202
+ if self.is_loaded:
203
+ return
204
+
205
+ self.image_processor = CLIPImageProcessor(**cfg)
206
+ if 'xxlarge' in self.vision_tower_name:
207
+ self.vision_tower = convnext_xxlarge(self.vision_tower_name)
208
+ setattr(self.vision_tower, 'hidden_size', self.fpn_input_dim)
209
+ # setattr(self.vision_tower, 'hidden_size', 3072)
210
+ else:
211
+ self.vision_tower = convnext_large_mlp(self.vision_tower_name)
212
+ setattr(self.vision_tower, 'hidden_size', 1536)
213
+ if self.freeze_vision:
214
+ self.vision_tower.requires_grad_(False)
215
+
216
+ # if self.vision_tower.grad_checkpointing:
217
+ for s in self.vision_tower.stages:
218
+ s.grad_checkpointing = True
219
+
220
+ if self.input_image_size is not None:
221
+ self.image_processor.size=self.input_image_size
222
+ self.image_processor.crop_size={
223
+ 'height':self.input_image_size,
224
+ 'width': self.input_image_size
225
+ }
226
+
227
+ self.is_loaded = True
228
+
229
+ @torch.no_grad()
230
+ def forward_features(self, x):
231
+ x = self.vision_tower.stem(x)
232
+ image_forward_out=[]
233
+ for blk in self.vision_tower.stages:
234
+ x = blk(x)
235
+ image_forward_out.append(x)
236
+ return image_forward_out
237
+
238
+ @torch.no_grad()
239
+ def forward(self, images):
240
+ if type(images) is list:
241
+ image_features = []
242
+ for image in images:
243
+ image_feature = self.forward_features(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
244
+ image_features.append(image_feature)
245
+ else:
246
+ image_features = self.forward_features(images.to(device=self.device, dtype=self.dtype))
247
+ image_features = [image_features[idx] for idx in self.fpn_layer_idx]
248
+
249
+ return image_features
250
+
251
+ @property
252
+ def dummy_feature(self):
253
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
254
+
255
+ @property
256
+ def dtype(self):
257
+ return next(self.vision_tower.parameters()).dtype
258
+
259
+ @property
260
+ def device(self):
261
+ return next(self.vision_tower.parameters()).device
262
+
263
+ @property
264
+ def config(self):
265
+ assert NotImplementedError
266
+ pass
267
+
268
+ @property
269
+ def num_attention_heads(self):
270
+ # as constant
271
+ return 16
272
+ @property
273
+ def num_layers(self):
274
+ # as constant
275
+ return 4
276
+ @property
277
+ def hidden_size(self):
278
+ return self.vision_tower.hidden_size
279
+
280
+ @property
281
+ def num_patches(self):
282
+ return (cfg['image_size'] // self.patch_embed.patch_size[0]) ** 2
283
+
284
+ if __name__ == '__main__':
285
+ COMBINED_STD = [s_slip / s_clip for s_slip, s_clip in zip(STD_SigLIP, STD_CLIP)]
286
+ COMBINED_MEAN = [(m_slip - m_clip) / s_clip for m_slip, m_clip, s_clip in zip(MEAN_SigLIP, MEAN_CLIP, STD_CLIP)]
287
+
288
+ # 定义合并的归一化变换
289
+ combined_normalize = T.Normalize(mean=COMBINED_MEAN, std=COMBINED_STD)
290
+ x = torch.randn(1, 3, 256, 256).cuda()
291
+ a = normalize_clip(x).to(torch.bfloat16)
292
+ b = normalize_siglip(x).to(torch.bfloat16)
293
+ c = denormalize_siglip(b.to(torch.float32))
294
+ c2 = normalize_clip(c).to(torch.bfloat16)
295
+ c3 = combined_normalize(b)
296
+ print((c-x).abs().max())
297
+ print((c2-a).abs().max())
298
+ print((c3-a).abs().max())
299
+ from IPython import embed
300
+ embed()
301
+ exit()
demo.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ A model worker executes the model.
4
+ """
5
+ from transformers import AutoModel, AutoTokenizer, TextIteratorStreamer, AutoConfig
6
+ import argparse
7
+ import base64
8
+ import json
9
+ import os
10
+ import decord
11
+ import threading
12
+ import time
13
+ from io import BytesIO
14
+ from threading import Thread
15
+ import math
16
+ import requests
17
+ import torch
18
+ import torchvision.transforms as T
19
+ from PIL import Image
20
+ from torchvision.transforms.functional import InterpolationMode
21
+
22
+ import numpy as np
23
+
24
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
25
+ IMAGENET_STD = (0.229, 0.224, 0.225)
26
+
27
+ SIGLIP_MEAN = (0.5, 0.5, 0.5)
28
+ SIGLIP_STD = (0.5, 0.5, 0.5)
29
+
30
+
31
+ def get_seq_frames(total_num_frames, desired_num_frames=-1, stride=-1):
32
+ """
33
+ Calculate the indices of frames to extract from a video.
34
+
35
+ Parameters:
36
+ total_num_frames (int): Total number of frames in the video.
37
+ desired_num_frames (int): Desired number of frames to extract.
38
+
39
+ Returns:
40
+ list: List of indices of frames to extract.
41
+ """
42
+
43
+ assert desired_num_frames > 0 or stride > 0 and not (desired_num_frames > 0 and stride > 0)
44
+
45
+ if stride > 0:
46
+ return list(range(0, total_num_frames, stride))
47
+
48
+ # Calculate the size of each segment from which a frame will be extracted
49
+ seg_size = float(total_num_frames - 1) / desired_num_frames
50
+
51
+ seq = []
52
+ for i in range(desired_num_frames):
53
+ # Calculate the start and end indices of each segment
54
+ start = int(np.round(seg_size * i))
55
+ end = int(np.round(seg_size * (i + 1)))
56
+
57
+ # Append the middle index of the segment to the list
58
+ seq.append((start + end) // 2)
59
+
60
+ return seq
61
+
62
+ def build_video_prompt(meta_list, num_frames, time_position=False):
63
+ # if time_position is True, the frame_timestamp is used.
64
+ # 1. pass time_position, 2. use env TIME_POSITION
65
+ time_position = os.environ.get("TIME_POSITION", time_position)
66
+ prefix = f"This is a video:\n"
67
+ for i in range(num_frames):
68
+ if time_position:
69
+ frame_txt = f"Frame {i+1} sampled at {meta_list[i]:.2f} seconds: <image>\n"
70
+ else:
71
+ frame_txt = f"Frame {i+1}: <image>\n"
72
+ prefix += frame_txt
73
+ return prefix
74
+
75
+ def load_video(video_path, num_frames=64, frame_cache_root=None):
76
+ if isinstance(video_path, str):
77
+ video = decord.VideoReader(video_path)
78
+ elif isinstance(video_path, dict):
79
+ assert False, 'we not support vidoe: "video_path" as input'
80
+ fps = video.get_avg_fps()
81
+ sampled_frames = get_seq_frames(len(video), num_frames)
82
+ samepld_timestamps = [i / fps for i in sampled_frames]
83
+ frames = video.get_batch(sampled_frames).asnumpy()
84
+ images = [Image.fromarray(frame) for frame in frames]
85
+
86
+ return images, build_video_prompt(samepld_timestamps, len(images), time_position=True)
87
+
88
+ def load_image(image):
89
+ if isinstance(image, str) and os.path.exists(image):
90
+ return Image.open(image)
91
+ elif isinstance(image, dict):
92
+ if 'disk_path' in image:
93
+ return Image.open(image['disk_path'])
94
+ elif 'base64' in image:
95
+ return Image.open(BytesIO(base64.b64decode(image['base64'])))
96
+ elif 'url' in image:
97
+ response = requests.get(image['url'])
98
+ return Image.open(BytesIO(response.content))
99
+ elif 'bytes' in image:
100
+ return Image.open(BytesIO(image['bytes']))
101
+ else:
102
+ raise ValueError(f'Invalid image: {image}')
103
+ else:
104
+ raise ValueError(f'Invalid image: {image}')
105
+
106
+ def build_transform(input_size, norm_type='imagenet'):
107
+ if norm_type == 'imagenet':
108
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
109
+ elif norm_type == 'siglip':
110
+ MEAN, STD = SIGLIP_MEAN, SIGLIP_STD
111
+
112
+ transform = T.Compose([
113
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
114
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
115
+ T.ToTensor(),
116
+ T.Normalize(mean=MEAN, std=STD)
117
+ ])
118
+ return transform
119
+
120
+
121
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
122
+ """
123
+ previous version mainly foucs on ratio.
124
+ We also consider area ratio here.
125
+ """
126
+ best_factor = float('-inf')
127
+ best_ratio = (1, 1)
128
+ area = width * height
129
+ for ratio in target_ratios:
130
+ target_aspect_ratio = ratio[0] / ratio[1]
131
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
132
+ area_ratio = (ratio[0]*ratio[1]*image_size*image_size)/ area
133
+ """
134
+ new area > 60% of original image area is enough.
135
+ """
136
+ factor_based_on_area_n_ratio = min((ratio[0]*ratio[1]*image_size*image_size)/ area, 0.6)* \
137
+ min(target_aspect_ratio/aspect_ratio, aspect_ratio/target_aspect_ratio)
138
+
139
+ if factor_based_on_area_n_ratio > best_factor:
140
+ best_factor = factor_based_on_area_n_ratio
141
+ best_ratio = ratio
142
+
143
+ return best_ratio
144
+
145
+
146
+ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
147
+ orig_width, orig_height = image.size
148
+ aspect_ratio = orig_width / orig_height
149
+
150
+ # calculate the existing image aspect ratio
151
+ target_ratios = set(
152
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
153
+ i * j <= max_num and i * j >= min_num)
154
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
155
+
156
+ # find the closest aspect ratio to the target
157
+ target_aspect_ratio = find_closest_aspect_ratio(
158
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
159
+
160
+ # calculate the target width and height
161
+ target_width = image_size * target_aspect_ratio[0]
162
+ target_height = image_size * target_aspect_ratio[1]
163
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
164
+
165
+ # resize the image
166
+ resized_img = image.resize((target_width, target_height))
167
+ processed_images = []
168
+ for i in range(blocks):
169
+ box = (
170
+ (i % (target_width // image_size)) * image_size,
171
+ (i // (target_width // image_size)) * image_size,
172
+ ((i % (target_width // image_size)) + 1) * image_size,
173
+ ((i // (target_width // image_size)) + 1) * image_size
174
+ )
175
+ # split the image
176
+ split_img = resized_img.crop(box)
177
+ processed_images.append(split_img)
178
+ assert len(processed_images) == blocks
179
+ if use_thumbnail and len(processed_images) != 1:
180
+ thumbnail_img = image.resize((image_size, image_size))
181
+ processed_images.append(thumbnail_img)
182
+ return processed_images
183
+
184
+ def split_model(model_path, device):
185
+
186
+ device_map = {}
187
+ world_size = torch.cuda.device_count()
188
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
189
+ num_layers = config.llm_config.num_hidden_layers
190
+
191
+ num_layers_per_gpu_ = math.floor(num_layers / (world_size - 1))
192
+ num_layers_per_gpu = [num_layers_per_gpu_] * world_size
193
+ num_layers_per_gpu[device] = num_layers - num_layers_per_gpu_ * (world_size-1)
194
+ layer_cnt = 0
195
+ for i, num_layer in enumerate(num_layers_per_gpu):
196
+ for j in range(num_layer):
197
+ device_map[f'language_model.model.layers.{layer_cnt}'] = i
198
+ layer_cnt += 1
199
+ device_map['vision_model'] = device
200
+ device_map['mlp1'] = device
201
+ device_map['language_model.model.tok_embeddings'] = device
202
+ device_map['language_model.model.embed_tokens'] = device
203
+ device_map['language_model.output'] = device
204
+ device_map['language_model.model.norm'] = device
205
+ device_map['language_model.lm_head'] = device
206
+ device_map['language_model.model.rotary_emb'] = device
207
+ device_map[f'language_model.model.layers.{num_layers - 1}'] = device
208
+ return device_map
209
+
210
+ class ModelWorker:
211
+ def __init__(self, model_path, model_name,
212
+ load_8bit, device):
213
+
214
+ if model_path.endswith('/'):
215
+ model_path = model_path[:-1]
216
+ if model_name is None:
217
+ model_paths = model_path.split('/')
218
+ if model_paths[-1].startswith('checkpoint-'):
219
+ self.model_name = model_paths[-2] + '_' + model_paths[-1]
220
+ else:
221
+ self.model_name = model_paths[-1]
222
+ else:
223
+ self.model_name = model_name
224
+
225
+ print(f'Loading the model {self.model_name}')
226
+
227
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
228
+ tokens_to_keep = ['<box>', '</box>', '<ref>', '</ref>']
229
+ tokenizer.additional_special_tokens = [item for item in tokenizer.additional_special_tokens if item not in tokens_to_keep]
230
+ self.tokenizer = tokenizer
231
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
232
+ model_type = config.vision_config.model_type
233
+ self.device = torch.cuda.current_device()
234
+ if model_type == 'siglip_vision_model':
235
+ self.norm_type = 'siglip'
236
+ elif model_type == 'MOB':
237
+ self.norm_type = 'siglip'
238
+ else:
239
+ self.norm_type = 'imagenet'
240
+
241
+ if any(x in model_path.lower() for x in ['34b']):
242
+ device_map = split_model(model_path, self.device)
243
+ else:
244
+ device_map = None
245
+
246
+ if device_map is not None:
247
+ self.model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16,
248
+ low_cpu_mem_usage=True,
249
+ device_map=device_map,
250
+ trust_remote_code=True,
251
+ load_in_8bit=load_8bit).eval()
252
+ else:
253
+ self.model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16,
254
+ trust_remote_code=True,
255
+ load_in_8bit=load_8bit).eval()
256
+ if not load_8bit and device_map is None:
257
+ self.model = self.model.to(device)
258
+ self.load_8bit = load_8bit
259
+
260
+ self.model_path = model_path
261
+ self.image_size = self.model.config.force_image_size
262
+ self.context_len = tokenizer.model_max_length
263
+ self.per_tile_len = 256
264
+
265
+ def reload_model(self):
266
+ del self.model
267
+ torch.cuda.empty_cache()
268
+ if self.device == 'auto':
269
+ os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
270
+ # This can make distributed deployment work properly
271
+ self.model = AutoModel.from_pretrained(
272
+ self.model_path,
273
+ load_in_8bit=self.load_8bit,
274
+ torch_dtype=torch.bfloat16,
275
+ device_map=self.device_map,
276
+ trust_remote_code=True).eval()
277
+ else:
278
+ self.model = AutoModel.from_pretrained(
279
+ self.model_path,
280
+ load_in_8bit=self.load_8bit,
281
+ torch_dtype=torch.bfloat16,
282
+ trust_remote_code=True).eval()
283
+ if not self.load_8bit and not self.device == 'auto':
284
+ self.model = self.model.cuda()
285
+
286
+ @torch.inference_mode()
287
+ def generate(self, params):
288
+ system_message = params['prompt'][0]['content']
289
+ send_messages = params['prompt'][1:]
290
+ max_input_tiles = params['max_input_tiles']
291
+ temperature = params['temperature']
292
+ top_p = params['top_p']
293
+ max_new_tokens = params['max_new_tokens']
294
+ repetition_penalty = params['repetition_penalty']
295
+ video_frame_num = params.get('video_frame_num', 64)
296
+ do_sample = True if temperature > 0.0 else False
297
+
298
+ global_image_cnt = 0
299
+ history, pil_images, max_input_tile_list = [], [], []
300
+ for message in send_messages:
301
+ if message['role'] == 'user':
302
+ prefix = ''
303
+ if 'image' in message:
304
+ for image_data in message['image']:
305
+ pil_images.append(load_image(image_data))
306
+ prefix = prefix + f'<image {global_image_cnt + 1}><image>\n'
307
+ global_image_cnt += 1
308
+ max_input_tile_list.append(max_input_tiles)
309
+ if 'video' in message:
310
+ for video_data in message['video']:
311
+ video_frames, tmp_prefix = load_video(video_data, num_frames=video_frame_num)
312
+ pil_images.extend(video_frames)
313
+ prefix = prefix + tmp_prefix
314
+ global_image_cnt += len(video_frames)
315
+ max_input_tile_list.extend([1] * len(video_frames))
316
+ content = prefix + message['content']
317
+ history.append([content, ])
318
+ else:
319
+ history[-1].append(message['content'])
320
+ question, history = history[-1][0], history[:-1]
321
+
322
+ if global_image_cnt == 1:
323
+ question = question.replace('<image 1><image>\n', '<image>\n')
324
+ history = [[item[0].replace('<image 1><image>\n', '<image>\n'), item[1]] for item in history]
325
+
326
+
327
+ try:
328
+ assert len(max_input_tile_list) == len(pil_images), 'The number of max_input_tile_list and pil_images should be the same.'
329
+ except Exception as e:
330
+ from IPython import embed; embed()
331
+ exit()
332
+ print(f'Error: {e}')
333
+ print(f'max_input_tile_list: {max_input_tile_list}, pil_images: {pil_images}')
334
+ # raise e
335
+
336
+ old_system_message = self.model.system_message
337
+ self.model.system_message = system_message
338
+
339
+ transform = build_transform(input_size=self.image_size, norm_type=self.norm_type)
340
+ if len(pil_images) > 0:
341
+ max_input_tiles_limited_by_contect = params['max_input_tiles']
342
+ while True:
343
+ image_tiles = []
344
+ for current_max_input_tiles, pil_image in zip(max_input_tile_list, pil_images):
345
+ if self.model.config.dynamic_image_size:
346
+ tiles = dynamic_preprocess(
347
+ pil_image, image_size=self.image_size, max_num=min(current_max_input_tiles, max_input_tiles_limited_by_contect),
348
+ use_thumbnail=self.model.config.use_thumbnail)
349
+ else:
350
+ tiles = [pil_image]
351
+ image_tiles += tiles
352
+ if (len(image_tiles) * self.per_tile_len < self.context_len):
353
+ break
354
+ else:
355
+ max_input_tiles_limited_by_contect -= 2
356
+
357
+ if max_input_tiles_limited_by_contect < 1:
358
+ break
359
+
360
+ pixel_values = [transform(item) for item in image_tiles]
361
+ pixel_values = torch.stack(pixel_values).to(self.model.device, dtype=torch.bfloat16)
362
+
363
+ else:
364
+ pixel_values = None
365
+
366
+ generation_config = dict(
367
+ num_beams=1,
368
+ max_new_tokens=max_new_tokens,
369
+ do_sample=do_sample,
370
+ temperature=temperature,
371
+ repetition_penalty=repetition_penalty,
372
+ max_length=self.context_len,
373
+ top_p=top_p,
374
+ )
375
+
376
+ response = self.model.chat(
377
+ tokenizer=self.tokenizer,
378
+ pixel_values=pixel_values,
379
+ question=question,
380
+ history=history,
381
+ return_history=False,
382
+ generation_config=generation_config,
383
+ )
384
+ self.model.system_message = old_system_message
385
+ return {'text': response, 'error_code': 0}
386
+
387
+
388
+
389
+
390
+
391
+ if __name__ == '__main__':
392
+ parser = argparse.ArgumentParser()
393
+ parser.add_argument('--model-path', type=str, default='nvidia/Eagle2-1B')
394
+ parser.add_argument('--model-name', type=str, default='Eagle2-1B')
395
+ parser.add_argument('--device', type=str, default='cuda')
396
+ parser.add_argument('--load-8bit', action='store_true')
397
+ args = parser.parse_args()
398
+ print(f'args: {args}')
399
+
400
+ worker = ModelWorker(
401
+ args.model_path,
402
+ args.model_name,
403
+ args.load_8bit,
404
+ args.device)
405
+ prompt = [
406
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
407
+ {'role': 'user', 'content': 'Describe this image in details.',
408
+ 'image':[
409
+ {'url': 'https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/[email protected]'}
410
+ ]
411
+ }
412
+ ]
413
+ params = {
414
+ 'prompt': prompt,
415
+ 'max_input_tiles': 24,
416
+ 'temperature': 0.7,
417
+ 'top_p': 1.0,
418
+ 'max_new_tokens': 4096,
419
+ 'repetition_penalty': 1.0,
420
+ }
421
+ print(worker.generate(params))
done.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ done: Mon Feb 10 05:10:50 2025
flash_attention.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py
2
+ import torch
3
+ import torch.nn as nn
4
+ from einops import rearrange
5
+
6
+ try: # v1
7
+ from flash_attn.flash_attn_interface import \
8
+ flash_attn_unpadded_qkvpacked_func
9
+ except: # v2
10
+ from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
11
+
12
+ from flash_attn.bert_padding import pad_input, unpad_input
13
+
14
+
15
+ class FlashAttention(nn.Module):
16
+ """Implement the scaled dot product attention with softmax.
17
+ Arguments
18
+ ---------
19
+ softmax_scale: The temperature to use for the softmax attention.
20
+ (default: 1/sqrt(d_keys) where d_keys is computed at
21
+ runtime)
22
+ attention_dropout: The dropout rate to apply to the attention
23
+ (default: 0.0)
24
+ """
25
+
26
+ def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
27
+ super().__init__()
28
+ self.softmax_scale = softmax_scale
29
+ self.dropout_p = attention_dropout
30
+
31
+ def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
32
+ max_s=None, need_weights=False):
33
+ """Implements the multihead softmax attention.
34
+ Arguments
35
+ ---------
36
+ qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
37
+ if unpadded: (nnz, 3, h, d)
38
+ key_padding_mask: a bool tensor of shape (B, S)
39
+ """
40
+ assert not need_weights
41
+ assert qkv.dtype in [torch.float16, torch.bfloat16]
42
+ assert qkv.is_cuda
43
+
44
+ if cu_seqlens is None:
45
+ batch_size = qkv.shape[0]
46
+ seqlen = qkv.shape[1]
47
+ if key_padding_mask is None:
48
+ qkv = rearrange(qkv, 'b s ... -> (b s) ...')
49
+ max_s = seqlen
50
+ cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
51
+ device=qkv.device)
52
+ output = flash_attn_unpadded_qkvpacked_func(
53
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
54
+ softmax_scale=self.softmax_scale, causal=causal
55
+ )
56
+ output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
57
+ else:
58
+ nheads = qkv.shape[-2]
59
+ x = rearrange(qkv, 'b s three h d -> b s (three h d)')
60
+ x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
61
+ x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
62
+ output_unpad = flash_attn_unpadded_qkvpacked_func(
63
+ x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
64
+ softmax_scale=self.softmax_scale, causal=causal
65
+ )
66
+ output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
67
+ indices, batch_size, seqlen),
68
+ 'b s (h d) -> b s h d', h=nheads)
69
+ else:
70
+ assert max_s is not None
71
+ output = flash_attn_unpadded_qkvpacked_func(
72
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
73
+ softmax_scale=self.softmax_scale, causal=causal
74
+ )
75
+
76
+ return output, None
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.37.2"
4
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4981e9966dbd147a165a44c1858040deeec3259f2b2dff6adce37100bc634fe5
3
+ size 3881557408
modeling_eagle_chat.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Eagle2
3
+ # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import warnings
8
+ from typing import Any, List, Optional, Tuple, Union
9
+
10
+ import torch.utils.checkpoint
11
+ import transformers
12
+ from torch import nn
13
+ from torch.nn import CrossEntropyLoss
14
+ from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
15
+ LlamaTokenizer)
16
+ from transformers.modeling_outputs import CausalLMOutputWithPast
17
+ from transformers.modeling_utils import PreTrainedModel
18
+ from transformers.utils import ModelOutput, logging
19
+ from peft import LoraConfig, get_peft_model
20
+ from .configuration_eagle_chat import Eagle2ChatConfig
21
+ from .conversation import get_conv_template
22
+ from .modeling_siglip import SiglipVisionModel
23
+ from .modeling_qwen2 import Qwen2ForCausalLM
24
+ from .flash_attention import *
25
+ from .multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModel
26
+ from .multi_backbone_channel_concatenation_encoder import MultiBackboneChannelConcatenationVisionTower
27
+ from .configuration_multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModelConfig
28
+ from .siglip_vision_tower import SiglipVisionTower
29
+ from .convnext_encoder import ConvNextVisionTower
30
+ from .convnext import ConvNeXt
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+
35
+ def version_cmp(v1, v2, op='eq'):
36
+ import operator
37
+
38
+ from packaging import version
39
+ op_func = getattr(operator, op)
40
+ return op_func(version.parse(v1), version.parse(v2))
41
+
42
+
43
+ class Eagle2ChatModel(PreTrainedModel):
44
+ config_class = Eagle2ChatConfig
45
+ main_input_name = 'pixel_values'
46
+ _no_split_modules = ['LlamaDecoderLayer']
47
+
48
+ def __init__(self, config: Eagle2ChatConfig, vision_model=None, language_model=None):
49
+ super().__init__(config)
50
+
51
+ assert version_cmp(transformers.__version__, '4.37.2', 'ge')
52
+ assert version_cmp(transformers.__version__, '4.39.2', 'le')
53
+ image_size = config.force_image_size or config.vision_config.image_size
54
+ if hasattr(config.vision_config, 'grid_size'):
55
+ grid_size = config.vision_config.grid_size
56
+ self.patch_size = 14
57
+ self.num_image_token = int((grid_size * config.downsample_ratio) ** 2)
58
+ else:
59
+ patch_size = config.vision_config.patch_size
60
+ self.patch_size = patch_size
61
+ self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
62
+
63
+ self.select_layer = config.select_layer
64
+ self.template = config.template
65
+
66
+ self.downsample_ratio = config.downsample_ratio
67
+
68
+ logger.info(f'num_image_token: {self.num_image_token}')
69
+ if vision_model is not None:
70
+ self.vision_model = vision_model
71
+ else:
72
+ if config.vision_config.model_type == 'siglip_vision_model':
73
+ self.vision_model = SiglipVisionModel(config.vision_config)
74
+ elif config.vision_config.model_type.startswith("MOB"):
75
+ self.vision_model = MultiBackboneChannelConcatenationVisionModel(config.vision_config, config)
76
+
77
+ if language_model is not None:
78
+ self.language_model = language_model
79
+ else:
80
+ if config.llm_config.architectures[0] == 'LlamaForCausalLM':
81
+ self.language_model = LlamaForCausalLM(config.llm_config)
82
+ elif config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
83
+ self.language_model = Qwen2ForCausalLM(config.llm_config)
84
+ else:
85
+ raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
86
+
87
+ vit_hidden_size = config.vision_config.hidden_size
88
+ if vit_hidden_size == 'lazy_calculation':
89
+ # a hack for Mixture of Backbones
90
+ vit_hidden_size = self.vision_model.hidden_size
91
+ print("The lazy calculated hidden_size: {} .. ".format(vit_hidden_size))
92
+ llm_hidden_size = config.llm_config.hidden_size
93
+ self.moe_version_type = getattr(config.vision_config, 'moe_version_type', None)
94
+
95
+ if self.moe_version_type in ['seq_concat', 'feat_concat']:
96
+ raise NotImplementedError
97
+ elif self.moe_version_type == 'convnext_512_siglip_448':
98
+ convnext_hidden_size = vit_hidden_size['convnext']
99
+ siglip_hidden_size = vit_hidden_size['siglip']
100
+ feature_concat_hidden_size = convnext_hidden_size + siglip_hidden_size * int(1 / self.downsample_ratio) ** 2
101
+ self.mlp1 = nn.Sequential(
102
+ nn.LayerNorm(feature_concat_hidden_size),
103
+ nn.Linear(feature_concat_hidden_size, llm_hidden_size),
104
+ nn.GELU(),
105
+ nn.Linear(llm_hidden_size, llm_hidden_size)
106
+ )
107
+ else:
108
+ self.mlp1 = nn.Sequential(
109
+ nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
110
+ nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
111
+ nn.GELU(),
112
+ nn.Linear(llm_hidden_size, llm_hidden_size)
113
+ )
114
+ self.img_context_token_id = None
115
+ self.conv_template = get_conv_template(self.template)
116
+ self.system_message = self.conv_template.system_message
117
+
118
+ if config.use_backbone_lora:
119
+ self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=2 * config.use_backbone_lora)
120
+
121
+ if config.use_llm_lora:
122
+ self.wrap_llm_lora(r=config.use_llm_lora, lora_alpha=2 * config.use_llm_lora)
123
+
124
+ def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
125
+ lora_config = LoraConfig(
126
+ r=r,
127
+ target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2'],
128
+ lora_alpha=lora_alpha,
129
+ lora_dropout=lora_dropout,
130
+ )
131
+ self.vision_model = get_peft_model(self.vision_model, lora_config)
132
+ self.vision_model.print_trainable_parameters()
133
+
134
+ def wrap_llm_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
135
+ lora_config = LoraConfig(
136
+ r=r,
137
+ target_modules=['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj', 'self_attn.o_proj',
138
+ 'mlp.gate_proj', 'mlp.down_proj', 'mlp.up_proj'],
139
+ lora_alpha=lora_alpha,
140
+ lora_dropout=lora_dropout,
141
+ task_type='CAUSAL_LM'
142
+ )
143
+ self.language_model = get_peft_model(self.language_model, lora_config)
144
+ self.language_model.enable_input_require_grads()
145
+ self.language_model.print_trainable_parameters()
146
+
147
+
148
+ def forward(
149
+ self,
150
+ pixel_values: torch.FloatTensor,
151
+ input_ids: torch.LongTensor = None,
152
+ attention_mask: Optional[torch.Tensor] = None,
153
+ position_ids: Optional[torch.LongTensor] = None,
154
+ image_flags: Optional[torch.LongTensor] = None,
155
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
156
+ labels: Optional[torch.LongTensor] = None,
157
+ use_cache: Optional[bool] = None,
158
+ output_attentions: Optional[bool] = None,
159
+ output_hidden_states: Optional[bool] = None,
160
+ return_dict: Optional[bool] = None,
161
+ num_patches_list: Optional[List[torch.Tensor]] = None,
162
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
163
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
164
+
165
+ image_flags = image_flags.squeeze(-1)
166
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
167
+
168
+
169
+ if self.moe_version_type in ['seq_concat', 'feat_concat'] and not isinstance(pixel_values, dict):
170
+ raise NotImplementedError
171
+ vit_embeds = self.extract_feature(pixel_values)
172
+
173
+ if not isinstance(image_flags, list):
174
+ image_flags = image_flags.squeeze(-1)
175
+ vit_embeds = vit_embeds[image_flags == 1]
176
+ if isinstance(pixel_values, dict):
177
+ # for MOE
178
+ vit_batch_size = sum(pixel_values['num_patches'])
179
+ else:
180
+ vit_batch_size = pixel_values.shape[0]
181
+
182
+ B, N, C = input_embeds.shape
183
+ input_embeds = input_embeds.reshape(B * N, C)
184
+
185
+ if torch.distributed.get_rank() == 0:
186
+ print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
187
+
188
+ input_ids = input_ids.reshape(B * N)
189
+ selected = (input_ids == self.img_context_token_id)
190
+ try:
191
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C)
192
+ except Exception as e:
193
+ vit_embeds = vit_embeds.reshape(-1, C)
194
+ print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, '
195
+ f'vit_embeds.shape={vit_embeds.shape}')
196
+ n_token = selected.sum()
197
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token]
198
+
199
+ input_embeds = input_embeds.reshape(B, N, C)
200
+
201
+ outputs = self.language_model(
202
+ inputs_embeds=input_embeds,
203
+ attention_mask=attention_mask,
204
+ position_ids=position_ids,
205
+ past_key_values=past_key_values,
206
+ use_cache=use_cache,
207
+ output_attentions=output_attentions,
208
+ output_hidden_states=output_hidden_states,
209
+ return_dict=return_dict,
210
+ )
211
+ logits = outputs.logits
212
+
213
+ loss = None
214
+ if labels is not None:
215
+ # Shift so that tokens < n predict n
216
+ shift_logits = logits[..., :-1, :].contiguous()
217
+ shift_labels = labels[..., 1:].contiguous()
218
+ # Flatten the tokens
219
+ loss_fct = CrossEntropyLoss()
220
+ shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
221
+ shift_labels = shift_labels.view(-1)
222
+ # Enable model parallelism
223
+ shift_labels = shift_labels.to(shift_logits.device)
224
+ loss = loss_fct(shift_logits, shift_labels)
225
+
226
+ if not return_dict:
227
+ output = (logits,) + outputs[1:]
228
+ return (loss,) + output if loss is not None else output
229
+
230
+ return CausalLMOutputWithPast(
231
+ loss=loss,
232
+ logits=logits,
233
+ past_key_values=outputs.past_key_values,
234
+ hidden_states=outputs.hidden_states,
235
+ attentions=outputs.attentions,
236
+ )
237
+
238
+ def pixel_shuffle(self, x, scale_factor=0.5):
239
+ n, w, h, c = x.size()
240
+ # N, W, H, C --> N, W, H * scale, C // scale
241
+ x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
242
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
243
+ x = x.permute(0, 2, 1, 3).contiguous()
244
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
245
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
246
+ int(c / (scale_factor * scale_factor)))
247
+ x = x.permute(0, 2, 1, 3).contiguous()
248
+ return x
249
+
250
+ def extract_feature(self, pixel_values):
251
+
252
+ """
253
+ """
254
+
255
+ if self.select_layer == -1:
256
+ vit_embeds = self.vision_model(
257
+ pixel_values=pixel_values,
258
+ output_hidden_states=False,
259
+ return_dict=True).last_hidden_state # torch.Size([B, 1025, 1024])
260
+
261
+ else:
262
+ vit_embeds = self.vision_model(
263
+ pixel_values=pixel_values,
264
+ output_hidden_states=True,
265
+ return_dict=True).hidden_states[self.select_layer]
266
+ if type(self.vision_model) == SiglipVisionModel:
267
+ pass
268
+ elif type(self.vision_model) == MultiBackboneChannelConcatenationVisionModel:
269
+ pass
270
+ else:
271
+ vit_embeds = vit_embeds[:, 1:, :] # torch.Size([B, 1024, 1024])
272
+
273
+ if self.training and self.neftune_alpha is not None:
274
+ vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
275
+
276
+ if self.moe_version_type in ['feat_concat', 'seq_concat']:
277
+ raise NotImplementedError
278
+ elif self.moe_version_type == 'convnext_512_siglip_448':
279
+ siglip_embeds = vit_embeds['siglip']
280
+ convnext_embeds = vit_embeds['convnext']
281
+ h = w = int(siglip_embeds.shape[1] ** 0.5)
282
+ siglip_embeds = siglip_embeds.reshape(siglip_embeds.shape[0], h, w, -1)
283
+ siglip_embeds = self.pixel_shuffle(siglip_embeds, scale_factor=self.downsample_ratio)
284
+ siglip_embeds = siglip_embeds.reshape(siglip_embeds.shape[0], -1, siglip_embeds.shape[-1])
285
+ vit_embeds = self.mlp1(torch.cat([siglip_embeds, convnext_embeds], dim=-1))
286
+ else:
287
+ h = w = int(vit_embeds.shape[1] ** 0.5)
288
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
289
+
290
+ vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) # torch.Size([B, 1024, 1024]) -> torch.Size([B, 16, 16, 4096])
291
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) # torch.Size([B, 16, 16, 4096]) -> torch.Size([B, 256, 4096])
292
+ vit_embeds = self.mlp1(vit_embeds)#.to(pixel_values.device)
293
+
294
+ return vit_embeds
295
+
296
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
297
+ history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
298
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
299
+ if history is not None or return_history:
300
+ print('Now multi-turn chat is not supported in batch_chat.')
301
+ raise NotImplementedError
302
+
303
+ if image_counts is not None:
304
+ num_patches_list = image_counts
305
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
306
+
307
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
308
+ self.img_context_token_id = img_context_token_id
309
+
310
+ if verbose and pixel_values is not None:
311
+ image_bs = pixel_values.shape[0]
312
+ print(f'dynamic ViT batch size: {image_bs}')
313
+
314
+ queries = []
315
+ for idx, num_patches in enumerate(num_patches_list):
316
+ question = questions[idx]
317
+ if pixel_values is not None and '<image>' not in question:
318
+ question = '<image>\n' + question
319
+ template = get_conv_template(self.template)
320
+ template.append_message(template.roles[0], question)
321
+ template.append_message(template.roles[1], None)
322
+ query = template.get_prompt()
323
+
324
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
325
+ query = query.replace('<image>', image_tokens, 1)
326
+ queries.append(query)
327
+
328
+ tokenizer.padding_side = 'left'
329
+ model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
330
+ input_ids = model_inputs['input_ids'].cuda()
331
+ attention_mask = model_inputs['attention_mask'].cuda()
332
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
333
+ generation_config['eos_token_id'] = eos_token_id
334
+ generation_output = self.generate(
335
+ pixel_values=pixel_values,
336
+ input_ids=input_ids,
337
+ attention_mask=attention_mask,
338
+ **generation_config
339
+ )
340
+ responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
341
+ responses = [response.split(template.sep)[0].strip() for response in responses]
342
+ return responses
343
+
344
+ def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
345
+ num_patches_list=None, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
346
+ verbose=False, llm_only=False):
347
+
348
+ if history is None and pixel_values is not None and '<image>' not in question:
349
+ question = '<image>\n' + question
350
+
351
+ if num_patches_list is None:
352
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
353
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
354
+
355
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
356
+ self.img_context_token_id = img_context_token_id
357
+
358
+ template = get_conv_template(self.template)
359
+ template.system_message = self.system_message
360
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
361
+
362
+ history = [] if history is None else history
363
+ for (old_question, old_answer) in history:
364
+ template.append_message(template.roles[0], old_question)
365
+ template.append_message(template.roles[1], old_answer)
366
+ template.append_message(template.roles[0], question)
367
+ template.append_message(template.roles[1], None)
368
+ query = template.get_prompt()
369
+
370
+ if verbose and pixel_values is not None:
371
+ image_bs = pixel_values.shape[0]
372
+ print(f'dynamic ViT batch size: {image_bs}')
373
+
374
+ for num_patches in num_patches_list:
375
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
376
+ if llm_only:
377
+ query = query.replace('<image>', '', 1)
378
+ else:
379
+ query = query.replace('<image>', image_tokens, 1)
380
+
381
+ model_inputs = tokenizer(query, return_tensors='pt')
382
+ input_ids = model_inputs['input_ids'].cuda()
383
+ attention_mask = model_inputs['attention_mask'].cuda()
384
+ generation_config['eos_token_id'] = eos_token_id
385
+ if self.moe_version_type is not None and self.moe_version_type != 'all_tiling' and self.moe_version_type != 'convnext_512_siglip_448':
386
+ pixel_values = {
387
+ 'pixel_values': pixel_values,
388
+ 'num_patches': num_patches_list # num patch of each image.
389
+ }
390
+ generation_output = self.generate(
391
+ pixel_values=pixel_values,
392
+ input_ids=input_ids,
393
+ attention_mask=attention_mask,
394
+ **generation_config
395
+ )
396
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
397
+ response = response.split(template.sep)[0].strip()
398
+ history.append((question, response))
399
+ if return_history:
400
+ return response, history
401
+ else:
402
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
403
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
404
+ if verbose:
405
+ print(query_to_print, response)
406
+ return response
407
+
408
+ @torch.no_grad()
409
+ def generate(
410
+ self,
411
+ pixel_values: Optional[torch.FloatTensor] = None,
412
+ input_ids: Optional[torch.FloatTensor] = None,
413
+ attention_mask: Optional[torch.LongTensor] = None,
414
+ visual_features: Optional[torch.FloatTensor] = None,
415
+ generation_config: Optional[GenerationConfig] = None,
416
+ output_hidden_states: Optional[bool] = None,
417
+ return_dict: Optional[bool] = None,
418
+ **generate_kwargs,
419
+ ) -> torch.LongTensor:
420
+
421
+ assert self.img_context_token_id is not None
422
+ if pixel_values is not None:
423
+ if visual_features is not None:
424
+ vit_embeds = visual_features
425
+ else:
426
+ vit_embeds = self.extract_feature(pixel_values)
427
+
428
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
429
+ B, N, C = input_embeds.shape
430
+ input_embeds = input_embeds.reshape(B * N, C)
431
+
432
+ input_ids = input_ids.reshape(B * N)
433
+ selected = (input_ids == self.img_context_token_id)
434
+ assert selected.sum() != 0
435
+ input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
436
+
437
+ input_embeds = input_embeds.reshape(B, N, C)
438
+ else:
439
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
440
+
441
+ outputs = self.language_model.generate(
442
+ inputs_embeds=input_embeds,
443
+ attention_mask=attention_mask,
444
+ generation_config=generation_config,
445
+ output_hidden_states=output_hidden_states,
446
+ return_dict=return_dict,
447
+ use_cache=True,
448
+ **generate_kwargs,
449
+ )
450
+
451
+ return outputs
452
+
453
+ def get_input_embeddings(self):
454
+ return self.language_model.get_input_embeddings()
455
+
456
+ def get_output_embeddings(self):
457
+ return self.language_model.get_output_embeddings()
modeling_qwen2.py ADDED
@@ -0,0 +1,1744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ PyTorch Qwen2 model."""
21
+ import inspect
22
+ import math
23
+ import warnings
24
+ from typing import List, Optional, Tuple, Union
25
+
26
+ import torch
27
+ import torch.nn.functional as F
28
+ import torch.utils.checkpoint
29
+ from torch import nn
30
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
31
+
32
+ from transformers.activations import ACT2FN
33
+ from transformers.cache_utils import Cache, DynamicCache
34
+ from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
35
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
36
+ from transformers.modeling_utils import PreTrainedModel
37
+ from transformers.utils import (
38
+ add_start_docstrings,
39
+ add_start_docstrings_to_model_forward,
40
+ is_flash_attn_2_available,
41
+ is_flash_attn_greater_or_equal_2_10,
42
+ logging,
43
+ replace_return_docstrings,
44
+ )
45
+ from .configuration_qwen2 import Qwen2Config
46
+
47
+
48
+ if is_flash_attn_2_available():
49
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
50
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
51
+
52
+ _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
53
+
54
+
55
+ logger = logging.get_logger(__name__)
56
+
57
+
58
+ _CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
59
+ _CONFIG_FOR_DOC = "Qwen2Config"
60
+
61
+ QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
62
+ "Qwen/Qwen2-7B-beta",
63
+ # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
64
+ ]
65
+
66
+
67
+ # Copied from transformers.models.llama.modeling_llama._get_unpad_data
68
+ def _get_unpad_data(attention_mask):
69
+ seqlens_in_batch = (attention_mask>0).sum(dim=-1, dtype=torch.int32)
70
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
71
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
72
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
73
+ return (
74
+ indices,
75
+ cu_seqlens,
76
+ max_seqlen_in_batch,
77
+ )
78
+
79
+ def _get_unpad_data_packing(attention_mask, sub_sample_lengths):
80
+ seqlens_in_batch = []
81
+ for i, per_sub_sample_lengths in enumerate(sub_sample_lengths):
82
+ if (attention_mask[i]==0).sum() == per_sub_sample_lengths[-1]:
83
+ per_sub_sample_lengths = per_sub_sample_lengths[:-1]
84
+ seqlens_in_batch.extend(per_sub_sample_lengths)
85
+ seqlens_in_batch = torch.tensor(seqlens_in_batch, device=attention_mask.device, dtype=torch.int32)
86
+
87
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
88
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
89
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
90
+ return (
91
+ indices,
92
+ cu_seqlens,
93
+ max_seqlen_in_batch,
94
+ )
95
+
96
+ # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
97
+ class Qwen2RMSNorm(nn.Module):
98
+ def __init__(self, hidden_size, eps=1e-6):
99
+ """
100
+ Qwen2RMSNorm is equivalent to T5LayerNorm
101
+ """
102
+ super().__init__()
103
+ self.weight = nn.Parameter(torch.ones(hidden_size))
104
+ self.variance_epsilon = eps
105
+
106
+ def forward(self, hidden_states):
107
+ input_dtype = hidden_states.dtype
108
+ hidden_states = hidden_states.to(torch.float32)
109
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
110
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
111
+ return self.weight * hidden_states.to(input_dtype)
112
+
113
+
114
+ # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
115
+ class Qwen2RotaryEmbedding(nn.Module):
116
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
117
+ super().__init__()
118
+
119
+ self.dim = dim
120
+ self.max_position_embeddings = max_position_embeddings
121
+ self.base = base
122
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
123
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
124
+
125
+ # Build here to make `torch.jit.trace` work.
126
+ self._set_cos_sin_cache(
127
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
128
+ )
129
+
130
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
131
+ self.max_seq_len_cached = seq_len
132
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
133
+
134
+ freqs = torch.outer(t, self.inv_freq)
135
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
136
+ emb = torch.cat((freqs, freqs), dim=-1)
137
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
138
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
139
+
140
+ def forward(self, x, seq_len=None):
141
+ # x: [bs, num_attention_heads, seq_len, head_size]
142
+ if seq_len > self.max_seq_len_cached:
143
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
144
+
145
+ return (
146
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
147
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
148
+ )
149
+
150
+
151
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
152
+ def rotate_half(x):
153
+ """Rotates half the hidden dims of the input."""
154
+ x1 = x[..., : x.shape[-1] // 2]
155
+ x2 = x[..., x.shape[-1] // 2 :]
156
+ return torch.cat((-x2, x1), dim=-1)
157
+
158
+
159
+ # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
160
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
161
+ """Applies Rotary Position Embedding to the query and key tensors.
162
+
163
+ Args:
164
+ q (`torch.Tensor`): The query tensor.
165
+ k (`torch.Tensor`): The key tensor.
166
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
167
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
168
+ position_ids (`torch.Tensor`):
169
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
170
+ used to pass offsetted position ids when working with a KV-cache.
171
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
172
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
173
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
174
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
175
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
176
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
177
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
178
+ Returns:
179
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
180
+ """
181
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
182
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
183
+ q_embed = (q * cos) + (rotate_half(q) * sin)
184
+ k_embed = (k * cos) + (rotate_half(k) * sin)
185
+ return q_embed, k_embed
186
+
187
+
188
+ # Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
189
+ class Qwen2MLP(nn.Module):
190
+ def __init__(self, config):
191
+ super().__init__()
192
+ self.config = config
193
+ self.hidden_size = config.hidden_size
194
+ self.intermediate_size = config.intermediate_size
195
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
196
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
197
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
198
+ self.act_fn = ACT2FN[config.hidden_act]
199
+
200
+ def forward(self, x):
201
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
202
+
203
+
204
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
205
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
206
+ """
207
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
208
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
209
+ """
210
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
211
+ if n_rep == 1:
212
+ return hidden_states
213
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
214
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
215
+
216
+
217
+ class Qwen2Attention(nn.Module):
218
+ """
219
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
220
+ and "Generating Long Sequences with Sparse Transformers".
221
+ """
222
+
223
+ def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
224
+ super().__init__()
225
+ self.config = config
226
+ self.layer_idx = layer_idx
227
+ if layer_idx is None:
228
+ logger.warning_once(
229
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
230
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
231
+ "when creating this class."
232
+ )
233
+
234
+ self.hidden_size = config.hidden_size
235
+ self.num_heads = config.num_attention_heads
236
+ self.head_dim = self.hidden_size // self.num_heads
237
+ self.num_key_value_heads = config.num_key_value_heads
238
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
239
+ self.max_position_embeddings = config.max_position_embeddings
240
+ self.rope_theta = config.rope_theta
241
+ self.is_causal = True
242
+ self.attention_dropout = config.attention_dropout
243
+
244
+ if (self.head_dim * self.num_heads) != self.hidden_size:
245
+ raise ValueError(
246
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
247
+ f" and `num_heads`: {self.num_heads})."
248
+ )
249
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
250
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
251
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
252
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
253
+
254
+ self.rotary_emb = Qwen2RotaryEmbedding(
255
+ self.head_dim,
256
+ max_position_embeddings=self.max_position_embeddings,
257
+ base=self.rope_theta,
258
+ )
259
+
260
+ def forward(
261
+ self,
262
+ hidden_states: torch.Tensor,
263
+ attention_mask: Optional[torch.Tensor] = None,
264
+ position_ids: Optional[torch.LongTensor] = None,
265
+ past_key_value: Optional[Cache] = None,
266
+ output_attentions: bool = False,
267
+ use_cache: bool = False,
268
+ **kwargs,
269
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
270
+ if "padding_mask" in kwargs:
271
+ warnings.warn(
272
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
273
+ )
274
+ bsz, q_len, _ = hidden_states.size()
275
+
276
+ query_states = self.q_proj(hidden_states)
277
+ key_states = self.k_proj(hidden_states)
278
+ value_states = self.v_proj(hidden_states)
279
+
280
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
281
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
282
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
283
+
284
+ kv_seq_len = key_states.shape[-2]
285
+ if past_key_value is not None:
286
+ if self.layer_idx is None:
287
+ raise ValueError(
288
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
289
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
290
+ "with a layer index."
291
+ )
292
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
293
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
294
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
295
+
296
+ if past_key_value is not None:
297
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
298
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
299
+
300
+ # repeat k/v heads if n_kv_heads < n_heads
301
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
302
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
303
+
304
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
305
+
306
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
307
+ raise ValueError(
308
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
309
+ f" {attn_weights.size()}"
310
+ )
311
+
312
+ if attention_mask is not None:
313
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
314
+ raise ValueError(
315
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
316
+ )
317
+
318
+ attn_weights = attn_weights + attention_mask
319
+
320
+ # upcast attention to fp32
321
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
322
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
323
+ attn_output = torch.matmul(attn_weights, value_states)
324
+
325
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
326
+ raise ValueError(
327
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
328
+ f" {attn_output.size()}"
329
+ )
330
+
331
+ attn_output = attn_output.transpose(1, 2).contiguous()
332
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
333
+
334
+ attn_output = self.o_proj(attn_output)
335
+
336
+ if not output_attentions:
337
+ attn_weights = None
338
+
339
+ return attn_output, attn_weights, past_key_value
340
+
341
+
342
+ class Qwen2FlashAttention2(Qwen2Attention):
343
+ """
344
+ Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
345
+ as the weights of the module stays untouched. The only required change would be on the forward pass
346
+ where it needs to correctly call the public API of flash attention and deal with padding tokens
347
+ in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
348
+ config.max_window_layers layers.
349
+ """
350
+
351
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
352
+ def __init__(self, *args, **kwargs):
353
+ super().__init__(*args, **kwargs)
354
+
355
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
356
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
357
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
358
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
359
+
360
+ def forward(
361
+ self,
362
+ hidden_states: torch.Tensor,
363
+ attention_mask: Optional[torch.Tensor] = None,
364
+ position_ids: Optional[torch.LongTensor] = None,
365
+ past_key_value: Optional[Cache] = None,
366
+ output_attentions: bool = False,
367
+ use_cache: bool = False,
368
+ **kwargs,
369
+ ):
370
+ if "padding_mask" in kwargs:
371
+ warnings.warn(
372
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
373
+ )
374
+
375
+ # overwrite attention_mask with padding_mask
376
+ attention_mask = kwargs.pop("padding_mask")
377
+ bsz, q_len, _ = hidden_states.size()
378
+
379
+ query_states = self.q_proj(hidden_states)
380
+ key_states = self.k_proj(hidden_states)
381
+ value_states = self.v_proj(hidden_states)
382
+
383
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
384
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
385
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
386
+
387
+ kv_seq_len = key_states.shape[-2]
388
+ if past_key_value is not None:
389
+ if self.layer_idx is None:
390
+ raise ValueError(
391
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
392
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
393
+ "with a layer index."
394
+ )
395
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
396
+
397
+ # Because the input can be padded, the absolute sequence length depends on the max position id.
398
+ rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
399
+ cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
400
+
401
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
402
+
403
+ use_sliding_windows = (
404
+ _flash_supports_window_size
405
+ and getattr(self.config, "sliding_window", None) is not None
406
+ and kv_seq_len > self.config.sliding_window
407
+ and self.config.use_sliding_window
408
+ )
409
+
410
+ if not _flash_supports_window_size:
411
+ logger.warning_once(
412
+ "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
413
+ " make sure to upgrade flash-attn library."
414
+ )
415
+
416
+ if past_key_value is not None:
417
+ # Activate slicing cache only if the config has a value `sliding_windows` attribute
418
+ cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
419
+ if (
420
+ getattr(self.config, "sliding_window", None) is not None
421
+ and kv_seq_len > self.config.sliding_window
422
+ and cache_has_contents
423
+ ):
424
+ slicing_tokens = 1 - self.config.sliding_window
425
+
426
+ past_key = past_key_value[self.layer_idx][0]
427
+ past_value = past_key_value[self.layer_idx][1]
428
+
429
+ past_key = past_key[:, :, slicing_tokens:, :].contiguous()
430
+ past_value = past_value[:, :, slicing_tokens:, :].contiguous()
431
+
432
+ if past_key.shape[-2] != self.config.sliding_window - 1:
433
+ raise ValueError(
434
+ f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
435
+ f" {past_key.shape}"
436
+ )
437
+
438
+ if attention_mask is not None:
439
+ attention_mask = attention_mask[:, slicing_tokens:]
440
+ attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
441
+
442
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
443
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
444
+
445
+ # repeat k/v heads if n_kv_heads < n_heads
446
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
447
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
448
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
449
+
450
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
451
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
452
+ # cast them back in float16 just to be sure everything works as expected.
453
+ input_dtype = query_states.dtype
454
+ if input_dtype == torch.float32:
455
+ if torch.is_autocast_enabled():
456
+ target_dtype = torch.get_autocast_gpu_dtype()
457
+ # Handle the case where the model is quantized
458
+ elif hasattr(self.config, "_pre_quantization_dtype"):
459
+ target_dtype = self.config._pre_quantization_dtype
460
+ else:
461
+ target_dtype = self.q_proj.weight.dtype
462
+
463
+ logger.warning_once(
464
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
465
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
466
+ f" {target_dtype}."
467
+ )
468
+
469
+ query_states = query_states.to(target_dtype)
470
+ key_states = key_states.to(target_dtype)
471
+ value_states = value_states.to(target_dtype)
472
+
473
+ # Reashape to the expected shape for Flash Attention
474
+ query_states = query_states.transpose(1, 2)
475
+ key_states = key_states.transpose(1, 2)
476
+ value_states = value_states.transpose(1, 2)
477
+
478
+ attn_output = self._flash_attention_forward(
479
+ query_states,
480
+ key_states,
481
+ value_states,
482
+ attention_mask,
483
+ q_len,
484
+ dropout=dropout_rate,
485
+ use_sliding_windows=use_sliding_windows,
486
+ )
487
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
488
+ attn_output = self.o_proj(attn_output)
489
+
490
+ if not output_attentions:
491
+ attn_weights = None
492
+
493
+ return attn_output, attn_weights, past_key_value
494
+
495
+ def _flash_attention_forward(
496
+ self,
497
+ query_states,
498
+ key_states,
499
+ value_states,
500
+ attention_mask,
501
+ query_length,
502
+ dropout=0.0,
503
+ softmax_scale=None,
504
+ use_sliding_windows=False,
505
+ ):
506
+ """
507
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
508
+ first unpad the input, then computes the attention scores and pad the final attention scores.
509
+
510
+ Args:
511
+ query_states (`torch.Tensor`):
512
+ Input query states to be passed to Flash Attention API
513
+ key_states (`torch.Tensor`):
514
+ Input key states to be passed to Flash Attention API
515
+ value_states (`torch.Tensor`):
516
+ Input value states to be passed to Flash Attention API
517
+ attention_mask (`torch.Tensor`):
518
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
519
+ position of padding tokens and 1 for the position of non-padding tokens.
520
+ dropout (`int`, *optional*):
521
+ Attention dropout
522
+ softmax_scale (`float`, *optional*):
523
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
524
+ use_sliding_windows (`bool`, *optional*):
525
+ Whether to activate sliding window attention.
526
+ """
527
+ if not self._flash_attn_uses_top_left_mask:
528
+ causal = self.is_causal
529
+ else:
530
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
531
+ causal = self.is_causal and query_length != 1
532
+
533
+ # Decide whether to use SWA or not by layer index.
534
+ if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
535
+ use_sliding_windows = False
536
+
537
+ # Contains at least one padding token in the sequence
538
+ if attention_mask is not None:
539
+ batch_size = query_states.shape[0]
540
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
541
+ query_states, key_states, value_states, attention_mask, query_length
542
+ )
543
+
544
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
545
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
546
+
547
+ if not use_sliding_windows:
548
+ attn_output_unpad = flash_attn_varlen_func(
549
+ query_states,
550
+ key_states,
551
+ value_states,
552
+ cu_seqlens_q=cu_seqlens_q,
553
+ cu_seqlens_k=cu_seqlens_k,
554
+ max_seqlen_q=max_seqlen_in_batch_q,
555
+ max_seqlen_k=max_seqlen_in_batch_k,
556
+ dropout_p=dropout,
557
+ softmax_scale=softmax_scale,
558
+ causal=causal,
559
+ )
560
+ else:
561
+ attn_output_unpad = flash_attn_varlen_func(
562
+ query_states,
563
+ key_states,
564
+ value_states,
565
+ cu_seqlens_q=cu_seqlens_q,
566
+ cu_seqlens_k=cu_seqlens_k,
567
+ max_seqlen_q=max_seqlen_in_batch_q,
568
+ max_seqlen_k=max_seqlen_in_batch_k,
569
+ dropout_p=dropout,
570
+ softmax_scale=softmax_scale,
571
+ causal=causal,
572
+ window_size=(self.config.sliding_window, self.config.sliding_window),
573
+ )
574
+
575
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
576
+ else:
577
+ if not use_sliding_windows:
578
+ attn_output = flash_attn_func(
579
+ query_states,
580
+ key_states,
581
+ value_states,
582
+ dropout,
583
+ softmax_scale=softmax_scale,
584
+ causal=causal,
585
+ )
586
+ else:
587
+ attn_output = flash_attn_func(
588
+ query_states,
589
+ key_states,
590
+ value_states,
591
+ dropout,
592
+ softmax_scale=softmax_scale,
593
+ causal=causal,
594
+ window_size=(self.config.sliding_window, self.config.sliding_window),
595
+ )
596
+
597
+ return attn_output
598
+
599
+ # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
600
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
601
+ batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
602
+
603
+ # On the first iteration we need to properly re-create the padding mask
604
+ # by slicing it on the proper place
605
+ if kv_seq_len != attention_mask.shape[-1]:
606
+ attention_mask_num_tokens = attention_mask.shape[-1]
607
+ attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
608
+
609
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
610
+
611
+ key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
612
+ value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
613
+
614
+ if query_length == kv_seq_len:
615
+ query_layer = index_first_axis(
616
+ query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
617
+ )
618
+ cu_seqlens_q = cu_seqlens_k
619
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
620
+ indices_q = indices_k
621
+ elif query_length == 1:
622
+ max_seqlen_in_batch_q = 1
623
+ cu_seqlens_q = torch.arange(
624
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
625
+ ) # There is a memcpy here, that is very bad.
626
+ indices_q = cu_seqlens_q[:-1]
627
+ query_layer = query_layer.squeeze(1)
628
+ else:
629
+ # The -q_len: slice assumes left padding.
630
+ attention_mask = attention_mask[:, -query_length:]
631
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
632
+
633
+ return (
634
+ query_layer,
635
+ key_layer,
636
+ value_layer,
637
+ indices_q,
638
+ (cu_seqlens_q, cu_seqlens_k),
639
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
640
+ )
641
+ class Qwen2FlashAttention2_packing(Qwen2Attention):
642
+ """
643
+ Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
644
+ as the weights of the module stays untouched. The only required change would be on the forward pass
645
+ where it needs to correctly call the public API of flash attention and deal with padding tokens
646
+ in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
647
+ config.max_window_layers layers.
648
+ """
649
+
650
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
651
+ def __init__(self, *args, **kwargs):
652
+ super().__init__(*args, **kwargs)
653
+
654
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
655
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
656
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
657
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
658
+
659
+ def forward(
660
+ self,
661
+ hidden_states: torch.Tensor,
662
+ attention_mask: Optional[torch.Tensor] = None,
663
+ position_ids: Optional[torch.LongTensor] = None,
664
+ past_key_value: Optional[Cache] = None,
665
+ output_attentions: bool = False,
666
+ use_cache: bool = False,
667
+ sub_sample_lengths = None,
668
+ **kwargs,
669
+ ):
670
+ if "padding_mask" in kwargs:
671
+ warnings.warn(
672
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
673
+ )
674
+
675
+ # overwrite attention_mask with padding_mask
676
+ attention_mask = kwargs.pop("padding_mask")
677
+ bsz, q_len, _ = hidden_states.size()
678
+
679
+ query_states = self.q_proj(hidden_states)
680
+ key_states = self.k_proj(hidden_states)
681
+ value_states = self.v_proj(hidden_states)
682
+
683
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
684
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
685
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
686
+
687
+ kv_seq_len = key_states.shape[-2]
688
+ if past_key_value is not None:
689
+ if self.layer_idx is None:
690
+ raise ValueError(
691
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
692
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
693
+ "with a layer index."
694
+ )
695
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
696
+
697
+ # Because the input can be padded, the absolute sequence length depends on the max position id.
698
+ rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
699
+ cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
700
+
701
+ if sub_sample_lengths is not None:
702
+ packing_position_ids = []
703
+ for b in range(bsz):
704
+ each_sum_sample_lengths = sub_sample_lengths[b]
705
+ packing_position_ids.append(torch.cat([torch.arange(each) for each in each_sum_sample_lengths]))
706
+ packing_position_ids = torch.stack(packing_position_ids)
707
+ packing_position_ids.to(query_states.device)
708
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, packing_position_ids)
709
+ else:
710
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
711
+
712
+ use_sliding_windows = (
713
+ _flash_supports_window_size
714
+ and getattr(self.config, "sliding_window", None) is not None
715
+ and kv_seq_len > self.config.sliding_window
716
+ and self.config.use_sliding_window
717
+ )
718
+
719
+ if not _flash_supports_window_size:
720
+ logger.warning_once(
721
+ "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
722
+ " make sure to upgrade flash-attn library."
723
+ )
724
+
725
+ if past_key_value is not None:
726
+ # Activate slicing cache only if the config has a value `sliding_windows` attribute
727
+ cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
728
+ if (
729
+ getattr(self.config, "sliding_window", None) is not None
730
+ and kv_seq_len > self.config.sliding_window
731
+ and cache_has_contents
732
+ ):
733
+ slicing_tokens = 1 - self.config.sliding_window
734
+
735
+ past_key = past_key_value[self.layer_idx][0]
736
+ past_value = past_key_value[self.layer_idx][1]
737
+
738
+ past_key = past_key[:, :, slicing_tokens:, :].contiguous()
739
+ past_value = past_value[:, :, slicing_tokens:, :].contiguous()
740
+
741
+ if past_key.shape[-2] != self.config.sliding_window - 1:
742
+ raise ValueError(
743
+ f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
744
+ f" {past_key.shape}"
745
+ )
746
+
747
+ if attention_mask is not None:
748
+ attention_mask = attention_mask[:, slicing_tokens:]
749
+ attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
750
+
751
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
752
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
753
+
754
+ # repeat k/v heads if n_kv_heads < n_heads
755
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
756
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
757
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
758
+
759
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
760
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
761
+ # cast them back in float16 just to be sure everything works as expected.
762
+ input_dtype = query_states.dtype
763
+ if input_dtype == torch.float32:
764
+ if torch.is_autocast_enabled():
765
+ target_dtype = torch.get_autocast_gpu_dtype()
766
+ # Handle the case where the model is quantized
767
+ elif hasattr(self.config, "_pre_quantization_dtype"):
768
+ target_dtype = self.config._pre_quantization_dtype
769
+ else:
770
+ target_dtype = self.q_proj.weight.dtype
771
+
772
+ logger.warning_once(
773
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
774
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
775
+ f" {target_dtype}."
776
+ )
777
+
778
+ query_states = query_states.to(target_dtype)
779
+ key_states = key_states.to(target_dtype)
780
+ value_states = value_states.to(target_dtype)
781
+
782
+ # Reashape to the expected shape for Flash Attention
783
+ query_states = query_states.transpose(1, 2)
784
+ key_states = key_states.transpose(1, 2)
785
+ value_states = value_states.transpose(1, 2)
786
+
787
+ attn_output = self._flash_attention_forward(
788
+ query_states,
789
+ key_states,
790
+ value_states,
791
+ attention_mask,
792
+ q_len,
793
+ dropout=dropout_rate,
794
+ use_sliding_windows=use_sliding_windows,
795
+ sub_sample_lengths=sub_sample_lengths
796
+ )
797
+
798
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
799
+ attn_output = self.o_proj(attn_output)
800
+
801
+ if not output_attentions:
802
+ attn_weights = None
803
+
804
+ return attn_output, attn_weights, past_key_value
805
+
806
+ def _flash_attention_forward(
807
+ self,
808
+ query_states,
809
+ key_states,
810
+ value_states,
811
+ attention_mask,
812
+ query_length,
813
+ dropout=0.0,
814
+ softmax_scale=None,
815
+ use_sliding_windows=False,
816
+ sub_sample_lengths=None,
817
+ ):
818
+ """
819
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
820
+ first unpad the input, then computes the attention scores and pad the final attention scores.
821
+
822
+ Args:
823
+ query_states (`torch.Tensor`):
824
+ Input query states to be passed to Flash Attention API
825
+ key_states (`torch.Tensor`):
826
+ Input key states to be passed to Flash Attention API
827
+ value_states (`torch.Tensor`):
828
+ Input value states to be passed to Flash Attention API
829
+ attention_mask (`torch.Tensor`):
830
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
831
+ position of padding tokens and 1 for the position of non-padding tokens.
832
+ dropout (`int`, *optional*):
833
+ Attention dropout
834
+ softmax_scale (`float`, *optional*):
835
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
836
+ use_sliding_windows (`bool`, *optional*):
837
+ Whether to activate sliding window attention.
838
+ """
839
+ if not self._flash_attn_uses_top_left_mask:
840
+ causal = self.is_causal
841
+ else:
842
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
843
+ causal = self.is_causal and query_length != 1
844
+
845
+ # Decide whether to use SWA or not by layer index.
846
+ if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
847
+ use_sliding_windows = False
848
+
849
+ # Contains at least one padding token in the sequence
850
+
851
+ if attention_mask is not None:
852
+ batch_size = query_states.shape[0]
853
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input_packing(
854
+ query_states, key_states, value_states, attention_mask, query_length, sub_sample_lengths
855
+ )
856
+
857
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
858
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
859
+
860
+ if not use_sliding_windows:
861
+ attn_output_unpad = flash_attn_varlen_func(
862
+ query_states,
863
+ key_states,
864
+ value_states,
865
+ cu_seqlens_q=cu_seqlens_q,
866
+ cu_seqlens_k=cu_seqlens_k,
867
+ max_seqlen_q=max_seqlen_in_batch_q,
868
+ max_seqlen_k=max_seqlen_in_batch_k,
869
+ dropout_p=dropout,
870
+ softmax_scale=softmax_scale,
871
+ causal=causal,
872
+ )
873
+ else:
874
+ attn_output_unpad = flash_attn_varlen_func(
875
+ query_states,
876
+ key_states,
877
+ value_states,
878
+ cu_seqlens_q=cu_seqlens_q,
879
+ cu_seqlens_k=cu_seqlens_k,
880
+ max_seqlen_q=max_seqlen_in_batch_q,
881
+ max_seqlen_k=max_seqlen_in_batch_k,
882
+ dropout_p=dropout,
883
+ softmax_scale=softmax_scale,
884
+ causal=causal,
885
+ window_size=(self.config.sliding_window, self.config.sliding_window),
886
+ )
887
+
888
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
889
+ else:
890
+ if not use_sliding_windows:
891
+ attn_output = flash_attn_func(
892
+ query_states,
893
+ key_states,
894
+ value_states,
895
+ dropout,
896
+ softmax_scale=softmax_scale,
897
+ causal=causal,
898
+ )
899
+ else:
900
+ attn_output = flash_attn_func(
901
+ query_states,
902
+ key_states,
903
+ value_states,
904
+ dropout,
905
+ softmax_scale=softmax_scale,
906
+ causal=causal,
907
+ window_size=(self.config.sliding_window, self.config.sliding_window),
908
+ )
909
+
910
+ return attn_output
911
+
912
+ # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
913
+ def _unpad_input_packing(self, query_layer, key_layer, value_layer, attention_mask, query_length, sub_sample_lengths):
914
+ batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
915
+
916
+ # On the first iteration we need to properly re-create the padding mask
917
+ # by slicing it on the proper place
918
+ if kv_seq_len != attention_mask.shape[-1]:
919
+ attention_mask_num_tokens = attention_mask.shape[-1]
920
+ attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
921
+
922
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data_packing(attention_mask, sub_sample_lengths)
923
+
924
+ key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
925
+ value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
926
+
927
+ if query_length == kv_seq_len:
928
+ query_layer = index_first_axis(
929
+ query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
930
+ )
931
+ cu_seqlens_q = cu_seqlens_k
932
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
933
+ indices_q = indices_k
934
+ elif query_length == 1:
935
+ max_seqlen_in_batch_q = 1
936
+ cu_seqlens_q = torch.arange(
937
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
938
+ ) # There is a memcpy here, that is very bad.
939
+ indices_q = cu_seqlens_q[:-1]
940
+ query_layer = query_layer.squeeze(1)
941
+ else:
942
+ # The -q_len: slice assumes left padding.
943
+ attention_mask = attention_mask[:, -query_length:]
944
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
945
+
946
+ return (
947
+ query_layer,
948
+ key_layer,
949
+ value_layer,
950
+ indices_q,
951
+ (cu_seqlens_q, cu_seqlens_k),
952
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
953
+ )
954
+
955
+
956
+ # Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
957
+ class Qwen2SdpaAttention(Qwen2Attention):
958
+ """
959
+ Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
960
+ `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
961
+ SDPA API.
962
+ """
963
+
964
+ # Adapted from Qwen2Attention.forward
965
+ def forward(
966
+ self,
967
+ hidden_states: torch.Tensor,
968
+ attention_mask: Optional[torch.Tensor] = None,
969
+ position_ids: Optional[torch.LongTensor] = None,
970
+ past_key_value: Optional[Cache] = None,
971
+ output_attentions: bool = False,
972
+ use_cache: bool = False,
973
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
974
+ if output_attentions:
975
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
976
+ logger.warning_once(
977
+ "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
978
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
979
+ )
980
+ return super().forward(
981
+ hidden_states=hidden_states,
982
+ attention_mask=attention_mask,
983
+ position_ids=position_ids,
984
+ past_key_value=past_key_value,
985
+ output_attentions=output_attentions,
986
+ use_cache=use_cache,
987
+ )
988
+
989
+ bsz, q_len, _ = hidden_states.size()
990
+
991
+ query_states = self.q_proj(hidden_states)
992
+ key_states = self.k_proj(hidden_states)
993
+ value_states = self.v_proj(hidden_states)
994
+
995
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
996
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
997
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
998
+
999
+ kv_seq_len = key_states.shape[-2]
1000
+ if past_key_value is not None:
1001
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
1002
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
1003
+
1004
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
1005
+
1006
+ if past_key_value is not None:
1007
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
1008
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
1009
+
1010
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
1011
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
1012
+
1013
+ if attention_mask is not None:
1014
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
1015
+ raise ValueError(
1016
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
1017
+ )
1018
+
1019
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
1020
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
1021
+ if query_states.device.type == "cuda" and attention_mask is not None:
1022
+ query_states = query_states.contiguous()
1023
+ key_states = key_states.contiguous()
1024
+ value_states = value_states.contiguous()
1025
+
1026
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
1027
+ query_states,
1028
+ key_states,
1029
+ value_states,
1030
+ attn_mask=attention_mask,
1031
+ dropout_p=self.attention_dropout if self.training else 0.0,
1032
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
1033
+ is_causal=self.is_causal and attention_mask is None and q_len > 1,
1034
+ )
1035
+
1036
+ attn_output = attn_output.transpose(1, 2).contiguous()
1037
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
1038
+
1039
+ attn_output = self.o_proj(attn_output)
1040
+
1041
+ return attn_output, None, past_key_value
1042
+
1043
+
1044
+ QWEN2_ATTENTION_CLASSES = {
1045
+ "eager": Qwen2Attention,
1046
+ "flash_attention_2": Qwen2FlashAttention2,
1047
+ "sdpa": Qwen2SdpaAttention,
1048
+ 'flash_attention_2_packing':Qwen2FlashAttention2_packing
1049
+ }
1050
+
1051
+
1052
+ class Qwen2DecoderLayer(nn.Module):
1053
+ def __init__(self, config: Qwen2Config, layer_idx: int):
1054
+ super().__init__()
1055
+ self.hidden_size = config.hidden_size
1056
+
1057
+ if config.use_sliding_window and config.attn_implementation != "flash_attention_2":
1058
+ logger.warning_once(
1059
+ f"Sliding Window Attention is enabled but not implemented for `{config.attn_implementation}`; "
1060
+ "unexpected results may be encountered."
1061
+ )
1062
+
1063
+ self.self_attn = QWEN2_ATTENTION_CLASSES[config.attn_implementation](config, layer_idx)
1064
+
1065
+ self.mlp = Qwen2MLP(config)
1066
+ self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1067
+ self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1068
+
1069
+ def forward(
1070
+ self,
1071
+ hidden_states: torch.Tensor,
1072
+ attention_mask: Optional[torch.Tensor] = None,
1073
+ position_ids: Optional[torch.LongTensor] = None,
1074
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
1075
+ sub_sample_lengths=None,
1076
+ output_attentions: Optional[bool] = False,
1077
+ use_cache: Optional[bool] = False,
1078
+ **kwargs,
1079
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
1080
+ if "padding_mask" in kwargs:
1081
+ warnings.warn(
1082
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. "
1083
+ "Please make sure use `attention_mask` instead.`"
1084
+ )
1085
+ """
1086
+ Args:
1087
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
1088
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
1089
+ `(batch, sequence_length)` where padding elements are indicated by 0.
1090
+ output_attentions (`bool`, *optional*):
1091
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
1092
+ returned tensors for more detail.
1093
+ use_cache (`bool`, *optional*):
1094
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
1095
+ (see `past_key_values`).
1096
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
1097
+ """
1098
+
1099
+ residual = hidden_states
1100
+
1101
+ hidden_states = self.input_layernorm(hidden_states)
1102
+
1103
+ # Self Attention
1104
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
1105
+ hidden_states=hidden_states,
1106
+ attention_mask=attention_mask,
1107
+ position_ids=position_ids,
1108
+ past_key_value=past_key_value,
1109
+ output_attentions=output_attentions,
1110
+ use_cache=use_cache,
1111
+ sub_sample_lengths=sub_sample_lengths,
1112
+ )
1113
+ hidden_states = residual + hidden_states
1114
+
1115
+ # Fully Connected
1116
+ residual = hidden_states
1117
+ hidden_states = self.post_attention_layernorm(hidden_states)
1118
+ hidden_states = self.mlp(hidden_states)
1119
+ hidden_states = residual + hidden_states
1120
+
1121
+ outputs = (hidden_states,)
1122
+
1123
+ if output_attentions:
1124
+ outputs += (self_attn_weights,)
1125
+
1126
+ if use_cache:
1127
+ outputs += (present_key_value,)
1128
+
1129
+ return outputs
1130
+
1131
+
1132
+ QWEN2_START_DOCSTRING = r"""
1133
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1134
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1135
+ etc.)
1136
+
1137
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1138
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1139
+ and behavior.
1140
+
1141
+ Parameters:
1142
+ config ([`Qwen2Config`]):
1143
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
1144
+ load the weights associated with the model, only the configuration. Check out the
1145
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
1146
+ """
1147
+
1148
+
1149
+ @add_start_docstrings(
1150
+ "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
1151
+ QWEN2_START_DOCSTRING,
1152
+ )
1153
+ class Qwen2PreTrainedModel(PreTrainedModel):
1154
+ config_class = Qwen2Config
1155
+ base_model_prefix = "model"
1156
+ supports_gradient_checkpointing = True
1157
+ _no_split_modules = ["Qwen2DecoderLayer"]
1158
+ _skip_keys_device_placement = "past_key_values"
1159
+ _supports_flash_attn_2 = True
1160
+ _supports_sdpa = True
1161
+ _supports_cache_class = True
1162
+
1163
+ def _init_weights(self, module):
1164
+ std = self.config.initializer_range
1165
+ if isinstance(module, nn.Linear):
1166
+ module.weight.data.normal_(mean=0.0, std=std)
1167
+ if module.bias is not None:
1168
+ module.bias.data.zero_()
1169
+ elif isinstance(module, nn.Embedding):
1170
+ module.weight.data.normal_(mean=0.0, std=std)
1171
+ if module.padding_idx is not None:
1172
+ module.weight.data[module.padding_idx].zero_()
1173
+
1174
+
1175
+ QWEN2_INPUTS_DOCSTRING = r"""
1176
+ Args:
1177
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1178
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1179
+ it.
1180
+
1181
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1182
+ [`PreTrainedTokenizer.__call__`] for details.
1183
+
1184
+ [What are input IDs?](../glossary#input-ids)
1185
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1186
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1187
+
1188
+ - 1 for tokens that are **not masked**,
1189
+ - 0 for tokens that are **masked**.
1190
+
1191
+ [What are attention masks?](../glossary#attention-mask)
1192
+
1193
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1194
+ [`PreTrainedTokenizer.__call__`] for details.
1195
+
1196
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
1197
+ `past_key_values`).
1198
+
1199
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1200
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1201
+ information on the default strategy.
1202
+
1203
+ - 1 indicates the head is **not masked**,
1204
+ - 0 indicates the head is **masked**.
1205
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1206
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1207
+ config.n_positions - 1]`.
1208
+
1209
+ [What are position IDs?](../glossary#position-ids)
1210
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1211
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1212
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1213
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
1214
+
1215
+ Two formats are allowed:
1216
+ - a [`~cache_utils.Cache`] instance;
1217
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1218
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1219
+ cache format.
1220
+
1221
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1222
+ legacy cache format will be returned.
1223
+
1224
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1225
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1226
+ of shape `(batch_size, sequence_length)`.
1227
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1228
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1229
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1230
+ model's internal embedding lookup matrix.
1231
+ use_cache (`bool`, *optional*):
1232
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1233
+ `past_key_values`).
1234
+ output_attentions (`bool`, *optional*):
1235
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1236
+ tensors for more detail.
1237
+ output_hidden_states (`bool`, *optional*):
1238
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1239
+ more detail.
1240
+ return_dict (`bool`, *optional*):
1241
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1242
+ """
1243
+
1244
+
1245
+ @add_start_docstrings(
1246
+ "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
1247
+ QWEN2_START_DOCSTRING,
1248
+ )
1249
+ class Qwen2Model(Qwen2PreTrainedModel):
1250
+ """
1251
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
1252
+
1253
+ Args:
1254
+ config: Qwen2Config
1255
+ """
1256
+
1257
+ def __init__(self, config: Qwen2Config):
1258
+ super().__init__(config)
1259
+ self.padding_idx = config.pad_token_id
1260
+ self.vocab_size = config.vocab_size
1261
+
1262
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1263
+ self.layers = nn.ModuleList(
1264
+ [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1265
+ )
1266
+ self.attn_implementation = config.attn_implementation
1267
+ self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1268
+
1269
+ self.gradient_checkpointing = False
1270
+ # Initialize weights and apply final processing
1271
+ self.post_init()
1272
+
1273
+ def get_input_embeddings(self):
1274
+ return self.embed_tokens
1275
+
1276
+ def set_input_embeddings(self, value):
1277
+ self.embed_tokens = value
1278
+
1279
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1280
+ def forward(
1281
+ self,
1282
+ input_ids: torch.LongTensor = None,
1283
+ attention_mask: Optional[torch.Tensor] = None,
1284
+ position_ids: Optional[torch.LongTensor] = None,
1285
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1286
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1287
+ use_cache: Optional[bool] = None,
1288
+ output_attentions: Optional[bool] = None,
1289
+ output_hidden_states: Optional[bool] = None,
1290
+ return_dict: Optional[bool] = None,
1291
+ sub_sample_lengths=None,
1292
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
1293
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1294
+ output_hidden_states = (
1295
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1296
+ )
1297
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1298
+
1299
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1300
+
1301
+ # retrieve input_ids and inputs_embeds
1302
+ if input_ids is not None and inputs_embeds is not None:
1303
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
1304
+ elif input_ids is not None:
1305
+ batch_size, seq_length = input_ids.shape
1306
+ elif inputs_embeds is not None:
1307
+ batch_size, seq_length, _ = inputs_embeds.shape
1308
+ else:
1309
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
1310
+
1311
+ if self.gradient_checkpointing and self.training:
1312
+ if use_cache:
1313
+ logger.warning_once(
1314
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
1315
+ )
1316
+ use_cache = False
1317
+
1318
+ past_key_values_length = 0
1319
+
1320
+ if use_cache:
1321
+ use_legacy_cache = not isinstance(past_key_values, Cache)
1322
+ if use_legacy_cache:
1323
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
1324
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
1325
+
1326
+ if position_ids is None:
1327
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
1328
+ position_ids = torch.arange(
1329
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
1330
+ )
1331
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
1332
+ else:
1333
+ position_ids = position_ids.view(-1, seq_length).long()
1334
+
1335
+ if inputs_embeds is None:
1336
+ inputs_embeds = self.embed_tokens(input_ids)
1337
+
1338
+ if attention_mask is not None and self.attn_implementation == "flash_attention_2" and use_cache:
1339
+ is_padding_right = attention_mask[:, -1].sum().item() != batch_size
1340
+ if is_padding_right:
1341
+ raise ValueError(
1342
+ "You are attempting to perform batched generation with padding_side='right'"
1343
+ " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
1344
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
1345
+ )
1346
+
1347
+ if self.attn_implementation == "flash_attention_2" or self.config.attn_implementation =='flash_attention_2_packing':
1348
+ # 2d mask is passed through the layers
1349
+ if attention_mask is not None:
1350
+ if attention_mask.dtype == torch.long:
1351
+ pass
1352
+ # attention_mask = attention_mask
1353
+ else:
1354
+ attention_mask = attention_mask if (0 in attention_mask) else None
1355
+
1356
+ elif self.attn_implementation == "sdpa" and not output_attentions:
1357
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
1358
+ # the manual implementation that requires a 4D causal mask in all cases.
1359
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
1360
+ attention_mask,
1361
+ (batch_size, seq_length),
1362
+ inputs_embeds,
1363
+ past_key_values_length,
1364
+ )
1365
+ else:
1366
+ # 4d mask is passed through the layers
1367
+ attention_mask = _prepare_4d_causal_attention_mask(
1368
+ attention_mask,
1369
+ (batch_size, seq_length),
1370
+ inputs_embeds,
1371
+ past_key_values_length,
1372
+ sliding_window=self.config.sliding_window,
1373
+ )
1374
+
1375
+ hidden_states = inputs_embeds
1376
+
1377
+ # decoder layers
1378
+ all_hidden_states = () if output_hidden_states else None
1379
+ all_self_attns = () if output_attentions else None
1380
+ next_decoder_cache = None
1381
+
1382
+ for decoder_layer in self.layers:
1383
+ if output_hidden_states:
1384
+ all_hidden_states += (hidden_states,)
1385
+ if self.gradient_checkpointing and self.training:
1386
+ layer_outputs = self._gradient_checkpointing_func(
1387
+ decoder_layer.__call__,
1388
+ hidden_states,
1389
+ attention_mask,
1390
+ position_ids,
1391
+ past_key_values,
1392
+ sub_sample_lengths,
1393
+ output_attentions,
1394
+ use_cache,
1395
+ )
1396
+ else:
1397
+ layer_outputs = decoder_layer(
1398
+ hidden_states,
1399
+ attention_mask=attention_mask,
1400
+ position_ids=position_ids,
1401
+ past_key_value=past_key_values,
1402
+ sub_sample_lengths=sub_sample_lengths,
1403
+ output_attentions=output_attentions,
1404
+ use_cache=use_cache,
1405
+ )
1406
+
1407
+ hidden_states = layer_outputs[0]
1408
+
1409
+ if use_cache:
1410
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
1411
+
1412
+ if output_attentions:
1413
+ all_self_attns += (layer_outputs[1],)
1414
+
1415
+ hidden_states = self.norm(hidden_states)
1416
+
1417
+ # add hidden states from the last decoder layer
1418
+ if output_hidden_states:
1419
+ all_hidden_states += (hidden_states,)
1420
+
1421
+ next_cache = None
1422
+ if use_cache:
1423
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
1424
+
1425
+ if not return_dict:
1426
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
1427
+ return BaseModelOutputWithPast(
1428
+ last_hidden_state=hidden_states,
1429
+ past_key_values=next_cache,
1430
+ hidden_states=all_hidden_states,
1431
+ attentions=all_self_attns,
1432
+ )
1433
+
1434
+
1435
+ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
1436
+ _tied_weights_keys = ["lm_head.weight"]
1437
+
1438
+ def __init__(self, config):
1439
+ super().__init__(config)
1440
+ self.model = Qwen2Model(config)
1441
+ self.vocab_size = config.vocab_size
1442
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1443
+
1444
+ # Initialize weights and apply final processing
1445
+ self.post_init()
1446
+ self.support_packing = True
1447
+
1448
+ def get_input_embeddings(self):
1449
+ return self.model.embed_tokens
1450
+
1451
+ def set_input_embeddings(self, value):
1452
+ self.model.embed_tokens = value
1453
+
1454
+ def get_output_embeddings(self):
1455
+ return self.lm_head
1456
+
1457
+ def set_output_embeddings(self, new_embeddings):
1458
+ self.lm_head = new_embeddings
1459
+
1460
+ def set_decoder(self, decoder):
1461
+ self.model = decoder
1462
+
1463
+ def get_decoder(self):
1464
+ return self.model
1465
+
1466
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1467
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1468
+ def forward(
1469
+ self,
1470
+ input_ids: torch.LongTensor = None,
1471
+ attention_mask: Optional[torch.Tensor] = None,
1472
+ position_ids: Optional[torch.LongTensor] = None,
1473
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1474
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1475
+ labels: Optional[torch.LongTensor] = None,
1476
+ use_cache: Optional[bool] = None,
1477
+ output_attentions: Optional[bool] = None,
1478
+ output_hidden_states: Optional[bool] = None,
1479
+ return_dict: Optional[bool] = None,
1480
+ sub_sample_lengths=None,
1481
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1482
+ r"""
1483
+ Args:
1484
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1485
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1486
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1487
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1488
+
1489
+ Returns:
1490
+
1491
+ Example:
1492
+
1493
+ ```python
1494
+ >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
1495
+
1496
+ >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1497
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1498
+
1499
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1500
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1501
+
1502
+ >>> # Generate
1503
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1504
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1505
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1506
+ ```"""
1507
+
1508
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1509
+ output_hidden_states = (
1510
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1511
+ )
1512
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1513
+
1514
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1515
+ outputs = self.model(
1516
+ input_ids=input_ids,
1517
+ attention_mask=attention_mask,
1518
+ position_ids=position_ids,
1519
+ past_key_values=past_key_values,
1520
+ inputs_embeds=inputs_embeds,
1521
+ use_cache=use_cache,
1522
+ output_attentions=output_attentions,
1523
+ output_hidden_states=output_hidden_states,
1524
+ return_dict=return_dict,
1525
+ sub_sample_lengths=sub_sample_lengths
1526
+ )
1527
+
1528
+ hidden_states = outputs[0]
1529
+ logits = self.lm_head(hidden_states)
1530
+ logits = logits.float()
1531
+
1532
+ loss = None
1533
+ if labels is not None:
1534
+ # Shift so that tokens < n predict n
1535
+ shift_logits = logits[..., :-1, :].contiguous()
1536
+ shift_labels = labels[..., 1:].contiguous()
1537
+ # Flatten the tokens
1538
+ loss_fct = CrossEntropyLoss()
1539
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1540
+ shift_labels = shift_labels.view(-1)
1541
+ # Enable model parallelism
1542
+ shift_labels = shift_labels.to(shift_logits.device)
1543
+ loss = loss_fct(shift_logits, shift_labels)
1544
+
1545
+ if not return_dict:
1546
+ output = (logits,) + outputs[1:]
1547
+ return (loss,) + output if loss is not None else output
1548
+
1549
+ return CausalLMOutputWithPast(
1550
+ loss=loss,
1551
+ logits=logits,
1552
+ past_key_values=outputs.past_key_values,
1553
+ hidden_states=outputs.hidden_states,
1554
+ attentions=outputs.attentions,
1555
+ )
1556
+
1557
+ def prepare_inputs_for_generation(
1558
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
1559
+ ):
1560
+ # Omit tokens covered by past_key_values
1561
+ if past_key_values is not None:
1562
+ if isinstance(past_key_values, Cache):
1563
+ cache_length = past_key_values.get_seq_length()
1564
+ past_length = past_key_values.seen_tokens
1565
+ max_cache_length = past_key_values.get_max_length()
1566
+ else:
1567
+ cache_length = past_length = past_key_values[0][0].shape[2]
1568
+ max_cache_length = None
1569
+
1570
+ # Keep only the unprocessed tokens:
1571
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
1572
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
1573
+ # input)
1574
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
1575
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
1576
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
1577
+ # input_ids based on the past_length.
1578
+ elif past_length < input_ids.shape[1]:
1579
+ input_ids = input_ids[:, past_length:]
1580
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
1581
+
1582
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
1583
+ if (
1584
+ max_cache_length is not None
1585
+ and attention_mask is not None
1586
+ and cache_length + input_ids.shape[1] > max_cache_length
1587
+ ):
1588
+ attention_mask = attention_mask[:, -max_cache_length:]
1589
+
1590
+ position_ids = kwargs.get("position_ids", None)
1591
+ if attention_mask is not None and position_ids is None:
1592
+ # create position_ids on the fly for batch generation
1593
+ position_ids = attention_mask.long().cumsum(-1) - 1
1594
+ position_ids.masked_fill_(attention_mask == 0, 1)
1595
+ if past_key_values:
1596
+ position_ids = position_ids[:, -input_ids.shape[1] :]
1597
+
1598
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1599
+ if inputs_embeds is not None and past_key_values is None:
1600
+ model_inputs = {"inputs_embeds": inputs_embeds}
1601
+ else:
1602
+ model_inputs = {"input_ids": input_ids}
1603
+
1604
+ model_inputs.update(
1605
+ {
1606
+ "position_ids": position_ids,
1607
+ "past_key_values": past_key_values,
1608
+ "use_cache": kwargs.get("use_cache"),
1609
+ "attention_mask": attention_mask,
1610
+ }
1611
+ )
1612
+ return model_inputs
1613
+
1614
+ @staticmethod
1615
+ def _reorder_cache(past_key_values, beam_idx):
1616
+ reordered_past = ()
1617
+ for layer_past in past_key_values:
1618
+ reordered_past += (
1619
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1620
+ )
1621
+ return reordered_past
1622
+
1623
+
1624
+ @add_start_docstrings(
1625
+ """
1626
+ The Qwen2 Model transformer with a sequence classification head on top (linear layer).
1627
+
1628
+ [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1629
+ (e.g. GPT-2) do.
1630
+
1631
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1632
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1633
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1634
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1635
+ each row of the batch).
1636
+ """,
1637
+ QWEN2_START_DOCSTRING,
1638
+ )
1639
+ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
1640
+ def __init__(self, config):
1641
+ super().__init__(config)
1642
+ self.num_labels = config.num_labels
1643
+ self.model = Qwen2Model(config)
1644
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1645
+
1646
+ # Initialize weights and apply final processing
1647
+ self.post_init()
1648
+
1649
+ def get_input_embeddings(self):
1650
+ return self.model.embed_tokens
1651
+
1652
+ def set_input_embeddings(self, value):
1653
+ self.model.embed_tokens = value
1654
+
1655
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1656
+ def forward(
1657
+ self,
1658
+ input_ids: torch.LongTensor = None,
1659
+ attention_mask: Optional[torch.Tensor] = None,
1660
+ position_ids: Optional[torch.LongTensor] = None,
1661
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1662
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1663
+ labels: Optional[torch.LongTensor] = None,
1664
+ use_cache: Optional[bool] = None,
1665
+ output_attentions: Optional[bool] = None,
1666
+ output_hidden_states: Optional[bool] = None,
1667
+ return_dict: Optional[bool] = None,
1668
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1669
+ r"""
1670
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1671
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1672
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1673
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1674
+ """
1675
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1676
+
1677
+ transformer_outputs = self.model(
1678
+ input_ids,
1679
+ attention_mask=attention_mask,
1680
+ position_ids=position_ids,
1681
+ past_key_values=past_key_values,
1682
+ inputs_embeds=inputs_embeds,
1683
+ use_cache=use_cache,
1684
+ output_attentions=output_attentions,
1685
+ output_hidden_states=output_hidden_states,
1686
+ return_dict=return_dict,
1687
+ )
1688
+ hidden_states = transformer_outputs[0]
1689
+ logits = self.score(hidden_states)
1690
+
1691
+ if input_ids is not None:
1692
+ batch_size = input_ids.shape[0]
1693
+ else:
1694
+ batch_size = inputs_embeds.shape[0]
1695
+
1696
+ if self.config.pad_token_id is None and batch_size != 1:
1697
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1698
+ if self.config.pad_token_id is None:
1699
+ sequence_lengths = -1
1700
+ else:
1701
+ if input_ids is not None:
1702
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1703
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1704
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1705
+ sequence_lengths = sequence_lengths.to(logits.device)
1706
+ else:
1707
+ sequence_lengths = -1
1708
+
1709
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1710
+
1711
+ loss = None
1712
+ if labels is not None:
1713
+ labels = labels.to(logits.device)
1714
+ if self.config.problem_type is None:
1715
+ if self.num_labels == 1:
1716
+ self.config.problem_type = "regression"
1717
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1718
+ self.config.problem_type = "single_label_classification"
1719
+ else:
1720
+ self.config.problem_type = "multi_label_classification"
1721
+
1722
+ if self.config.problem_type == "regression":
1723
+ loss_fct = MSELoss()
1724
+ if self.num_labels == 1:
1725
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1726
+ else:
1727
+ loss = loss_fct(pooled_logits, labels)
1728
+ elif self.config.problem_type == "single_label_classification":
1729
+ loss_fct = CrossEntropyLoss()
1730
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1731
+ elif self.config.problem_type == "multi_label_classification":
1732
+ loss_fct = BCEWithLogitsLoss()
1733
+ loss = loss_fct(pooled_logits, labels)
1734
+ if not return_dict:
1735
+ output = (pooled_logits,) + transformer_outputs[1:]
1736
+ return ((loss,) + output) if loss is not None else output
1737
+
1738
+ return SequenceClassifierOutputWithPast(
1739
+ loss=loss,
1740
+ logits=pooled_logits,
1741
+ past_key_values=transformer_outputs.past_key_values,
1742
+ hidden_states=transformer_outputs.hidden_states,
1743
+ attentions=transformer_outputs.attentions,
1744
+ )
modeling_siglip.py ADDED
@@ -0,0 +1,1241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Eagle2
3
+ # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # Support flash-attention in SigLIP
6
+ # --------------------------------------------------------
7
+
8
+
9
+ # coding=utf-8
10
+ # Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
11
+ #
12
+ # Licensed under the Apache License, Version 2.0 (the "License");
13
+ # you may not use this file except in compliance with the License.
14
+ # You may obtain a copy of the License at
15
+ #
16
+ # http://www.apache.org/licenses/LICENSE-2.0
17
+ #
18
+ # Unless required by applicable law or agreed to in writing, software
19
+ # distributed under the License is distributed on an "AS IS" BASIS,
20
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21
+ # See the License for the specific language governing permissions and
22
+ # limitations under the License.
23
+ """ PyTorch Siglip model."""
24
+
25
+
26
+ import math
27
+ import warnings
28
+ from dataclasses import dataclass
29
+ from typing import Any, Optional, Tuple, Union
30
+ from einops import rearrange
31
+ import numpy as np
32
+ import torch
33
+ import torch.utils.checkpoint
34
+ from torch import nn
35
+ from torch.nn.init import _calculate_fan_in_and_fan_out
36
+
37
+ from transformers.activations import ACT2FN
38
+ from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
39
+ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
40
+ from transformers.modeling_utils import PreTrainedModel
41
+ from transformers.utils import (
42
+ ModelOutput,
43
+ add_start_docstrings,
44
+ add_start_docstrings_to_model_forward,
45
+ logging,
46
+ replace_return_docstrings,
47
+ )
48
+ from .configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
49
+
50
+ try:
51
+ from .flash_attention import FlashAttention
52
+ has_flash_attn = True
53
+ except:
54
+ print('FlashAttention is not installed.')
55
+ has_flash_attn = False
56
+
57
+ logger = logging.get_logger(__name__)
58
+
59
+ _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
60
+
61
+ SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
62
+ "google/siglip-base-patch16-224",
63
+ # See all SigLIP models at https://huggingface.co/models?filter=siglip
64
+ ]
65
+
66
+
67
+ def _trunc_normal_(tensor, mean, std, a, b):
68
+ # Cut & paste from PyTorch official master until it's in a few official releases - RW
69
+ # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
70
+ def norm_cdf(x):
71
+ # Computes standard normal cumulative distribution function
72
+ return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
73
+
74
+ if (mean < a - 2 * std) or (mean > b + 2 * std):
75
+ warnings.warn(
76
+ "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
77
+ "The distribution of values may be incorrect.",
78
+ stacklevel=2,
79
+ )
80
+
81
+ # Values are generated by using a truncated uniform distribution and
82
+ # then using the inverse CDF for the normal distribution.
83
+ # Get upper and lower cdf values
84
+ l = norm_cdf((a - mean) / std)
85
+ u = norm_cdf((b - mean) / std)
86
+
87
+ # Uniformly fill tensor with values from [l, u], then translate to
88
+ # [2l-1, 2u-1].
89
+ tensor.uniform_(2 * l - 1, 2 * u - 1)
90
+
91
+ # Use inverse cdf transform for normal distribution to get truncated
92
+ # standard normal
93
+ tensor.erfinv_()
94
+
95
+ # Transform to proper mean, std
96
+ tensor.mul_(std * math.sqrt(2.0))
97
+ tensor.add_(mean)
98
+
99
+ # Clamp to ensure it's in the proper range
100
+ tensor.clamp_(min=a, max=b)
101
+
102
+
103
+ def trunc_normal_tf_(
104
+ tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
105
+ ) -> torch.Tensor:
106
+ """Fills the input Tensor with values drawn from a truncated
107
+ normal distribution. The values are effectively drawn from the
108
+ normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
109
+ with values outside :math:`[a, b]` redrawn until they are within
110
+ the bounds. The method used for generating the random values works
111
+ best when :math:`a \\leq \text{mean} \\leq b`.
112
+
113
+ NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
114
+ bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
115
+ and the result is subsquently scaled and shifted by the mean and std args.
116
+
117
+ Args:
118
+ tensor: an n-dimensional `torch.Tensor`
119
+ mean: the mean of the normal distribution
120
+ std: the standard deviation of the normal distribution
121
+ a: the minimum cutoff value
122
+ b: the maximum cutoff value
123
+ """
124
+ with torch.no_grad():
125
+ _trunc_normal_(tensor, 0, 1.0, a, b)
126
+ tensor.mul_(std).add_(mean)
127
+
128
+
129
+ def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
130
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
131
+ if mode == "fan_in":
132
+ denom = fan_in
133
+ elif mode == "fan_out":
134
+ denom = fan_out
135
+ elif mode == "fan_avg":
136
+ denom = (fan_in + fan_out) / 2
137
+
138
+ variance = scale / denom
139
+
140
+ if distribution == "truncated_normal":
141
+ # constant is stddev of standard normal truncated to (-2, 2)
142
+ trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
143
+ elif distribution == "normal":
144
+ with torch.no_grad():
145
+ tensor.normal_(std=math.sqrt(variance))
146
+ elif distribution == "uniform":
147
+ bound = math.sqrt(3 * variance)
148
+ with torch.no_grad():
149
+ tensor.uniform_(-bound, bound)
150
+ else:
151
+ raise ValueError(f"invalid distribution {distribution}")
152
+
153
+
154
+ def lecun_normal_(tensor):
155
+ variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
156
+
157
+
158
+ def default_flax_embed_init(tensor):
159
+ variance_scaling_(tensor, mode="fan_in", distribution="normal")
160
+
161
+
162
+ @dataclass
163
+ # Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
164
+ class SiglipVisionModelOutput(ModelOutput):
165
+ """
166
+ Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
167
+
168
+ Args:
169
+ image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
170
+ The image embeddings obtained by applying the projection layer to the pooler_output.
171
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
172
+ Sequence of hidden-states at the output of the last layer of the model.
173
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
174
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
175
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
176
+
177
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
178
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
179
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
180
+ sequence_length)`.
181
+
182
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
183
+ heads.
184
+ """
185
+
186
+ image_embeds: Optional[torch.FloatTensor] = None
187
+ last_hidden_state: torch.FloatTensor = None
188
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
189
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
190
+
191
+
192
+ @dataclass
193
+ # Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Siglip
194
+ class SiglipTextModelOutput(ModelOutput):
195
+ """
196
+ Base class for text model's outputs that also contains a pooling of the last hidden states.
197
+
198
+ Args:
199
+ text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
200
+ The text embeddings obtained by applying the projection layer to the pooler_output.
201
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
202
+ Sequence of hidden-states at the output of the last layer of the model.
203
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
204
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
205
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
206
+
207
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
208
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
209
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
210
+ sequence_length)`.
211
+
212
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
213
+ heads.
214
+ """
215
+
216
+ text_embeds: Optional[torch.FloatTensor] = None
217
+ last_hidden_state: torch.FloatTensor = None
218
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
219
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
220
+
221
+
222
+ @dataclass
223
+ # Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Siglip
224
+ class SiglipOutput(ModelOutput):
225
+ """
226
+ Args:
227
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
228
+ Contrastive loss for image-text similarity.
229
+ logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
230
+ The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
231
+ similarity scores.
232
+ logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
233
+ The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
234
+ similarity scores.
235
+ text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
236
+ The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
237
+ image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
238
+ The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
239
+ text_model_output(`BaseModelOutputWithPooling`):
240
+ The output of the [`SiglipTextModel`].
241
+ vision_model_output(`BaseModelOutputWithPooling`):
242
+ The output of the [`SiglipVisionModel`].
243
+ """
244
+
245
+ loss: Optional[torch.FloatTensor] = None
246
+ logits_per_image: torch.FloatTensor = None
247
+ logits_per_text: torch.FloatTensor = None
248
+ text_embeds: torch.FloatTensor = None
249
+ image_embeds: torch.FloatTensor = None
250
+ text_model_output: BaseModelOutputWithPooling = None
251
+ vision_model_output: BaseModelOutputWithPooling = None
252
+
253
+ def to_tuple(self) -> Tuple[Any]:
254
+ return tuple(
255
+ self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
256
+ for k in self.keys()
257
+ )
258
+
259
+
260
+ class SiglipVisionEmbeddings(nn.Module):
261
+ def __init__(self, config: SiglipVisionConfig):
262
+ super().__init__()
263
+ self.config = config
264
+ self.embed_dim = config.hidden_size
265
+ self.image_size = config.image_size
266
+ self.patch_size = config.patch_size
267
+
268
+ self.patch_embedding = nn.Conv2d(
269
+ in_channels=config.num_channels,
270
+ out_channels=self.embed_dim,
271
+ kernel_size=self.patch_size,
272
+ stride=self.patch_size,
273
+ padding="valid",
274
+ )
275
+
276
+ self.num_patches = (self.image_size // self.patch_size) ** 2
277
+ self.num_positions = self.num_patches
278
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
279
+ self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
280
+
281
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
282
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
283
+ embeddings = patch_embeds.flatten(2).transpose(1, 2)
284
+
285
+ embeddings = embeddings + self.position_embedding(self.position_ids)
286
+ return embeddings
287
+
288
+
289
+ # Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Siglip
290
+ class SiglipTextEmbeddings(nn.Module):
291
+ def __init__(self, config: SiglipTextConfig):
292
+ super().__init__()
293
+ embed_dim = config.hidden_size
294
+
295
+ self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
296
+ self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
297
+
298
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
299
+ self.register_buffer(
300
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
301
+ )
302
+
303
+ def forward(
304
+ self,
305
+ input_ids: Optional[torch.LongTensor] = None,
306
+ position_ids: Optional[torch.LongTensor] = None,
307
+ inputs_embeds: Optional[torch.FloatTensor] = None,
308
+ ) -> torch.Tensor:
309
+ seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
310
+
311
+ if position_ids is None:
312
+ position_ids = self.position_ids[:, :seq_length]
313
+
314
+ if inputs_embeds is None:
315
+ inputs_embeds = self.token_embedding(input_ids)
316
+
317
+ position_embeddings = self.position_embedding(position_ids)
318
+ embeddings = inputs_embeds + position_embeddings
319
+
320
+ return embeddings
321
+
322
+
323
+ class SiglipAttention(nn.Module):
324
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
325
+
326
+ # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
327
+ def __init__(self, config):
328
+ super().__init__()
329
+ self.config = config
330
+ self.embed_dim = config.hidden_size
331
+ self.num_heads = config.num_attention_heads
332
+ self.head_dim = self.embed_dim // self.num_heads
333
+ if self.head_dim * self.num_heads != self.embed_dim:
334
+ raise ValueError(
335
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
336
+ f" {self.num_heads})."
337
+ )
338
+ self.scale = self.head_dim**-0.5
339
+ self.dropout = config.attention_dropout
340
+
341
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
342
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
343
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
344
+ # self.use_flash_attn = config.use_flash_attn and has_flash_attn
345
+ self.use_flash_attn = True if has_flash_attn else False
346
+ if self.use_flash_attn:
347
+ self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
348
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
349
+
350
+ def _flash_attn(self,
351
+ hidden_states: torch.Tensor,
352
+ attention_mask: Optional[torch.Tensor] = None,
353
+ output_attentions: Optional[bool] = False,
354
+ key_padding_mask=None,
355
+ need_weights=False
356
+ ):
357
+
358
+ batch_size, q_len, _ = hidden_states.size()
359
+
360
+ query_states = self.q_proj(hidden_states)
361
+ key_states = self.k_proj(hidden_states)
362
+ value_states = self.v_proj(hidden_states)
363
+
364
+ query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim)
365
+ key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim)
366
+ value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim)
367
+
368
+ qkv = torch.stack([query_states, key_states, value_states], dim=2)
369
+ context, attn_weights = self.inner_attn(
370
+ qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
371
+ )
372
+ attn_output = self.out_proj(rearrange(context, 'b s h d -> b s (h d)'))
373
+
374
+ return attn_output, attn_weights
375
+
376
+ def forward(
377
+ self,
378
+ hidden_states: torch.Tensor,
379
+ attention_mask: Optional[torch.Tensor] = None,
380
+ output_attentions: Optional[bool] = False,
381
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
382
+ """Input shape: Batch x Time x Channel"""
383
+ if self.use_flash_attn:
384
+ return self._flash_attn(hidden_states)
385
+ else:
386
+ return self._vanilla_attn(hidden_states, attention_mask, output_attentions)
387
+
388
+ def _vanilla_attn(self, hidden_states, attention_mask=None, output_attentions=False):
389
+ batch_size, q_len, _ = hidden_states.size()
390
+
391
+ query_states = self.q_proj(hidden_states)
392
+ key_states = self.k_proj(hidden_states)
393
+ value_states = self.v_proj(hidden_states)
394
+
395
+ query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
396
+ key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
397
+ value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
398
+
399
+ k_v_seq_len = key_states.shape[-2]
400
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
401
+
402
+ if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
403
+ raise ValueError(
404
+ f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
405
+ f" {attn_weights.size()}"
406
+ )
407
+
408
+ if attention_mask is not None:
409
+ if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
410
+ raise ValueError(
411
+ f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
412
+ )
413
+ attn_weights = attn_weights + attention_mask
414
+
415
+ # upcast attention to fp32
416
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
417
+ attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
418
+ attn_output = torch.matmul(attn_weights, value_states)
419
+
420
+ if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
421
+ raise ValueError(
422
+ f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
423
+ f" {attn_output.size()}"
424
+ )
425
+
426
+ attn_output = attn_output.transpose(1, 2).contiguous()
427
+ attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
428
+
429
+ attn_output = self.out_proj(attn_output)
430
+
431
+ return attn_output, attn_weights
432
+
433
+
434
+ # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
435
+ class SiglipMLP(nn.Module):
436
+ def __init__(self, config):
437
+ super().__init__()
438
+ self.config = config
439
+ self.activation_fn = ACT2FN[config.hidden_act]
440
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
441
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
442
+
443
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
444
+ hidden_states = self.fc1(hidden_states)
445
+ hidden_states = self.activation_fn(hidden_states)
446
+ hidden_states = self.fc2(hidden_states)
447
+ return hidden_states
448
+
449
+
450
+ # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
451
+ class SiglipEncoderLayer(nn.Module):
452
+ def __init__(self, config: SiglipConfig):
453
+ super().__init__()
454
+ self.embed_dim = config.hidden_size
455
+ self.self_attn = SiglipAttention(config)
456
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
457
+ self.mlp = SiglipMLP(config)
458
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
459
+
460
+ # Ignore copy
461
+ def forward(
462
+ self,
463
+ hidden_states: torch.Tensor,
464
+ attention_mask: torch.Tensor,
465
+ output_attentions: Optional[bool] = False,
466
+ ) -> Tuple[torch.FloatTensor]:
467
+ """
468
+ Args:
469
+ hidden_states (`torch.FloatTensor`):
470
+ Input to the layer of shape `(batch, seq_len, embed_dim)`.
471
+ attention_mask (`torch.FloatTensor`):
472
+ Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
473
+ output_attentions (`bool`, *optional*, defaults to `False`):
474
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
475
+ returned tensors for more detail.
476
+ """
477
+ residual = hidden_states
478
+
479
+ hidden_states = self.layer_norm1(hidden_states)
480
+ hidden_states, attn_weights = self.self_attn(
481
+ hidden_states=hidden_states,
482
+ attention_mask=attention_mask,
483
+ output_attentions=output_attentions,
484
+ )
485
+ hidden_states = residual + hidden_states
486
+
487
+ residual = hidden_states
488
+ hidden_states = self.layer_norm2(hidden_states)
489
+ hidden_states = self.mlp(hidden_states)
490
+ hidden_states = residual + hidden_states
491
+
492
+ outputs = (hidden_states,)
493
+
494
+ if output_attentions:
495
+ outputs += (attn_weights,)
496
+
497
+ return outputs
498
+
499
+
500
+ class SiglipPreTrainedModel(PreTrainedModel):
501
+ """
502
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
503
+ models.
504
+ """
505
+
506
+ config_class = SiglipConfig
507
+ base_model_prefix = "siglip"
508
+ supports_gradient_checkpointing = True
509
+
510
+ def _init_weights(self, module):
511
+ """Initialize the weights"""
512
+ if isinstance(module, SiglipVisionEmbeddings):
513
+ width = (
514
+ self.config.vision_config.hidden_size
515
+ if isinstance(self.config, SiglipConfig)
516
+ else self.config.hidden_size
517
+ )
518
+ nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
519
+ elif isinstance(module, nn.Embedding):
520
+ default_flax_embed_init(module.weight)
521
+ elif isinstance(module, SiglipAttention):
522
+ nn.init.xavier_uniform_(module.q_proj.weight)
523
+ nn.init.xavier_uniform_(module.k_proj.weight)
524
+ nn.init.xavier_uniform_(module.v_proj.weight)
525
+ nn.init.xavier_uniform_(module.out_proj.weight)
526
+ nn.init.zeros_(module.q_proj.bias)
527
+ nn.init.zeros_(module.k_proj.bias)
528
+ nn.init.zeros_(module.v_proj.bias)
529
+ nn.init.zeros_(module.out_proj.bias)
530
+ elif isinstance(module, SiglipMLP):
531
+ nn.init.xavier_uniform_(module.fc1.weight)
532
+ nn.init.xavier_uniform_(module.fc2.weight)
533
+ nn.init.normal_(module.fc1.bias, std=1e-6)
534
+ nn.init.normal_(module.fc2.bias, std=1e-6)
535
+ elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
536
+ nn.init.xavier_uniform_(module.probe.data)
537
+ nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
538
+ nn.init.zeros_(module.attention.in_proj_bias.data)
539
+ elif isinstance(module, SiglipModel):
540
+ logit_scale_init = torch.log(torch.tensor(1.0))
541
+ module.logit_scale.data.fill_(logit_scale_init)
542
+ module.logit_bias.data.zero_()
543
+ elif isinstance(module, (nn.Linear, nn.Conv2d)):
544
+ lecun_normal_(module.weight)
545
+ if module.bias is not None:
546
+ nn.init.zeros_(module.bias)
547
+ elif isinstance(module, nn.LayerNorm):
548
+ module.bias.data.zero_()
549
+ module.weight.data.fill_(1.0)
550
+
551
+
552
+ SIGLIP_START_DOCSTRING = r"""
553
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
554
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
555
+ etc.)
556
+
557
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
558
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
559
+ and behavior.
560
+
561
+ Parameters:
562
+ config ([`SiglipConfig`]): Model configuration class with all the parameters of the model.
563
+ Initializing with a config file does not load the weights associated with the model, only the
564
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
565
+ """
566
+
567
+ SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
568
+ Args:
569
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
570
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
571
+ it.
572
+
573
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
574
+ [`PreTrainedTokenizer.__call__`] for details.
575
+
576
+ [What are input IDs?](../glossary#input-ids)
577
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
578
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
579
+
580
+ - 1 for tokens that are **not masked**,
581
+ - 0 for tokens that are **masked**.
582
+
583
+ [What are attention masks?](../glossary#attention-mask)
584
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
585
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
586
+ config.max_position_embeddings - 1]`.
587
+
588
+ [What are position IDs?](../glossary#position-ids)
589
+ output_attentions (`bool`, *optional*):
590
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
591
+ tensors for more detail.
592
+ output_hidden_states (`bool`, *optional*):
593
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
594
+ more detail.
595
+ return_dict (`bool`, *optional*):
596
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
597
+ """
598
+
599
+ SIGLIP_VISION_INPUTS_DOCSTRING = r"""
600
+ Args:
601
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
602
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
603
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
604
+ output_attentions (`bool`, *optional*):
605
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
606
+ tensors for more detail.
607
+ output_hidden_states (`bool`, *optional*):
608
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
609
+ more detail.
610
+ return_dict (`bool`, *optional*):
611
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
612
+ """
613
+
614
+ SIGLIP_INPUTS_DOCSTRING = r"""
615
+ Args:
616
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
617
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
618
+ it.
619
+
620
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
621
+ [`PreTrainedTokenizer.__call__`] for details.
622
+
623
+ [What are input IDs?](../glossary#input-ids)
624
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
625
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
626
+
627
+ - 1 for tokens that are **not masked**,
628
+ - 0 for tokens that are **masked**.
629
+
630
+ [What are attention masks?](../glossary#attention-mask)
631
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
632
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
633
+ config.max_position_embeddings - 1]`.
634
+
635
+ [What are position IDs?](../glossary#position-ids)
636
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
637
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
638
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
639
+ return_loss (`bool`, *optional*):
640
+ Whether or not to return the contrastive loss.
641
+ output_attentions (`bool`, *optional*):
642
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
643
+ tensors for more detail.
644
+ output_hidden_states (`bool`, *optional*):
645
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
646
+ more detail.
647
+ return_dict (`bool`, *optional*):
648
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
649
+ """
650
+
651
+
652
+ # Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
653
+ class SiglipEncoder(nn.Module):
654
+ """
655
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
656
+ [`SiglipEncoderLayer`].
657
+
658
+ Args:
659
+ config: SiglipConfig
660
+ """
661
+
662
+ def __init__(self, config: SiglipConfig):
663
+ super().__init__()
664
+ self.config = config
665
+ self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
666
+ self.gradient_checkpointing = False
667
+
668
+ # Ignore copy
669
+ def forward(
670
+ self,
671
+ inputs_embeds,
672
+ attention_mask: Optional[torch.Tensor] = None,
673
+ output_attentions: Optional[bool] = None,
674
+ output_hidden_states: Optional[bool] = None,
675
+ return_dict: Optional[bool] = None,
676
+ ) -> Union[Tuple, BaseModelOutput]:
677
+ r"""
678
+ Args:
679
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
680
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
681
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
682
+ than the model's internal embedding lookup matrix.
683
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
684
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
685
+
686
+ - 1 for tokens that are **not masked**,
687
+ - 0 for tokens that are **masked**.
688
+
689
+ [What are attention masks?](../glossary#attention-mask)
690
+ output_attentions (`bool`, *optional*):
691
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
692
+ returned tensors for more detail.
693
+ output_hidden_states (`bool`, *optional*):
694
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
695
+ for more detail.
696
+ return_dict (`bool`, *optional*):
697
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
698
+ """
699
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
700
+ output_hidden_states = (
701
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
702
+ )
703
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
704
+
705
+ encoder_states = () if output_hidden_states else None
706
+ all_attentions = () if output_attentions else None
707
+
708
+ hidden_states = inputs_embeds
709
+ for encoder_layer in self.layers:
710
+ if output_hidden_states:
711
+ encoder_states = encoder_states + (hidden_states,)
712
+ if self.gradient_checkpointing and self.training:
713
+ layer_outputs = self._gradient_checkpointing_func(
714
+ encoder_layer.__call__,
715
+ hidden_states,
716
+ attention_mask,
717
+ output_attentions,
718
+ )
719
+ else:
720
+ layer_outputs = encoder_layer(
721
+ hidden_states,
722
+ attention_mask,
723
+ output_attentions=output_attentions,
724
+ )
725
+
726
+ hidden_states = layer_outputs[0]
727
+
728
+ if output_attentions:
729
+ all_attentions = all_attentions + (layer_outputs[1],)
730
+
731
+ if output_hidden_states:
732
+ encoder_states = encoder_states + (hidden_states,)
733
+
734
+ if not return_dict:
735
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
736
+ return BaseModelOutput(
737
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
738
+ )
739
+
740
+
741
+ class SiglipTextTransformer(nn.Module):
742
+ def __init__(self, config: SiglipTextConfig):
743
+ super().__init__()
744
+ self.config = config
745
+ embed_dim = config.hidden_size
746
+ self.embeddings = SiglipTextEmbeddings(config)
747
+ self.encoder = SiglipEncoder(config)
748
+ self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
749
+
750
+ self.head = nn.Linear(embed_dim, embed_dim)
751
+
752
+ @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
753
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
754
+ def forward(
755
+ self,
756
+ input_ids: Optional[torch.Tensor] = None,
757
+ attention_mask: Optional[torch.Tensor] = None,
758
+ position_ids: Optional[torch.Tensor] = None,
759
+ output_attentions: Optional[bool] = None,
760
+ output_hidden_states: Optional[bool] = None,
761
+ return_dict: Optional[bool] = None,
762
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
763
+ r"""
764
+ Returns:
765
+
766
+ """
767
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
768
+ output_hidden_states = (
769
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
770
+ )
771
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
772
+
773
+ if input_ids is None:
774
+ raise ValueError("You have to specify input_ids")
775
+
776
+ input_shape = input_ids.size()
777
+ input_ids = input_ids.view(-1, input_shape[-1])
778
+
779
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
780
+
781
+ # note: SigLIP's text model does not use a causal mask, unlike the original CLIP model.
782
+ # expand attention_mask
783
+ if attention_mask is not None:
784
+ # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
785
+ attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
786
+
787
+ encoder_outputs = self.encoder(
788
+ inputs_embeds=hidden_states,
789
+ attention_mask=attention_mask,
790
+ output_attentions=output_attentions,
791
+ output_hidden_states=output_hidden_states,
792
+ return_dict=return_dict,
793
+ )
794
+
795
+ last_hidden_state = encoder_outputs[0]
796
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
797
+
798
+ # Assuming "sticky" EOS tokenization, last token is always EOS.
799
+ pooled_output = last_hidden_state[:, -1, :]
800
+ pooled_output = self.head(pooled_output)
801
+
802
+ if not return_dict:
803
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
804
+
805
+ return BaseModelOutputWithPooling(
806
+ last_hidden_state=last_hidden_state,
807
+ pooler_output=pooled_output,
808
+ hidden_states=encoder_outputs.hidden_states,
809
+ attentions=encoder_outputs.attentions,
810
+ )
811
+
812
+
813
+ @add_start_docstrings(
814
+ """The text model from SigLIP without any head or projection on top.""",
815
+ SIGLIP_START_DOCSTRING,
816
+ )
817
+ class SiglipTextModel(SiglipPreTrainedModel):
818
+ config_class = SiglipTextConfig
819
+
820
+ _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]
821
+
822
+ def __init__(self, config: SiglipTextConfig):
823
+ super().__init__(config)
824
+ self.text_model = SiglipTextTransformer(config)
825
+ # Initialize weights and apply final processing
826
+ self.post_init()
827
+
828
+ def get_input_embeddings(self) -> nn.Module:
829
+ return self.text_model.embeddings.token_embedding
830
+
831
+ def set_input_embeddings(self, value):
832
+ self.text_model.embeddings.token_embedding = value
833
+
834
+ @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
835
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
836
+ def forward(
837
+ self,
838
+ input_ids: Optional[torch.Tensor] = None,
839
+ attention_mask: Optional[torch.Tensor] = None,
840
+ position_ids: Optional[torch.Tensor] = None,
841
+ output_attentions: Optional[bool] = None,
842
+ output_hidden_states: Optional[bool] = None,
843
+ return_dict: Optional[bool] = None,
844
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
845
+ r"""
846
+ Returns:
847
+
848
+ Examples:
849
+
850
+ ```python
851
+ >>> from transformers import AutoTokenizer, SiglipTextModel
852
+
853
+ >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
854
+ >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
855
+
856
+ >>> # important: make sure to set padding="max_length" as that's how the model was trained
857
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
858
+
859
+ >>> outputs = model(**inputs)
860
+ >>> last_hidden_state = outputs.last_hidden_state
861
+ >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
862
+ ```"""
863
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
864
+
865
+ return self.text_model(
866
+ input_ids=input_ids,
867
+ attention_mask=attention_mask,
868
+ position_ids=position_ids,
869
+ output_attentions=output_attentions,
870
+ output_hidden_states=output_hidden_states,
871
+ return_dict=return_dict,
872
+ )
873
+
874
+
875
+ class SiglipVisionTransformer(nn.Module):
876
+ def __init__(self, config: SiglipVisionConfig):
877
+ super().__init__()
878
+ self.config = config
879
+ embed_dim = config.hidden_size
880
+
881
+ self.embeddings = SiglipVisionEmbeddings(config)
882
+ self.encoder = SiglipEncoder(config)
883
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
884
+ self.head = SiglipMultiheadAttentionPoolingHead(config)
885
+
886
+ @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
887
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
888
+ def forward(
889
+ self,
890
+ pixel_values,
891
+ output_attentions: Optional[bool] = None,
892
+ output_hidden_states: Optional[bool] = None,
893
+ return_dict: Optional[bool] = None,
894
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
895
+ r"""
896
+ Returns:
897
+
898
+ """
899
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
900
+ output_hidden_states = (
901
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
902
+ )
903
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
904
+
905
+ hidden_states = self.embeddings(pixel_values)
906
+
907
+ encoder_outputs = self.encoder(
908
+ inputs_embeds=hidden_states,
909
+ output_attentions=output_attentions,
910
+ output_hidden_states=output_hidden_states,
911
+ return_dict=return_dict,
912
+ )
913
+
914
+ last_hidden_state = encoder_outputs[0]
915
+ last_hidden_state = self.post_layernorm(last_hidden_state)
916
+
917
+ pooled_output = self.head(last_hidden_state)
918
+
919
+ if not return_dict:
920
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
921
+
922
+ return BaseModelOutputWithPooling(
923
+ last_hidden_state=last_hidden_state,
924
+ pooler_output=pooled_output,
925
+ hidden_states=encoder_outputs.hidden_states,
926
+ attentions=encoder_outputs.attentions,
927
+ )
928
+
929
+
930
+ class SiglipMultiheadAttentionPoolingHead(nn.Module):
931
+ """Multihead Attention Pooling."""
932
+
933
+ def __init__(self, config: SiglipVisionConfig):
934
+ super().__init__()
935
+
936
+ self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
937
+ self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
938
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
939
+ self.mlp = SiglipMLP(config)
940
+
941
+ def forward(self, hidden_state):
942
+ batch_size = hidden_state.shape[0]
943
+ probe = self.probe.repeat(batch_size, 1, 1)
944
+
945
+ hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
946
+
947
+ residual = hidden_state
948
+ hidden_state = self.layernorm(hidden_state)
949
+ hidden_state = residual + self.mlp(hidden_state)
950
+
951
+ return hidden_state[:, 0]
952
+
953
+
954
+ @add_start_docstrings(
955
+ """The vision model from SigLIP without any head or projection on top.""",
956
+ SIGLIP_START_DOCSTRING,
957
+ )
958
+ class SiglipVisionModel(SiglipPreTrainedModel):
959
+ config_class = SiglipVisionConfig
960
+ main_input_name = "pixel_values"
961
+ _no_split_modules = [
962
+ "SiglipEncoderLayer",
963
+ "SiglipVisionEmbeddings",
964
+ "SiglipMultiheadAttentionPoolingHead",
965
+ ]
966
+
967
+ def __init__(self, config: SiglipVisionConfig):
968
+ super().__init__(config)
969
+
970
+ self.vision_model = SiglipVisionTransformer(config)
971
+
972
+ # Initialize weights and apply final processing
973
+ self.post_init()
974
+
975
+ def get_input_embeddings(self) -> nn.Module:
976
+ return self.vision_model.embeddings.patch_embedding
977
+
978
+ @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
979
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
980
+ def forward(
981
+ self,
982
+ pixel_values,
983
+ output_attentions: Optional[bool] = None,
984
+ output_hidden_states: Optional[bool] = None,
985
+ return_dict: Optional[bool] = None,
986
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
987
+ r"""
988
+ Returns:
989
+
990
+ Examples:
991
+
992
+ ```python
993
+ >>> from PIL import Image
994
+ >>> import requests
995
+ >>> from transformers import AutoProcessor, SiglipVisionModel
996
+
997
+ >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
998
+ >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
999
+
1000
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1001
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1002
+
1003
+ >>> inputs = processor(images=image, return_tensors="pt")
1004
+
1005
+ >>> outputs = model(**inputs)
1006
+ >>> last_hidden_state = outputs.last_hidden_state
1007
+ >>> pooled_output = outputs.pooler_output # pooled features
1008
+ ```"""
1009
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1010
+
1011
+ return self.vision_model(
1012
+ pixel_values=pixel_values,
1013
+ output_attentions=output_attentions,
1014
+ output_hidden_states=output_hidden_states,
1015
+ return_dict=return_dict,
1016
+ )
1017
+
1018
+
1019
+ @add_start_docstrings(SIGLIP_START_DOCSTRING)
1020
+ class SiglipModel(SiglipPreTrainedModel):
1021
+ config_class = SiglipConfig
1022
+
1023
+ def __init__(self, config: SiglipConfig):
1024
+ super().__init__(config)
1025
+
1026
+ if not isinstance(config.text_config, SiglipTextConfig):
1027
+ raise ValueError(
1028
+ "config.text_config is expected to be of type SiglipTextConfig but is of type"
1029
+ f" {type(config.text_config)}."
1030
+ )
1031
+
1032
+ if not isinstance(config.vision_config, SiglipVisionConfig):
1033
+ raise ValueError(
1034
+ "config.vision_config is expected to be of type SiglipVisionConfig but is of type"
1035
+ f" {type(config.vision_config)}."
1036
+ )
1037
+
1038
+ text_config = config.text_config
1039
+ vision_config = config.vision_config
1040
+
1041
+ self.text_model = SiglipTextTransformer(text_config)
1042
+ self.vision_model = SiglipVisionTransformer(vision_config)
1043
+
1044
+ self.logit_scale = nn.Parameter(torch.randn(1))
1045
+ self.logit_bias = nn.Parameter(torch.randn(1))
1046
+
1047
+ # Initialize weights and apply final processing
1048
+ self.post_init()
1049
+
1050
+ @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
1051
+ def get_text_features(
1052
+ self,
1053
+ input_ids: Optional[torch.Tensor] = None,
1054
+ attention_mask: Optional[torch.Tensor] = None,
1055
+ position_ids: Optional[torch.Tensor] = None,
1056
+ output_attentions: Optional[bool] = None,
1057
+ output_hidden_states: Optional[bool] = None,
1058
+ return_dict: Optional[bool] = None,
1059
+ ) -> torch.FloatTensor:
1060
+ r"""
1061
+ Returns:
1062
+ text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
1063
+ applying the projection layer to the pooled output of [`SiglipTextModel`].
1064
+
1065
+ Examples:
1066
+
1067
+ ```python
1068
+ >>> from transformers import AutoTokenizer, AutoModel
1069
+ >>> import torch
1070
+
1071
+ >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
1072
+ >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
1073
+
1074
+ >>> # important: make sure to set padding="max_length" as that's how the model was trained
1075
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
1076
+ >>> with torch.no_grad():
1077
+ transformers. text_features = model.get_text_features(**inputs)
1078
+ ```"""
1079
+ # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
1080
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1081
+ output_hidden_states = (
1082
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1083
+ )
1084
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1085
+
1086
+ text_outputs = self.text_model(
1087
+ input_ids=input_ids,
1088
+ attention_mask=attention_mask,
1089
+ position_ids=position_ids,
1090
+ output_attentions=output_attentions,
1091
+ output_hidden_states=output_hidden_states,
1092
+ return_dict=return_dict,
1093
+ )
1094
+
1095
+ pooled_output = text_outputs[1]
1096
+
1097
+ return pooled_output
1098
+
1099
+ @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
1100
+ def get_image_features(
1101
+ self,
1102
+ pixel_values: Optional[torch.FloatTensor] = None,
1103
+ output_attentions: Optional[bool] = None,
1104
+ output_hidden_states: Optional[bool] = None,
1105
+ return_dict: Optional[bool] = None,
1106
+ ) -> torch.FloatTensor:
1107
+ r"""
1108
+ Returns:
1109
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
1110
+ applying the projection layer to the pooled output of [`SiglipVisionModel`].
1111
+
1112
+ Examples:
1113
+
1114
+ ```python
1115
+ >>> from PIL import Image
1116
+ >>> import requests
1117
+ >>> from transformers import AutoProcessor, AutoModel
1118
+ >>> import torch
1119
+
1120
+ >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
1121
+ >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
1122
+
1123
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1124
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1125
+
1126
+ >>> inputs = processor(images=image, return_tensors="pt")
1127
+
1128
+ >>> with torch.no_grad():
1129
+ transformers. image_features = model.get_image_features(**inputs)
1130
+ ```"""
1131
+ # Use SiglipModel's config for some fields (if specified) instead of those of vision & text components.
1132
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1133
+ output_hidden_states = (
1134
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1135
+ )
1136
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1137
+
1138
+ vision_outputs = self.vision_model(
1139
+ pixel_values=pixel_values,
1140
+ output_attentions=output_attentions,
1141
+ output_hidden_states=output_hidden_states,
1142
+ return_dict=return_dict,
1143
+ )
1144
+
1145
+ pooled_output = vision_outputs[1]
1146
+
1147
+ return pooled_output
1148
+
1149
+ @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
1150
+ @replace_return_docstrings(output_type=SiglipOutput, config_class=SiglipConfig)
1151
+ def forward(
1152
+ self,
1153
+ input_ids: Optional[torch.LongTensor] = None,
1154
+ pixel_values: Optional[torch.FloatTensor] = None,
1155
+ attention_mask: Optional[torch.Tensor] = None,
1156
+ position_ids: Optional[torch.LongTensor] = None,
1157
+ return_loss: Optional[bool] = None,
1158
+ output_attentions: Optional[bool] = None,
1159
+ output_hidden_states: Optional[bool] = None,
1160
+ return_dict: Optional[bool] = None,
1161
+ ) -> Union[Tuple, SiglipOutput]:
1162
+ r"""
1163
+ Returns:
1164
+
1165
+ Examples:
1166
+
1167
+ ```python
1168
+ >>> from PIL import Image
1169
+ >>> import requests
1170
+ >>> from transformers import AutoProcessor, AutoModel
1171
+ >>> import torch
1172
+
1173
+ >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
1174
+ >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
1175
+
1176
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1177
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1178
+
1179
+ >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
1180
+ >>> # important: we pass `padding=max_length` since the model was trained with this
1181
+ >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
1182
+
1183
+ >>> with torch.no_grad():
1184
+ transformers. outputs = model(**inputs)
1185
+
1186
+ >>> logits_per_image = outputs.logits_per_image
1187
+ >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
1188
+ >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
1189
+ 31.9% that image 0 is 'a photo of 2 cats'
1190
+ ```"""
1191
+ # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
1192
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1193
+ output_hidden_states = (
1194
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1195
+ )
1196
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1197
+
1198
+ vision_outputs = self.vision_model(
1199
+ pixel_values=pixel_values,
1200
+ output_attentions=output_attentions,
1201
+ output_hidden_states=output_hidden_states,
1202
+ return_dict=return_dict,
1203
+ )
1204
+
1205
+ text_outputs = self.text_model(
1206
+ input_ids=input_ids,
1207
+ attention_mask=attention_mask,
1208
+ position_ids=position_ids,
1209
+ output_attentions=output_attentions,
1210
+ output_hidden_states=output_hidden_states,
1211
+ return_dict=return_dict,
1212
+ )
1213
+
1214
+ image_embeds = vision_outputs[1]
1215
+ text_embeds = text_outputs[1]
1216
+
1217
+ # normalized features
1218
+ image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
1219
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
1220
+
1221
+ # cosine similarity as logits
1222
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale.exp() + self.logit_bias
1223
+ logits_per_image = logits_per_text.t()
1224
+
1225
+ loss = None
1226
+ if return_loss:
1227
+ raise NotImplementedError("SigLIP loss to be implemented")
1228
+
1229
+ if not return_dict:
1230
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
1231
+ return ((loss,) + output) if loss is not None else output
1232
+
1233
+ return SiglipOutput(
1234
+ loss=loss,
1235
+ logits_per_image=logits_per_image,
1236
+ logits_per_text=logits_per_text,
1237
+ text_embeds=text_embeds,
1238
+ image_embeds=image_embeds,
1239
+ text_model_output=text_outputs,
1240
+ vision_model_output=vision_outputs,
1241
+ )
monitor.txt ADDED
The diff for this file is too large to render. See raw diff
 
multi_backbone_channel_concatenation_encoder.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Eagle2
3
+ # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import torch, os
8
+ import torch.nn as nn
9
+ from torch.utils.checkpoint import checkpoint
10
+
11
+ from .siglip_vision_tower import SiglipVisionTower
12
+
13
+ import torch.nn.functional as F
14
+ from torch.nn.init import trunc_normal_
15
+ from copy import deepcopy
16
+ import random
17
+ import math
18
+
19
+ class MultiBackboneChannelConcatenationVisionTower(nn.Module):
20
+ def __init__(self,
21
+ vision_tower,
22
+ args,
23
+ grid_size=32,
24
+ convnext_img_size=1024,
25
+ normalize_type=None, raw_config=None):
26
+
27
+ super().__init__()
28
+
29
+ self.is_loaded = False
30
+ self.grid_size = grid_size
31
+ self.num_tokens = self.grid_size ** 2
32
+ self.normalize_type = args.normalize_type
33
+ self.moe_version_type = args.moe_version_type
34
+ self.raw_config = raw_config
35
+ print("moe_version_type: ", self.moe_version_type)
36
+ assert self.moe_version_type in [None, 'all_tiling', 'seq_concat', 'feat_concat', 'convnext_512_siglip_448'], f"Unknown self.moe_version_type: {self.moe_version_type}"
37
+
38
+ vision_tower_name_list = vision_tower.split(";")
39
+ self.input_image_size = 1024
40
+ self.convnext_img_size = convnext_img_size
41
+ self.load_vision_towers(vision_tower_name_list, args)
42
+
43
+
44
+ def load_vision_towers(self, vision_tower_name_list, args):
45
+ self.vision_towers = nn.ModuleList()
46
+
47
+ freeze_backbone_list = args.freeze_backbones # note this is a str
48
+ if freeze_backbone_list is not None and len(freeze_backbone_list) > 0:
49
+ print("The frozen backbones: ", freeze_backbone_list)
50
+ else:
51
+ # make it a blank str
52
+ freeze_backbone_list = ""
53
+
54
+ for name in vision_tower_name_list:
55
+
56
+ ## ConvNeXt
57
+ if name == 'convnext-1024':
58
+ convnext_args = deepcopy(args)
59
+
60
+ convnext_args.freeze_vision = False
61
+ if 'convnext-1024' in freeze_backbone_list:
62
+ convnext_args.freeze_vision = True
63
+
64
+ from .convnext_encoder import ConvNextVisionTower
65
+ convnext_args.input_image_size = self.convnext_img_size
66
+ convnext_vision_tower = args.vision_tower_convnext_path
67
+ convnext_vision_tower = ConvNextVisionTower(convnext_vision_tower,
68
+ convnext_args, delay_load=args.delay_load, normalize_type=self.normalize_type)
69
+ convnext_vision_tower.load_model()
70
+ self.vision_towers.append(convnext_vision_tower)
71
+
72
+ ## PaliSigLIP
73
+ elif name == 'palisiglip':
74
+ palisiglip_args = deepcopy(args)
75
+ palisiglip_args.input_image_size = 448
76
+
77
+ palisiglip_args.freeze_vision = False
78
+ if 'palisiglip' in freeze_backbone_list:
79
+ palisiglip_args.freeze_vision = True
80
+
81
+ palisiglip_vision_tower = SiglipVisionTower(args.vision_tower_siglip_path, palisiglip_args, delay_load=args.delay_load, raw_config=self.raw_config)
82
+
83
+ palisiglip_vision_tower.load_model()
84
+ self.vision_towers.append(palisiglip_vision_tower)
85
+
86
+ # Set the image processor
87
+ self.image_processor = None
88
+ self.is_loaded = True
89
+
90
+ def load_model(self):
91
+ assert self.is_loaded, "All the vision encoders should be loaded during initialization!"
92
+
93
+ def forward(self, x):
94
+ # x is a Tensor if moe_version_type is None or 'all_tiling'
95
+ # else is a tuple(Tensor, Tensor)
96
+ if self.moe_version_type in [None, 'all_tiling']:
97
+ # The default pipeline
98
+ features = []
99
+ image_input_size = x.shape[2]
100
+ assert x.shape[2] == x.shape[3], f"Image should be a square but size ({x.shape[2]} x {x.shape[3]})"
101
+ for vision_tower in self.vision_towers:
102
+
103
+ if vision_tower.input_image_size != image_input_size:
104
+ resized_x = F.interpolate(x.float(),
105
+ size=(vision_tower.input_image_size, vision_tower.input_image_size),
106
+ mode='bilinear',
107
+ align_corners=True).to(dtype=x.dtype)
108
+ else:
109
+ resized_x = x
110
+
111
+ feature = vision_tower(resized_x)
112
+
113
+ if len(feature.shape) == 3: # b, n, c
114
+ b, n, c = feature.shape
115
+ if n == self.num_tokens:
116
+ features.append(feature)
117
+ continue
118
+ w = h = int(n**0.5)
119
+ feature = feature.transpose(1,2).reshape(b, c, h, w)
120
+ else:
121
+ b, c, h, w = feature.shape
122
+
123
+ if w != self.grid_size:
124
+ feature = F.interpolate(feature.float(), size=(self.grid_size, self.grid_size), mode='bilinear', align_corners=True).to(dtype=x.dtype)
125
+ features.append(feature.flatten(2,3).transpose(1,2))
126
+
127
+ features = torch.cat(features, dim=-1)
128
+ elif self.moe_version_type == 'convnext_512_siglip_448':
129
+ features = {}
130
+ image_input_size = x.shape[2]
131
+ assert x.shape[2] == x.shape[3], f"Image should be a square but size ({x.shape[2]} x {x.shape[3]})"
132
+ for vision_tower in self.vision_towers:
133
+
134
+ if vision_tower.input_image_size != image_input_size:
135
+ resized_x = F.interpolate(x.float(),
136
+ size=(vision_tower.input_image_size, vision_tower.input_image_size),
137
+ mode='bilinear',
138
+ align_corners=True).to(dtype=x.dtype)
139
+ else:
140
+ resized_x = x
141
+
142
+ feature = vision_tower(resized_x)
143
+
144
+ # if len(feature.shape) == 3: # b, n, c
145
+ # b, n, c = feature.shape
146
+ # if n == self.num_tokens:
147
+ # features.append(feature)
148
+ # continue
149
+ # w = h = int(n**0.5)
150
+ # feature = feature.transpose(1,2).reshape(b, c, h, w)
151
+ # else:
152
+ # b, c, h, w = feature.shape
153
+ features[vision_tower.name] = feature
154
+
155
+ else:
156
+ assert isinstance(x, dict), "x is expected to be a dict but {}".format(type(x))
157
+ pixel_values = x['pixel_values']
158
+ num_patches = x['num_patches'] # num patch of paddings token in texts
159
+
160
+ # calculated the real image patches
161
+ if self.moe_version_type == 'seq_concat':
162
+ image_in_num_patches = [i-1 for i in num_patches]
163
+ else:
164
+ image_in_num_patches = [i for i in num_patches]
165
+
166
+
167
+ assert sum(image_in_num_patches) == pixel_values.size(0), "sum(image_in_num_patches) ({}) != pixel_values.size(0) ({})".format(sum(image_in_num_patches), pixel_values.size(0))
168
+
169
+ # find the thubnail image id
170
+ thumbnail_image_id = torch.cumsum(torch.tensor(image_in_num_patches).to(pixel_values.device), 0) - 1
171
+ image_no_tiling = pixel_values[thumbnail_image_id]
172
+
173
+ # By default, we use the 1st vision_tower for x, others for x_nt
174
+ features = []
175
+ for layer_id, vision_tower in enumerate(self.vision_towers):
176
+ if layer_id == 0:
177
+ x = pixel_values
178
+ else:
179
+ x = image_no_tiling
180
+
181
+ if vision_tower.input_image_size != self.input_image_size:
182
+ resized_x = F.interpolate(x.float(),
183
+ size=(vision_tower.input_image_size, vision_tower.input_image_size),
184
+ mode='bilinear',
185
+ align_corners=True).to(dtype=x.dtype)
186
+ else:
187
+ resized_x = x
188
+
189
+ feature = vision_tower(resized_x)
190
+ if len(feature.shape) == 3: # b, n, c
191
+ b, n, c = feature.shape
192
+ if n == self.num_tokens:
193
+ features.append(feature)
194
+ continue
195
+
196
+ w = h = int(n**0.5)
197
+ feature = feature.transpose(1,2).reshape(b, c, h, w)
198
+ else:
199
+ b, c, h, w = feature.shape
200
+
201
+ if w != self.grid_size:
202
+ feature = F.interpolate(feature.float(), size=(self.grid_size, self.grid_size), mode='bilinear', align_corners=True).to(dtype=x.dtype)
203
+ features.append(feature.flatten(2,3).transpose(1,2))
204
+
205
+ clip_embeds = features[0]
206
+ if len(features) <= 1:
207
+ no_tiling_embeds = None
208
+ else:
209
+ no_tiling_embeds = torch.cat(features[1:], dim=-1)
210
+
211
+ if self.moe_version_type == 'feat_concat':
212
+ # concat thumbnail images features together
213
+ clip_thumbnail_embeds = clip_embeds[thumbnail_image_id]
214
+ if no_tiling_embeds is not None:
215
+ no_tiling_embeds = torch.cat([clip_thumbnail_embeds, no_tiling_embeds], dim=-1)
216
+ else:
217
+ no_tiling_embeds = clip_thumbnail_embeds
218
+
219
+ # extra patch featureas
220
+ clip_embeds_mask = ~torch.isin(torch.arange(clip_embeds.shape[0]).to(clip_embeds.device), thumbnail_image_id)
221
+ clip_embeds = clip_embeds[clip_embeds_mask]
222
+
223
+
224
+ features = {
225
+ 'clip_embeds': clip_embeds,
226
+ 'no_tiling_embeds': no_tiling_embeds,
227
+ 'num_patches': num_patches
228
+ }
229
+
230
+ # features is a Tensor if not clip_tiling_only
231
+
232
+ return features
233
+
234
+ @property
235
+ def dummy_feature(self):
236
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
237
+
238
+ @property
239
+ def dtype(self):
240
+ return next(self.clip_vision_tower.parameters()).dtype
241
+
242
+ @property
243
+ def device(self):
244
+ return next(self.clip_vision_tower.parameters()).device
245
+
246
+ @property
247
+ def config(self):
248
+ assert NotImplementedError
249
+ pass
250
+
251
+ @property
252
+ def hidden_size(self):
253
+ if self.moe_version_type == 'convnext_512_siglip_448':
254
+ res = {}
255
+ for vision_tower in self.vision_towers:
256
+ res[vision_tower.name] = vision_tower.hidden_size
257
+ return res
258
+ else:
259
+ return sum([_.hidden_size for _ in self.vision_towers])
260
+
261
+ @property
262
+ def num_patches(self):
263
+ return self.num_tokens
264
+
265
+
266
+
multi_backbone_channel_concatentation_model.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Eagle2
3
+ # Copyright (c) 2025 NVIDIA
4
+ # Licensed under The Apache License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+
8
+ import torch.nn as nn
9
+
10
+ from transformers.modeling_outputs import BaseModelOutputWithPooling
11
+ from typing import Optional, Tuple, Union
12
+
13
+ from .multi_backbone_channel_concatenation_encoder import MultiBackboneChannelConcatenationVisionTower
14
+ from .configuration_multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModelConfig
15
+
16
+
17
+ class MultiBackboneChannelConcatenationVisionModel(nn.Module):
18
+
19
+ """
20
+ A vision model wrapper that concatenates channels from multiple backbones.
21
+
22
+ Args:
23
+ config (MultiBackboneChannelConcatenationVisionModelConfig): The configuration for the model.
24
+
25
+ Attributes:
26
+ vision_model (MultiBackboneChannelConcatenationVisionTower): The vision tower that performs the channel concatenation.
27
+
28
+ Notes:
29
+ **The class is not inherited from the PreTrainedModel in transformers**
30
+
31
+ """
32
+
33
+ config_class = MultiBackboneChannelConcatenationVisionModelConfig
34
+ main_input_name = "pixel_values"
35
+
36
+ def __init__(self, config: MultiBackboneChannelConcatenationVisionModelConfig, raw_config):
37
+ super().__init__()
38
+
39
+ self.vision_model = MultiBackboneChannelConcatenationVisionTower(
40
+ vision_tower=config.vision_tower,
41
+ args=config,
42
+ grid_size=config.grid_size,
43
+ convnext_img_size=config.convnext_img_size,
44
+ normalize_type=config.normalize_type,
45
+ raw_config=raw_config
46
+ )
47
+
48
+
49
+ def get_input_embeddings(self):
50
+ # You might need to adjust this depending on how you want to handle input embeddings
51
+ return self.vision_model.vision_towers[0].get_input_embeddings()
52
+
53
+ def forward(
54
+ self,
55
+ pixel_values,
56
+ return_dict: Optional[bool] = True,
57
+ output_hidden_states: Optional[bool] = False,
58
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
59
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
60
+
61
+ assert return_dict is True, "We only support return_dict"
62
+ assert output_hidden_states is False, "We do not support output_hidden_states"
63
+
64
+ features = self.vision_model(pixel_values)
65
+
66
+ # We only supports features as model outputs
67
+ return BaseModelOutputWithPooling(
68
+ last_hidden_state=features,
69
+ pooler_output=None,
70
+ hidden_states=None,
71
+ attentions=None,
72
+ )
73
+
74
+ @property
75
+ def dummy_feature(self):
76
+ return self.vision_model.dummy_feature
77
+
78
+ @property
79
+ def dtype(self):
80
+ return self.vision_model.dtype
81
+
82
+ @property
83
+ def device(self):
84
+ return self.vision_model.device
85
+
86
+ @property
87
+ def config(self):
88
+ return self.vision_model.config
89
+
90
+ @property
91
+ def hidden_size(self):
92
+ return self.vision_model.hidden_size
93
+
94
+ @property
95
+ def num_patches(self):
96
+ return self.vision_model.num_patches
siglip_vision_tower.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.utils.checkpoint import checkpoint
4
+
5
+ from .modeling_siglip import SiglipVisionModel
6
+ from .configuration_siglip import SiglipVisionConfig
7
+
8
+ import math
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from typing import List, Optional
12
+ import os
13
+
14
+ class SiglipVisionTower(nn.Module):
15
+ # We use the same wrapper as the default clip encoder.
16
+ # See `clip_encoder.py` in the same folder
17
+ def __init__(self, vision_tower, args, delay_load=False, raw_config=None):
18
+ super().__init__()
19
+
20
+ self.is_loaded = False
21
+ self.freeze_vision=args.freeze_vision
22
+ self.input_image_size=args.input_image_size
23
+ self.vision_tower_name = vision_tower
24
+ self.select_layer = args.mm_vision_select_layer
25
+ self.name = 'siglip'
26
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
27
+ self.delay_load = delay_load
28
+ self.raw_config = raw_config
29
+ if not delay_load:
30
+ self.load_model()
31
+ else:
32
+ if os.path.isfile(self.vision_tower_name):
33
+ self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name, local_files_only=True)
34
+ else:
35
+ self.cfg_only = SiglipVisionConfig(**self.raw_config.vision_config.siglip_vision_config)
36
+
37
+
38
+ def load_model(self):
39
+ if self.is_loaded:
40
+ print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
41
+ return
42
+
43
+ # self.image_processor = SiglipImageProcessor(size=1024)
44
+ # self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, local_files_only=True, torch_dtype=torch.bfloat16)
45
+ if self.delay_load:
46
+ # cfg = SiglipVisionConfig.from_pretrained(self.vision_tower_name, local_files_only=True)
47
+ self.vision_tower = SiglipVisionModel(self.cfg_only)
48
+ else:
49
+ self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, local_files_only=True)
50
+
51
+ if self.freeze_vision:
52
+ self.vision_tower.requires_grad_(False)
53
+
54
+ self.vision_tower.vision_model.encoder.gradient_checkpointing = True
55
+ self.is_loaded = True
56
+
57
+ def forward(self, images):
58
+ return self.vision_tower(
59
+ pixel_values=images,
60
+ output_hidden_states=False,
61
+ return_dict=True).last_hidden_state
62
+
63
+
64
+ @property
65
+ def dummy_feature(self):
66
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
67
+
68
+ @property
69
+ def dtype(self):
70
+ return self.vision_tower.dtype
71
+
72
+ @property
73
+ def device(self):
74
+ return self.vision_tower.device
75
+
76
+ @property
77
+ def config(self):
78
+ if self.is_loaded:
79
+ return self.vision_tower.config
80
+ else:
81
+ return self.cfg_only
82
+
83
+ @property
84
+ def hidden_size(self):
85
+ return self.config.hidden_size
86
+
87
+ @property
88
+ def num_patches_per_side(self):
89
+ return self.config.image_size // self.config.patch_size
90
+
91
+ @property
92
+ def num_patches(self):
93
+ return (self.config.image_size // self.config.patch_size) ** 2
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|eot_id|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|finetune_right_pad_id|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenization_qwen2.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for Qwen2."""
16
+
17
+ import json
18
+ import os
19
+ import unicodedata
20
+ from functools import lru_cache
21
+ from typing import Optional, Tuple
22
+
23
+ import regex as re
24
+
25
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
26
+ from transformers.utils import logging
27
+
28
+
29
+ logger = logging.get_logger(__name__)
30
+
31
+ VOCAB_FILES_NAMES = {
32
+ "vocab_file": "vocab.json",
33
+ "merges_file": "merges.txt",
34
+ }
35
+
36
+ PRETRAINED_VOCAB_FILES_MAP = {
37
+ "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
38
+ "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
39
+ }
40
+
41
+ MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
42
+
43
+ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
44
+
45
+
46
+ @lru_cache()
47
+ # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
48
+ def bytes_to_unicode():
49
+ """
50
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
51
+ characters the bpe code barfs on.
52
+
53
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
54
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
55
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
56
+ tables between utf-8 bytes and unicode strings.
57
+ """
58
+ bs = (
59
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
60
+ )
61
+ cs = bs[:]
62
+ n = 0
63
+ for b in range(2**8):
64
+ if b not in bs:
65
+ bs.append(b)
66
+ cs.append(2**8 + n)
67
+ n += 1
68
+ cs = [chr(n) for n in cs]
69
+ return dict(zip(bs, cs))
70
+
71
+
72
+ # Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
73
+ def get_pairs(word):
74
+ """
75
+ Return set of symbol pairs in a word.
76
+
77
+ Word is represented as tuple of symbols (symbols being variable-length strings).
78
+ """
79
+ pairs = set()
80
+ prev_char = word[0]
81
+ for char in word[1:]:
82
+ pairs.add((prev_char, char))
83
+ prev_char = char
84
+ return pairs
85
+
86
+
87
+ class Qwen2Tokenizer(PreTrainedTokenizer):
88
+ """
89
+ Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
90
+
91
+ Same with GPT2Tokenzier, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
92
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
93
+
94
+ ```python
95
+ >>> from transformers import Qwen2Tokenizer
96
+
97
+ >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
98
+ >>> tokenizer("Hello world")["input_ids"]
99
+ [9707, 1879]
100
+
101
+ >>> tokenizer(" Hello world")["input_ids"]
102
+ [21927, 1879]
103
+ ```
104
+ This is expected.
105
+
106
+ You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
107
+
108
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
109
+ this superclass for more information regarding those methods.
110
+
111
+ Args:
112
+ vocab_file (`str`):
113
+ Path to the vocabulary file.
114
+ merges_file (`str`):
115
+ Path to the merges file.
116
+ errors (`str`, *optional*, defaults to `"replace"`):
117
+ Paradigm to follow when decoding bytes to UTF-8. See
118
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
119
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
120
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
121
+ token instead.
122
+ bos_token (`str`, *optional*):
123
+ The beginning of sequence token. Not applicable for this tokenizer.
124
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
125
+ The end of sequence token.
126
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
127
+ The token used for padding, for example when batching sequences of different lengths.
128
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
129
+ Whether or not the model should cleanup the spaces that were added when splitting the input text during the
130
+ tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
131
+ split_special_tokens (`bool`, *optional*, defaults to `False`):
132
+ Whether or not the special tokens should be split during the tokenization process. The default behavior is
133
+ to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
134
+ ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
135
+ '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
136
+ """
137
+
138
+ vocab_files_names = VOCAB_FILES_NAMES
139
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
140
+ max_model_input_sizes = MAX_MODEL_INPUT_SIZES
141
+ model_input_names = ["input_ids", "attention_mask"]
142
+
143
+ def __init__(
144
+ self,
145
+ vocab_file,
146
+ merges_file,
147
+ errors="replace",
148
+ unk_token="<|endoftext|>",
149
+ bos_token=None,
150
+ eos_token="<|endoftext|>",
151
+ pad_token="<|endoftext|>",
152
+ clean_up_tokenization_spaces=False,
153
+ split_special_tokens=False,
154
+ **kwargs,
155
+ ):
156
+ # Qwen vocab does not contain control tokens; added tokens need to be special
157
+ bos_token = (
158
+ AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
159
+ if isinstance(bos_token, str)
160
+ else bos_token
161
+ )
162
+ eos_token = (
163
+ AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
164
+ if isinstance(eos_token, str)
165
+ else eos_token
166
+ )
167
+ unk_token = (
168
+ AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
169
+ if isinstance(unk_token, str)
170
+ else unk_token
171
+ )
172
+ pad_token = (
173
+ AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
174
+ if isinstance(pad_token, str)
175
+ else pad_token
176
+ )
177
+
178
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
179
+ self.encoder = json.load(vocab_handle)
180
+ self.decoder = {v: k for k, v in self.encoder.items()}
181
+ self.errors = errors # how to handle errors in decoding
182
+ self.byte_encoder = bytes_to_unicode()
183
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
184
+ bpe_merges = []
185
+ with open(merges_file, encoding="utf-8") as merges_handle:
186
+ for line in merges_handle:
187
+ line = line.strip()
188
+ if not line or line.startswith("#"):
189
+ continue
190
+ bpe_merges.append(tuple(line.split()))
191
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
192
+ # NOTE: the cache can grow without bound and will get really large for long running processes
193
+ # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
194
+ # not a memory leak but appears as one.
195
+ # GPT2Tokenizer has the same problem, so let's be consistent.
196
+ self.cache = {}
197
+
198
+ self.pat = re.compile(PRETOKENIZE_REGEX)
199
+
200
+ if kwargs.get("add_prefix_space", False):
201
+ logger.warning_once(
202
+ f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
203
+ )
204
+
205
+ super().__init__(
206
+ errors=errors,
207
+ bos_token=bos_token,
208
+ eos_token=eos_token,
209
+ pad_token=pad_token,
210
+ unk_token=unk_token,
211
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
212
+ split_special_tokens=split_special_tokens,
213
+ **kwargs,
214
+ )
215
+
216
+ @property
217
+ def vocab_size(self) -> int:
218
+ return len(self.encoder)
219
+
220
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
221
+ def get_vocab(self):
222
+ return dict(self.encoder, **self.added_tokens_encoder)
223
+
224
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
225
+ def bpe(self, token):
226
+ if token in self.cache:
227
+ return self.cache[token]
228
+ word = tuple(token)
229
+ pairs = get_pairs(word)
230
+
231
+ if not pairs:
232
+ return token
233
+
234
+ while True:
235
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
236
+ if bigram not in self.bpe_ranks:
237
+ break
238
+ first, second = bigram
239
+ new_word = []
240
+ i = 0
241
+ while i < len(word):
242
+ try:
243
+ j = word.index(first, i)
244
+ except ValueError:
245
+ new_word.extend(word[i:])
246
+ break
247
+ else:
248
+ new_word.extend(word[i:j])
249
+ i = j
250
+
251
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
252
+ new_word.append(first + second)
253
+ i += 2
254
+ else:
255
+ new_word.append(word[i])
256
+ i += 1
257
+ new_word = tuple(new_word)
258
+ word = new_word
259
+ if len(word) == 1:
260
+ break
261
+ else:
262
+ pairs = get_pairs(word)
263
+ word = " ".join(word)
264
+ self.cache[token] = word
265
+ return word
266
+
267
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
268
+ def _tokenize(self, text):
269
+ """Tokenize a string."""
270
+ bpe_tokens = []
271
+ for token in re.findall(self.pat, text):
272
+ token = "".join(
273
+ self.byte_encoder[b] for b in token.encode("utf-8")
274
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
275
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
276
+ return bpe_tokens
277
+
278
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
279
+ def _convert_token_to_id(self, token):
280
+ """Converts a token (str) in an id using the vocab."""
281
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
282
+
283
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
284
+ def _convert_id_to_token(self, index):
285
+ """Converts an index (integer) in a token (str) using the vocab."""
286
+ return self.decoder.get(index)
287
+
288
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
289
+ def convert_tokens_to_string(self, tokens):
290
+ """Converts a sequence of tokens (string) in a single string."""
291
+ text = "".join(tokens)
292
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
293
+ return text
294
+
295
+ def decode(
296
+ self,
297
+ token_ids,
298
+ skip_special_tokens: bool = False,
299
+ clean_up_tokenization_spaces: Optional[bool] = False,
300
+ spaces_between_special_tokens: bool = False,
301
+ **kwargs,
302
+ ) -> str:
303
+ # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
304
+ # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
305
+ return super().decode(
306
+ token_ids,
307
+ skip_special_tokens=skip_special_tokens,
308
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
309
+ spaces_between_special_tokens=spaces_between_special_tokens,
310
+ **kwargs,
311
+ )
312
+
313
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
314
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
315
+ if not os.path.isdir(save_directory):
316
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
317
+ return
318
+ vocab_file = os.path.join(
319
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
320
+ )
321
+ merge_file = os.path.join(
322
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
323
+ )
324
+
325
+ with open(vocab_file, "w", encoding="utf-8") as f:
326
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
327
+
328
+ index = 0
329
+ with open(merge_file, "w", encoding="utf-8") as writer:
330
+ writer.write("#version: 0.2\n")
331
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
332
+ if index != token_index:
333
+ logger.warning(
334
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
335
+ " Please check that the tokenizer is not corrupted!"
336
+ )
337
+ index = token_index
338
+ writer.write(" ".join(bpe_tokens) + "\n")
339
+ index += 1
340
+
341
+ return vocab_file, merge_file
342
+
343
+ def prepare_for_tokenization(self, text, **kwargs):
344
+ text = unicodedata.normalize("NFC", text)
345
+ return (text, kwargs)
tokenization_qwen2_fast.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for Qwen2."""
16
+
17
+ from typing import Optional, Tuple
18
+
19
+ from transformers.tokenization_utils import AddedToken
20
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
21
+ from transformers.utils import logging
22
+ from .tokenization_qwen2 import Qwen2Tokenizer
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ VOCAB_FILES_NAMES = {
28
+ "vocab_file": "vocab.json",
29
+ "merges_file": "merges.txt",
30
+ "tokenizer_file": "tokenizer.json",
31
+ }
32
+
33
+ PRETRAINED_VOCAB_FILES_MAP = {
34
+ "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"},
35
+ "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"},
36
+ "tokenizer_file": {
37
+ "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json"
38
+ },
39
+ }
40
+
41
+ MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
42
+
43
+
44
+ class Qwen2TokenizerFast(PreTrainedTokenizerFast):
45
+ """
46
+ Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
47
+ Byte-Pair-Encoding.
48
+
49
+ Same with GPT2Tokenzier, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
50
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
51
+
52
+ ```python
53
+ >>> from transformers import Qwen2TokenizerFast
54
+
55
+ >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
56
+ >>> tokenizer("Hello world")["input_ids"]
57
+ [9707, 1879]
58
+
59
+ >>> tokenizer(" Hello world")["input_ids"]
60
+ [21927, 1879]
61
+ ```
62
+ This is expected.
63
+
64
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
65
+ refer to this superclass for more information regarding those methods.
66
+
67
+ Args:
68
+ vocab_file (`str`, *optional*):
69
+ Path to the vocabulary file.
70
+ merges_file (`str`, *optional*):
71
+ Path to the merges file.
72
+ tokenizer_file (`str`, *optional*):
73
+ Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
74
+ contains everything needed to load the tokenizer.
75
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
76
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
77
+ token instead. Not applicable to this tokenizer.
78
+ bos_token (`str`, *optional*):
79
+ The beginning of sequence token. Not applicable for this tokenizer.
80
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
81
+ The end of sequence token.
82
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
83
+ The token used for padding, for example when batching sequences of different lengths.
84
+ """
85
+
86
+ vocab_files_names = VOCAB_FILES_NAMES
87
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
88
+ max_model_input_sizes = MAX_MODEL_INPUT_SIZES
89
+ model_input_names = ["input_ids", "attention_mask"]
90
+ slow_tokenizer_class = Qwen2Tokenizer
91
+
92
+ def __init__(
93
+ self,
94
+ vocab_file=None,
95
+ merges_file=None,
96
+ tokenizer_file=None,
97
+ unk_token="<|endoftext|>",
98
+ bos_token=None,
99
+ eos_token="<|endoftext|>",
100
+ pad_token="<|endoftext|>",
101
+ **kwargs,
102
+ ):
103
+ # We need to at least pass vocab_file and merges_file to base class
104
+ # in case a slow tokenizer needs to be initialized; other can be
105
+ # configured through files.
106
+ # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
107
+
108
+ bos_token = (
109
+ AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
110
+ if isinstance(bos_token, str)
111
+ else bos_token
112
+ )
113
+ eos_token = (
114
+ AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
115
+ if isinstance(eos_token, str)
116
+ else eos_token
117
+ )
118
+ unk_token = (
119
+ AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
120
+ if isinstance(unk_token, str)
121
+ else unk_token
122
+ )
123
+ pad_token = (
124
+ AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
125
+ if isinstance(pad_token, str)
126
+ else pad_token
127
+ )
128
+
129
+ super().__init__(
130
+ vocab_file,
131
+ merges_file,
132
+ tokenizer_file=tokenizer_file,
133
+ unk_token=unk_token,
134
+ bos_token=bos_token,
135
+ eos_token=eos_token,
136
+ pad_token=pad_token,
137
+ **kwargs,
138
+ )
139
+
140
+ # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
141
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
142
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
143
+ return tuple(files)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,2152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_eos_token": false,
3
+ "added_tokens_decoder": {
4
+ "128000": {
5
+ "content": "<|begin_of_text|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "128001": {
13
+ "content": "<|end_of_text|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "128002": {
21
+ "content": "<|reserved_special_token_0|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "128003": {
29
+ "content": "<|reserved_special_token_1|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "128004": {
37
+ "content": "<|finetune_right_pad_id|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "128005": {
45
+ "content": "<|reserved_special_token_2|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "128006": {
53
+ "content": "<|start_header_id|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "128007": {
61
+ "content": "<|end_header_id|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "128008": {
69
+ "content": "<|eom_id|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "128009": {
77
+ "content": "<|eot_id|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "128010": {
85
+ "content": "<|python_tag|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "128011": {
93
+ "content": "<|reserved_special_token_3|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "128012": {
101
+ "content": "<|reserved_special_token_4|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "128013": {
109
+ "content": "<|reserved_special_token_5|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "128014": {
117
+ "content": "<|reserved_special_token_6|>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "128015": {
125
+ "content": "<|reserved_special_token_7|>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "128016": {
133
+ "content": "<|reserved_special_token_8|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "128017": {
141
+ "content": "<|reserved_special_token_9|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "128018": {
149
+ "content": "<|reserved_special_token_10|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "128019": {
157
+ "content": "<|reserved_special_token_11|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "128020": {
165
+ "content": "<|reserved_special_token_12|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "128021": {
173
+ "content": "<|reserved_special_token_13|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "128022": {
181
+ "content": "<|reserved_special_token_14|>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "128023": {
189
+ "content": "<|reserved_special_token_15|>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "128024": {
197
+ "content": "<|reserved_special_token_16|>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "128025": {
205
+ "content": "<|reserved_special_token_17|>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "128026": {
213
+ "content": "<|reserved_special_token_18|>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "128027": {
221
+ "content": "<|reserved_special_token_19|>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "128028": {
229
+ "content": "<|reserved_special_token_20|>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "128029": {
237
+ "content": "<|reserved_special_token_21|>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "128030": {
245
+ "content": "<|reserved_special_token_22|>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "128031": {
253
+ "content": "<|reserved_special_token_23|>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "128032": {
261
+ "content": "<|reserved_special_token_24|>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "128033": {
269
+ "content": "<|reserved_special_token_25|>",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "128034": {
277
+ "content": "<|reserved_special_token_26|>",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "128035": {
285
+ "content": "<|reserved_special_token_27|>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "128036": {
293
+ "content": "<|reserved_special_token_28|>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "128037": {
301
+ "content": "<|reserved_special_token_29|>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "128038": {
309
+ "content": "<|reserved_special_token_30|>",
310
+ "lstrip": false,
311
+ "normalized": false,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "128039": {
317
+ "content": "<|reserved_special_token_31|>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "128040": {
325
+ "content": "<|reserved_special_token_32|>",
326
+ "lstrip": false,
327
+ "normalized": false,
328
+ "rstrip": false,
329
+ "single_word": false,
330
+ "special": true
331
+ },
332
+ "128041": {
333
+ "content": "<|reserved_special_token_33|>",
334
+ "lstrip": false,
335
+ "normalized": false,
336
+ "rstrip": false,
337
+ "single_word": false,
338
+ "special": true
339
+ },
340
+ "128042": {
341
+ "content": "<|reserved_special_token_34|>",
342
+ "lstrip": false,
343
+ "normalized": false,
344
+ "rstrip": false,
345
+ "single_word": false,
346
+ "special": true
347
+ },
348
+ "128043": {
349
+ "content": "<|reserved_special_token_35|>",
350
+ "lstrip": false,
351
+ "normalized": false,
352
+ "rstrip": false,
353
+ "single_word": false,
354
+ "special": true
355
+ },
356
+ "128044": {
357
+ "content": "<|reserved_special_token_36|>",
358
+ "lstrip": false,
359
+ "normalized": false,
360
+ "rstrip": false,
361
+ "single_word": false,
362
+ "special": true
363
+ },
364
+ "128045": {
365
+ "content": "<|reserved_special_token_37|>",
366
+ "lstrip": false,
367
+ "normalized": false,
368
+ "rstrip": false,
369
+ "single_word": false,
370
+ "special": true
371
+ },
372
+ "128046": {
373
+ "content": "<|reserved_special_token_38|>",
374
+ "lstrip": false,
375
+ "normalized": false,
376
+ "rstrip": false,
377
+ "single_word": false,
378
+ "special": true
379
+ },
380
+ "128047": {
381
+ "content": "<|reserved_special_token_39|>",
382
+ "lstrip": false,
383
+ "normalized": false,
384
+ "rstrip": false,
385
+ "single_word": false,
386
+ "special": true
387
+ },
388
+ "128048": {
389
+ "content": "<|reserved_special_token_40|>",
390
+ "lstrip": false,
391
+ "normalized": false,
392
+ "rstrip": false,
393
+ "single_word": false,
394
+ "special": true
395
+ },
396
+ "128049": {
397
+ "content": "<|reserved_special_token_41|>",
398
+ "lstrip": false,
399
+ "normalized": false,
400
+ "rstrip": false,
401
+ "single_word": false,
402
+ "special": true
403
+ },
404
+ "128050": {
405
+ "content": "<|reserved_special_token_42|>",
406
+ "lstrip": false,
407
+ "normalized": false,
408
+ "rstrip": false,
409
+ "single_word": false,
410
+ "special": true
411
+ },
412
+ "128051": {
413
+ "content": "<|reserved_special_token_43|>",
414
+ "lstrip": false,
415
+ "normalized": false,
416
+ "rstrip": false,
417
+ "single_word": false,
418
+ "special": true
419
+ },
420
+ "128052": {
421
+ "content": "<|reserved_special_token_44|>",
422
+ "lstrip": false,
423
+ "normalized": false,
424
+ "rstrip": false,
425
+ "single_word": false,
426
+ "special": true
427
+ },
428
+ "128053": {
429
+ "content": "<|reserved_special_token_45|>",
430
+ "lstrip": false,
431
+ "normalized": false,
432
+ "rstrip": false,
433
+ "single_word": false,
434
+ "special": true
435
+ },
436
+ "128054": {
437
+ "content": "<|reserved_special_token_46|>",
438
+ "lstrip": false,
439
+ "normalized": false,
440
+ "rstrip": false,
441
+ "single_word": false,
442
+ "special": true
443
+ },
444
+ "128055": {
445
+ "content": "<|reserved_special_token_47|>",
446
+ "lstrip": false,
447
+ "normalized": false,
448
+ "rstrip": false,
449
+ "single_word": false,
450
+ "special": true
451
+ },
452
+ "128056": {
453
+ "content": "<|reserved_special_token_48|>",
454
+ "lstrip": false,
455
+ "normalized": false,
456
+ "rstrip": false,
457
+ "single_word": false,
458
+ "special": true
459
+ },
460
+ "128057": {
461
+ "content": "<|reserved_special_token_49|>",
462
+ "lstrip": false,
463
+ "normalized": false,
464
+ "rstrip": false,
465
+ "single_word": false,
466
+ "special": true
467
+ },
468
+ "128058": {
469
+ "content": "<|reserved_special_token_50|>",
470
+ "lstrip": false,
471
+ "normalized": false,
472
+ "rstrip": false,
473
+ "single_word": false,
474
+ "special": true
475
+ },
476
+ "128059": {
477
+ "content": "<|reserved_special_token_51|>",
478
+ "lstrip": false,
479
+ "normalized": false,
480
+ "rstrip": false,
481
+ "single_word": false,
482
+ "special": true
483
+ },
484
+ "128060": {
485
+ "content": "<|reserved_special_token_52|>",
486
+ "lstrip": false,
487
+ "normalized": false,
488
+ "rstrip": false,
489
+ "single_word": false,
490
+ "special": true
491
+ },
492
+ "128061": {
493
+ "content": "<|reserved_special_token_53|>",
494
+ "lstrip": false,
495
+ "normalized": false,
496
+ "rstrip": false,
497
+ "single_word": false,
498
+ "special": true
499
+ },
500
+ "128062": {
501
+ "content": "<|reserved_special_token_54|>",
502
+ "lstrip": false,
503
+ "normalized": false,
504
+ "rstrip": false,
505
+ "single_word": false,
506
+ "special": true
507
+ },
508
+ "128063": {
509
+ "content": "<|reserved_special_token_55|>",
510
+ "lstrip": false,
511
+ "normalized": false,
512
+ "rstrip": false,
513
+ "single_word": false,
514
+ "special": true
515
+ },
516
+ "128064": {
517
+ "content": "<|reserved_special_token_56|>",
518
+ "lstrip": false,
519
+ "normalized": false,
520
+ "rstrip": false,
521
+ "single_word": false,
522
+ "special": true
523
+ },
524
+ "128065": {
525
+ "content": "<|reserved_special_token_57|>",
526
+ "lstrip": false,
527
+ "normalized": false,
528
+ "rstrip": false,
529
+ "single_word": false,
530
+ "special": true
531
+ },
532
+ "128066": {
533
+ "content": "<|reserved_special_token_58|>",
534
+ "lstrip": false,
535
+ "normalized": false,
536
+ "rstrip": false,
537
+ "single_word": false,
538
+ "special": true
539
+ },
540
+ "128067": {
541
+ "content": "<|reserved_special_token_59|>",
542
+ "lstrip": false,
543
+ "normalized": false,
544
+ "rstrip": false,
545
+ "single_word": false,
546
+ "special": true
547
+ },
548
+ "128068": {
549
+ "content": "<|reserved_special_token_60|>",
550
+ "lstrip": false,
551
+ "normalized": false,
552
+ "rstrip": false,
553
+ "single_word": false,
554
+ "special": true
555
+ },
556
+ "128069": {
557
+ "content": "<|reserved_special_token_61|>",
558
+ "lstrip": false,
559
+ "normalized": false,
560
+ "rstrip": false,
561
+ "single_word": false,
562
+ "special": true
563
+ },
564
+ "128070": {
565
+ "content": "<|reserved_special_token_62|>",
566
+ "lstrip": false,
567
+ "normalized": false,
568
+ "rstrip": false,
569
+ "single_word": false,
570
+ "special": true
571
+ },
572
+ "128071": {
573
+ "content": "<|reserved_special_token_63|>",
574
+ "lstrip": false,
575
+ "normalized": false,
576
+ "rstrip": false,
577
+ "single_word": false,
578
+ "special": true
579
+ },
580
+ "128072": {
581
+ "content": "<|reserved_special_token_64|>",
582
+ "lstrip": false,
583
+ "normalized": false,
584
+ "rstrip": false,
585
+ "single_word": false,
586
+ "special": true
587
+ },
588
+ "128073": {
589
+ "content": "<|reserved_special_token_65|>",
590
+ "lstrip": false,
591
+ "normalized": false,
592
+ "rstrip": false,
593
+ "single_word": false,
594
+ "special": true
595
+ },
596
+ "128074": {
597
+ "content": "<|reserved_special_token_66|>",
598
+ "lstrip": false,
599
+ "normalized": false,
600
+ "rstrip": false,
601
+ "single_word": false,
602
+ "special": true
603
+ },
604
+ "128075": {
605
+ "content": "<|reserved_special_token_67|>",
606
+ "lstrip": false,
607
+ "normalized": false,
608
+ "rstrip": false,
609
+ "single_word": false,
610
+ "special": true
611
+ },
612
+ "128076": {
613
+ "content": "<|reserved_special_token_68|>",
614
+ "lstrip": false,
615
+ "normalized": false,
616
+ "rstrip": false,
617
+ "single_word": false,
618
+ "special": true
619
+ },
620
+ "128077": {
621
+ "content": "<|reserved_special_token_69|>",
622
+ "lstrip": false,
623
+ "normalized": false,
624
+ "rstrip": false,
625
+ "single_word": false,
626
+ "special": true
627
+ },
628
+ "128078": {
629
+ "content": "<|reserved_special_token_70|>",
630
+ "lstrip": false,
631
+ "normalized": false,
632
+ "rstrip": false,
633
+ "single_word": false,
634
+ "special": true
635
+ },
636
+ "128079": {
637
+ "content": "<|reserved_special_token_71|>",
638
+ "lstrip": false,
639
+ "normalized": false,
640
+ "rstrip": false,
641
+ "single_word": false,
642
+ "special": true
643
+ },
644
+ "128080": {
645
+ "content": "<|reserved_special_token_72|>",
646
+ "lstrip": false,
647
+ "normalized": false,
648
+ "rstrip": false,
649
+ "single_word": false,
650
+ "special": true
651
+ },
652
+ "128081": {
653
+ "content": "<|reserved_special_token_73|>",
654
+ "lstrip": false,
655
+ "normalized": false,
656
+ "rstrip": false,
657
+ "single_word": false,
658
+ "special": true
659
+ },
660
+ "128082": {
661
+ "content": "<|reserved_special_token_74|>",
662
+ "lstrip": false,
663
+ "normalized": false,
664
+ "rstrip": false,
665
+ "single_word": false,
666
+ "special": true
667
+ },
668
+ "128083": {
669
+ "content": "<|reserved_special_token_75|>",
670
+ "lstrip": false,
671
+ "normalized": false,
672
+ "rstrip": false,
673
+ "single_word": false,
674
+ "special": true
675
+ },
676
+ "128084": {
677
+ "content": "<|reserved_special_token_76|>",
678
+ "lstrip": false,
679
+ "normalized": false,
680
+ "rstrip": false,
681
+ "single_word": false,
682
+ "special": true
683
+ },
684
+ "128085": {
685
+ "content": "<|reserved_special_token_77|>",
686
+ "lstrip": false,
687
+ "normalized": false,
688
+ "rstrip": false,
689
+ "single_word": false,
690
+ "special": true
691
+ },
692
+ "128086": {
693
+ "content": "<|reserved_special_token_78|>",
694
+ "lstrip": false,
695
+ "normalized": false,
696
+ "rstrip": false,
697
+ "single_word": false,
698
+ "special": true
699
+ },
700
+ "128087": {
701
+ "content": "<|reserved_special_token_79|>",
702
+ "lstrip": false,
703
+ "normalized": false,
704
+ "rstrip": false,
705
+ "single_word": false,
706
+ "special": true
707
+ },
708
+ "128088": {
709
+ "content": "<|reserved_special_token_80|>",
710
+ "lstrip": false,
711
+ "normalized": false,
712
+ "rstrip": false,
713
+ "single_word": false,
714
+ "special": true
715
+ },
716
+ "128089": {
717
+ "content": "<|reserved_special_token_81|>",
718
+ "lstrip": false,
719
+ "normalized": false,
720
+ "rstrip": false,
721
+ "single_word": false,
722
+ "special": true
723
+ },
724
+ "128090": {
725
+ "content": "<|reserved_special_token_82|>",
726
+ "lstrip": false,
727
+ "normalized": false,
728
+ "rstrip": false,
729
+ "single_word": false,
730
+ "special": true
731
+ },
732
+ "128091": {
733
+ "content": "<|reserved_special_token_83|>",
734
+ "lstrip": false,
735
+ "normalized": false,
736
+ "rstrip": false,
737
+ "single_word": false,
738
+ "special": true
739
+ },
740
+ "128092": {
741
+ "content": "<|reserved_special_token_84|>",
742
+ "lstrip": false,
743
+ "normalized": false,
744
+ "rstrip": false,
745
+ "single_word": false,
746
+ "special": true
747
+ },
748
+ "128093": {
749
+ "content": "<|reserved_special_token_85|>",
750
+ "lstrip": false,
751
+ "normalized": false,
752
+ "rstrip": false,
753
+ "single_word": false,
754
+ "special": true
755
+ },
756
+ "128094": {
757
+ "content": "<|reserved_special_token_86|>",
758
+ "lstrip": false,
759
+ "normalized": false,
760
+ "rstrip": false,
761
+ "single_word": false,
762
+ "special": true
763
+ },
764
+ "128095": {
765
+ "content": "<|reserved_special_token_87|>",
766
+ "lstrip": false,
767
+ "normalized": false,
768
+ "rstrip": false,
769
+ "single_word": false,
770
+ "special": true
771
+ },
772
+ "128096": {
773
+ "content": "<|reserved_special_token_88|>",
774
+ "lstrip": false,
775
+ "normalized": false,
776
+ "rstrip": false,
777
+ "single_word": false,
778
+ "special": true
779
+ },
780
+ "128097": {
781
+ "content": "<|reserved_special_token_89|>",
782
+ "lstrip": false,
783
+ "normalized": false,
784
+ "rstrip": false,
785
+ "single_word": false,
786
+ "special": true
787
+ },
788
+ "128098": {
789
+ "content": "<|reserved_special_token_90|>",
790
+ "lstrip": false,
791
+ "normalized": false,
792
+ "rstrip": false,
793
+ "single_word": false,
794
+ "special": true
795
+ },
796
+ "128099": {
797
+ "content": "<|reserved_special_token_91|>",
798
+ "lstrip": false,
799
+ "normalized": false,
800
+ "rstrip": false,
801
+ "single_word": false,
802
+ "special": true
803
+ },
804
+ "128100": {
805
+ "content": "<|reserved_special_token_92|>",
806
+ "lstrip": false,
807
+ "normalized": false,
808
+ "rstrip": false,
809
+ "single_word": false,
810
+ "special": true
811
+ },
812
+ "128101": {
813
+ "content": "<|reserved_special_token_93|>",
814
+ "lstrip": false,
815
+ "normalized": false,
816
+ "rstrip": false,
817
+ "single_word": false,
818
+ "special": true
819
+ },
820
+ "128102": {
821
+ "content": "<|reserved_special_token_94|>",
822
+ "lstrip": false,
823
+ "normalized": false,
824
+ "rstrip": false,
825
+ "single_word": false,
826
+ "special": true
827
+ },
828
+ "128103": {
829
+ "content": "<|reserved_special_token_95|>",
830
+ "lstrip": false,
831
+ "normalized": false,
832
+ "rstrip": false,
833
+ "single_word": false,
834
+ "special": true
835
+ },
836
+ "128104": {
837
+ "content": "<|reserved_special_token_96|>",
838
+ "lstrip": false,
839
+ "normalized": false,
840
+ "rstrip": false,
841
+ "single_word": false,
842
+ "special": true
843
+ },
844
+ "128105": {
845
+ "content": "<|reserved_special_token_97|>",
846
+ "lstrip": false,
847
+ "normalized": false,
848
+ "rstrip": false,
849
+ "single_word": false,
850
+ "special": true
851
+ },
852
+ "128106": {
853
+ "content": "<|reserved_special_token_98|>",
854
+ "lstrip": false,
855
+ "normalized": false,
856
+ "rstrip": false,
857
+ "single_word": false,
858
+ "special": true
859
+ },
860
+ "128107": {
861
+ "content": "<|reserved_special_token_99|>",
862
+ "lstrip": false,
863
+ "normalized": false,
864
+ "rstrip": false,
865
+ "single_word": false,
866
+ "special": true
867
+ },
868
+ "128108": {
869
+ "content": "<|reserved_special_token_100|>",
870
+ "lstrip": false,
871
+ "normalized": false,
872
+ "rstrip": false,
873
+ "single_word": false,
874
+ "special": true
875
+ },
876
+ "128109": {
877
+ "content": "<|reserved_special_token_101|>",
878
+ "lstrip": false,
879
+ "normalized": false,
880
+ "rstrip": false,
881
+ "single_word": false,
882
+ "special": true
883
+ },
884
+ "128110": {
885
+ "content": "<|reserved_special_token_102|>",
886
+ "lstrip": false,
887
+ "normalized": false,
888
+ "rstrip": false,
889
+ "single_word": false,
890
+ "special": true
891
+ },
892
+ "128111": {
893
+ "content": "<|reserved_special_token_103|>",
894
+ "lstrip": false,
895
+ "normalized": false,
896
+ "rstrip": false,
897
+ "single_word": false,
898
+ "special": true
899
+ },
900
+ "128112": {
901
+ "content": "<|reserved_special_token_104|>",
902
+ "lstrip": false,
903
+ "normalized": false,
904
+ "rstrip": false,
905
+ "single_word": false,
906
+ "special": true
907
+ },
908
+ "128113": {
909
+ "content": "<|reserved_special_token_105|>",
910
+ "lstrip": false,
911
+ "normalized": false,
912
+ "rstrip": false,
913
+ "single_word": false,
914
+ "special": true
915
+ },
916
+ "128114": {
917
+ "content": "<|reserved_special_token_106|>",
918
+ "lstrip": false,
919
+ "normalized": false,
920
+ "rstrip": false,
921
+ "single_word": false,
922
+ "special": true
923
+ },
924
+ "128115": {
925
+ "content": "<|reserved_special_token_107|>",
926
+ "lstrip": false,
927
+ "normalized": false,
928
+ "rstrip": false,
929
+ "single_word": false,
930
+ "special": true
931
+ },
932
+ "128116": {
933
+ "content": "<|reserved_special_token_108|>",
934
+ "lstrip": false,
935
+ "normalized": false,
936
+ "rstrip": false,
937
+ "single_word": false,
938
+ "special": true
939
+ },
940
+ "128117": {
941
+ "content": "<|reserved_special_token_109|>",
942
+ "lstrip": false,
943
+ "normalized": false,
944
+ "rstrip": false,
945
+ "single_word": false,
946
+ "special": true
947
+ },
948
+ "128118": {
949
+ "content": "<|reserved_special_token_110|>",
950
+ "lstrip": false,
951
+ "normalized": false,
952
+ "rstrip": false,
953
+ "single_word": false,
954
+ "special": true
955
+ },
956
+ "128119": {
957
+ "content": "<|reserved_special_token_111|>",
958
+ "lstrip": false,
959
+ "normalized": false,
960
+ "rstrip": false,
961
+ "single_word": false,
962
+ "special": true
963
+ },
964
+ "128120": {
965
+ "content": "<|reserved_special_token_112|>",
966
+ "lstrip": false,
967
+ "normalized": false,
968
+ "rstrip": false,
969
+ "single_word": false,
970
+ "special": true
971
+ },
972
+ "128121": {
973
+ "content": "<|reserved_special_token_113|>",
974
+ "lstrip": false,
975
+ "normalized": false,
976
+ "rstrip": false,
977
+ "single_word": false,
978
+ "special": true
979
+ },
980
+ "128122": {
981
+ "content": "<|reserved_special_token_114|>",
982
+ "lstrip": false,
983
+ "normalized": false,
984
+ "rstrip": false,
985
+ "single_word": false,
986
+ "special": true
987
+ },
988
+ "128123": {
989
+ "content": "<|reserved_special_token_115|>",
990
+ "lstrip": false,
991
+ "normalized": false,
992
+ "rstrip": false,
993
+ "single_word": false,
994
+ "special": true
995
+ },
996
+ "128124": {
997
+ "content": "<|reserved_special_token_116|>",
998
+ "lstrip": false,
999
+ "normalized": false,
1000
+ "rstrip": false,
1001
+ "single_word": false,
1002
+ "special": true
1003
+ },
1004
+ "128125": {
1005
+ "content": "<|reserved_special_token_117|>",
1006
+ "lstrip": false,
1007
+ "normalized": false,
1008
+ "rstrip": false,
1009
+ "single_word": false,
1010
+ "special": true
1011
+ },
1012
+ "128126": {
1013
+ "content": "<|reserved_special_token_118|>",
1014
+ "lstrip": false,
1015
+ "normalized": false,
1016
+ "rstrip": false,
1017
+ "single_word": false,
1018
+ "special": true
1019
+ },
1020
+ "128127": {
1021
+ "content": "<|reserved_special_token_119|>",
1022
+ "lstrip": false,
1023
+ "normalized": false,
1024
+ "rstrip": false,
1025
+ "single_word": false,
1026
+ "special": true
1027
+ },
1028
+ "128128": {
1029
+ "content": "<|reserved_special_token_120|>",
1030
+ "lstrip": false,
1031
+ "normalized": false,
1032
+ "rstrip": false,
1033
+ "single_word": false,
1034
+ "special": true
1035
+ },
1036
+ "128129": {
1037
+ "content": "<|reserved_special_token_121|>",
1038
+ "lstrip": false,
1039
+ "normalized": false,
1040
+ "rstrip": false,
1041
+ "single_word": false,
1042
+ "special": true
1043
+ },
1044
+ "128130": {
1045
+ "content": "<|reserved_special_token_122|>",
1046
+ "lstrip": false,
1047
+ "normalized": false,
1048
+ "rstrip": false,
1049
+ "single_word": false,
1050
+ "special": true
1051
+ },
1052
+ "128131": {
1053
+ "content": "<|reserved_special_token_123|>",
1054
+ "lstrip": false,
1055
+ "normalized": false,
1056
+ "rstrip": false,
1057
+ "single_word": false,
1058
+ "special": true
1059
+ },
1060
+ "128132": {
1061
+ "content": "<|reserved_special_token_124|>",
1062
+ "lstrip": false,
1063
+ "normalized": false,
1064
+ "rstrip": false,
1065
+ "single_word": false,
1066
+ "special": true
1067
+ },
1068
+ "128133": {
1069
+ "content": "<|reserved_special_token_125|>",
1070
+ "lstrip": false,
1071
+ "normalized": false,
1072
+ "rstrip": false,
1073
+ "single_word": false,
1074
+ "special": true
1075
+ },
1076
+ "128134": {
1077
+ "content": "<|reserved_special_token_126|>",
1078
+ "lstrip": false,
1079
+ "normalized": false,
1080
+ "rstrip": false,
1081
+ "single_word": false,
1082
+ "special": true
1083
+ },
1084
+ "128135": {
1085
+ "content": "<|reserved_special_token_127|>",
1086
+ "lstrip": false,
1087
+ "normalized": false,
1088
+ "rstrip": false,
1089
+ "single_word": false,
1090
+ "special": true
1091
+ },
1092
+ "128136": {
1093
+ "content": "<|reserved_special_token_128|>",
1094
+ "lstrip": false,
1095
+ "normalized": false,
1096
+ "rstrip": false,
1097
+ "single_word": false,
1098
+ "special": true
1099
+ },
1100
+ "128137": {
1101
+ "content": "<|reserved_special_token_129|>",
1102
+ "lstrip": false,
1103
+ "normalized": false,
1104
+ "rstrip": false,
1105
+ "single_word": false,
1106
+ "special": true
1107
+ },
1108
+ "128138": {
1109
+ "content": "<|reserved_special_token_130|>",
1110
+ "lstrip": false,
1111
+ "normalized": false,
1112
+ "rstrip": false,
1113
+ "single_word": false,
1114
+ "special": true
1115
+ },
1116
+ "128139": {
1117
+ "content": "<|reserved_special_token_131|>",
1118
+ "lstrip": false,
1119
+ "normalized": false,
1120
+ "rstrip": false,
1121
+ "single_word": false,
1122
+ "special": true
1123
+ },
1124
+ "128140": {
1125
+ "content": "<|reserved_special_token_132|>",
1126
+ "lstrip": false,
1127
+ "normalized": false,
1128
+ "rstrip": false,
1129
+ "single_word": false,
1130
+ "special": true
1131
+ },
1132
+ "128141": {
1133
+ "content": "<|reserved_special_token_133|>",
1134
+ "lstrip": false,
1135
+ "normalized": false,
1136
+ "rstrip": false,
1137
+ "single_word": false,
1138
+ "special": true
1139
+ },
1140
+ "128142": {
1141
+ "content": "<|reserved_special_token_134|>",
1142
+ "lstrip": false,
1143
+ "normalized": false,
1144
+ "rstrip": false,
1145
+ "single_word": false,
1146
+ "special": true
1147
+ },
1148
+ "128143": {
1149
+ "content": "<|reserved_special_token_135|>",
1150
+ "lstrip": false,
1151
+ "normalized": false,
1152
+ "rstrip": false,
1153
+ "single_word": false,
1154
+ "special": true
1155
+ },
1156
+ "128144": {
1157
+ "content": "<|reserved_special_token_136|>",
1158
+ "lstrip": false,
1159
+ "normalized": false,
1160
+ "rstrip": false,
1161
+ "single_word": false,
1162
+ "special": true
1163
+ },
1164
+ "128145": {
1165
+ "content": "<|reserved_special_token_137|>",
1166
+ "lstrip": false,
1167
+ "normalized": false,
1168
+ "rstrip": false,
1169
+ "single_word": false,
1170
+ "special": true
1171
+ },
1172
+ "128146": {
1173
+ "content": "<|reserved_special_token_138|>",
1174
+ "lstrip": false,
1175
+ "normalized": false,
1176
+ "rstrip": false,
1177
+ "single_word": false,
1178
+ "special": true
1179
+ },
1180
+ "128147": {
1181
+ "content": "<|reserved_special_token_139|>",
1182
+ "lstrip": false,
1183
+ "normalized": false,
1184
+ "rstrip": false,
1185
+ "single_word": false,
1186
+ "special": true
1187
+ },
1188
+ "128148": {
1189
+ "content": "<|reserved_special_token_140|>",
1190
+ "lstrip": false,
1191
+ "normalized": false,
1192
+ "rstrip": false,
1193
+ "single_word": false,
1194
+ "special": true
1195
+ },
1196
+ "128149": {
1197
+ "content": "<|reserved_special_token_141|>",
1198
+ "lstrip": false,
1199
+ "normalized": false,
1200
+ "rstrip": false,
1201
+ "single_word": false,
1202
+ "special": true
1203
+ },
1204
+ "128150": {
1205
+ "content": "<|reserved_special_token_142|>",
1206
+ "lstrip": false,
1207
+ "normalized": false,
1208
+ "rstrip": false,
1209
+ "single_word": false,
1210
+ "special": true
1211
+ },
1212
+ "128151": {
1213
+ "content": "<|reserved_special_token_143|>",
1214
+ "lstrip": false,
1215
+ "normalized": false,
1216
+ "rstrip": false,
1217
+ "single_word": false,
1218
+ "special": true
1219
+ },
1220
+ "128152": {
1221
+ "content": "<|reserved_special_token_144|>",
1222
+ "lstrip": false,
1223
+ "normalized": false,
1224
+ "rstrip": false,
1225
+ "single_word": false,
1226
+ "special": true
1227
+ },
1228
+ "128153": {
1229
+ "content": "<|reserved_special_token_145|>",
1230
+ "lstrip": false,
1231
+ "normalized": false,
1232
+ "rstrip": false,
1233
+ "single_word": false,
1234
+ "special": true
1235
+ },
1236
+ "128154": {
1237
+ "content": "<|reserved_special_token_146|>",
1238
+ "lstrip": false,
1239
+ "normalized": false,
1240
+ "rstrip": false,
1241
+ "single_word": false,
1242
+ "special": true
1243
+ },
1244
+ "128155": {
1245
+ "content": "<|reserved_special_token_147|>",
1246
+ "lstrip": false,
1247
+ "normalized": false,
1248
+ "rstrip": false,
1249
+ "single_word": false,
1250
+ "special": true
1251
+ },
1252
+ "128156": {
1253
+ "content": "<|reserved_special_token_148|>",
1254
+ "lstrip": false,
1255
+ "normalized": false,
1256
+ "rstrip": false,
1257
+ "single_word": false,
1258
+ "special": true
1259
+ },
1260
+ "128157": {
1261
+ "content": "<|reserved_special_token_149|>",
1262
+ "lstrip": false,
1263
+ "normalized": false,
1264
+ "rstrip": false,
1265
+ "single_word": false,
1266
+ "special": true
1267
+ },
1268
+ "128158": {
1269
+ "content": "<|reserved_special_token_150|>",
1270
+ "lstrip": false,
1271
+ "normalized": false,
1272
+ "rstrip": false,
1273
+ "single_word": false,
1274
+ "special": true
1275
+ },
1276
+ "128159": {
1277
+ "content": "<|reserved_special_token_151|>",
1278
+ "lstrip": false,
1279
+ "normalized": false,
1280
+ "rstrip": false,
1281
+ "single_word": false,
1282
+ "special": true
1283
+ },
1284
+ "128160": {
1285
+ "content": "<|reserved_special_token_152|>",
1286
+ "lstrip": false,
1287
+ "normalized": false,
1288
+ "rstrip": false,
1289
+ "single_word": false,
1290
+ "special": true
1291
+ },
1292
+ "128161": {
1293
+ "content": "<|reserved_special_token_153|>",
1294
+ "lstrip": false,
1295
+ "normalized": false,
1296
+ "rstrip": false,
1297
+ "single_word": false,
1298
+ "special": true
1299
+ },
1300
+ "128162": {
1301
+ "content": "<|reserved_special_token_154|>",
1302
+ "lstrip": false,
1303
+ "normalized": false,
1304
+ "rstrip": false,
1305
+ "single_word": false,
1306
+ "special": true
1307
+ },
1308
+ "128163": {
1309
+ "content": "<|reserved_special_token_155|>",
1310
+ "lstrip": false,
1311
+ "normalized": false,
1312
+ "rstrip": false,
1313
+ "single_word": false,
1314
+ "special": true
1315
+ },
1316
+ "128164": {
1317
+ "content": "<|reserved_special_token_156|>",
1318
+ "lstrip": false,
1319
+ "normalized": false,
1320
+ "rstrip": false,
1321
+ "single_word": false,
1322
+ "special": true
1323
+ },
1324
+ "128165": {
1325
+ "content": "<|reserved_special_token_157|>",
1326
+ "lstrip": false,
1327
+ "normalized": false,
1328
+ "rstrip": false,
1329
+ "single_word": false,
1330
+ "special": true
1331
+ },
1332
+ "128166": {
1333
+ "content": "<|reserved_special_token_158|>",
1334
+ "lstrip": false,
1335
+ "normalized": false,
1336
+ "rstrip": false,
1337
+ "single_word": false,
1338
+ "special": true
1339
+ },
1340
+ "128167": {
1341
+ "content": "<|reserved_special_token_159|>",
1342
+ "lstrip": false,
1343
+ "normalized": false,
1344
+ "rstrip": false,
1345
+ "single_word": false,
1346
+ "special": true
1347
+ },
1348
+ "128168": {
1349
+ "content": "<|reserved_special_token_160|>",
1350
+ "lstrip": false,
1351
+ "normalized": false,
1352
+ "rstrip": false,
1353
+ "single_word": false,
1354
+ "special": true
1355
+ },
1356
+ "128169": {
1357
+ "content": "<|reserved_special_token_161|>",
1358
+ "lstrip": false,
1359
+ "normalized": false,
1360
+ "rstrip": false,
1361
+ "single_word": false,
1362
+ "special": true
1363
+ },
1364
+ "128170": {
1365
+ "content": "<|reserved_special_token_162|>",
1366
+ "lstrip": false,
1367
+ "normalized": false,
1368
+ "rstrip": false,
1369
+ "single_word": false,
1370
+ "special": true
1371
+ },
1372
+ "128171": {
1373
+ "content": "<|reserved_special_token_163|>",
1374
+ "lstrip": false,
1375
+ "normalized": false,
1376
+ "rstrip": false,
1377
+ "single_word": false,
1378
+ "special": true
1379
+ },
1380
+ "128172": {
1381
+ "content": "<|reserved_special_token_164|>",
1382
+ "lstrip": false,
1383
+ "normalized": false,
1384
+ "rstrip": false,
1385
+ "single_word": false,
1386
+ "special": true
1387
+ },
1388
+ "128173": {
1389
+ "content": "<|reserved_special_token_165|>",
1390
+ "lstrip": false,
1391
+ "normalized": false,
1392
+ "rstrip": false,
1393
+ "single_word": false,
1394
+ "special": true
1395
+ },
1396
+ "128174": {
1397
+ "content": "<|reserved_special_token_166|>",
1398
+ "lstrip": false,
1399
+ "normalized": false,
1400
+ "rstrip": false,
1401
+ "single_word": false,
1402
+ "special": true
1403
+ },
1404
+ "128175": {
1405
+ "content": "<|reserved_special_token_167|>",
1406
+ "lstrip": false,
1407
+ "normalized": false,
1408
+ "rstrip": false,
1409
+ "single_word": false,
1410
+ "special": true
1411
+ },
1412
+ "128176": {
1413
+ "content": "<|reserved_special_token_168|>",
1414
+ "lstrip": false,
1415
+ "normalized": false,
1416
+ "rstrip": false,
1417
+ "single_word": false,
1418
+ "special": true
1419
+ },
1420
+ "128177": {
1421
+ "content": "<|reserved_special_token_169|>",
1422
+ "lstrip": false,
1423
+ "normalized": false,
1424
+ "rstrip": false,
1425
+ "single_word": false,
1426
+ "special": true
1427
+ },
1428
+ "128178": {
1429
+ "content": "<|reserved_special_token_170|>",
1430
+ "lstrip": false,
1431
+ "normalized": false,
1432
+ "rstrip": false,
1433
+ "single_word": false,
1434
+ "special": true
1435
+ },
1436
+ "128179": {
1437
+ "content": "<|reserved_special_token_171|>",
1438
+ "lstrip": false,
1439
+ "normalized": false,
1440
+ "rstrip": false,
1441
+ "single_word": false,
1442
+ "special": true
1443
+ },
1444
+ "128180": {
1445
+ "content": "<|reserved_special_token_172|>",
1446
+ "lstrip": false,
1447
+ "normalized": false,
1448
+ "rstrip": false,
1449
+ "single_word": false,
1450
+ "special": true
1451
+ },
1452
+ "128181": {
1453
+ "content": "<|reserved_special_token_173|>",
1454
+ "lstrip": false,
1455
+ "normalized": false,
1456
+ "rstrip": false,
1457
+ "single_word": false,
1458
+ "special": true
1459
+ },
1460
+ "128182": {
1461
+ "content": "<|reserved_special_token_174|>",
1462
+ "lstrip": false,
1463
+ "normalized": false,
1464
+ "rstrip": false,
1465
+ "single_word": false,
1466
+ "special": true
1467
+ },
1468
+ "128183": {
1469
+ "content": "<|reserved_special_token_175|>",
1470
+ "lstrip": false,
1471
+ "normalized": false,
1472
+ "rstrip": false,
1473
+ "single_word": false,
1474
+ "special": true
1475
+ },
1476
+ "128184": {
1477
+ "content": "<|reserved_special_token_176|>",
1478
+ "lstrip": false,
1479
+ "normalized": false,
1480
+ "rstrip": false,
1481
+ "single_word": false,
1482
+ "special": true
1483
+ },
1484
+ "128185": {
1485
+ "content": "<|reserved_special_token_177|>",
1486
+ "lstrip": false,
1487
+ "normalized": false,
1488
+ "rstrip": false,
1489
+ "single_word": false,
1490
+ "special": true
1491
+ },
1492
+ "128186": {
1493
+ "content": "<|reserved_special_token_178|>",
1494
+ "lstrip": false,
1495
+ "normalized": false,
1496
+ "rstrip": false,
1497
+ "single_word": false,
1498
+ "special": true
1499
+ },
1500
+ "128187": {
1501
+ "content": "<|reserved_special_token_179|>",
1502
+ "lstrip": false,
1503
+ "normalized": false,
1504
+ "rstrip": false,
1505
+ "single_word": false,
1506
+ "special": true
1507
+ },
1508
+ "128188": {
1509
+ "content": "<|reserved_special_token_180|>",
1510
+ "lstrip": false,
1511
+ "normalized": false,
1512
+ "rstrip": false,
1513
+ "single_word": false,
1514
+ "special": true
1515
+ },
1516
+ "128189": {
1517
+ "content": "<|reserved_special_token_181|>",
1518
+ "lstrip": false,
1519
+ "normalized": false,
1520
+ "rstrip": false,
1521
+ "single_word": false,
1522
+ "special": true
1523
+ },
1524
+ "128190": {
1525
+ "content": "<|reserved_special_token_182|>",
1526
+ "lstrip": false,
1527
+ "normalized": false,
1528
+ "rstrip": false,
1529
+ "single_word": false,
1530
+ "special": true
1531
+ },
1532
+ "128191": {
1533
+ "content": "<|reserved_special_token_183|>",
1534
+ "lstrip": false,
1535
+ "normalized": false,
1536
+ "rstrip": false,
1537
+ "single_word": false,
1538
+ "special": true
1539
+ },
1540
+ "128192": {
1541
+ "content": "<|reserved_special_token_184|>",
1542
+ "lstrip": false,
1543
+ "normalized": false,
1544
+ "rstrip": false,
1545
+ "single_word": false,
1546
+ "special": true
1547
+ },
1548
+ "128193": {
1549
+ "content": "<|reserved_special_token_185|>",
1550
+ "lstrip": false,
1551
+ "normalized": false,
1552
+ "rstrip": false,
1553
+ "single_word": false,
1554
+ "special": true
1555
+ },
1556
+ "128194": {
1557
+ "content": "<|reserved_special_token_186|>",
1558
+ "lstrip": false,
1559
+ "normalized": false,
1560
+ "rstrip": false,
1561
+ "single_word": false,
1562
+ "special": true
1563
+ },
1564
+ "128195": {
1565
+ "content": "<|reserved_special_token_187|>",
1566
+ "lstrip": false,
1567
+ "normalized": false,
1568
+ "rstrip": false,
1569
+ "single_word": false,
1570
+ "special": true
1571
+ },
1572
+ "128196": {
1573
+ "content": "<|reserved_special_token_188|>",
1574
+ "lstrip": false,
1575
+ "normalized": false,
1576
+ "rstrip": false,
1577
+ "single_word": false,
1578
+ "special": true
1579
+ },
1580
+ "128197": {
1581
+ "content": "<|reserved_special_token_189|>",
1582
+ "lstrip": false,
1583
+ "normalized": false,
1584
+ "rstrip": false,
1585
+ "single_word": false,
1586
+ "special": true
1587
+ },
1588
+ "128198": {
1589
+ "content": "<|reserved_special_token_190|>",
1590
+ "lstrip": false,
1591
+ "normalized": false,
1592
+ "rstrip": false,
1593
+ "single_word": false,
1594
+ "special": true
1595
+ },
1596
+ "128199": {
1597
+ "content": "<|reserved_special_token_191|>",
1598
+ "lstrip": false,
1599
+ "normalized": false,
1600
+ "rstrip": false,
1601
+ "single_word": false,
1602
+ "special": true
1603
+ },
1604
+ "128200": {
1605
+ "content": "<|reserved_special_token_192|>",
1606
+ "lstrip": false,
1607
+ "normalized": false,
1608
+ "rstrip": false,
1609
+ "single_word": false,
1610
+ "special": true
1611
+ },
1612
+ "128201": {
1613
+ "content": "<|reserved_special_token_193|>",
1614
+ "lstrip": false,
1615
+ "normalized": false,
1616
+ "rstrip": false,
1617
+ "single_word": false,
1618
+ "special": true
1619
+ },
1620
+ "128202": {
1621
+ "content": "<|reserved_special_token_194|>",
1622
+ "lstrip": false,
1623
+ "normalized": false,
1624
+ "rstrip": false,
1625
+ "single_word": false,
1626
+ "special": true
1627
+ },
1628
+ "128203": {
1629
+ "content": "<|reserved_special_token_195|>",
1630
+ "lstrip": false,
1631
+ "normalized": false,
1632
+ "rstrip": false,
1633
+ "single_word": false,
1634
+ "special": true
1635
+ },
1636
+ "128204": {
1637
+ "content": "<|reserved_special_token_196|>",
1638
+ "lstrip": false,
1639
+ "normalized": false,
1640
+ "rstrip": false,
1641
+ "single_word": false,
1642
+ "special": true
1643
+ },
1644
+ "128205": {
1645
+ "content": "<|reserved_special_token_197|>",
1646
+ "lstrip": false,
1647
+ "normalized": false,
1648
+ "rstrip": false,
1649
+ "single_word": false,
1650
+ "special": true
1651
+ },
1652
+ "128206": {
1653
+ "content": "<|reserved_special_token_198|>",
1654
+ "lstrip": false,
1655
+ "normalized": false,
1656
+ "rstrip": false,
1657
+ "single_word": false,
1658
+ "special": true
1659
+ },
1660
+ "128207": {
1661
+ "content": "<|reserved_special_token_199|>",
1662
+ "lstrip": false,
1663
+ "normalized": false,
1664
+ "rstrip": false,
1665
+ "single_word": false,
1666
+ "special": true
1667
+ },
1668
+ "128208": {
1669
+ "content": "<|reserved_special_token_200|>",
1670
+ "lstrip": false,
1671
+ "normalized": false,
1672
+ "rstrip": false,
1673
+ "single_word": false,
1674
+ "special": true
1675
+ },
1676
+ "128209": {
1677
+ "content": "<|reserved_special_token_201|>",
1678
+ "lstrip": false,
1679
+ "normalized": false,
1680
+ "rstrip": false,
1681
+ "single_word": false,
1682
+ "special": true
1683
+ },
1684
+ "128210": {
1685
+ "content": "<|reserved_special_token_202|>",
1686
+ "lstrip": false,
1687
+ "normalized": false,
1688
+ "rstrip": false,
1689
+ "single_word": false,
1690
+ "special": true
1691
+ },
1692
+ "128211": {
1693
+ "content": "<|reserved_special_token_203|>",
1694
+ "lstrip": false,
1695
+ "normalized": false,
1696
+ "rstrip": false,
1697
+ "single_word": false,
1698
+ "special": true
1699
+ },
1700
+ "128212": {
1701
+ "content": "<|reserved_special_token_204|>",
1702
+ "lstrip": false,
1703
+ "normalized": false,
1704
+ "rstrip": false,
1705
+ "single_word": false,
1706
+ "special": true
1707
+ },
1708
+ "128213": {
1709
+ "content": "<|reserved_special_token_205|>",
1710
+ "lstrip": false,
1711
+ "normalized": false,
1712
+ "rstrip": false,
1713
+ "single_word": false,
1714
+ "special": true
1715
+ },
1716
+ "128214": {
1717
+ "content": "<|reserved_special_token_206|>",
1718
+ "lstrip": false,
1719
+ "normalized": false,
1720
+ "rstrip": false,
1721
+ "single_word": false,
1722
+ "special": true
1723
+ },
1724
+ "128215": {
1725
+ "content": "<|reserved_special_token_207|>",
1726
+ "lstrip": false,
1727
+ "normalized": false,
1728
+ "rstrip": false,
1729
+ "single_word": false,
1730
+ "special": true
1731
+ },
1732
+ "128216": {
1733
+ "content": "<|reserved_special_token_208|>",
1734
+ "lstrip": false,
1735
+ "normalized": false,
1736
+ "rstrip": false,
1737
+ "single_word": false,
1738
+ "special": true
1739
+ },
1740
+ "128217": {
1741
+ "content": "<|reserved_special_token_209|>",
1742
+ "lstrip": false,
1743
+ "normalized": false,
1744
+ "rstrip": false,
1745
+ "single_word": false,
1746
+ "special": true
1747
+ },
1748
+ "128218": {
1749
+ "content": "<|reserved_special_token_210|>",
1750
+ "lstrip": false,
1751
+ "normalized": false,
1752
+ "rstrip": false,
1753
+ "single_word": false,
1754
+ "special": true
1755
+ },
1756
+ "128219": {
1757
+ "content": "<|reserved_special_token_211|>",
1758
+ "lstrip": false,
1759
+ "normalized": false,
1760
+ "rstrip": false,
1761
+ "single_word": false,
1762
+ "special": true
1763
+ },
1764
+ "128220": {
1765
+ "content": "<|reserved_special_token_212|>",
1766
+ "lstrip": false,
1767
+ "normalized": false,
1768
+ "rstrip": false,
1769
+ "single_word": false,
1770
+ "special": true
1771
+ },
1772
+ "128221": {
1773
+ "content": "<|reserved_special_token_213|>",
1774
+ "lstrip": false,
1775
+ "normalized": false,
1776
+ "rstrip": false,
1777
+ "single_word": false,
1778
+ "special": true
1779
+ },
1780
+ "128222": {
1781
+ "content": "<|reserved_special_token_214|>",
1782
+ "lstrip": false,
1783
+ "normalized": false,
1784
+ "rstrip": false,
1785
+ "single_word": false,
1786
+ "special": true
1787
+ },
1788
+ "128223": {
1789
+ "content": "<|reserved_special_token_215|>",
1790
+ "lstrip": false,
1791
+ "normalized": false,
1792
+ "rstrip": false,
1793
+ "single_word": false,
1794
+ "special": true
1795
+ },
1796
+ "128224": {
1797
+ "content": "<|reserved_special_token_216|>",
1798
+ "lstrip": false,
1799
+ "normalized": false,
1800
+ "rstrip": false,
1801
+ "single_word": false,
1802
+ "special": true
1803
+ },
1804
+ "128225": {
1805
+ "content": "<|reserved_special_token_217|>",
1806
+ "lstrip": false,
1807
+ "normalized": false,
1808
+ "rstrip": false,
1809
+ "single_word": false,
1810
+ "special": true
1811
+ },
1812
+ "128226": {
1813
+ "content": "<|reserved_special_token_218|>",
1814
+ "lstrip": false,
1815
+ "normalized": false,
1816
+ "rstrip": false,
1817
+ "single_word": false,
1818
+ "special": true
1819
+ },
1820
+ "128227": {
1821
+ "content": "<|reserved_special_token_219|>",
1822
+ "lstrip": false,
1823
+ "normalized": false,
1824
+ "rstrip": false,
1825
+ "single_word": false,
1826
+ "special": true
1827
+ },
1828
+ "128228": {
1829
+ "content": "<|reserved_special_token_220|>",
1830
+ "lstrip": false,
1831
+ "normalized": false,
1832
+ "rstrip": false,
1833
+ "single_word": false,
1834
+ "special": true
1835
+ },
1836
+ "128229": {
1837
+ "content": "<|reserved_special_token_221|>",
1838
+ "lstrip": false,
1839
+ "normalized": false,
1840
+ "rstrip": false,
1841
+ "single_word": false,
1842
+ "special": true
1843
+ },
1844
+ "128230": {
1845
+ "content": "<|reserved_special_token_222|>",
1846
+ "lstrip": false,
1847
+ "normalized": false,
1848
+ "rstrip": false,
1849
+ "single_word": false,
1850
+ "special": true
1851
+ },
1852
+ "128231": {
1853
+ "content": "<|reserved_special_token_223|>",
1854
+ "lstrip": false,
1855
+ "normalized": false,
1856
+ "rstrip": false,
1857
+ "single_word": false,
1858
+ "special": true
1859
+ },
1860
+ "128232": {
1861
+ "content": "<|reserved_special_token_224|>",
1862
+ "lstrip": false,
1863
+ "normalized": false,
1864
+ "rstrip": false,
1865
+ "single_word": false,
1866
+ "special": true
1867
+ },
1868
+ "128233": {
1869
+ "content": "<|reserved_special_token_225|>",
1870
+ "lstrip": false,
1871
+ "normalized": false,
1872
+ "rstrip": false,
1873
+ "single_word": false,
1874
+ "special": true
1875
+ },
1876
+ "128234": {
1877
+ "content": "<|reserved_special_token_226|>",
1878
+ "lstrip": false,
1879
+ "normalized": false,
1880
+ "rstrip": false,
1881
+ "single_word": false,
1882
+ "special": true
1883
+ },
1884
+ "128235": {
1885
+ "content": "<|reserved_special_token_227|>",
1886
+ "lstrip": false,
1887
+ "normalized": false,
1888
+ "rstrip": false,
1889
+ "single_word": false,
1890
+ "special": true
1891
+ },
1892
+ "128236": {
1893
+ "content": "<|reserved_special_token_228|>",
1894
+ "lstrip": false,
1895
+ "normalized": false,
1896
+ "rstrip": false,
1897
+ "single_word": false,
1898
+ "special": true
1899
+ },
1900
+ "128237": {
1901
+ "content": "<|reserved_special_token_229|>",
1902
+ "lstrip": false,
1903
+ "normalized": false,
1904
+ "rstrip": false,
1905
+ "single_word": false,
1906
+ "special": true
1907
+ },
1908
+ "128238": {
1909
+ "content": "<|reserved_special_token_230|>",
1910
+ "lstrip": false,
1911
+ "normalized": false,
1912
+ "rstrip": false,
1913
+ "single_word": false,
1914
+ "special": true
1915
+ },
1916
+ "128239": {
1917
+ "content": "<|reserved_special_token_231|>",
1918
+ "lstrip": false,
1919
+ "normalized": false,
1920
+ "rstrip": false,
1921
+ "single_word": false,
1922
+ "special": true
1923
+ },
1924
+ "128240": {
1925
+ "content": "<|reserved_special_token_232|>",
1926
+ "lstrip": false,
1927
+ "normalized": false,
1928
+ "rstrip": false,
1929
+ "single_word": false,
1930
+ "special": true
1931
+ },
1932
+ "128241": {
1933
+ "content": "<|reserved_special_token_233|>",
1934
+ "lstrip": false,
1935
+ "normalized": false,
1936
+ "rstrip": false,
1937
+ "single_word": false,
1938
+ "special": true
1939
+ },
1940
+ "128242": {
1941
+ "content": "<|reserved_special_token_234|>",
1942
+ "lstrip": false,
1943
+ "normalized": false,
1944
+ "rstrip": false,
1945
+ "single_word": false,
1946
+ "special": true
1947
+ },
1948
+ "128243": {
1949
+ "content": "<|reserved_special_token_235|>",
1950
+ "lstrip": false,
1951
+ "normalized": false,
1952
+ "rstrip": false,
1953
+ "single_word": false,
1954
+ "special": true
1955
+ },
1956
+ "128244": {
1957
+ "content": "<|reserved_special_token_236|>",
1958
+ "lstrip": false,
1959
+ "normalized": false,
1960
+ "rstrip": false,
1961
+ "single_word": false,
1962
+ "special": true
1963
+ },
1964
+ "128245": {
1965
+ "content": "<|reserved_special_token_237|>",
1966
+ "lstrip": false,
1967
+ "normalized": false,
1968
+ "rstrip": false,
1969
+ "single_word": false,
1970
+ "special": true
1971
+ },
1972
+ "128246": {
1973
+ "content": "<|reserved_special_token_238|>",
1974
+ "lstrip": false,
1975
+ "normalized": false,
1976
+ "rstrip": false,
1977
+ "single_word": false,
1978
+ "special": true
1979
+ },
1980
+ "128247": {
1981
+ "content": "<|reserved_special_token_239|>",
1982
+ "lstrip": false,
1983
+ "normalized": false,
1984
+ "rstrip": false,
1985
+ "single_word": false,
1986
+ "special": true
1987
+ },
1988
+ "128248": {
1989
+ "content": "<|reserved_special_token_240|>",
1990
+ "lstrip": false,
1991
+ "normalized": false,
1992
+ "rstrip": false,
1993
+ "single_word": false,
1994
+ "special": true
1995
+ },
1996
+ "128249": {
1997
+ "content": "<|reserved_special_token_241|>",
1998
+ "lstrip": false,
1999
+ "normalized": false,
2000
+ "rstrip": false,
2001
+ "single_word": false,
2002
+ "special": true
2003
+ },
2004
+ "128250": {
2005
+ "content": "<|reserved_special_token_242|>",
2006
+ "lstrip": false,
2007
+ "normalized": false,
2008
+ "rstrip": false,
2009
+ "single_word": false,
2010
+ "special": true
2011
+ },
2012
+ "128251": {
2013
+ "content": "<|reserved_special_token_243|>",
2014
+ "lstrip": false,
2015
+ "normalized": false,
2016
+ "rstrip": false,
2017
+ "single_word": false,
2018
+ "special": true
2019
+ },
2020
+ "128252": {
2021
+ "content": "<|reserved_special_token_244|>",
2022
+ "lstrip": false,
2023
+ "normalized": false,
2024
+ "rstrip": false,
2025
+ "single_word": false,
2026
+ "special": true
2027
+ },
2028
+ "128253": {
2029
+ "content": "<|reserved_special_token_245|>",
2030
+ "lstrip": false,
2031
+ "normalized": false,
2032
+ "rstrip": false,
2033
+ "single_word": false,
2034
+ "special": true
2035
+ },
2036
+ "128254": {
2037
+ "content": "<|reserved_special_token_246|>",
2038
+ "lstrip": false,
2039
+ "normalized": false,
2040
+ "rstrip": false,
2041
+ "single_word": false,
2042
+ "special": true
2043
+ },
2044
+ "128255": {
2045
+ "content": "<|reserved_special_token_247|>",
2046
+ "lstrip": false,
2047
+ "normalized": false,
2048
+ "rstrip": false,
2049
+ "single_word": false,
2050
+ "special": true
2051
+ },
2052
+ "128256": {
2053
+ "content": "<img>",
2054
+ "lstrip": false,
2055
+ "normalized": false,
2056
+ "rstrip": false,
2057
+ "single_word": false,
2058
+ "special": true
2059
+ },
2060
+ "128257": {
2061
+ "content": "</img>",
2062
+ "lstrip": false,
2063
+ "normalized": false,
2064
+ "rstrip": false,
2065
+ "single_word": false,
2066
+ "special": true
2067
+ },
2068
+ "128258": {
2069
+ "content": "<IMG_CONTEXT>",
2070
+ "lstrip": false,
2071
+ "normalized": false,
2072
+ "rstrip": false,
2073
+ "single_word": false,
2074
+ "special": true
2075
+ },
2076
+ "128259": {
2077
+ "content": "<quad>",
2078
+ "lstrip": false,
2079
+ "normalized": false,
2080
+ "rstrip": false,
2081
+ "single_word": false,
2082
+ "special": true
2083
+ },
2084
+ "128260": {
2085
+ "content": "</quad>",
2086
+ "lstrip": false,
2087
+ "normalized": false,
2088
+ "rstrip": false,
2089
+ "single_word": false,
2090
+ "special": true
2091
+ },
2092
+ "128261": {
2093
+ "content": "<ref>",
2094
+ "lstrip": false,
2095
+ "normalized": false,
2096
+ "rstrip": false,
2097
+ "single_word": false,
2098
+ "special": true
2099
+ },
2100
+ "128262": {
2101
+ "content": "</ref>",
2102
+ "lstrip": false,
2103
+ "normalized": false,
2104
+ "rstrip": false,
2105
+ "single_word": false,
2106
+ "special": true
2107
+ },
2108
+ "128263": {
2109
+ "content": "<box>",
2110
+ "lstrip": false,
2111
+ "normalized": false,
2112
+ "rstrip": false,
2113
+ "single_word": false,
2114
+ "special": true
2115
+ },
2116
+ "128264": {
2117
+ "content": "</box>",
2118
+ "lstrip": false,
2119
+ "normalized": false,
2120
+ "rstrip": false,
2121
+ "single_word": false,
2122
+ "special": true
2123
+ },
2124
+ "128265": {
2125
+ "content": "<interval>",
2126
+ "lstrip": false,
2127
+ "normalized": false,
2128
+ "rstrip": false,
2129
+ "single_word": false,
2130
+ "special": true
2131
+ },
2132
+ "128266": {
2133
+ "content": "</interval>",
2134
+ "lstrip": false,
2135
+ "normalized": false,
2136
+ "rstrip": false,
2137
+ "single_word": false,
2138
+ "special": true
2139
+ }
2140
+ },
2141
+ "bos_token": "<|begin_of_text|>",
2142
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
2143
+ "clean_up_tokenization_spaces": true,
2144
+ "eos_token": "<|eot_id|>",
2145
+ "model_input_names": [
2146
+ "input_ids",
2147
+ "attention_mask"
2148
+ ],
2149
+ "model_max_length": 16384,
2150
+ "pad_token": "<|finetune_right_pad_id|>",
2151
+ "tokenizer_class": "PreTrainedTokenizerFast"
2152
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.18843612657266906,
4
+ "train_runtime": 10275.4061,
5
+ "train_samples": 9129380,
6
+ "train_samples_per_second": 888.469,
7
+ "train_steps_per_second": 1.132
8
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:853b66556402ae9fdde7d46710aa0de2eef675e1dfdad3a54ba30df79f507594
3
+ size 6264