prince-canuma commited on
Commit
412f1f1
·
verified ·
1 Parent(s): 47939ab

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: qwen
4
+ license_link: https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/LICENSE
5
+ pipeline_tag: image-text-to-text
6
+ library_name: transformers
7
+ base_model:
8
+ - OpenGVLab/InternVL3-1B-Instruct
9
+ base_model_relation: finetune
10
+ datasets:
11
+ - OpenGVLab/MMPR-v1.2
12
+ language:
13
+ - multilingual
14
+ tags:
15
+ - internvl
16
+ - custom_code
17
+ - mlx
18
+ ---
19
+
20
+ # mlx-community/Kimi-VL-A3B-Thinking-8bit
21
+ This model was converted to MLX format from [`moonshotai/Kimi-VL-A3B-Thinking`]() using mlx-vlm version **0.1.23**.
22
+ Refer to the [original model card](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking) for more details on the model.
23
+ ## Use with mlx
24
+
25
+ ```bash
26
+ pip install -U mlx-vlm
27
+ ```
28
+
29
+ ```bash
30
+ python -m mlx_vlm.generate --model mlx-community/Kimi-VL-A3B-Thinking-8bit --max-tokens 100 --temperature 0.0 --prompt "Describe this image." --image <path_to_image>
31
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- for message in messages -%}
2
+ {%- if loop.first and messages[0]['role'] != 'system' -%}
3
+ {{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}
4
+ {%- endif -%}
5
+ {%- if message['role'] == 'system' -%}
6
+ {{'<|im_system|>'}}
7
+ {%- endif -%}
8
+ {%- if message['role'] == 'user' -%}
9
+ {{'<|im_user|>'}}
10
+ {%- endif -%}
11
+ {%- if message['role'] == 'assistant' -%}
12
+ {{'<|im_assistant|>'}}
13
+ {%- endif -%}
14
+ {{- message['role'] -}}
15
+ {{'<|im_middle|>'}}
16
+ {%- if message['content'] is string -%}
17
+ {{- message['content'] + '<|im_end|>' -}}
18
+ {%- else -%}
19
+ {%- for content in message['content'] -%}
20
+ {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
21
+ {{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}
22
+ {%- else -%}
23
+ {{content['text']}}
24
+ {%- endif -%}
25
+ {%- endfor -%}
26
+ {{'<|im_end|>'}}
27
+ {%- endif -%}
28
+ {%- endfor -%}
29
+ {%- if add_generation_prompt -%}
30
+ {{'<|im_assistant|>assistant<|im_middle|>'}}
31
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "KimiVLForConditionalGeneration"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_kimi_vl.KimiVLConfig",
9
+ "AutoModel": "modeling_kimi_vl.KimiVLForConditionalGeneration",
10
+ "AutoModelForCausalLM": "modeling_kimi_vl.KimiVLForConditionalGeneration"
11
+ },
12
+ "bad_words_ids": null,
13
+ "begin_suppress_tokens": null,
14
+ "bos_token_id": null,
15
+ "chunk_size_feed_forward": 0,
16
+ "cross_attention_hidden_size": null,
17
+ "decoder_start_token_id": null,
18
+ "diversity_penalty": 0.0,
19
+ "do_sample": false,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": null,
23
+ "exponential_decay_length_penalty": null,
24
+ "finetuning_task": null,
25
+ "forced_bos_token_id": null,
26
+ "forced_eos_token_id": null,
27
+ "id2label": {
28
+ "0": "LABEL_0",
29
+ "1": "LABEL_1"
30
+ },
31
+ "ignore_index": -100,
32
+ "is_decoder": false,
33
+ "is_encoder_decoder": false,
34
+ "label2id": {
35
+ "LABEL_0": 0,
36
+ "LABEL_1": 1
37
+ },
38
+ "length_penalty": 1.0,
39
+ "max_length": 20,
40
+ "media_placeholder_token_id": 163605,
41
+ "min_length": 0,
42
+ "model_type": "kimi_vl",
43
+ "no_repeat_ngram_size": 0,
44
+ "num_beam_groups": 1,
45
+ "num_beams": 1,
46
+ "num_return_sequences": 1,
47
+ "output_attentions": false,
48
+ "output_hidden_states": false,
49
+ "output_scores": false,
50
+ "pad_token_id": 0,
51
+ "prefix": null,
52
+ "problem_type": null,
53
+ "pruned_heads": {},
54
+ "quantization": {
55
+ "group_size": 64,
56
+ "bits": 8
57
+ },
58
+ "remove_invalid_values": false,
59
+ "repetition_penalty": 1.0,
60
+ "return_dict": true,
61
+ "return_dict_in_generate": false,
62
+ "sep_token_id": null,
63
+ "suppress_tokens": null,
64
+ "task_specific_params": null,
65
+ "temperature": 1.0,
66
+ "text_config": {
67
+ "vocab_size": 163840,
68
+ "max_position_embeddings": 131072,
69
+ "hidden_size": 2048,
70
+ "intermediate_size": 11264,
71
+ "moe_intermediate_size": 1408,
72
+ "num_hidden_layers": 27,
73
+ "num_nextn_predict_layers": 1,
74
+ "num_attention_heads": 16,
75
+ "n_shared_experts": 2,
76
+ "n_routed_experts": 64,
77
+ "ep_size": 1,
78
+ "routed_scaling_factor": 2.446,
79
+ "kv_lora_rank": 512,
80
+ "q_lora_rank": null,
81
+ "qk_rope_head_dim": 64,
82
+ "v_head_dim": 128,
83
+ "qk_nope_head_dim": 128,
84
+ "topk_method": "noaux_tc",
85
+ "n_group": 1,
86
+ "topk_group": 1,
87
+ "num_experts_per_tok": 6,
88
+ "moe_layer_freq": 1,
89
+ "first_k_dense_replace": 1,
90
+ "norm_topk_prob": true,
91
+ "scoring_func": "sigmoid",
92
+ "aux_loss_alpha": 0.001,
93
+ "seq_aux": true,
94
+ "num_key_value_heads": 16,
95
+ "hidden_act": "silu",
96
+ "initializer_range": 0.02,
97
+ "rms_norm_eps": 1e-05,
98
+ "pretraining_tp": 1,
99
+ "use_cache": true,
100
+ "rope_theta": 800000.0,
101
+ "rope_scaling": null,
102
+ "attention_bias": false,
103
+ "attention_dropout": 0.0,
104
+ "return_dict": true,
105
+ "output_hidden_states": false,
106
+ "output_attentions": false,
107
+ "torchscript": false,
108
+ "torch_dtype": "bfloat16",
109
+ "use_bfloat16": false,
110
+ "tf_legacy_loss": false,
111
+ "pruned_heads": {},
112
+ "tie_word_embeddings": false,
113
+ "chunk_size_feed_forward": 0,
114
+ "is_encoder_decoder": false,
115
+ "is_decoder": false,
116
+ "cross_attention_hidden_size": null,
117
+ "add_cross_attention": false,
118
+ "tie_encoder_decoder": false,
119
+ "max_length": 20,
120
+ "min_length": 0,
121
+ "do_sample": false,
122
+ "early_stopping": false,
123
+ "num_beams": 1,
124
+ "num_beam_groups": 1,
125
+ "diversity_penalty": 0.0,
126
+ "temperature": 1.0,
127
+ "top_k": 50,
128
+ "top_p": 1.0,
129
+ "typical_p": 1.0,
130
+ "repetition_penalty": 1.0,
131
+ "length_penalty": 1.0,
132
+ "no_repeat_ngram_size": 0,
133
+ "encoder_no_repeat_ngram_size": 0,
134
+ "bad_words_ids": null,
135
+ "num_return_sequences": 1,
136
+ "output_scores": false,
137
+ "return_dict_in_generate": false,
138
+ "forced_bos_token_id": null,
139
+ "forced_eos_token_id": null,
140
+ "remove_invalid_values": false,
141
+ "exponential_decay_length_penalty": null,
142
+ "suppress_tokens": null,
143
+ "begin_suppress_tokens": null,
144
+ "architectures": null,
145
+ "finetuning_task": null,
146
+ "id2label": {
147
+ "0": "LABEL_0",
148
+ "1": "LABEL_1"
149
+ },
150
+ "label2id": {
151
+ "LABEL_0": 0,
152
+ "LABEL_1": 1
153
+ },
154
+ "tokenizer_class": null,
155
+ "prefix": null,
156
+ "bos_token_id": 163584,
157
+ "pad_token_id": 163839,
158
+ "eos_token_id": 163585,
159
+ "sep_token_id": null,
160
+ "decoder_start_token_id": null,
161
+ "task_specific_params": null,
162
+ "problem_type": null,
163
+ "_name_or_path": "",
164
+ "_attn_implementation_autoset": false,
165
+ "model_type": "deepseek_v3"
166
+ },
167
+ "tf_legacy_loss": false,
168
+ "tie_encoder_decoder": false,
169
+ "tie_word_embeddings": false,
170
+ "tokenizer_class": null,
171
+ "top_k": 50,
172
+ "top_p": 1.0,
173
+ "torchscript": false,
174
+ "transformers_version": "4.52.0.dev0",
175
+ "typical_p": 1.0,
176
+ "use_bfloat16": false,
177
+ "vision_config": {
178
+ "return_dict": true,
179
+ "output_hidden_states": false,
180
+ "output_attentions": false,
181
+ "torchscript": false,
182
+ "torch_dtype": "bfloat16",
183
+ "use_bfloat16": false,
184
+ "tf_legacy_loss": false,
185
+ "pruned_heads": {},
186
+ "tie_word_embeddings": true,
187
+ "chunk_size_feed_forward": 0,
188
+ "is_encoder_decoder": false,
189
+ "is_decoder": false,
190
+ "cross_attention_hidden_size": null,
191
+ "add_cross_attention": false,
192
+ "tie_encoder_decoder": false,
193
+ "max_length": 20,
194
+ "min_length": 0,
195
+ "do_sample": false,
196
+ "early_stopping": false,
197
+ "num_beams": 1,
198
+ "num_beam_groups": 1,
199
+ "diversity_penalty": 0.0,
200
+ "temperature": 1.0,
201
+ "top_k": 50,
202
+ "top_p": 1.0,
203
+ "typical_p": 1.0,
204
+ "repetition_penalty": 1.0,
205
+ "length_penalty": 1.0,
206
+ "no_repeat_ngram_size": 0,
207
+ "encoder_no_repeat_ngram_size": 0,
208
+ "bad_words_ids": null,
209
+ "num_return_sequences": 1,
210
+ "output_scores": false,
211
+ "return_dict_in_generate": false,
212
+ "forced_bos_token_id": null,
213
+ "forced_eos_token_id": null,
214
+ "remove_invalid_values": false,
215
+ "exponential_decay_length_penalty": null,
216
+ "suppress_tokens": null,
217
+ "begin_suppress_tokens": null,
218
+ "architectures": null,
219
+ "finetuning_task": null,
220
+ "id2label": {
221
+ "0": "LABEL_0",
222
+ "1": "LABEL_1"
223
+ },
224
+ "label2id": {
225
+ "LABEL_0": 0,
226
+ "LABEL_1": 1
227
+ },
228
+ "tokenizer_class": null,
229
+ "prefix": null,
230
+ "bos_token_id": null,
231
+ "pad_token_id": null,
232
+ "eos_token_id": null,
233
+ "sep_token_id": null,
234
+ "decoder_start_token_id": null,
235
+ "task_specific_params": null,
236
+ "problem_type": null,
237
+ "_name_or_path": "",
238
+ "_attn_implementation_autoset": false,
239
+ "model_type": "moonvit",
240
+ "patch_size": 14,
241
+ "init_pos_emb_height": 64,
242
+ "init_pos_emb_width": 64,
243
+ "num_hidden_layers": 27,
244
+ "num_attention_heads": 16,
245
+ "hidden_size": 1152,
246
+ "intermediate_size": 4304,
247
+ "merge_kernel_size": [
248
+ 2,
249
+ 2
250
+ ],
251
+ "skip_vision": true
252
+ },
253
+ "vocab_size": 163840
254
+ }
configuration_kimi_vl.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+ from typing import Optional, Union
4
+
5
+ logger = logging.get_logger(__name__)
6
+
7
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
8
+
9
+
10
+ class DeepseekV3Config(PretrainedConfig):
11
+ r"""
12
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
13
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
14
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
15
+
16
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
17
+ documentation from [`PretrainedConfig`] for more information.
18
+
19
+ Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
20
+
21
+ Args:
22
+ vocab_size (`int`, *optional*, defaults to 129280):
23
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
24
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
25
+ hidden_size (`int`, *optional*, defaults to 4096):
26
+ Dimension of the hidden representations.
27
+ intermediate_size (`int`, *optional*, defaults to 11008):
28
+ Dimension of the MLP representations.
29
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
30
+ Dimension of the MoE representations.
31
+ num_hidden_layers (`int`, *optional*, defaults to 32):
32
+ Number of hidden layers in the Transformer decoder.
33
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
34
+ Number of nextn predict layers in the DeepSeekV3 Model.
35
+ num_attention_heads (`int`, *optional*, defaults to 32):
36
+ Number of attention heads for each attention layer in the Transformer decoder.
37
+ n_shared_experts (`int`, *optional*, defaults to None):
38
+ Number of shared experts, None means dense model.
39
+ n_routed_experts (`int`, *optional*, defaults to None):
40
+ Number of routed experts, None means dense model.
41
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
42
+ Scaling factor or routed experts.
43
+ topk_method (`str`, *optional*, defaults to `gready`):
44
+ Topk method used in routed gate.
45
+ n_group (`int`, *optional*, defaults to None):
46
+ Number of groups for routed experts.
47
+ topk_group (`int`, *optional*, defaults to None):
48
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
49
+ num_experts_per_tok (`int`, *optional*, defaults to None):
50
+ Number of selected experts, None means dense model.
51
+ moe_layer_freq (`int`, *optional*, defaults to 1):
52
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
53
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
54
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
55
+ \--k dense layers--/
56
+ norm_topk_prob (`bool`, *optional*, defaults to False):
57
+ Whether to normalize the weights of the routed experts.
58
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
59
+ Method of computing expert weights.
60
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
61
+ Auxiliary loss weight coefficient.
62
+ seq_aux = (`bool`, *optional*, defaults to True):
63
+ Whether to compute the auxiliary loss for each individual sample.
64
+ num_key_value_heads (`int`, *optional*):
65
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
66
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
67
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
68
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
69
+ by meanpooling all the original heads within that group. For more details checkout [this
70
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
71
+ `num_attention_heads`.
72
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
73
+ The non-linear activation function (function or string) in the decoder.
74
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
75
+ The maximum sequence length that this model might ever be used with.
76
+ initializer_range (`float`, *optional*, defaults to 0.02):
77
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
78
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
79
+ The epsilon used by the rms normalization layers.
80
+ use_cache (`bool`, *optional*, defaults to `True`):
81
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
82
+ relevant if `config.is_decoder=True`.
83
+ pad_token_id (`int`, *optional*):
84
+ Padding token id.
85
+ bos_token_id (`int`, *optional*, defaults to 1):
86
+ Beginning of stream token id.
87
+ eos_token_id (`int`, *optional*, defaults to 2):
88
+ End of stream token id.
89
+ pretraining_tp (`int`, *optional*, defaults to 1):
90
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
91
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
92
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
93
+ issue](https://github.com/pytorch/pytorch/issues/76232).
94
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
95
+ Whether to tie weight embeddings
96
+ rope_theta (`float`, *optional*, defaults to 10000.0):
97
+ The base period of the RoPE embeddings.
98
+ rope_scaling (`Dict`, *optional*):
99
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
100
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
101
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
102
+ `max_position_embeddings` to the expected new maximum.
103
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
104
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
105
+ attention_dropout (`float`, *optional*, defaults to 0.0):
106
+ The dropout ratio for the attention probabilities.
107
+
108
+ ```python
109
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
110
+
111
+ >>> # Initializing a Deepseek-V3 style configuration
112
+ >>> configuration = DeepseekV3Config()
113
+
114
+ >>> # Accessing the model configuration
115
+ >>> configuration = model.config
116
+ ```"""
117
+
118
+ model_type = "deepseek_v3"
119
+ keys_to_ignore_at_inference = ["past_key_values"]
120
+
121
+ def __init__(
122
+ self,
123
+ vocab_size=129280,
124
+ hidden_size=7168,
125
+ intermediate_size=18432,
126
+ moe_intermediate_size=2048,
127
+ num_hidden_layers=61,
128
+ num_nextn_predict_layers=1,
129
+ num_attention_heads=128,
130
+ num_key_value_heads=128,
131
+ n_shared_experts=1,
132
+ n_routed_experts=256,
133
+ ep_size=1,
134
+ routed_scaling_factor=2.5,
135
+ kv_lora_rank=512,
136
+ q_lora_rank=1536,
137
+ qk_rope_head_dim=64,
138
+ v_head_dim=128,
139
+ qk_nope_head_dim=128,
140
+ topk_method="noaux_tc",
141
+ n_group=8,
142
+ topk_group=4,
143
+ num_experts_per_tok=8,
144
+ moe_layer_freq=1,
145
+ first_k_dense_replace=3,
146
+ norm_topk_prob=True,
147
+ scoring_func="sigmoid",
148
+ aux_loss_alpha=0.001,
149
+ seq_aux=True,
150
+ hidden_act="silu",
151
+ max_position_embeddings=4096,
152
+ initializer_range=0.02,
153
+ rms_norm_eps=1e-6,
154
+ use_cache=True,
155
+ pad_token_id=None,
156
+ bos_token_id=0,
157
+ eos_token_id=1,
158
+ pretraining_tp=1,
159
+ tie_word_embeddings=False,
160
+ rope_theta=10000.0,
161
+ rope_scaling=None,
162
+ attention_bias=False,
163
+ attention_dropout=0.0,
164
+ **kwargs,
165
+ ):
166
+ self.vocab_size = vocab_size
167
+ self.max_position_embeddings = max_position_embeddings
168
+ self.hidden_size = hidden_size
169
+ self.intermediate_size = intermediate_size
170
+ self.moe_intermediate_size = moe_intermediate_size
171
+ self.num_hidden_layers = num_hidden_layers
172
+ self.num_nextn_predict_layers = num_nextn_predict_layers
173
+ self.num_attention_heads = num_attention_heads
174
+ self.n_shared_experts = n_shared_experts
175
+ self.n_routed_experts = n_routed_experts
176
+ self.ep_size = ep_size
177
+ self.routed_scaling_factor = routed_scaling_factor
178
+ self.kv_lora_rank = kv_lora_rank
179
+ self.q_lora_rank = q_lora_rank
180
+ self.qk_rope_head_dim = qk_rope_head_dim
181
+ self.v_head_dim = v_head_dim
182
+ self.qk_nope_head_dim = qk_nope_head_dim
183
+ self.topk_method = topk_method
184
+ self.n_group = n_group
185
+ self.topk_group = topk_group
186
+ self.num_experts_per_tok = num_experts_per_tok
187
+ self.moe_layer_freq = moe_layer_freq
188
+ self.first_k_dense_replace = first_k_dense_replace
189
+ self.norm_topk_prob = norm_topk_prob
190
+ self.scoring_func = scoring_func
191
+ self.aux_loss_alpha = aux_loss_alpha
192
+ self.seq_aux = seq_aux
193
+ # for backward compatibility
194
+ if num_key_value_heads is None:
195
+ num_key_value_heads = num_attention_heads
196
+
197
+ self.num_key_value_heads = num_key_value_heads
198
+ self.hidden_act = hidden_act
199
+ self.initializer_range = initializer_range
200
+ self.rms_norm_eps = rms_norm_eps
201
+ self.pretraining_tp = pretraining_tp
202
+ self.use_cache = use_cache
203
+ self.rope_theta = rope_theta
204
+ self.rope_scaling = rope_scaling
205
+ self.attention_bias = attention_bias
206
+ self.attention_dropout = attention_dropout
207
+
208
+ super().__init__(
209
+ pad_token_id=pad_token_id,
210
+ bos_token_id=bos_token_id,
211
+ eos_token_id=eos_token_id,
212
+ tie_word_embeddings=tie_word_embeddings,
213
+ **kwargs,
214
+ )
215
+
216
+
217
+ class MoonViTConfig(PretrainedConfig):
218
+ model_type = "moonvit"
219
+
220
+ def __init__(
221
+ self,
222
+ patch_size: int = 14,
223
+ init_pos_emb_height: int = 64,
224
+ init_pos_emb_width: int = 64,
225
+ num_attention_heads: int = 16,
226
+ num_hidden_layers: int = 27,
227
+ hidden_size: int = 1152,
228
+ intermediate_size: int = 4304,
229
+ merge_kernel_size: tuple[int, int] = (2, 2),
230
+ **kwargs,
231
+ ):
232
+ super().__init__(**kwargs)
233
+ self.patch_size = patch_size
234
+ # Positional embedding config
235
+ self.init_pos_emb_height = init_pos_emb_height
236
+ self.init_pos_emb_width = init_pos_emb_width
237
+ # Transformer config
238
+ self.num_hidden_layers = num_hidden_layers
239
+ self.num_attention_heads = num_attention_heads
240
+ self.hidden_size = hidden_size
241
+ self.intermediate_size = intermediate_size
242
+ # Patch merger config
243
+ self.merge_kernel_size = merge_kernel_size
244
+
245
+
246
+ class KimiVLConfig(PretrainedConfig):
247
+ model_type = "kimi_vl"
248
+
249
+ def __init__(
250
+ self,
251
+ vision_config: Optional[Union[dict, MoonViTConfig]] = None,
252
+ text_config: Optional[Union[dict, DeepseekV3Config]] = None,
253
+ ignore_index: int = -100,
254
+ media_placeholder_token_id: int = 163605,
255
+ pad_token_id: int = 0,
256
+ **kwargs,
257
+ ):
258
+ if vision_config is None:
259
+ vision_config = MoonViTConfig()
260
+ elif isinstance(vision_config, dict):
261
+ vision_config = MoonViTConfig(**vision_config)
262
+ self.vision_config = vision_config
263
+
264
+ if text_config is None:
265
+ text_config = DeepseekV3Config()
266
+ elif isinstance(text_config, dict):
267
+ text_config = DeepseekV3Config(**text_config)
268
+ self.text_config = text_config
269
+
270
+ self.ignore_index = ignore_index
271
+ self.media_placeholder_token_id = media_placeholder_token_id
272
+
273
+ attn_implementation = kwargs.get("attn_implementation")
274
+ if attn_implementation is not None:
275
+ if attn_implementation in ["eager", "flash_attention_2"]:
276
+ self._attn_implementation = attn_implementation
277
+ self.vision_config._attn_implementation = attn_implementation
278
+ self.text_config._attn_implementation = attn_implementation
279
+ else:
280
+ raise ValueError(
281
+ f"Invalid attention implementation: {attn_implementation}"
282
+ )
283
+
284
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
image_processing_kimi_vl.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image processor class for KimiVL."""
2
+
3
+ import math
4
+ import numpy as np
5
+ from PIL import Image
6
+ from typing import Optional, Union
7
+
8
+ import torch
9
+ from torchvision.transforms import functional as TF
10
+ from transformers.image_utils import ImageInput, make_list_of_images, valid_images
11
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
12
+ from transformers.utils import TensorType
13
+
14
+
15
+ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
16
+ OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
17
+
18
+
19
+ class KimiVLImageProcessor(BaseImageProcessor):
20
+ model_type = "kimi_vl"
21
+
22
+ def __init__(
23
+ self,
24
+ patch_size: int = 14,
25
+ pad_input: bool = False,
26
+ image_mean: tuple[float, float, float] = OPENAI_DATASET_MEAN,
27
+ image_std: tuple[float, float, float] = OPENAI_DATASET_STD,
28
+ in_token_limit: int = 4096,
29
+ merge_kernel_size: list[int, int] = [2, 2],
30
+ **kwargs,
31
+ ):
32
+ super().__init__(**kwargs)
33
+ self.in_token_limit = in_token_limit
34
+ self.patch_size = patch_size
35
+ self.pad_input = pad_input
36
+ self.image_mean = image_mean
37
+ self.image_std = image_std
38
+ self.merge_kernel_size = merge_kernel_size
39
+
40
+ def rescale(
41
+ self, image: Image.Image, merge_kernel_size: list[int, int] = [2, 2]
42
+ ) -> Image.Image:
43
+ w, h = image.size
44
+ patch_size = self.patch_size
45
+
46
+ if (w // patch_size) * (h // patch_size) > self.in_token_limit:
47
+ scale = math.sqrt(self.in_token_limit / ((w // patch_size) * (h // patch_size)))
48
+ new_w, new_h = int(w * scale), int(h * scale)
49
+ image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
50
+ if self.pad_input:
51
+ new_w, new_h = image.size
52
+ pad_size_h = merge_kernel_size[0] * patch_size
53
+ pad_size_w = merge_kernel_size[1] * patch_size
54
+
55
+ pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
56
+ pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
57
+
58
+ image = TF.pad(image, (0, 0, pad_w, pad_h))
59
+ else:
60
+ new_w, new_h = image.size
61
+ new_w = new_w - new_w % patch_size
62
+ new_h = new_h - new_h % patch_size
63
+ image = TF.center_crop(image, (new_h, new_w))
64
+
65
+ w, h = image.size
66
+ if w // patch_size >= 512 or h // patch_size >= 512:
67
+ raise ValueError("Exceed pos emb")
68
+
69
+ return image
70
+
71
+ def to_tensor(self, image: Image.Image) -> torch.Tensor:
72
+ return TF.to_tensor(image.convert("RGB"))
73
+
74
+ def normalize(self, image: torch.Tensor) -> torch.Tensor:
75
+ return TF.normalize(image, self.image_mean, self.image_std)
76
+
77
+ def patchify(self, image: torch.Tensor) -> tuple[torch.Tensor, list[int, int]]:
78
+ patch_size = self.patch_size
79
+ C, H, W = image.shape
80
+ patches = image.reshape(C, H // patch_size, patch_size, W // patch_size, patch_size)
81
+ patches = patches.permute(1, 3, 0, 2, 4)
82
+ patches = patches.contiguous().view(-1, C, patch_size, patch_size)
83
+ grid_hw = (H // patch_size, W // patch_size)
84
+ return patches, grid_hw
85
+
86
+ def _preprocess(self, image: ImageInput) -> tuple[torch.Tensor, list[int, int]]:
87
+ """
88
+ Preprocess image and patchify it.
89
+
90
+ Args:
91
+ image (`ImageInput`):
92
+ Image to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
93
+
94
+ Returns:
95
+ patches: torch.Tensor
96
+ grid_hw: list[int, int]
97
+ """
98
+ image = self.rescale(image, self.merge_kernel_size)
99
+ image = self.to_tensor(image)
100
+ image = self.normalize(image)
101
+ patches, grid_hw = self.patchify(image)
102
+ return patches, grid_hw
103
+
104
+ def preprocess(
105
+ self,
106
+ images: ImageInput,
107
+ return_tensors: Optional[Union[str, TensorType]] = None,
108
+ ) -> BatchFeature:
109
+ images = make_list_of_images(images)
110
+
111
+ if not valid_images(images):
112
+ raise ValueError(
113
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
114
+ "torch.Tensor, tf.Tensor or jax.ndarray."
115
+ )
116
+
117
+ pixel_values, image_grid_hws = [], []
118
+ for image in images:
119
+ patches, image_grid_hw = self._preprocess(image)
120
+ pixel_values.append(patches)
121
+ image_grid_hws.append(image_grid_hw)
122
+ pixel_values = torch.concat(pixel_values, dim=0)
123
+ image_grid_hws = np.array(image_grid_hws)
124
+ data = {"pixel_values": pixel_values, "image_grid_hws": image_grid_hws}
125
+
126
+ return BatchFeature(data=data, tensor_type=return_tensors)
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14e428f7c4ab5e763d70c5ada02868b6761ad922cd75fcdf850f939b729f979a
3
+ size 5218358149
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b4533e6c41231f0001352c121a1d4caba56f10f55eb5272dccb4c76776875c3
3
+ size 5364694549
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51d01d099a0249813602c15e4aec63fa22de88ac9a6d6108b35c281b07b3bc0a
3
+ size 5201366026
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:714089135d3023904e54c5083c2cc57bf1735e8aeb5629bd1bf21279a635274c
3
+ size 2043016397
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_kimi_vl.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_kimi_vl.KimiVLImageProcessor",
4
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
5
+ },
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "KimiVLImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "in_token_limit": 4096,
18
+ "merge_kernel_size": [
19
+ 2,
20
+ 2
21
+ ],
22
+ "num_pooled_tokens": 1024,
23
+ "pad_input": true,
24
+ "patch_size": 14,
25
+ "processor_class": "KimiVLProcessor"
26
+ }
processing_kimi_vl.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Moonshot Team and HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # The code is based on the Qwen2VL processor (qwen2_vl/processing_qwen2_vl.py), but modified for KimiVL.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """
18
+ Processor class for KimiVL.
19
+ """
20
+
21
+ from typing import List, Union
22
+
23
+ from transformers.feature_extraction_utils import BatchFeature
24
+ from transformers.image_utils import ImageInput
25
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
26
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
27
+ from transformers.utils import logging
28
+
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+
33
+ class KimiVLProcessorKwargs(ProcessingKwargs, total=False):
34
+ _defaults = {
35
+ "text_kwargs": {
36
+ "padding": False,
37
+ },
38
+ "images_kwargs": {},
39
+ }
40
+
41
+
42
+ class KimiVLProcessor(ProcessorMixin):
43
+ r"""
44
+ Constructs a KimiVL processor which wraps a KimiVL image processor and a tokenizer into a single processor.
45
+
46
+ [`KimiVLProcessor`] offers all the functionalities of [`KimiVLImageProcessor`] and [`TikTokenTokenizer`]. See the
47
+ [`~KimiVLProcessor.__call__`] and [`~KimiVLProcessor.decode`] for more information.
48
+
49
+ Args:
50
+ image_processor ([`KimiVLImageProcessor`], *optional*):
51
+ The image processor is a required input.
52
+ tokenizer ([`TikTokenTokenizer`], *optional*):
53
+ The tokenizer is a required input.
54
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
55
+ in a chat into a tokenizable string.
56
+ """
57
+
58
+ attributes = ["image_processor", "tokenizer"]
59
+ valid_kwargs = [ "chat_template"]
60
+ image_processor_class = "AutoImageProcessor"
61
+ tokenizer_class = "AutoTokenizer"
62
+
63
+ def __init__(
64
+ self,
65
+ image_processor=None,
66
+ tokenizer=None,
67
+ chat_template=None,
68
+ **kwargs,
69
+ ):
70
+ self.image_token = "<|media_pad|>"
71
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
72
+
73
+ def __call__(
74
+ self,
75
+ images: ImageInput = None,
76
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
77
+ **kwargs: Unpack[KimiVLProcessorKwargs],
78
+ ) -> BatchFeature:
79
+ """
80
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
81
+ and `kwargs` arguments to TikTokenTokenizer's [`~TikTokenTokenizer.__call__`] if `text` is not `None` to encode
82
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
83
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
84
+ of the above two methods for more information.
85
+
86
+ Args:
87
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
88
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
89
+ tensor. Both channels-first and channels-last formats are supported.
90
+ text (`str`, `List[str]`, `List[List[str]]`):
91
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
92
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
93
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
94
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
95
+ If set, will return tensors of a particular framework. Acceptable values are:
96
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
97
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
98
+ - `'np'`: Return NumPy `np.ndarray` objects.
99
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
100
+
101
+ Returns:
102
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
103
+
104
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
105
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
106
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
107
+ `None`).
108
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
109
+ """
110
+ if images is None and text is None:
111
+ raise ValueError("You have to specify at least one of `images` or `text`.")
112
+
113
+ # check if images and text inputs are reversed for BC
114
+ images, text = _validate_images_text_input_order(images, text)
115
+
116
+ output_kwargs = self._merge_kwargs(
117
+ KimiVLProcessorKwargs,
118
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
119
+ **kwargs,
120
+ )
121
+ if images is not None:
122
+ image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
123
+ image_grid_hws = image_inputs["image_grid_hws"]
124
+ else:
125
+ image_inputs = {}
126
+ image_grid_hws = None
127
+
128
+ if isinstance(text, str):
129
+ text = [text]
130
+ elif not isinstance(text, list) and not isinstance(text[0], str):
131
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
132
+
133
+ if image_grid_hws is not None:
134
+ merge_length = self.image_processor.merge_kernel_size[0] * self.image_processor.merge_kernel_size[1]
135
+ index = 0
136
+ for i in range(len(text)):
137
+ while self.image_token in text[i]:
138
+ text[i] = text[i].replace(
139
+ self.image_token,
140
+ "<|placeholder|>" * (image_grid_hws[index].prod() // merge_length),
141
+ 1,
142
+ )
143
+ index += 1
144
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
145
+
146
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
147
+ return BatchFeature(data={**text_inputs, **image_inputs})
148
+
149
+ def batch_decode(self, *args, **kwargs):
150
+ """
151
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
152
+ refer to the docstring of this method for more information.
153
+ """
154
+ return self.tokenizer.batch_decode(*args, **kwargs)
155
+
156
+ def decode(self, *args, **kwargs):
157
+ """
158
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
159
+ the docstring of this method for more information.
160
+ """
161
+ return self.tokenizer.decode(*args, **kwargs)
162
+
163
+ @property
164
+ def model_input_names(self):
165
+ tokenizer_input_names = self.tokenizer.model_input_names
166
+ image_processor_input_names = self.image_processor.model_input_names
167
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
168
+
169
+
170
+ __all__ = ["KimiVLProcessorKwargs"]
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
4
+ },
5
+ "processor_class": "KimiVLProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_end|>",
4
+ "<|im_user|>",
5
+ "<|im_assistant|>",
6
+ "<|im_system|>",
7
+ "<|im_middle|>",
8
+ "<|media_start|>",
9
+ "<|media_content|>",
10
+ "<|media_end|>",
11
+ "<|media_pad|>"
12
+ ],
13
+ "bos_token": {
14
+ "content": "[BOS]",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "eos_token": {
21
+ "content": "[EOS]",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "pad_token": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "unk_token": {
35
+ "content": "[UNK]",
36
+ "lstrip": false,
37
+ "normalized": false,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ }
41
+ }
tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
3
+ size 2795286
tokenization_moonshot.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.utils import to_py_obj
20
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+ SPIECE_UNDERLINE = "▁"
26
+
27
+
28
+ class TikTokenTokenizer(PreTrainedTokenizer):
29
+ """
30
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
31
+
32
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
33
+ this superclass for more information regarding those methods.
34
+
35
+ Args:
36
+ vocab_file (`str`):
37
+ The path to the Tiktoken model file.
38
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
39
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
40
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
41
+ The end of sequence token.
42
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
43
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
44
+ token instead. The second to last item in special_tokens.
45
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
46
+ The token used for padding, for example when batching sequences of different lengths.
47
+ additional_special_tokens (list of `str`, *optional*):
48
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
49
+ skipped when decoding if `skip_special_tokens` is set to `True`.
50
+ """
51
+
52
+ vocab_files_names = VOCAB_FILES_NAMES
53
+
54
+ model_input_names = ["input_ids", "attention_mask"]
55
+
56
+ special_tokens: Dict[str, int]
57
+
58
+ num_reserved_special_tokens = 256
59
+
60
+ pat_str = "|".join(
61
+ [
62
+ r"""[\p{Han}]+""",
63
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
64
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
65
+ r"""\p{N}{1,3}""",
66
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
67
+ r"""\s*[\r\n]+""",
68
+ r"""\s+(?!\S)""",
69
+ r"""\s+""",
70
+ ]
71
+ )
72
+
73
+ def __init__(
74
+ self,
75
+ vocab_file,
76
+ bos_token: Union[str, AddedToken] = "[BOS]",
77
+ eos_token: Union[str, AddedToken] = "[EOS]",
78
+ unk_token: Union[str, AddedToken] = "[UNK]",
79
+ pad_token: Union[str, AddedToken] = "[PAD]",
80
+ additional_special_tokens: Optional[List[str]] = None,
81
+ added_tokens_decoder: Optional[dict] = None,
82
+ **kwargs,
83
+ ):
84
+ assert os.path.isfile(vocab_file), vocab_file
85
+ if additional_special_tokens is None:
86
+ additional_special_tokens = [
87
+ "<|im_end|>",
88
+ "<|im_middle|>",
89
+ "<|im_user|>",
90
+ "<|im_assistant|>",
91
+ "<|im_system|>",
92
+ ]
93
+ special_tokens_mapping = {
94
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
95
+ }
96
+
97
+ special_tokens = (
98
+ [str(bos_token), str(eos_token)]
99
+ + additional_special_tokens
100
+ + [str(unk_token), str(pad_token)]
101
+ )
102
+
103
+ self.vocab_file = vocab_file
104
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
105
+ num_base_tokens = len(mergeable_ranks)
106
+ self.special_tokens = {
107
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
108
+ for i in range(
109
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
110
+ )
111
+ }
112
+
113
+ self.model = tiktoken.Encoding(
114
+ name=Path(vocab_file).name,
115
+ pat_str=self.pat_str,
116
+ mergeable_ranks=mergeable_ranks,
117
+ special_tokens=self.special_tokens,
118
+ )
119
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
120
+
121
+ self.n_words: int = self.model.n_vocab
122
+ # BOS / EOS token IDs
123
+ self.bos_id: int = self.special_tokens[str(bos_token)]
124
+ self.eos_id: int = self.special_tokens[str(eos_token)]
125
+ logger.info(
126
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
127
+ )
128
+
129
+ self.pad_id: int = self.special_tokens[str(pad_token)]
130
+ self.unk_id: int = self.special_tokens[str(unk_token)]
131
+
132
+ self.byte_encoder = bytes_to_unicode()
133
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
134
+
135
+ self.decoder = {}
136
+ for i in range(self.n_words):
137
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
138
+ decoding = "".join(
139
+ [
140
+ self.byte_encoder[ord(char)]
141
+ for char in self.model.decode_single_token_bytes(i).decode(
142
+ "latin-1"
143
+ )
144
+ ]
145
+ )
146
+ self.decoder[i] = decoding
147
+
148
+ self.encoder = {}
149
+ for i in range(self.n_words):
150
+ if i in self.decoder:
151
+ self.encoder[self.decoder[i]] = i
152
+
153
+ super().__init__(
154
+ bos_token=bos_token,
155
+ eos_token=eos_token,
156
+ unk_token=unk_token,
157
+ pad_token=pad_token,
158
+ additional_special_tokens=additional_special_tokens,
159
+ **kwargs,
160
+ )
161
+ self.all_special_ids_set = set(self.all_special_ids)
162
+
163
+ def encode(
164
+ self, text: str, allow_special_tokens: bool = True, **kwargs
165
+ ) -> List[int]:
166
+ """
167
+ Encodes a string into a list of token IDs.
168
+
169
+ Args:
170
+ text (str): The input string to be encoded.
171
+
172
+ Returns:
173
+ list[int]: A list of token IDs.
174
+ """
175
+ # If there are other args, we should call super().encode because there are a lot of code
176
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
177
+ if len(kwargs) > 0:
178
+ return super().encode(text, **kwargs)
179
+
180
+ assert type(text) is str
181
+
182
+ # The tiktoken tokenizer can handle <=400k chars without
183
+ # pyo3_runtime.PanicException.
184
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
185
+
186
+ # https://github.com/openai/tiktoken/issues/195
187
+ # Here we iterate over subsequences and split if we exceed the limit
188
+ # of max consecutive non-whitespace or whitespace characters.
189
+ MAX_NO_WHITESPACES_CHARS = 25_000
190
+
191
+ substrs = (
192
+ substr
193
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
194
+ for substr in self._split_whitespaces_or_nonwhitespaces(
195
+ text[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
196
+ )
197
+ )
198
+ t: List[int] = []
199
+ for substr in substrs:
200
+ if allow_special_tokens:
201
+ t.extend(
202
+ # we should consider special token as a common token
203
+ self.model.encode(
204
+ substr,
205
+ allowed_special="all",
206
+ )
207
+ )
208
+ else:
209
+ t.extend(
210
+ # we should consider special token as a common token
211
+ self.model.encode(
212
+ substr,
213
+ disallowed_special=(),
214
+ )
215
+ )
216
+ return t
217
+
218
+ def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
219
+ """
220
+ Decodes a list of token IDs into a string.
221
+
222
+ Args:
223
+ t (List[int]): The list of token IDs to be decoded.
224
+
225
+ Returns:
226
+ str: The decoded string.
227
+ """
228
+ # If there are other args, we should call super().decode because there are a lot of code
229
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
230
+ if len(kwargs) > 0:
231
+ return super().decode(token_ids, **kwargs)
232
+
233
+ token_ids = to_py_obj(token_ids)
234
+
235
+ if type(token_ids) is int:
236
+ token_ids = [token_ids]
237
+
238
+ return self.model.decode(cast(List[int], token_ids))
239
+
240
+ @staticmethod
241
+ def _split_whitespaces_or_nonwhitespaces(
242
+ s: str, max_consecutive_slice_len: int
243
+ ) -> Iterator[str]:
244
+ """
245
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
246
+ consecutive whitespaces or consecutive non-whitespaces.
247
+ """
248
+ current_slice_len = 0
249
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
250
+ slice_start = 0
251
+
252
+ for i in range(len(s)):
253
+ is_now_space = s[i].isspace()
254
+
255
+ if current_slice_is_space ^ is_now_space:
256
+ current_slice_len = 1
257
+ current_slice_is_space = is_now_space
258
+ else:
259
+ current_slice_len += 1
260
+ if current_slice_len > max_consecutive_slice_len:
261
+ yield s[slice_start:i]
262
+ slice_start = i
263
+ current_slice_len = 1
264
+ yield s[slice_start:]
265
+
266
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
267
+
268
+ @property
269
+ def vocab_size(self) -> int:
270
+ return self.n_words
271
+
272
+ def get_vocab(self) -> Dict[str, int]:
273
+ return self.encoder
274
+
275
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
276
+ return [self.decoder[t] for t in self.encode(text)]
277
+
278
+ def _convert_token_to_id(self, token: str) -> int:
279
+ return self.encoder.get(token, self.unk_id)
280
+
281
+ def _convert_id_to_token(self, index: int) -> str:
282
+ return self.decoder.get(index)
283
+
284
+ @staticmethod
285
+ def clean_up_tokenization(out_string: str) -> str:
286
+ return out_string
287
+
288
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
289
+ text = "".join(tokens).replace(SPIECE_UNDERLINE, "")
290
+ text = bytearray([self.byte_decoder[c] for c in text]).decode(
291
+ "utf-8", "replace"
292
+ )
293
+ return text
294
+
295
+ def save_vocabulary(
296
+ self, save_directory: str, filename_prefix: Optional[str] = None
297
+ ) -> Tuple[str]:
298
+ if not os.path.isdir(save_directory):
299
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
300
+ return
301
+ out_vocab_file = os.path.join(
302
+ save_directory,
303
+ (filename_prefix + "-" if filename_prefix else "")
304
+ + VOCAB_FILES_NAMES["vocab_file"],
305
+ )
306
+
307
+ if os.path.abspath(self.vocab_file) != os.path.abspath(
308
+ out_vocab_file
309
+ ) and os.path.isfile(self.vocab_file):
310
+ copyfile(self.vocab_file, out_vocab_file)
311
+
312
+ return (out_vocab_file,)
tokenizer_config.json ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "163584": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "163585": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "163586": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "163587": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "163588": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "163594": {
44
+ "content": "<|im_system|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "163601": {
52
+ "content": "<|im_middle|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "163602": {
60
+ "content": "<|media_start|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "163603": {
68
+ "content": "<|media_content|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "163604": {
76
+ "content": "<|media_end|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "163605": {
84
+ "content": "<|media_pad|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "163838": {
92
+ "content": "[PAD]",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "163839": {
100
+ "content": "[UNK]",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ }
107
+ },
108
+ "additional_special_tokens": [
109
+ "<|im_end|>",
110
+ "<|im_user|>",
111
+ "<|im_assistant|>",
112
+ "<|im_system|>",
113
+ "<|im_middle|>",
114
+ "<|media_start|>",
115
+ "<|media_content|>",
116
+ "<|media_end|>",
117
+ "<|media_pad|>"
118
+ ],
119
+ "auto_map": {
120
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor",
121
+ "AutoTokenizer": [
122
+ "tokenization_moonshot.TikTokenTokenizer",
123
+ null
124
+ ]
125
+ },
126
+ "bos_token": "[BOS]",
127
+ "clean_up_tokenization_spaces": false,
128
+ "eos_token": "[EOS]",
129
+ "extra_special_tokens": {},
130
+ "model_max_length": 1048576,
131
+ "pad_token": "[PAD]",
132
+ "processor_class": "KimiVLProcessor",
133
+ "tokenizer_class": "TikTokenTokenizer",
134
+ "unk_token": "[UNK]"
135
+ }