wenhuach commited on
Commit
1537268
·
1 Parent(s): b92fb61

add model file

Browse files

Signed-off-by: wenhuach <[email protected]>

Files changed (50) hide show
  1. configuration_deepseek.py +210 -0
  2. generation_config.json +9 -0
  3. model-00001-of-00042.safetensors +3 -0
  4. model-00002-of-00042.safetensors +3 -0
  5. model-00003-of-00042.safetensors +3 -0
  6. model-00004-of-00042.safetensors +3 -0
  7. model-00005-of-00042.safetensors +3 -0
  8. model-00006-of-00042.safetensors +3 -0
  9. model-00007-of-00042.safetensors +3 -0
  10. model-00008-of-00042.safetensors +3 -0
  11. model-00009-of-00042.safetensors +3 -0
  12. model-00010-of-00042.safetensors +3 -0
  13. model-00011-of-00042.safetensors +3 -0
  14. model-00012-of-00042.safetensors +3 -0
  15. model-00013-of-00042.safetensors +3 -0
  16. model-00014-of-00042.safetensors +3 -0
  17. model-00015-of-00042.safetensors +3 -0
  18. model-00016-of-00042.safetensors +3 -0
  19. model-00017-of-00042.safetensors +3 -0
  20. model-00018-of-00042.safetensors +3 -0
  21. model-00019-of-00042.safetensors +3 -0
  22. model-00020-of-00042.safetensors +3 -0
  23. model-00021-of-00042.safetensors +3 -0
  24. model-00022-of-00042.safetensors +3 -0
  25. model-00023-of-00042.safetensors +3 -0
  26. model-00024-of-00042.safetensors +3 -0
  27. model-00025-of-00042.safetensors +3 -0
  28. model-00026-of-00042.safetensors +3 -0
  29. model-00027-of-00042.safetensors +3 -0
  30. model-00028-of-00042.safetensors +3 -0
  31. model-00029-of-00042.safetensors +3 -0
  32. model-00030-of-00042.safetensors +3 -0
  33. model-00031-of-00042.safetensors +3 -0
  34. model-00032-of-00042.safetensors +3 -0
  35. model-00033-of-00042.safetensors +3 -0
  36. model-00034-of-00042.safetensors +3 -0
  37. model-00035-of-00042.safetensors +3 -0
  38. model-00036-of-00042.safetensors +3 -0
  39. model-00037-of-00042.safetensors +3 -0
  40. model-00038-of-00042.safetensors +3 -0
  41. model-00039-of-00042.safetensors +3 -0
  42. model-00040-of-00042.safetensors +3 -0
  43. model-00041-of-00042.safetensors +3 -0
  44. model-00042-of-00042.safetensors +3 -0
  45. model.safetensors.index.json +3 -0
  46. modeling_deepseek.py +1849 -0
  47. quantization_config.json +3002 -0
  48. special_tokens_map.json +23 -0
  49. tokenizer.json +0 -0
  50. tokenizer_config.json +0 -0
configuration_deepseek.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.utils import logging
3
+
4
+ logger = logging.get_logger(__name__)
5
+
6
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
7
+ class DeepseekV3Config(PretrainedConfig):
8
+ r"""
9
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
10
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
11
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
12
+
13
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
14
+ documentation from [`PretrainedConfig`] for more information.
15
+
16
+
17
+ Args:
18
+ vocab_size (`int`, *optional*, defaults to 129280):
19
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
20
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
21
+ hidden_size (`int`, *optional*, defaults to 4096):
22
+ Dimension of the hidden representations.
23
+ intermediate_size (`int`, *optional*, defaults to 11008):
24
+ Dimension of the MLP representations.
25
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
26
+ Dimension of the MoE representations.
27
+ num_hidden_layers (`int`, *optional*, defaults to 32):
28
+ Number of hidden layers in the Transformer decoder.
29
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
30
+ Number of nextn predict layers in the DeepSeekV3 Model.
31
+ num_attention_heads (`int`, *optional*, defaults to 32):
32
+ Number of attention heads for each attention layer in the Transformer decoder.
33
+ n_shared_experts (`int`, *optional*, defaults to None):
34
+ Number of shared experts, None means dense model.
35
+ n_routed_experts (`int`, *optional*, defaults to None):
36
+ Number of routed experts, None means dense model.
37
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
38
+ Scaling factor or routed experts.
39
+ topk_method (`str`, *optional*, defaults to `gready`):
40
+ Topk method used in routed gate.
41
+ n_group (`int`, *optional*, defaults to None):
42
+ Number of groups for routed experts.
43
+ topk_group (`int`, *optional*, defaults to None):
44
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
45
+ num_experts_per_tok (`int`, *optional*, defaults to None):
46
+ Number of selected experts, None means dense model.
47
+ moe_layer_freq (`int`, *optional*, defaults to 1):
48
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
49
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
50
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
51
+ \--k dense layers--/
52
+ norm_topk_prob (`bool`, *optional*, defaults to False):
53
+ Whether to normalize the weights of the routed experts.
54
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
55
+ Method of computing expert weights.
56
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
57
+ Auxiliary loss weight coefficient.
58
+ seq_aux = (`bool`, *optional*, defaults to True):
59
+ Whether to compute the auxiliary loss for each individual sample.
60
+ num_key_value_heads (`int`, *optional*):
61
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
62
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
63
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
64
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
65
+ by meanpooling all the original heads within that group. For more details checkout [this
66
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
67
+ `num_attention_heads`.
68
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
69
+ The non-linear activation function (function or string) in the decoder.
70
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
71
+ The maximum sequence length that this model might ever be used with.
72
+ initializer_range (`float`, *optional*, defaults to 0.02):
73
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
74
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
75
+ The epsilon used by the rms normalization layers.
76
+ use_cache (`bool`, *optional*, defaults to `True`):
77
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
78
+ relevant if `config.is_decoder=True`.
79
+ pad_token_id (`int`, *optional*):
80
+ Padding token id.
81
+ bos_token_id (`int`, *optional*, defaults to 1):
82
+ Beginning of stream token id.
83
+ eos_token_id (`int`, *optional*, defaults to 2):
84
+ End of stream token id.
85
+ pretraining_tp (`int`, *optional*, defaults to 1):
86
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
87
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
88
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
89
+ issue](https://github.com/pytorch/pytorch/issues/76232).
90
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
91
+ Whether to tie weight embeddings
92
+ rope_theta (`float`, *optional*, defaults to 10000.0):
93
+ The base period of the RoPE embeddings.
94
+ rope_scaling (`Dict`, *optional*):
95
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
96
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
97
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
98
+ `max_position_embeddings` to the expected new maximum.
99
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
100
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
101
+ attention_dropout (`float`, *optional*, defaults to 0.0):
102
+ The dropout ratio for the attention probabilities.
103
+
104
+ ```python
105
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
106
+
107
+ >>> # Initializing a Deepseek-V3 style configuration
108
+ >>> configuration = DeepseekV3Config()
109
+
110
+ >>> # Accessing the model configuration
111
+ >>> configuration = model.config
112
+ ```"""
113
+
114
+ model_type = "deepseek_v3"
115
+ keys_to_ignore_at_inference = ["past_key_values"]
116
+
117
+ def __init__(
118
+ self,
119
+ vocab_size=129280,
120
+ hidden_size=7168,
121
+ intermediate_size=18432,
122
+ moe_intermediate_size = 2048,
123
+ num_hidden_layers=61,
124
+ num_nextn_predict_layers=1,
125
+ num_attention_heads=128,
126
+ num_key_value_heads=128,
127
+ n_shared_experts = 1,
128
+ n_routed_experts = 256,
129
+ ep_size = 1,
130
+ routed_scaling_factor = 2.5,
131
+ kv_lora_rank = 512,
132
+ q_lora_rank = 1536,
133
+ qk_rope_head_dim = 64,
134
+ v_head_dim = 128,
135
+ qk_nope_head_dim = 128,
136
+ topk_method = 'noaux_tc',
137
+ n_group = 8,
138
+ topk_group = 4,
139
+ num_experts_per_tok = 8,
140
+ moe_layer_freq = 1,
141
+ first_k_dense_replace = 3,
142
+ norm_topk_prob = True,
143
+ scoring_func = 'sigmoid',
144
+ aux_loss_alpha = 0.001,
145
+ seq_aux = True,
146
+ hidden_act="silu",
147
+ max_position_embeddings=4096,
148
+ initializer_range=0.02,
149
+ rms_norm_eps=1e-6,
150
+ use_cache=True,
151
+ pad_token_id=None,
152
+ bos_token_id=0,
153
+ eos_token_id=1,
154
+ pretraining_tp=1,
155
+ tie_word_embeddings=False,
156
+ rope_theta=10000.0,
157
+ rope_scaling=None,
158
+ attention_bias=False,
159
+ attention_dropout=0.0,
160
+ **kwargs,
161
+ ):
162
+ self.vocab_size = vocab_size
163
+ self.max_position_embeddings = max_position_embeddings
164
+ self.hidden_size = hidden_size
165
+ self.intermediate_size = intermediate_size
166
+ self.moe_intermediate_size = moe_intermediate_size
167
+ self.num_hidden_layers = num_hidden_layers
168
+ self.num_nextn_predict_layers = num_nextn_predict_layers
169
+ self.num_attention_heads = num_attention_heads
170
+ self.n_shared_experts = n_shared_experts
171
+ self.n_routed_experts = n_routed_experts
172
+ self.ep_size = ep_size
173
+ self.routed_scaling_factor = routed_scaling_factor
174
+ self.kv_lora_rank = kv_lora_rank
175
+ self.q_lora_rank = q_lora_rank
176
+ self.qk_rope_head_dim = qk_rope_head_dim
177
+ self.v_head_dim = v_head_dim
178
+ self.qk_nope_head_dim = qk_nope_head_dim
179
+ self.topk_method = topk_method
180
+ self.n_group = n_group
181
+ self.topk_group = topk_group
182
+ self.num_experts_per_tok = num_experts_per_tok
183
+ self.moe_layer_freq = moe_layer_freq
184
+ self.first_k_dense_replace = first_k_dense_replace
185
+ self.norm_topk_prob = norm_topk_prob
186
+ self.scoring_func = scoring_func
187
+ self.aux_loss_alpha = aux_loss_alpha
188
+ self.seq_aux = seq_aux
189
+ # for backward compatibility
190
+ if num_key_value_heads is None:
191
+ num_key_value_heads = num_attention_heads
192
+
193
+ self.num_key_value_heads = num_key_value_heads
194
+ self.hidden_act = hidden_act
195
+ self.initializer_range = initializer_range
196
+ self.rms_norm_eps = rms_norm_eps
197
+ self.pretraining_tp = pretraining_tp
198
+ self.use_cache = use_cache
199
+ self.rope_theta = rope_theta
200
+ self.rope_scaling = rope_scaling
201
+ self.attention_bias = attention_bias
202
+ self.attention_dropout = attention_dropout
203
+
204
+ super().__init__(
205
+ pad_token_id=pad_token_id,
206
+ bos_token_id=bos_token_id,
207
+ eos_token_id=eos_token_id,
208
+ tie_word_embeddings=tie_word_embeddings,
209
+ **kwargs,
210
+ )
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "do_sample": true,
5
+ "eos_token_id": 1,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.47.0"
9
+ }
model-00001-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bb27077bc56f49f2744863b919dd6fa084d0369b4249a842727aceabedebb88
3
+ size 4998883416
model-00002-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51f5283206fcb48a378a53839e5d17a7af1b7af40c44699d0eedb9de74975407
3
+ size 4999321128
model-00003-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f07c2f51f0a42e390c15d7863aa6cf372ef02f69a0dcefc417caf4437df26d24
3
+ size 4997367192
model-00004-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2908af22ce6a584a03e7092dff8990b06f3c7d9732f29507d18ca043dd224c99
3
+ size 4999300704
model-00005-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ef64fbf4388fb116e369f3ddded814c9ef2b171ae1a247cab4802fb2cc6cbd3
3
+ size 4997367120
model-00006-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:119a806fe8a0a3cb5eb2cf6517cfcdc6cdaea3f899df254bdb7a613d8aa26cbd
3
+ size 4999324656
model-00007-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcf34c23949057b86386831b32ce93badb5eecee971cebdfbacf3156beb67331
3
+ size 4997371720
model-00008-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:305d049387e77b955596bdfbd8aa4cfda228c0c78fbee9e750f4d4e7166387ed
3
+ size 4999325912
model-00009-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd4de03360b199f3d6ae8556cd66ded71c63aa88765e908491a0b49db2217788
3
+ size 4997371656
model-00010-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c2fc5d533cdd2242eb1e550ae3768b5fc37ed7e1a2cf9327d9d4e6a4c7863d5
3
+ size 4999305504
model-00011-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5c2fd47f0d25eeacaf1f709ce4e8348fe5e0fa249472fbe93484f9a2ae7e3b4
3
+ size 4997371584
model-00012-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51a2c53272d0c1fffe82eba227a4da94f1983bfd7efcb1a39cfa9690f6f31110
3
+ size 4999326112
model-00013-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d107d7a0cc2d0e14a265ac190caff73a36dcd00eea1de4a343a5bb1673817ec7
3
+ size 4997371464
model-00014-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5f27d603a2c74cd9598104947c2053ca51dc45a106bf94f9a728de7dc640fa0
3
+ size 4942441016
model-00015-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0183de5d45f10d13aa01299f5d263aa21a302e5d96cbfc0e6be66cbd24643602
3
+ size 4999539888
model-00016-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8060861e48398df7d5c7577080815c8cebefda79da29ebebba0191503b29776d
3
+ size 4997372656
model-00017-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac35ce24b167293abf878b3ce28ae3a05bddedfa6c4cb9d4051e99142981f6ed
3
+ size 4999324944
model-00018-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49109c2e81dc5bca4e1ec4b654bd77d2f20c2c9369896645ac5ce00f911fa239
3
+ size 4997372656
model-00019-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8cd1135b50085a7c0924671531f95ee068095be185311117095dc8bc97473a0
3
+ size 4999304472
model-00020-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c321129ff087da0dc8790c95dc91fde9b557650879bbd842d783bb3d3061bfc1
3
+ size 4997372656
model-00021-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20971be2983c328599d686c35d3938e0bb7d9e55ec36ecc7e8697a9477854275
3
+ size 4999324936
model-00022-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fbee72ac46370e772dd9146ae25250b576e98794c1a6bae0da89c71c2d6b54b
3
+ size 4997372656
model-00023-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2067c15233726f548aac4608870f38100667f41bed2860c68dd515041d9a6b7
3
+ size 4999324928
model-00024-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce3888d0e7bfcdddd348e3aa4eb4fc8c79a85f500117ce3be7efc5b61ee9087a
3
+ size 4997372656
model-00025-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1d5624d21b874b7ee9972ee680b1dc48302416f9a9cb9a4ee4e5ba69c7f22b2
3
+ size 4999304488
model-00026-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c7ed25703e417a203f88647e1ee00061fec15574fd9cf01df94d759ac3ba365
3
+ size 4997372616
model-00027-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e4793525389d1060e2768fc582a9fe045599bb38065ebb600f7c87c0b0eaa68
3
+ size 4999325040
model-00028-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65689de7b34592a5eab9d10fe868169efa562e4d9216d7de0c7ccbb9e62c4e77
3
+ size 4997372552
model-00029-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f0be652ecab20c613838dd36f20c7a135ca1d982e083bc6191601fe1328057a
3
+ size 4999325096
model-00030-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cc8146338272399815e2db0af88a4830848b58f9928cc44b6f445165e265ac8
3
+ size 4997372480
model-00031-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf24548b36daf79b90843b212469c5aadc3c96a9baa395e581502cbcbdadbb43
3
+ size 4999304696
model-00032-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:367ac146f23eca878aaac91d07929fa835ba6c497219bb2c644ab8f395f6e662
3
+ size 4997372416
model-00033-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb4bab2aa79235ca941298c6c7cd0ee600272d9812601b24dcf9756e8b0e02b8
3
+ size 4999325240
model-00034-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb0f67a0ceaf1f3073eaa7dc6030c2b1107ae7939da63a1a9437ba648ad2cdc5
3
+ size 4997372344
model-00035-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:032a37995008e9de9c95fb761e9916cbaad60739291cb95e88e2ef7337a1fffd
3
+ size 4999325304
model-00036-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b4b398fe8cd1c1bd828b54ac6bd24dec38e141b3106eb5f8d8007007504b53a
3
+ size 4997372280
model-00037-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:368484a5c85c37cd1a261c98e4c447f6a54ac198a823a691abbe919d301ac77a
3
+ size 4999304896
model-00038-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33f4914a33b79522dad864be6ee9ffd58507e7befed932d4449189086b0093ff
3
+ size 4997372208
model-00039-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62e303254788c8ed534400771573725e2d314e3831179e694eea15fd7f192e6a
3
+ size 4997706504
model-00040-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b5b5954207bda2a0d4ff44701dbc5754c46c08aa0479b0042ce694f446d4a1f
3
+ size 4992599072
model-00041-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:897b40eb5e452c80de3f71abe9612f4efdcf283e352ddaea8e7fc66ff66b91ac
3
+ size 4007988760
model-00042-of-00042.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebaf911197b76095e80b3d394907736b8736049ff2d32615f5e5781282da29c2
3
+ size 1853358208
model.safetensors.index.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d04f43b65815a8dc6b406c2e25ed0492fe7ae196e70291fdd9f1ea36a7935a5
3
+ size 16310312
modeling_deepseek.py ADDED
@@ -0,0 +1,1849 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ PyTorch DeepSeek model."""
21
+ import math
22
+ import warnings
23
+ from typing import List, Optional, Tuple, Union
24
+
25
+ import torch
26
+ import torch.nn.functional as F
27
+ import torch.utils.checkpoint
28
+ from torch import nn
29
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
30
+
31
+ from transformers.activations import ACT2FN
32
+ from transformers.cache_utils import Cache, DynamicCache
33
+ from transformers.modeling_attn_mask_utils import (
34
+ AttentionMaskConverter,
35
+ _prepare_4d_attention_mask,
36
+ _prepare_4d_causal_attention_mask,
37
+ )
38
+ from transformers.modeling_outputs import (
39
+ BaseModelOutputWithPast,
40
+ CausalLMOutputWithPast,
41
+ SequenceClassifierOutputWithPast,
42
+ )
43
+ from transformers.modeling_utils import PreTrainedModel
44
+ from transformers.pytorch_utils import (
45
+ ALL_LAYERNORM_LAYERS,
46
+ is_torch_greater_or_equal_than_1_13,
47
+ )
48
+ from transformers.utils import (
49
+ add_start_docstrings,
50
+ add_start_docstrings_to_model_forward,
51
+ is_flash_attn_2_available,
52
+ is_flash_attn_greater_or_equal_2_10,
53
+ logging,
54
+ replace_return_docstrings,
55
+ )
56
+ from transformers.utils.import_utils import is_torch_fx_available
57
+ from .configuration_deepseek import DeepseekV3Config
58
+ import torch.distributed as dist
59
+ import numpy as np
60
+
61
+ if is_flash_attn_2_available():
62
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
63
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
64
+
65
+
66
+ # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
67
+ # It means that the function will not be traced through and simply appear as a node in the graph.
68
+ if is_torch_fx_available():
69
+ if not is_torch_greater_or_equal_than_1_13:
70
+ import torch.fx
71
+
72
+ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
73
+
74
+
75
+ logger = logging.get_logger(__name__)
76
+
77
+ _CONFIG_FOR_DOC = "DeepseekV3Config"
78
+
79
+
80
+ def _get_unpad_data(attention_mask):
81
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
82
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
83
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
84
+ cu_seqlens = F.pad(
85
+ torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
86
+ )
87
+ return (
88
+ indices,
89
+ cu_seqlens,
90
+ max_seqlen_in_batch,
91
+ )
92
+
93
+
94
+ class DeepseekV3RMSNorm(nn.Module):
95
+ def __init__(self, hidden_size, eps=1e-6):
96
+ """
97
+ DeepseekV3RMSNorm is equivalent to T5LayerNorm
98
+ """
99
+ super().__init__()
100
+ self.weight = nn.Parameter(torch.ones(hidden_size))
101
+ self.variance_epsilon = eps
102
+
103
+ def forward(self, hidden_states):
104
+ input_dtype = hidden_states.dtype
105
+ hidden_states = hidden_states.to(torch.float32)
106
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
107
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
108
+ return self.weight * hidden_states.to(input_dtype)
109
+
110
+
111
+ ALL_LAYERNORM_LAYERS.append(DeepseekV3RMSNorm)
112
+
113
+
114
+ class DeepseekV3RotaryEmbedding(nn.Module):
115
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
116
+ super().__init__()
117
+
118
+ self.dim = dim
119
+ self.max_position_embeddings = max_position_embeddings
120
+ self.base = base
121
+ inv_freq = 1.0 / (
122
+ self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
123
+ )
124
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
125
+
126
+ # Build here to make `torch.jit.trace` work.
127
+ self._set_cos_sin_cache(
128
+ seq_len=max_position_embeddings,
129
+ device=self.inv_freq.device,
130
+ dtype=torch.get_default_dtype(),
131
+ )
132
+ self.max_seq_len_cached = None
133
+
134
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
135
+ self.max_seq_len_cached = seq_len
136
+ t = torch.arange(
137
+ self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
138
+ )
139
+
140
+ freqs = torch.outer(t, self.inv_freq.to(t.device))
141
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
142
+ emb = torch.cat((freqs, freqs), dim=-1)
143
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
144
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
145
+
146
+ def forward(self, x, seq_len=None):
147
+ # x: [bs, num_attention_heads, seq_len, head_size]
148
+ if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
149
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
150
+
151
+ return (
152
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
153
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
154
+ )
155
+
156
+
157
+ # Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV3
158
+ class DeepseekV3LinearScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
159
+ """DeepseekV3RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
160
+
161
+ def __init__(
162
+ self,
163
+ dim,
164
+ max_position_embeddings=2048,
165
+ base=10000,
166
+ device=None,
167
+ scaling_factor=1.0,
168
+ ):
169
+ self.scaling_factor = scaling_factor
170
+ super().__init__(dim, max_position_embeddings, base, device)
171
+
172
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
173
+ self.max_seq_len_cached = seq_len
174
+ t = torch.arange(
175
+ self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
176
+ )
177
+ t = t / self.scaling_factor
178
+
179
+ freqs = torch.outer(t, self.inv_freq)
180
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
181
+ emb = torch.cat((freqs, freqs), dim=-1)
182
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
183
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
184
+
185
+
186
+ # Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV3
187
+ class DeepseekV3DynamicNTKScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
188
+ """DeepseekV3RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
189
+
190
+ def __init__(
191
+ self,
192
+ dim,
193
+ max_position_embeddings=2048,
194
+ base=10000,
195
+ device=None,
196
+ scaling_factor=1.0,
197
+ ):
198
+ self.scaling_factor = scaling_factor
199
+ super().__init__(dim, max_position_embeddings, base, device)
200
+
201
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
202
+ self.max_seq_len_cached = seq_len
203
+
204
+ if seq_len > self.max_position_embeddings:
205
+ base = self.base * (
206
+ (self.scaling_factor * seq_len / self.max_position_embeddings)
207
+ - (self.scaling_factor - 1)
208
+ ) ** (self.dim / (self.dim - 2))
209
+ inv_freq = 1.0 / (
210
+ base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
211
+ )
212
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
213
+
214
+ t = torch.arange(
215
+ self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
216
+ )
217
+
218
+ freqs = torch.outer(t, self.inv_freq)
219
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
220
+ emb = torch.cat((freqs, freqs), dim=-1)
221
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
222
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
223
+
224
+
225
+ # Inverse dim formula to find dim based on number of rotations
226
+ def yarn_find_correction_dim(
227
+ num_rotations, dim, base=10000, max_position_embeddings=2048
228
+ ):
229
+ return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
230
+ 2 * math.log(base)
231
+ )
232
+
233
+
234
+ # Find dim range bounds based on rotations
235
+ def yarn_find_correction_range(
236
+ low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
237
+ ):
238
+ low = math.floor(
239
+ yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
240
+ )
241
+ high = math.ceil(
242
+ yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
243
+ )
244
+ return max(low, 0), min(high, dim - 1) # Clamp values just in case
245
+
246
+
247
+ def yarn_get_mscale(scale=1, mscale=1):
248
+ if scale <= 1:
249
+ return 1.0
250
+ return 0.1 * mscale * math.log(scale) + 1.0
251
+
252
+
253
+ def yarn_linear_ramp_mask(min, max, dim):
254
+ if min == max:
255
+ max += 0.001 # Prevent singularity
256
+
257
+ linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
258
+ ramp_func = torch.clamp(linear_func, 0, 1)
259
+ return ramp_func
260
+
261
+
262
+ class DeepseekV3YarnRotaryEmbedding(DeepseekV3RotaryEmbedding):
263
+
264
+ def __init__(
265
+ self,
266
+ dim,
267
+ max_position_embeddings=2048,
268
+ base=10000,
269
+ device=None,
270
+ scaling_factor=1.0,
271
+ original_max_position_embeddings=4096,
272
+ beta_fast=32,
273
+ beta_slow=1,
274
+ mscale=1,
275
+ mscale_all_dim=0,
276
+ ):
277
+ self.scaling_factor = scaling_factor
278
+ self.original_max_position_embeddings = original_max_position_embeddings
279
+ self.beta_fast = beta_fast
280
+ self.beta_slow = beta_slow
281
+ self.mscale = mscale
282
+ self.mscale_all_dim = mscale_all_dim
283
+ super().__init__(dim, max_position_embeddings, base, device)
284
+
285
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
286
+ self.max_seq_len_cached = seq_len
287
+ dim = self.dim
288
+
289
+ freq_extra = 1.0 / (
290
+ self.base
291
+ ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
292
+ )
293
+ freq_inter = 1.0 / (
294
+ self.scaling_factor
295
+ * self.base
296
+ ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
297
+ )
298
+
299
+ low, high = yarn_find_correction_range(
300
+ self.beta_fast,
301
+ self.beta_slow,
302
+ dim,
303
+ self.base,
304
+ self.original_max_position_embeddings,
305
+ )
306
+ inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
307
+ device=device, dtype=torch.float32
308
+ )
309
+ inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
310
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
311
+
312
+ t = torch.arange(seq_len, device=device, dtype=torch.float32)
313
+
314
+ freqs = torch.outer(t, inv_freq)
315
+
316
+ _mscale = float(
317
+ yarn_get_mscale(self.scaling_factor, self.mscale)
318
+ / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
319
+ )
320
+
321
+ emb = torch.cat((freqs, freqs), dim=-1)
322
+ self.register_buffer(
323
+ "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
324
+ )
325
+ self.register_buffer(
326
+ "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
327
+ )
328
+
329
+
330
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
331
+ def rotate_half(x):
332
+ """Rotates half the hidden dims of the input."""
333
+ x1 = x[..., : x.shape[-1] // 2]
334
+ x2 = x[..., x.shape[-1] // 2 :]
335
+ return torch.cat((-x2, x1), dim=-1)
336
+
337
+
338
+ # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
339
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
340
+ """Applies Rotary Position Embedding to the query and key tensors.
341
+
342
+ Args:
343
+ q (`torch.Tensor`): The query tensor.
344
+ k (`torch.Tensor`): The key tensor.
345
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
346
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
347
+ position_ids (`torch.Tensor`):
348
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
349
+ used to pass offsetted position ids when working with a KV-cache.
350
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
351
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
352
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
353
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
354
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
355
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
356
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
357
+ Returns:
358
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
359
+ """
360
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
361
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
362
+
363
+ b, h, s, d = q.shape
364
+ q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
365
+
366
+ b, h, s, d = k.shape
367
+ k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
368
+
369
+ q_embed = (q * cos) + (rotate_half(q) * sin)
370
+ k_embed = (k * cos) + (rotate_half(k) * sin)
371
+ return q_embed, k_embed
372
+
373
+
374
+ class DeepseekV3MLP(nn.Module):
375
+ def __init__(self, config, hidden_size=None, intermediate_size=None):
376
+ super().__init__()
377
+ self.config = config
378
+ self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
379
+ self.intermediate_size = (
380
+ config.intermediate_size if intermediate_size is None else intermediate_size
381
+ )
382
+
383
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
384
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
385
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
386
+ self.act_fn = ACT2FN[config.hidden_act]
387
+
388
+ def forward(self, x):
389
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
390
+ return down_proj
391
+
392
+
393
+ class MoEGate(nn.Module):
394
+ def __init__(self, config):
395
+ super().__init__()
396
+ self.config = config
397
+ self.top_k = config.num_experts_per_tok
398
+ self.n_routed_experts = config.n_routed_experts
399
+ self.routed_scaling_factor = config.routed_scaling_factor
400
+ self.scoring_func = config.scoring_func
401
+ self.seq_aux = config.seq_aux
402
+ self.topk_method = config.topk_method
403
+ self.n_group = config.n_group
404
+ self.topk_group = config.topk_group
405
+
406
+ # topk selection algorithm
407
+ self.norm_topk_prob = config.norm_topk_prob
408
+ self.gating_dim = config.hidden_size
409
+ self.weight = nn.Parameter(
410
+ torch.empty((self.n_routed_experts, self.gating_dim))
411
+ )
412
+ if self.topk_method == "noaux_tc":
413
+ self.e_score_correction_bias = nn.Parameter(
414
+ torch.empty((self.n_routed_experts))
415
+ )
416
+ self.reset_parameters()
417
+
418
+ def reset_parameters(self) -> None:
419
+ import torch.nn.init as init
420
+
421
+ init.kaiming_uniform_(self.weight, a=math.sqrt(5))
422
+
423
+ def forward(self, hidden_states):
424
+ bsz, seq_len, h = hidden_states.shape
425
+ ### compute gating score
426
+ hidden_states = hidden_states.view(-1, h)
427
+ logits = F.linear(
428
+ hidden_states.type(torch.float32), self.weight.type(torch.float32), None
429
+ )
430
+ if self.scoring_func == "sigmoid":
431
+ scores = logits.sigmoid()
432
+ else:
433
+ raise NotImplementedError(
434
+ f"insupportable scoring function for MoE gating: {self.scoring_func}"
435
+ )
436
+
437
+ ### select top-k experts
438
+ if self.topk_method == "noaux_tc":
439
+ assert not self.training
440
+ scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
441
+ group_scores = (
442
+ scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
443
+ ) # [n, n_group]
444
+ group_idx = torch.topk(
445
+ group_scores, k=self.topk_group, dim=-1, sorted=False
446
+ )[
447
+ 1
448
+ ] # [n, top_k_group]
449
+ group_mask = torch.zeros_like(group_scores) # [n, n_group]
450
+ group_mask.scatter_(1, group_idx, 1) # [n, n_group]
451
+ score_mask = (
452
+ group_mask.unsqueeze(-1)
453
+ .expand(
454
+ bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
455
+ )
456
+ .reshape(bsz * seq_len, -1)
457
+ ) # [n, e]
458
+ tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # [n, e]
459
+ _, topk_idx = torch.topk(
460
+ tmp_scores, k=self.top_k, dim=-1, sorted=False
461
+ )
462
+ topk_weight = scores.gather(1, topk_idx)
463
+ else:
464
+ raise NotImplementedError(
465
+ f"insupportable TopK function for MoE gating: {self.topk_method}"
466
+ )
467
+
468
+ ### norm gate to sum 1
469
+ if self.top_k > 1 and self.norm_topk_prob:
470
+ denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
471
+ topk_weight = topk_weight / denominator
472
+ topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor
473
+
474
+ return topk_idx, topk_weight
475
+
476
+ class DeepseekV3MoE(nn.Module):
477
+ """
478
+ A mixed expert module containing shared experts.
479
+ """
480
+
481
+ def __init__(self, config):
482
+ super().__init__()
483
+ self.config = config
484
+ self.num_experts_per_tok = config.num_experts_per_tok
485
+
486
+ if hasattr(config, "ep_size") and config.ep_size > 1:
487
+ assert config.ep_size == dist.get_world_size()
488
+ self.ep_size = config.ep_size
489
+ self.experts_per_rank = config.n_routed_experts // config.ep_size
490
+ self.ep_rank = dist.get_rank()
491
+ self.experts = nn.ModuleList(
492
+ [
493
+ (
494
+ DeepseekV3MLP(
495
+ config, intermediate_size=config.moe_intermediate_size
496
+ )
497
+ if i >= self.ep_rank * self.experts_per_rank
498
+ and i < (self.ep_rank + 1) * self.experts_per_rank
499
+ else None
500
+ )
501
+ for i in range(config.n_routed_experts)
502
+ ]
503
+ )
504
+ else:
505
+ self.ep_size = 1
506
+ self.experts_per_rank = config.n_routed_experts
507
+ self.ep_rank = 0
508
+ self.experts = nn.ModuleList(
509
+ [
510
+ DeepseekV3MLP(
511
+ config, intermediate_size=config.moe_intermediate_size
512
+ )
513
+ for i in range(config.n_routed_experts)
514
+ ]
515
+ )
516
+ self.gate = MoEGate(config)
517
+ if config.n_shared_experts is not None:
518
+ intermediate_size = config.moe_intermediate_size * config.n_shared_experts
519
+ self.shared_experts = DeepseekV3MLP(
520
+ config=config, intermediate_size=intermediate_size
521
+ )
522
+
523
+ def forward(self, hidden_states):
524
+ identity = hidden_states
525
+ orig_shape = hidden_states.shape
526
+ topk_idx, topk_weight = self.gate(hidden_states)
527
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
528
+ flat_topk_idx = topk_idx.view(-1)
529
+ if not self.training:
530
+ y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
531
+ if self.config.n_shared_experts is not None:
532
+ y = y + self.shared_experts(identity)
533
+ return y
534
+
535
+ @torch.no_grad()
536
+ def moe_infer(self, x, topk_ids, topk_weight):
537
+ cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
538
+ cnts.scatter_(1, topk_ids, 1)
539
+ tokens_per_expert = cnts.sum(dim=0)
540
+ idxs = topk_ids.view(-1).argsort()
541
+ sorted_tokens = x[idxs // topk_ids.shape[1]]
542
+ sorted_tokens_shape = sorted_tokens.shape
543
+ if self.ep_size > 1:
544
+ tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
545
+ tokens_per_expert_group = tokens_per_expert.new_empty(
546
+ tokens_per_expert.shape[0]
547
+ )
548
+ dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
549
+ output_splits = (
550
+ tokens_per_expert_group.view(self.ep_size, -1)
551
+ .sum(1)
552
+ .cpu()
553
+ .numpy()
554
+ .tolist()
555
+ )
556
+ gathered_tokens = sorted_tokens.new_empty(
557
+ tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1]
558
+ )
559
+ input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
560
+ dist.all_to_all(
561
+ list(gathered_tokens.split(output_splits)),
562
+ list(sorted_tokens.split(input_split_sizes)),
563
+ )
564
+ tokens_per_expert_post_gather = tokens_per_expert_group.view(
565
+ self.ep_size, self.experts_per_rank
566
+ ).sum(dim=0)
567
+ gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32)
568
+ s = 0
569
+ for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
570
+ gatherd_idxs[s : s + k] = i % self.experts_per_rank
571
+ s += k
572
+ gatherd_idxs = gatherd_idxs.argsort()
573
+ sorted_tokens = gathered_tokens[gatherd_idxs]
574
+ tokens_per_expert = tokens_per_expert_post_gather
575
+ tokens_per_expert = tokens_per_expert.cpu().numpy()
576
+
577
+ outputs = []
578
+ start_idx = 0
579
+ for i, num_tokens in enumerate(tokens_per_expert):
580
+ end_idx = start_idx + num_tokens
581
+ if num_tokens == 0:
582
+ continue
583
+ expert = self.experts[i + self.ep_rank * self.experts_per_rank]
584
+ tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
585
+ expert_out = expert(tokens_for_this_expert)
586
+ outputs.append(expert_out)
587
+ start_idx = end_idx
588
+
589
+ outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
590
+ if self.ep_size > 1:
591
+ new_x = torch.empty_like(outs)
592
+ new_x[gatherd_idxs] = outs
593
+ gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
594
+ dist.all_to_all(
595
+ list(gathered_tokens.split(input_split_sizes)),
596
+ list(new_x.split(output_splits)),
597
+ )
598
+ outs = gathered_tokens
599
+
600
+ new_x = torch.empty_like(outs)
601
+ new_x[idxs] = outs
602
+ final_out = (
603
+ new_x.view(*topk_ids.shape, -1)
604
+ .type(topk_weight.dtype)
605
+ .mul_(topk_weight.unsqueeze(dim=-1))
606
+ .sum(dim=1)
607
+ .type(new_x.dtype)
608
+ )
609
+ return final_out
610
+
611
+
612
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
613
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
614
+ """
615
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
616
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
617
+ """
618
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
619
+ if n_rep == 1:
620
+ return hidden_states
621
+ hidden_states = hidden_states[:, :, None, :, :].expand(
622
+ batch, num_key_value_heads, n_rep, slen, head_dim
623
+ )
624
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
625
+
626
+
627
+ # Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV3
628
+ class DeepseekV3Attention(nn.Module):
629
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
630
+
631
+ def __init__(self, config: DeepseekV3Config, layer_idx: Optional[int] = None):
632
+ super().__init__()
633
+ self.config = config
634
+ self.layer_idx = layer_idx
635
+ if layer_idx is None:
636
+ logger.warning_once(
637
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
638
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
639
+ "when creating this class."
640
+ )
641
+
642
+ self.attention_dropout = config.attention_dropout
643
+ self.hidden_size = config.hidden_size
644
+ self.num_heads = config.num_attention_heads
645
+
646
+ self.max_position_embeddings = config.max_position_embeddings
647
+ self.rope_theta = config.rope_theta
648
+ self.q_lora_rank = config.q_lora_rank
649
+ self.qk_rope_head_dim = config.qk_rope_head_dim
650
+ self.kv_lora_rank = config.kv_lora_rank
651
+ self.v_head_dim = config.v_head_dim
652
+ self.qk_nope_head_dim = config.qk_nope_head_dim
653
+ self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
654
+
655
+ self.is_causal = True
656
+
657
+ if self.q_lora_rank is None:
658
+ self.q_proj = nn.Linear(
659
+ self.hidden_size, self.num_heads * self.q_head_dim, bias=False
660
+ )
661
+ else:
662
+ self.q_a_proj = nn.Linear(
663
+ self.hidden_size, config.q_lora_rank, bias=config.attention_bias
664
+ )
665
+ self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
666
+ self.q_b_proj = nn.Linear(
667
+ config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
668
+ )
669
+
670
+ self.kv_a_proj_with_mqa = nn.Linear(
671
+ self.hidden_size,
672
+ config.kv_lora_rank + config.qk_rope_head_dim,
673
+ bias=config.attention_bias,
674
+ )
675
+ self.kv_a_layernorm = DeepseekV3RMSNorm(config.kv_lora_rank)
676
+ self.kv_b_proj = nn.Linear(
677
+ config.kv_lora_rank,
678
+ self.num_heads
679
+ * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
680
+ bias=False,
681
+ )
682
+
683
+ self.o_proj = nn.Linear(
684
+ self.num_heads * self.v_head_dim,
685
+ self.hidden_size,
686
+ bias=config.attention_bias,
687
+ )
688
+ self._init_rope()
689
+
690
+ self.softmax_scale = self.q_head_dim ** (-0.5)
691
+ if self.config.rope_scaling is not None:
692
+ mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
693
+ scaling_factor = self.config.rope_scaling["factor"]
694
+ if mscale_all_dim:
695
+ mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
696
+ self.softmax_scale = self.softmax_scale * mscale * mscale
697
+
698
+ def _init_rope(self):
699
+ if self.config.rope_scaling is None:
700
+ self.rotary_emb = DeepseekV3RotaryEmbedding(
701
+ self.qk_rope_head_dim,
702
+ max_position_embeddings=self.max_position_embeddings,
703
+ base=self.rope_theta,
704
+ )
705
+ else:
706
+ scaling_type = self.config.rope_scaling["type"]
707
+ scaling_factor = self.config.rope_scaling["factor"]
708
+ if scaling_type == "linear":
709
+ self.rotary_emb = DeepseekV3LinearScalingRotaryEmbedding(
710
+ self.qk_rope_head_dim,
711
+ max_position_embeddings=self.max_position_embeddings,
712
+ scaling_factor=scaling_factor,
713
+ base=self.rope_theta,
714
+ )
715
+ elif scaling_type == "dynamic":
716
+ self.rotary_emb = DeepseekV3DynamicNTKScalingRotaryEmbedding(
717
+ self.qk_rope_head_dim,
718
+ max_position_embeddings=self.max_position_embeddings,
719
+ scaling_factor=scaling_factor,
720
+ base=self.rope_theta,
721
+ )
722
+ elif scaling_type == "yarn":
723
+ kwargs = {
724
+ key: self.config.rope_scaling[key]
725
+ for key in [
726
+ "original_max_position_embeddings",
727
+ "beta_fast",
728
+ "beta_slow",
729
+ "mscale",
730
+ "mscale_all_dim",
731
+ ]
732
+ if key in self.config.rope_scaling
733
+ }
734
+ self.rotary_emb = DeepseekV3YarnRotaryEmbedding(
735
+ self.qk_rope_head_dim,
736
+ max_position_embeddings=self.max_position_embeddings,
737
+ scaling_factor=scaling_factor,
738
+ base=self.rope_theta,
739
+ **kwargs,
740
+ )
741
+ else:
742
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
743
+
744
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
745
+ return (
746
+ tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim)
747
+ .transpose(1, 2)
748
+ .contiguous()
749
+ )
750
+
751
+ def forward(
752
+ self,
753
+ hidden_states: torch.Tensor,
754
+ attention_mask: Optional[torch.Tensor] = None,
755
+ position_ids: Optional[torch.LongTensor] = None,
756
+ past_key_value: Optional[Cache] = None,
757
+ output_attentions: bool = False,
758
+ use_cache: bool = False,
759
+ **kwargs,
760
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
761
+ if "padding_mask" in kwargs:
762
+ warnings.warn(
763
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
764
+ )
765
+ bsz, q_len, _ = hidden_states.size()
766
+
767
+ if self.q_lora_rank is None:
768
+ q = self.q_proj(hidden_states)
769
+ else:
770
+ q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
771
+ q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
772
+ q_nope, q_pe = torch.split(
773
+ q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
774
+ )
775
+
776
+ compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
777
+ compressed_kv, k_pe = torch.split(
778
+ compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
779
+ )
780
+ k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
781
+ kv = (
782
+ self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
783
+ .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
784
+ .transpose(1, 2)
785
+ )
786
+
787
+ k_nope, value_states = torch.split(
788
+ kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
789
+ )
790
+ kv_seq_len = value_states.shape[-2]
791
+ if past_key_value is not None:
792
+ if self.layer_idx is None:
793
+ raise ValueError(
794
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
795
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
796
+ "with a layer index."
797
+ )
798
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
799
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
800
+
801
+ q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
802
+
803
+ query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
804
+ query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
805
+ query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
806
+
807
+ key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
808
+ key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
809
+ key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
810
+ if past_key_value is not None:
811
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
812
+ key_states, value_states = past_key_value.update(
813
+ key_states, value_states, self.layer_idx, cache_kwargs
814
+ )
815
+
816
+ attn_weights = (
817
+ torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
818
+ )
819
+
820
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
821
+ raise ValueError(
822
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
823
+ f" {attn_weights.size()}"
824
+ )
825
+ assert attention_mask is not None
826
+ if attention_mask is not None:
827
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
828
+ raise ValueError(
829
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
830
+ )
831
+ attn_weights = attn_weights + attention_mask
832
+
833
+ # upcast attention to fp32
834
+ attn_weights = nn.functional.softmax(
835
+ attn_weights, dim=-1, dtype=torch.float32
836
+ ).to(query_states.dtype)
837
+ attn_weights = nn.functional.dropout(
838
+ attn_weights, p=self.attention_dropout, training=self.training
839
+ )
840
+ attn_output = torch.matmul(attn_weights, value_states)
841
+
842
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
843
+ raise ValueError(
844
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
845
+ f" {attn_output.size()}"
846
+ )
847
+
848
+ attn_output = attn_output.transpose(1, 2).contiguous()
849
+
850
+ attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
851
+
852
+ attn_output = self.o_proj(attn_output)
853
+
854
+ if not output_attentions:
855
+ attn_weights = None
856
+
857
+ return attn_output, attn_weights, past_key_value
858
+
859
+
860
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV3
861
+ class DeepseekV3FlashAttention2(DeepseekV3Attention):
862
+ """
863
+ DeepseekV3 flash attention module. This module inherits from `DeepseekV3Attention` as the weights of the module stays
864
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
865
+ flash attention and deal with padding tokens in case the input contains any of them.
866
+ """
867
+
868
+ def __init__(self, *args, **kwargs):
869
+ super().__init__(*args, **kwargs)
870
+
871
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
872
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
873
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
874
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
875
+
876
+ def forward(
877
+ self,
878
+ hidden_states: torch.Tensor,
879
+ attention_mask: Optional[torch.LongTensor] = None,
880
+ position_ids: Optional[torch.LongTensor] = None,
881
+ past_key_value: Optional[Cache] = None,
882
+ output_attentions: bool = False,
883
+ use_cache: bool = False,
884
+ **kwargs,
885
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
886
+ # DeepseekV3FlashAttention2 attention does not support output_attentions
887
+ if "padding_mask" in kwargs:
888
+ warnings.warn(
889
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
890
+ )
891
+
892
+ # overwrite attention_mask with padding_mask
893
+ attention_mask = kwargs.pop("padding_mask")
894
+
895
+ output_attentions = False
896
+
897
+ bsz, q_len, _ = hidden_states.size()
898
+
899
+ if self.q_lora_rank is None:
900
+ q = self.q_proj(hidden_states)
901
+ else:
902
+ q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
903
+ q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
904
+ q_nope, q_pe = torch.split(
905
+ q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
906
+ )
907
+
908
+ # Flash attention requires the input to have the shape
909
+ # batch_size x seq_length x head_dim x hidden_dim
910
+ # therefore we just need to keep the original shape
911
+ compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
912
+ compressed_kv, k_pe = torch.split(
913
+ compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
914
+ )
915
+ k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
916
+ kv = (
917
+ self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
918
+ .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
919
+ .transpose(1, 2)
920
+ )
921
+
922
+ k_nope, value_states = torch.split(
923
+ kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
924
+ )
925
+ kv_seq_len = value_states.shape[-2]
926
+
927
+ kv_seq_len = value_states.shape[-2]
928
+ if past_key_value is not None:
929
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
930
+
931
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
932
+ q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
933
+
934
+ query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
935
+ query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
936
+ query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
937
+
938
+ key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
939
+ key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
940
+ key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
941
+
942
+ if self.q_head_dim != self.v_head_dim:
943
+ value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])
944
+
945
+ if past_key_value is not None:
946
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
947
+ key_states, value_states = past_key_value.update(
948
+ key_states, value_states, self.layer_idx, cache_kwargs
949
+ )
950
+
951
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
952
+ # to be able to avoid many of these transpose/reshape/view.
953
+ query_states = query_states.transpose(1, 2)
954
+ key_states = key_states.transpose(1, 2)
955
+ value_states = value_states.transpose(1, 2)
956
+
957
+ dropout_rate = self.attention_dropout if self.training else 0.0
958
+
959
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
960
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
961
+ # cast them back in the correct dtype just to be sure everything works as expected.
962
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
963
+ # in fp32. (DeepseekV3RMSNorm handles it correctly)
964
+
965
+ input_dtype = query_states.dtype
966
+ if input_dtype == torch.float32:
967
+ # Handle the case where the model is quantized
968
+ if hasattr(self.config, "_pre_quantization_dtype"):
969
+ target_dtype = self.config._pre_quantization_dtype
970
+ elif torch.is_autocast_enabled():
971
+ target_dtype = torch.get_autocast_gpu_dtype()
972
+ else:
973
+ target_dtype = (
974
+ self.q_proj.weight.dtype
975
+ if self.q_lora_rank is None
976
+ else self.q_a_proj.weight.dtype
977
+ )
978
+
979
+ logger.warning_once(
980
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
981
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
982
+ f" {target_dtype}."
983
+ )
984
+
985
+ query_states = query_states.to(target_dtype)
986
+ key_states = key_states.to(target_dtype)
987
+ value_states = value_states.to(target_dtype)
988
+
989
+ attn_output = self._flash_attention_forward(
990
+ query_states,
991
+ key_states,
992
+ value_states,
993
+ attention_mask,
994
+ q_len,
995
+ dropout=dropout_rate,
996
+ softmax_scale=self.softmax_scale,
997
+ )
998
+ if self.q_head_dim != self.v_head_dim:
999
+ attn_output = attn_output[:, :, :, : self.v_head_dim]
1000
+
1001
+ attn_output = attn_output.reshape(
1002
+ bsz, q_len, self.num_heads * self.v_head_dim
1003
+ ).contiguous()
1004
+ attn_output = self.o_proj(attn_output)
1005
+
1006
+ if not output_attentions:
1007
+ attn_weights = None
1008
+
1009
+ return attn_output, attn_weights, past_key_value
1010
+
1011
+ def _flash_attention_forward(
1012
+ self,
1013
+ query_states,
1014
+ key_states,
1015
+ value_states,
1016
+ attention_mask,
1017
+ query_length,
1018
+ dropout=0.0,
1019
+ softmax_scale=None,
1020
+ ):
1021
+ """
1022
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1023
+ first unpad the input, then computes the attention scores and pad the final attention scores.
1024
+
1025
+ Args:
1026
+ query_states (`torch.Tensor`):
1027
+ Input query states to be passed to Flash Attention API
1028
+ key_states (`torch.Tensor`):
1029
+ Input key states to be passed to Flash Attention API
1030
+ value_states (`torch.Tensor`):
1031
+ Input value states to be passed to Flash Attention API
1032
+ attention_mask (`torch.Tensor`):
1033
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
1034
+ position of padding tokens and 1 for the position of non-padding tokens.
1035
+ dropout (`int`, *optional*):
1036
+ Attention dropout
1037
+ softmax_scale (`float`, *optional*):
1038
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
1039
+ """
1040
+ if not self._flash_attn_uses_top_left_mask:
1041
+ causal = self.is_causal
1042
+ else:
1043
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV3FlashAttention2 __init__.
1044
+ causal = self.is_causal and query_length != 1
1045
+
1046
+ # Contains at least one padding token in the sequence
1047
+ if attention_mask is not None:
1048
+ batch_size = query_states.shape[0]
1049
+ (
1050
+ query_states,
1051
+ key_states,
1052
+ value_states,
1053
+ indices_q,
1054
+ cu_seq_lens,
1055
+ max_seq_lens,
1056
+ ) = self._upad_input(
1057
+ query_states, key_states, value_states, attention_mask, query_length
1058
+ )
1059
+
1060
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
1061
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
1062
+
1063
+ attn_output_unpad = flash_attn_varlen_func(
1064
+ query_states,
1065
+ key_states,
1066
+ value_states,
1067
+ cu_seqlens_q=cu_seqlens_q,
1068
+ cu_seqlens_k=cu_seqlens_k,
1069
+ max_seqlen_q=max_seqlen_in_batch_q,
1070
+ max_seqlen_k=max_seqlen_in_batch_k,
1071
+ dropout_p=dropout,
1072
+ softmax_scale=softmax_scale,
1073
+ causal=causal,
1074
+ )
1075
+
1076
+ attn_output = pad_input(
1077
+ attn_output_unpad, indices_q, batch_size, query_length
1078
+ )
1079
+ else:
1080
+ attn_output = flash_attn_func(
1081
+ query_states,
1082
+ key_states,
1083
+ value_states,
1084
+ dropout,
1085
+ softmax_scale=softmax_scale,
1086
+ causal=causal,
1087
+ )
1088
+
1089
+ return attn_output
1090
+
1091
+ def _upad_input(
1092
+ self, query_layer, key_layer, value_layer, attention_mask, query_length
1093
+ ):
1094
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
1095
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
1096
+
1097
+ key_layer = index_first_axis(
1098
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
1099
+ indices_k,
1100
+ )
1101
+ value_layer = index_first_axis(
1102
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
1103
+ indices_k,
1104
+ )
1105
+ if query_length == kv_seq_len:
1106
+ query_layer = index_first_axis(
1107
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
1108
+ indices_k,
1109
+ )
1110
+ cu_seqlens_q = cu_seqlens_k
1111
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
1112
+ indices_q = indices_k
1113
+ elif query_length == 1:
1114
+ max_seqlen_in_batch_q = 1
1115
+ cu_seqlens_q = torch.arange(
1116
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
1117
+ ) # There is a memcpy here, that is very bad.
1118
+ indices_q = cu_seqlens_q[:-1]
1119
+ query_layer = query_layer.squeeze(1)
1120
+ else:
1121
+ # The -q_len: slice assumes left padding.
1122
+ attention_mask = attention_mask[:, -query_length:]
1123
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
1124
+ query_layer, attention_mask
1125
+ )
1126
+
1127
+ return (
1128
+ query_layer,
1129
+ key_layer,
1130
+ value_layer,
1131
+ indices_q,
1132
+ (cu_seqlens_q, cu_seqlens_k),
1133
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
1134
+ )
1135
+
1136
+
1137
+ ATTENTION_CLASSES = {
1138
+ "eager": DeepseekV3Attention,
1139
+ "flash_attention_2": DeepseekV3FlashAttention2,
1140
+ }
1141
+
1142
+
1143
+ class DeepseekV3DecoderLayer(nn.Module):
1144
+ def __init__(self, config: DeepseekV3Config, layer_idx: int):
1145
+ super().__init__()
1146
+ self.hidden_size = config.hidden_size
1147
+
1148
+ self.self_attn = ATTENTION_CLASSES[config._attn_implementation](
1149
+ config=config, layer_idx=layer_idx
1150
+ )
1151
+
1152
+ self.mlp = (
1153
+ DeepseekV3MoE(config)
1154
+ if (
1155
+ config.n_routed_experts is not None
1156
+ and layer_idx >= config.first_k_dense_replace
1157
+ and layer_idx % config.moe_layer_freq == 0
1158
+ )
1159
+ else DeepseekV3MLP(config)
1160
+ )
1161
+ self.input_layernorm = DeepseekV3RMSNorm(
1162
+ config.hidden_size, eps=config.rms_norm_eps
1163
+ )
1164
+ self.post_attention_layernorm = DeepseekV3RMSNorm(
1165
+ config.hidden_size, eps=config.rms_norm_eps
1166
+ )
1167
+
1168
+ def forward(
1169
+ self,
1170
+ hidden_states: torch.Tensor,
1171
+ attention_mask: Optional[torch.Tensor] = None,
1172
+ position_ids: Optional[torch.LongTensor] = None,
1173
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
1174
+ output_attentions: Optional[bool] = False,
1175
+ use_cache: Optional[bool] = False,
1176
+ **kwargs,
1177
+ ) -> Tuple[
1178
+ torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
1179
+ ]:
1180
+ """
1181
+ Args:
1182
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
1183
+ attention_mask (`torch.FloatTensor`, *optional*):
1184
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
1185
+ query_sequence_length, key_sequence_length)` if default attention is used.
1186
+ output_attentions (`bool`, *optional*):
1187
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
1188
+ returned tensors for more detail.
1189
+ use_cache (`bool`, *optional*):
1190
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
1191
+ (see `past_key_values`).
1192
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
1193
+ """
1194
+ if "padding_mask" in kwargs:
1195
+ warnings.warn(
1196
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
1197
+ )
1198
+ residual = hidden_states
1199
+
1200
+ hidden_states = self.input_layernorm(hidden_states)
1201
+
1202
+ # Self Attention
1203
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
1204
+ hidden_states=hidden_states,
1205
+ attention_mask=attention_mask,
1206
+ position_ids=position_ids,
1207
+ past_key_value=past_key_value,
1208
+ output_attentions=output_attentions,
1209
+ use_cache=use_cache,
1210
+ **kwargs,
1211
+ )
1212
+ hidden_states = residual + hidden_states
1213
+
1214
+ # Fully Connected
1215
+ residual = hidden_states
1216
+ hidden_states = self.post_attention_layernorm(hidden_states)
1217
+ hidden_states = self.mlp(hidden_states)
1218
+ hidden_states = residual + hidden_states
1219
+
1220
+ outputs = (hidden_states,)
1221
+
1222
+ if output_attentions:
1223
+ outputs += (self_attn_weights,)
1224
+
1225
+ if use_cache:
1226
+ outputs += (present_key_value,)
1227
+
1228
+ return outputs
1229
+
1230
+
1231
+ DeepseekV3_START_DOCSTRING = r"""
1232
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1233
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1234
+ etc.)
1235
+
1236
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1237
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1238
+ and behavior.
1239
+
1240
+ Parameters:
1241
+ config ([`DeepseekV3Config`]):
1242
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
1243
+ load the weights associated with the model, only the configuration. Check out the
1244
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
1245
+ """
1246
+
1247
+
1248
+ @add_start_docstrings(
1249
+ "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
1250
+ DeepseekV3_START_DOCSTRING,
1251
+ )
1252
+ class DeepseekV3PreTrainedModel(PreTrainedModel):
1253
+ config_class = DeepseekV3Config
1254
+ base_model_prefix = "model"
1255
+ supports_gradient_checkpointing = True
1256
+ _no_split_modules = ["DeepseekV3DecoderLayer"]
1257
+ _skip_keys_device_placement = "past_key_values"
1258
+ _supports_flash_attn_2 = True
1259
+ _supports_cache_class = True
1260
+
1261
+ def _init_weights(self, module):
1262
+ std = self.config.initializer_range
1263
+ if isinstance(module, nn.Linear):
1264
+ module.weight.data.normal_(mean=0.0, std=std)
1265
+ if module.bias is not None:
1266
+ module.bias.data.zero_()
1267
+ elif isinstance(module, nn.Embedding):
1268
+ module.weight.data.normal_(mean=0.0, std=std)
1269
+ if module.padding_idx is not None:
1270
+ module.weight.data[module.padding_idx].zero_()
1271
+
1272
+
1273
+ DeepseekV3_INPUTS_DOCSTRING = r"""
1274
+ Args:
1275
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1276
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1277
+ it.
1278
+
1279
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1280
+ [`PreTrainedTokenizer.__call__`] for details.
1281
+
1282
+ [What are input IDs?](../glossary#input-ids)
1283
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1284
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1285
+
1286
+ - 1 for tokens that are **not masked**,
1287
+ - 0 for tokens that are **masked**.
1288
+
1289
+ [What are attention masks?](../glossary#attention-mask)
1290
+
1291
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1292
+ [`PreTrainedTokenizer.__call__`] for details.
1293
+
1294
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
1295
+ `past_key_values`).
1296
+
1297
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1298
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1299
+ information on the default strategy.
1300
+
1301
+ - 1 indicates the head is **not masked**,
1302
+ - 0 indicates the head is **masked**.
1303
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1304
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1305
+ config.n_positions - 1]`.
1306
+
1307
+ [What are position IDs?](../glossary#position-ids)
1308
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1309
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1310
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1311
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
1312
+
1313
+ Two formats are allowed:
1314
+ - a [`~cache_utils.Cache`] instance;
1315
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1316
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1317
+ cache format.
1318
+
1319
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1320
+ legacy cache format will be returned.
1321
+
1322
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1323
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1324
+ of shape `(batch_size, sequence_length)`.
1325
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1326
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1327
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1328
+ model's internal embedding lookup matrix.
1329
+ use_cache (`bool`, *optional*):
1330
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1331
+ `past_key_values`).
1332
+ output_attentions (`bool`, *optional*):
1333
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1334
+ tensors for more detail.
1335
+ output_hidden_states (`bool`, *optional*):
1336
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1337
+ more detail.
1338
+ return_dict (`bool`, *optional*):
1339
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1340
+ """
1341
+
1342
+
1343
+ @add_start_docstrings(
1344
+ "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
1345
+ DeepseekV3_START_DOCSTRING,
1346
+ )
1347
+ class DeepseekV3Model(DeepseekV3PreTrainedModel):
1348
+ """
1349
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV3DecoderLayer`]
1350
+
1351
+ Args:
1352
+ config: DeepseekV3Config
1353
+ """
1354
+
1355
+ def __init__(self, config: DeepseekV3Config):
1356
+ super().__init__(config)
1357
+ self.padding_idx = config.pad_token_id
1358
+ self.vocab_size = config.vocab_size
1359
+
1360
+ self.embed_tokens = nn.Embedding(
1361
+ config.vocab_size, config.hidden_size, self.padding_idx
1362
+ )
1363
+ self.layers = nn.ModuleList(
1364
+ [
1365
+ DeepseekV3DecoderLayer(config, layer_idx)
1366
+ for layer_idx in range(config.num_hidden_layers)
1367
+ ]
1368
+ )
1369
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
1370
+ self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1371
+
1372
+ self.gradient_checkpointing = False
1373
+ # Initialize weights and apply final processing
1374
+ self.post_init()
1375
+
1376
+ def get_input_embeddings(self):
1377
+ return self.embed_tokens
1378
+
1379
+ def set_input_embeddings(self, value):
1380
+ self.embed_tokens = value
1381
+
1382
+ @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
1383
+ def forward(
1384
+ self,
1385
+ input_ids: torch.LongTensor = None,
1386
+ attention_mask: Optional[torch.Tensor] = None,
1387
+ position_ids: Optional[torch.LongTensor] = None,
1388
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1389
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1390
+ use_cache: Optional[bool] = None,
1391
+ output_attentions: Optional[bool] = None,
1392
+ output_hidden_states: Optional[bool] = None,
1393
+ return_dict: Optional[bool] = None,
1394
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
1395
+ output_attentions = (
1396
+ output_attentions
1397
+ if output_attentions is not None
1398
+ else self.config.output_attentions
1399
+ )
1400
+ output_hidden_states = (
1401
+ output_hidden_states
1402
+ if output_hidden_states is not None
1403
+ else self.config.output_hidden_states
1404
+ )
1405
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1406
+
1407
+ return_dict = (
1408
+ return_dict if return_dict is not None else self.config.use_return_dict
1409
+ )
1410
+
1411
+ # retrieve input_ids and inputs_embeds
1412
+ if input_ids is not None and inputs_embeds is not None:
1413
+ raise ValueError(
1414
+ "You cannot specify both input_ids and inputs_embeds at the same time"
1415
+ )
1416
+ elif input_ids is not None:
1417
+ batch_size, seq_length = input_ids.shape[:2]
1418
+ elif inputs_embeds is not None:
1419
+ batch_size, seq_length = inputs_embeds.shape[:2]
1420
+ else:
1421
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
1422
+
1423
+ past_key_values_length = 0
1424
+ if use_cache:
1425
+ use_legacy_cache = not isinstance(past_key_values, Cache)
1426
+ if use_legacy_cache:
1427
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
1428
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
1429
+
1430
+ if position_ids is None:
1431
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
1432
+ position_ids = torch.arange(
1433
+ past_key_values_length,
1434
+ seq_length + past_key_values_length,
1435
+ dtype=torch.long,
1436
+ device=device,
1437
+ )
1438
+ position_ids = position_ids.unsqueeze(0)
1439
+
1440
+ if inputs_embeds is None:
1441
+ inputs_embeds = self.embed_tokens(input_ids)
1442
+
1443
+ if self._use_flash_attention_2:
1444
+ # 2d mask is passed through the layers
1445
+ attention_mask = (
1446
+ attention_mask
1447
+ if (attention_mask is not None and 0 in attention_mask)
1448
+ else None
1449
+ )
1450
+ else:
1451
+ # 4d mask is passed through the layers
1452
+ attention_mask = _prepare_4d_causal_attention_mask(
1453
+ attention_mask,
1454
+ (batch_size, seq_length),
1455
+ inputs_embeds,
1456
+ past_key_values_length,
1457
+ )
1458
+
1459
+ # embed positions
1460
+ hidden_states = inputs_embeds
1461
+
1462
+ # decoder layers
1463
+ all_hidden_states = () if output_hidden_states else None
1464
+ all_self_attns = () if output_attentions else None
1465
+ next_decoder_cache = None
1466
+
1467
+ for decoder_layer in self.layers:
1468
+ if output_hidden_states:
1469
+ all_hidden_states += (hidden_states,)
1470
+
1471
+ layer_outputs = decoder_layer(
1472
+ hidden_states,
1473
+ attention_mask=attention_mask,
1474
+ position_ids=position_ids,
1475
+ past_key_value=past_key_values,
1476
+ output_attentions=output_attentions,
1477
+ use_cache=use_cache,
1478
+ )
1479
+
1480
+ hidden_states = layer_outputs[0]
1481
+
1482
+ if use_cache:
1483
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
1484
+
1485
+ if output_attentions:
1486
+ all_self_attns += (layer_outputs[1],)
1487
+
1488
+ hidden_states = self.norm(hidden_states)
1489
+
1490
+ # add hidden states from the last decoder layer
1491
+ if output_hidden_states:
1492
+ all_hidden_states += (hidden_states,)
1493
+
1494
+ next_cache = None
1495
+ if use_cache:
1496
+ next_cache = (
1497
+ next_decoder_cache.to_legacy_cache()
1498
+ if use_legacy_cache
1499
+ else next_decoder_cache
1500
+ )
1501
+ if not return_dict:
1502
+ return tuple(
1503
+ v
1504
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
1505
+ if v is not None
1506
+ )
1507
+ return BaseModelOutputWithPast(
1508
+ last_hidden_state=hidden_states,
1509
+ past_key_values=next_cache,
1510
+ hidden_states=all_hidden_states,
1511
+ attentions=all_self_attns,
1512
+ )
1513
+
1514
+
1515
+ class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
1516
+ _tied_weights_keys = ["lm_head.weight"]
1517
+
1518
+ def __init__(self, config):
1519
+ super().__init__(config)
1520
+ self.model = DeepseekV3Model(config)
1521
+ self.vocab_size = config.vocab_size
1522
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1523
+
1524
+ # Initialize weights and apply final processing
1525
+ self.post_init()
1526
+
1527
+ def get_input_embeddings(self):
1528
+ return self.model.embed_tokens
1529
+
1530
+ def set_input_embeddings(self, value):
1531
+ self.model.embed_tokens = value
1532
+
1533
+ def get_output_embeddings(self):
1534
+ return self.lm_head
1535
+
1536
+ def set_output_embeddings(self, new_embeddings):
1537
+ self.lm_head = new_embeddings
1538
+
1539
+ def set_decoder(self, decoder):
1540
+ self.model = decoder
1541
+
1542
+ def get_decoder(self):
1543
+ return self.model
1544
+
1545
+ @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
1546
+ @replace_return_docstrings(
1547
+ output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
1548
+ )
1549
+ def forward(
1550
+ self,
1551
+ input_ids: torch.LongTensor = None,
1552
+ attention_mask: Optional[torch.Tensor] = None,
1553
+ position_ids: Optional[torch.LongTensor] = None,
1554
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1555
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1556
+ labels: Optional[torch.LongTensor] = None,
1557
+ use_cache: Optional[bool] = None,
1558
+ output_attentions: Optional[bool] = None,
1559
+ output_hidden_states: Optional[bool] = None,
1560
+ return_dict: Optional[bool] = None,
1561
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1562
+ r"""
1563
+ Args:
1564
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1565
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
1566
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1567
+ (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
1568
+
1569
+ Returns:
1570
+
1571
+ Example:
1572
+
1573
+ ```python
1574
+ >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM
1575
+
1576
+ >>> model = DeepseekV3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1577
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1578
+
1579
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1580
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1581
+
1582
+ >>> # Generate
1583
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1584
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1585
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1586
+ ```"""
1587
+ output_attentions = (
1588
+ output_attentions
1589
+ if output_attentions is not None
1590
+ else self.config.output_attentions
1591
+ )
1592
+ output_hidden_states = (
1593
+ output_hidden_states
1594
+ if output_hidden_states is not None
1595
+ else self.config.output_hidden_states
1596
+ )
1597
+ return_dict = (
1598
+ return_dict if return_dict is not None else self.config.use_return_dict
1599
+ )
1600
+
1601
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1602
+ outputs = self.model(
1603
+ input_ids=input_ids,
1604
+ attention_mask=attention_mask,
1605
+ position_ids=position_ids,
1606
+ past_key_values=past_key_values,
1607
+ inputs_embeds=inputs_embeds,
1608
+ use_cache=use_cache,
1609
+ output_attentions=output_attentions,
1610
+ output_hidden_states=output_hidden_states,
1611
+ return_dict=return_dict,
1612
+ )
1613
+
1614
+ hidden_states = outputs[0]
1615
+ logits = self.lm_head(hidden_states)
1616
+ logits = logits.float()
1617
+
1618
+ loss = None
1619
+ if labels is not None:
1620
+ # Shift so that tokens < n predict n
1621
+ shift_logits = logits[..., :-1, :].contiguous()
1622
+ shift_labels = labels[..., 1:].contiguous()
1623
+ # Flatten the tokens
1624
+ loss_fct = CrossEntropyLoss()
1625
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1626
+ shift_labels = shift_labels.view(-1)
1627
+ # Enable model parallelism
1628
+ shift_labels = shift_labels.to(shift_logits.device)
1629
+ loss = loss_fct(shift_logits, shift_labels)
1630
+
1631
+ if not return_dict:
1632
+ output = (logits,) + outputs[1:]
1633
+ return (loss,) + output if loss is not None else output
1634
+
1635
+ return CausalLMOutputWithPast(
1636
+ loss=loss,
1637
+ logits=logits,
1638
+ past_key_values=outputs.past_key_values,
1639
+ hidden_states=outputs.hidden_states,
1640
+ attentions=outputs.attentions,
1641
+ )
1642
+
1643
+ def prepare_inputs_for_generation(
1644
+ self,
1645
+ input_ids,
1646
+ past_key_values=None,
1647
+ attention_mask=None,
1648
+ inputs_embeds=None,
1649
+ **kwargs,
1650
+ ):
1651
+ if past_key_values is not None:
1652
+ if isinstance(past_key_values, Cache):
1653
+ cache_length = past_key_values.get_seq_length()
1654
+ past_length = past_key_values.seen_tokens
1655
+ max_cache_length = past_key_values.get_max_length()
1656
+ else:
1657
+ cache_length = past_length = past_key_values[0][0].shape[2]
1658
+ max_cache_length = None
1659
+
1660
+ # Keep only the unprocessed tokens:
1661
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
1662
+ # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
1663
+ # input)
1664
+ if (
1665
+ attention_mask is not None
1666
+ and attention_mask.shape[1] > input_ids.shape[1]
1667
+ ):
1668
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
1669
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
1670
+ # input_ids based on the past_length.
1671
+ elif past_length < input_ids.shape[1]:
1672
+ input_ids = input_ids[:, past_length:]
1673
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
1674
+
1675
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
1676
+ if (
1677
+ max_cache_length is not None
1678
+ and attention_mask is not None
1679
+ and cache_length + input_ids.shape[1] > max_cache_length
1680
+ ):
1681
+ attention_mask = attention_mask[:, -max_cache_length:]
1682
+
1683
+ position_ids = kwargs.get("position_ids", None)
1684
+ if attention_mask is not None and position_ids is None:
1685
+ # create position_ids on the fly for batch generation
1686
+ position_ids = attention_mask.long().cumsum(-1) - 1
1687
+ position_ids.masked_fill_(attention_mask == 0, 1)
1688
+ if past_key_values:
1689
+ position_ids = position_ids[:, -input_ids.shape[1] :]
1690
+
1691
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1692
+ if inputs_embeds is not None and past_key_values is None:
1693
+ model_inputs = {"inputs_embeds": inputs_embeds}
1694
+ else:
1695
+ model_inputs = {"input_ids": input_ids}
1696
+
1697
+ model_inputs.update(
1698
+ {
1699
+ "position_ids": position_ids,
1700
+ "past_key_values": past_key_values,
1701
+ "use_cache": kwargs.get("use_cache"),
1702
+ "attention_mask": attention_mask,
1703
+ }
1704
+ )
1705
+ return model_inputs
1706
+
1707
+ @staticmethod
1708
+ def _reorder_cache(past_key_values, beam_idx):
1709
+ reordered_past = ()
1710
+ for layer_past in past_key_values:
1711
+ reordered_past += (
1712
+ tuple(
1713
+ past_state.index_select(0, beam_idx.to(past_state.device))
1714
+ for past_state in layer_past
1715
+ ),
1716
+ )
1717
+ return reordered_past
1718
+
1719
+
1720
+ @add_start_docstrings(
1721
+ """
1722
+ The DeepseekV3 Model transformer with a sequence classification head on top (linear layer).
1723
+
1724
+ [`DeepseekV3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1725
+ (e.g. GPT-2) do.
1726
+
1727
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1728
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1729
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1730
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1731
+ each row of the batch).
1732
+ """,
1733
+ DeepseekV3_START_DOCSTRING,
1734
+ )
1735
+ class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel):
1736
+ def __init__(self, config):
1737
+ super().__init__(config)
1738
+ self.num_labels = config.num_labels
1739
+ self.model = DeepseekV3Model(config)
1740
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1741
+
1742
+ # Initialize weights and apply final processing
1743
+ self.post_init()
1744
+
1745
+ def get_input_embeddings(self):
1746
+ return self.model.embed_tokens
1747
+
1748
+ def set_input_embeddings(self, value):
1749
+ self.model.embed_tokens = value
1750
+
1751
+ @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
1752
+ def forward(
1753
+ self,
1754
+ input_ids: torch.LongTensor = None,
1755
+ attention_mask: Optional[torch.Tensor] = None,
1756
+ position_ids: Optional[torch.LongTensor] = None,
1757
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1758
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1759
+ labels: Optional[torch.LongTensor] = None,
1760
+ use_cache: Optional[bool] = None,
1761
+ output_attentions: Optional[bool] = None,
1762
+ output_hidden_states: Optional[bool] = None,
1763
+ return_dict: Optional[bool] = None,
1764
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1765
+ r"""
1766
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1767
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
1768
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1769
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1770
+ """
1771
+ return_dict = (
1772
+ return_dict if return_dict is not None else self.config.use_return_dict
1773
+ )
1774
+
1775
+ transformer_outputs = self.model(
1776
+ input_ids,
1777
+ attention_mask=attention_mask,
1778
+ position_ids=position_ids,
1779
+ past_key_values=past_key_values,
1780
+ inputs_embeds=inputs_embeds,
1781
+ use_cache=use_cache,
1782
+ output_attentions=output_attentions,
1783
+ output_hidden_states=output_hidden_states,
1784
+ return_dict=return_dict,
1785
+ )
1786
+ hidden_states = transformer_outputs[0]
1787
+ logits = self.score(hidden_states)
1788
+
1789
+ if input_ids is not None:
1790
+ batch_size = input_ids.shape[0]
1791
+ else:
1792
+ batch_size = inputs_embeds.shape[0]
1793
+
1794
+ if self.config.pad_token_id is None and batch_size != 1:
1795
+ raise ValueError(
1796
+ "Cannot handle batch sizes > 1 if no padding token is defined."
1797
+ )
1798
+ if self.config.pad_token_id is None:
1799
+ sequence_lengths = -1
1800
+ else:
1801
+ if input_ids is not None:
1802
+ sequence_lengths = (
1803
+ torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1804
+ ).to(logits.device)
1805
+ else:
1806
+ sequence_lengths = -1
1807
+
1808
+ pooled_logits = logits[
1809
+ torch.arange(batch_size, device=logits.device), sequence_lengths
1810
+ ]
1811
+
1812
+ loss = None
1813
+ if labels is not None:
1814
+ labels = labels.to(logits.device)
1815
+ if self.config.problem_type is None:
1816
+ if self.num_labels == 1:
1817
+ self.config.problem_type = "regression"
1818
+ elif self.num_labels > 1 and (
1819
+ labels.dtype == torch.long or labels.dtype == torch.int
1820
+ ):
1821
+ self.config.problem_type = "single_label_classification"
1822
+ else:
1823
+ self.config.problem_type = "multi_label_classification"
1824
+
1825
+ if self.config.problem_type == "regression":
1826
+ loss_fct = MSELoss()
1827
+ if self.num_labels == 1:
1828
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1829
+ else:
1830
+ loss = loss_fct(pooled_logits, labels)
1831
+ elif self.config.problem_type == "single_label_classification":
1832
+ loss_fct = CrossEntropyLoss()
1833
+ loss = loss_fct(
1834
+ pooled_logits.view(-1, self.num_labels), labels.view(-1)
1835
+ )
1836
+ elif self.config.problem_type == "multi_label_classification":
1837
+ loss_fct = BCEWithLogitsLoss()
1838
+ loss = loss_fct(pooled_logits, labels)
1839
+ if not return_dict:
1840
+ output = (pooled_logits,) + transformer_outputs[1:]
1841
+ return ((loss,) + output) if loss is not None else output
1842
+
1843
+ return SequenceClassifierOutputWithPast(
1844
+ loss=loss,
1845
+ logits=pooled_logits,
1846
+ past_key_values=transformer_outputs.past_key_values,
1847
+ hidden_states=transformer_outputs.hidden_states,
1848
+ attentions=transformer_outputs.attentions,
1849
+ )
quantization_config.json ADDED
@@ -0,0 +1,3002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bits": 2,
3
+ "group_size": 64,
4
+ "sym": true,
5
+ "data_type": "int",
6
+ "enable_quanted_input": true,
7
+ "enable_minmax_tuning": true,
8
+ "seqlen": 512,
9
+ "batch_size": 4,
10
+ "scale_dtype": "torch.float16",
11
+ "lr": 0.0025,
12
+ "minmax_lr": 0.0025,
13
+ "gradient_accumulate_steps": 1,
14
+ "iters": 400,
15
+ "amp": true,
16
+ "nsamples": 512,
17
+ "low_gpu_mem_usage": false,
18
+ "to_quant_block_names": null,
19
+ "enable_norm_bias_tuning": false,
20
+ "dataset": "NeelNanda/pile-10k",
21
+ "autoround_version": "0.4.5",
22
+ "quant_method": "intel/auto-round",
23
+ "backend": "auto_round:gptq:exllamav2",
24
+ "extra_config": {
25
+ "model.layers.0.self_attn.q_a_proj": {
26
+ "bits": 4,
27
+ "group_size": 128
28
+ },
29
+ "model.layers.0.self_attn.q_b_proj": {
30
+ "bits": 4,
31
+ "group_size": 128
32
+ },
33
+ "model.layers.0.self_attn.kv_a_proj_with_mqa": {
34
+ "bits": 4,
35
+ "group_size": 128
36
+ },
37
+ "model.layers.0.self_attn.kv_b_proj": {
38
+ "bits": 4,
39
+ "group_size": 128
40
+ },
41
+ "model.layers.0.self_attn.o_proj": {
42
+ "bits": 4,
43
+ "group_size": 128
44
+ },
45
+ "model.layers.0.mlp.gate_proj": {
46
+ "bits": 4,
47
+ "group_size": 128
48
+ },
49
+ "model.layers.0.mlp.up_proj": {
50
+ "bits": 4,
51
+ "group_size": 128
52
+ },
53
+ "model.layers.0.mlp.down_proj": {
54
+ "bits": 4,
55
+ "group_size": 128
56
+ },
57
+ "model.layers.1.self_attn.q_a_proj": {
58
+ "bits": 4,
59
+ "group_size": 128
60
+ },
61
+ "model.layers.1.self_attn.q_b_proj": {
62
+ "bits": 4,
63
+ "group_size": 128
64
+ },
65
+ "model.layers.1.self_attn.kv_a_proj_with_mqa": {
66
+ "bits": 4,
67
+ "group_size": 128
68
+ },
69
+ "model.layers.1.self_attn.kv_b_proj": {
70
+ "bits": 4,
71
+ "group_size": 128
72
+ },
73
+ "model.layers.1.self_attn.o_proj": {
74
+ "bits": 4,
75
+ "group_size": 128
76
+ },
77
+ "model.layers.1.mlp.gate_proj": {
78
+ "bits": 4,
79
+ "group_size": 128
80
+ },
81
+ "model.layers.1.mlp.up_proj": {
82
+ "bits": 4,
83
+ "group_size": 128
84
+ },
85
+ "model.layers.1.mlp.down_proj": {
86
+ "bits": 4,
87
+ "group_size": 128
88
+ },
89
+ "model.layers.2.self_attn.q_a_proj": {
90
+ "bits": 4,
91
+ "group_size": 128
92
+ },
93
+ "model.layers.2.self_attn.q_b_proj": {
94
+ "bits": 4,
95
+ "group_size": 128
96
+ },
97
+ "model.layers.2.self_attn.kv_a_proj_with_mqa": {
98
+ "bits": 4,
99
+ "group_size": 128
100
+ },
101
+ "model.layers.2.self_attn.kv_b_proj": {
102
+ "bits": 4,
103
+ "group_size": 128
104
+ },
105
+ "model.layers.2.self_attn.o_proj": {
106
+ "bits": 4,
107
+ "group_size": 128
108
+ },
109
+ "model.layers.2.mlp.gate_proj": {
110
+ "bits": 4,
111
+ "group_size": 128
112
+ },
113
+ "model.layers.2.mlp.up_proj": {
114
+ "bits": 4,
115
+ "group_size": 128
116
+ },
117
+ "model.layers.2.mlp.down_proj": {
118
+ "bits": 4,
119
+ "group_size": 128
120
+ },
121
+ "model.layers.3.self_attn.q_a_proj": {
122
+ "bits": 4,
123
+ "group_size": 128
124
+ },
125
+ "model.layers.3.self_attn.q_b_proj": {
126
+ "bits": 4,
127
+ "group_size": 128
128
+ },
129
+ "model.layers.3.self_attn.kv_a_proj_with_mqa": {
130
+ "bits": 4,
131
+ "group_size": 128
132
+ },
133
+ "model.layers.3.self_attn.kv_b_proj": {
134
+ "bits": 4,
135
+ "group_size": 128
136
+ },
137
+ "model.layers.3.self_attn.o_proj": {
138
+ "bits": 4,
139
+ "group_size": 128
140
+ },
141
+ "model.layers.3.mlp.shared_experts.gate_proj": {
142
+ "bits": 4,
143
+ "group_size": 128
144
+ },
145
+ "model.layers.3.mlp.shared_experts.up_proj": {
146
+ "bits": 4,
147
+ "group_size": 128
148
+ },
149
+ "model.layers.3.mlp.shared_experts.down_proj": {
150
+ "bits": 4,
151
+ "group_size": 128
152
+ },
153
+ "model.layers.4.self_attn.q_a_proj": {
154
+ "bits": 4,
155
+ "group_size": 128
156
+ },
157
+ "model.layers.4.self_attn.q_b_proj": {
158
+ "bits": 4,
159
+ "group_size": 128
160
+ },
161
+ "model.layers.4.self_attn.kv_a_proj_with_mqa": {
162
+ "bits": 4,
163
+ "group_size": 128
164
+ },
165
+ "model.layers.4.self_attn.kv_b_proj": {
166
+ "bits": 4,
167
+ "group_size": 128
168
+ },
169
+ "model.layers.4.self_attn.o_proj": {
170
+ "bits": 4,
171
+ "group_size": 128
172
+ },
173
+ "model.layers.4.mlp.shared_experts.gate_proj": {
174
+ "bits": 4,
175
+ "group_size": 128
176
+ },
177
+ "model.layers.4.mlp.shared_experts.up_proj": {
178
+ "bits": 4,
179
+ "group_size": 128
180
+ },
181
+ "model.layers.4.mlp.shared_experts.down_proj": {
182
+ "bits": 4,
183
+ "group_size": 128
184
+ },
185
+ "model.layers.5.self_attn.q_a_proj": {
186
+ "bits": 4,
187
+ "group_size": 128
188
+ },
189
+ "model.layers.5.self_attn.q_b_proj": {
190
+ "bits": 4,
191
+ "group_size": 128
192
+ },
193
+ "model.layers.5.self_attn.kv_a_proj_with_mqa": {
194
+ "bits": 4,
195
+ "group_size": 128
196
+ },
197
+ "model.layers.5.self_attn.kv_b_proj": {
198
+ "bits": 4,
199
+ "group_size": 128
200
+ },
201
+ "model.layers.5.self_attn.o_proj": {
202
+ "bits": 4,
203
+ "group_size": 128
204
+ },
205
+ "model.layers.5.mlp.shared_experts.gate_proj": {
206
+ "bits": 4,
207
+ "group_size": 128
208
+ },
209
+ "model.layers.5.mlp.shared_experts.up_proj": {
210
+ "bits": 4,
211
+ "group_size": 128
212
+ },
213
+ "model.layers.5.mlp.shared_experts.down_proj": {
214
+ "bits": 4,
215
+ "group_size": 128
216
+ },
217
+ "model.layers.6.self_attn.q_a_proj": {
218
+ "bits": 4,
219
+ "group_size": 128
220
+ },
221
+ "model.layers.6.self_attn.q_b_proj": {
222
+ "bits": 4,
223
+ "group_size": 128
224
+ },
225
+ "model.layers.6.self_attn.kv_a_proj_with_mqa": {
226
+ "bits": 4,
227
+ "group_size": 128
228
+ },
229
+ "model.layers.6.self_attn.kv_b_proj": {
230
+ "bits": 4,
231
+ "group_size": 128
232
+ },
233
+ "model.layers.6.self_attn.o_proj": {
234
+ "bits": 4,
235
+ "group_size": 128
236
+ },
237
+ "model.layers.6.mlp.shared_experts.gate_proj": {
238
+ "bits": 4,
239
+ "group_size": 128
240
+ },
241
+ "model.layers.6.mlp.shared_experts.up_proj": {
242
+ "bits": 4,
243
+ "group_size": 128
244
+ },
245
+ "model.layers.6.mlp.shared_experts.down_proj": {
246
+ "bits": 4,
247
+ "group_size": 128
248
+ },
249
+ "model.layers.7.self_attn.q_a_proj": {
250
+ "bits": 4,
251
+ "group_size": 128
252
+ },
253
+ "model.layers.7.self_attn.q_b_proj": {
254
+ "bits": 4,
255
+ "group_size": 128
256
+ },
257
+ "model.layers.7.self_attn.kv_a_proj_with_mqa": {
258
+ "bits": 4,
259
+ "group_size": 128
260
+ },
261
+ "model.layers.7.self_attn.kv_b_proj": {
262
+ "bits": 4,
263
+ "group_size": 128
264
+ },
265
+ "model.layers.7.self_attn.o_proj": {
266
+ "bits": 4,
267
+ "group_size": 128
268
+ },
269
+ "model.layers.7.mlp.shared_experts.gate_proj": {
270
+ "bits": 4,
271
+ "group_size": 128
272
+ },
273
+ "model.layers.7.mlp.shared_experts.up_proj": {
274
+ "bits": 4,
275
+ "group_size": 128
276
+ },
277
+ "model.layers.7.mlp.shared_experts.down_proj": {
278
+ "bits": 4,
279
+ "group_size": 128
280
+ },
281
+ "model.layers.8.self_attn.q_a_proj": {
282
+ "bits": 4,
283
+ "group_size": 128
284
+ },
285
+ "model.layers.8.self_attn.q_b_proj": {
286
+ "bits": 4,
287
+ "group_size": 128
288
+ },
289
+ "model.layers.8.self_attn.kv_a_proj_with_mqa": {
290
+ "bits": 4,
291
+ "group_size": 128
292
+ },
293
+ "model.layers.8.self_attn.kv_b_proj": {
294
+ "bits": 4,
295
+ "group_size": 128
296
+ },
297
+ "model.layers.8.self_attn.o_proj": {
298
+ "bits": 4,
299
+ "group_size": 128
300
+ },
301
+ "model.layers.8.mlp.shared_experts.gate_proj": {
302
+ "bits": 4,
303
+ "group_size": 128
304
+ },
305
+ "model.layers.8.mlp.shared_experts.up_proj": {
306
+ "bits": 4,
307
+ "group_size": 128
308
+ },
309
+ "model.layers.8.mlp.shared_experts.down_proj": {
310
+ "bits": 4,
311
+ "group_size": 128
312
+ },
313
+ "model.layers.9.self_attn.q_a_proj": {
314
+ "bits": 4,
315
+ "group_size": 128
316
+ },
317
+ "model.layers.9.self_attn.q_b_proj": {
318
+ "bits": 4,
319
+ "group_size": 128
320
+ },
321
+ "model.layers.9.self_attn.kv_a_proj_with_mqa": {
322
+ "bits": 4,
323
+ "group_size": 128
324
+ },
325
+ "model.layers.9.self_attn.kv_b_proj": {
326
+ "bits": 4,
327
+ "group_size": 128
328
+ },
329
+ "model.layers.9.self_attn.o_proj": {
330
+ "bits": 4,
331
+ "group_size": 128
332
+ },
333
+ "model.layers.9.mlp.shared_experts.gate_proj": {
334
+ "bits": 4,
335
+ "group_size": 128
336
+ },
337
+ "model.layers.9.mlp.shared_experts.up_proj": {
338
+ "bits": 4,
339
+ "group_size": 128
340
+ },
341
+ "model.layers.9.mlp.shared_experts.down_proj": {
342
+ "bits": 4,
343
+ "group_size": 128
344
+ },
345
+ "model.layers.10.self_attn.q_a_proj": {
346
+ "bits": 4,
347
+ "group_size": 128
348
+ },
349
+ "model.layers.10.self_attn.q_b_proj": {
350
+ "bits": 4,
351
+ "group_size": 128
352
+ },
353
+ "model.layers.10.self_attn.kv_a_proj_with_mqa": {
354
+ "bits": 4,
355
+ "group_size": 128
356
+ },
357
+ "model.layers.10.self_attn.kv_b_proj": {
358
+ "bits": 4,
359
+ "group_size": 128
360
+ },
361
+ "model.layers.10.self_attn.o_proj": {
362
+ "bits": 4,
363
+ "group_size": 128
364
+ },
365
+ "model.layers.10.mlp.shared_experts.gate_proj": {
366
+ "bits": 4,
367
+ "group_size": 128
368
+ },
369
+ "model.layers.10.mlp.shared_experts.up_proj": {
370
+ "bits": 4,
371
+ "group_size": 128
372
+ },
373
+ "model.layers.10.mlp.shared_experts.down_proj": {
374
+ "bits": 4,
375
+ "group_size": 128
376
+ },
377
+ "model.layers.11.self_attn.q_a_proj": {
378
+ "bits": 4,
379
+ "group_size": 128
380
+ },
381
+ "model.layers.11.self_attn.q_b_proj": {
382
+ "bits": 4,
383
+ "group_size": 128
384
+ },
385
+ "model.layers.11.self_attn.kv_a_proj_with_mqa": {
386
+ "bits": 4,
387
+ "group_size": 128
388
+ },
389
+ "model.layers.11.self_attn.kv_b_proj": {
390
+ "bits": 4,
391
+ "group_size": 128
392
+ },
393
+ "model.layers.11.self_attn.o_proj": {
394
+ "bits": 4,
395
+ "group_size": 128
396
+ },
397
+ "model.layers.11.mlp.shared_experts.gate_proj": {
398
+ "bits": 4,
399
+ "group_size": 128
400
+ },
401
+ "model.layers.11.mlp.shared_experts.up_proj": {
402
+ "bits": 4,
403
+ "group_size": 128
404
+ },
405
+ "model.layers.11.mlp.shared_experts.down_proj": {
406
+ "bits": 4,
407
+ "group_size": 128
408
+ },
409
+ "model.layers.12.self_attn.q_a_proj": {
410
+ "bits": 4,
411
+ "group_size": 128
412
+ },
413
+ "model.layers.12.self_attn.q_b_proj": {
414
+ "bits": 4,
415
+ "group_size": 128
416
+ },
417
+ "model.layers.12.self_attn.kv_a_proj_with_mqa": {
418
+ "bits": 4,
419
+ "group_size": 128
420
+ },
421
+ "model.layers.12.self_attn.kv_b_proj": {
422
+ "bits": 4,
423
+ "group_size": 128
424
+ },
425
+ "model.layers.12.self_attn.o_proj": {
426
+ "bits": 4,
427
+ "group_size": 128
428
+ },
429
+ "model.layers.12.mlp.shared_experts.gate_proj": {
430
+ "bits": 4,
431
+ "group_size": 128
432
+ },
433
+ "model.layers.12.mlp.shared_experts.up_proj": {
434
+ "bits": 4,
435
+ "group_size": 128
436
+ },
437
+ "model.layers.12.mlp.shared_experts.down_proj": {
438
+ "bits": 4,
439
+ "group_size": 128
440
+ },
441
+ "model.layers.13.self_attn.q_a_proj": {
442
+ "bits": 4,
443
+ "group_size": 128
444
+ },
445
+ "model.layers.13.self_attn.q_b_proj": {
446
+ "bits": 4,
447
+ "group_size": 128
448
+ },
449
+ "model.layers.13.self_attn.kv_a_proj_with_mqa": {
450
+ "bits": 4,
451
+ "group_size": 128
452
+ },
453
+ "model.layers.13.self_attn.kv_b_proj": {
454
+ "bits": 4,
455
+ "group_size": 128
456
+ },
457
+ "model.layers.13.self_attn.o_proj": {
458
+ "bits": 4,
459
+ "group_size": 128
460
+ },
461
+ "model.layers.13.mlp.shared_experts.gate_proj": {
462
+ "bits": 4,
463
+ "group_size": 128
464
+ },
465
+ "model.layers.13.mlp.shared_experts.up_proj": {
466
+ "bits": 4,
467
+ "group_size": 128
468
+ },
469
+ "model.layers.13.mlp.shared_experts.down_proj": {
470
+ "bits": 4,
471
+ "group_size": 128
472
+ },
473
+ "model.layers.14.self_attn.q_a_proj": {
474
+ "bits": 4,
475
+ "group_size": 128
476
+ },
477
+ "model.layers.14.self_attn.q_b_proj": {
478
+ "bits": 4,
479
+ "group_size": 128
480
+ },
481
+ "model.layers.14.self_attn.kv_a_proj_with_mqa": {
482
+ "bits": 4,
483
+ "group_size": 128
484
+ },
485
+ "model.layers.14.self_attn.kv_b_proj": {
486
+ "bits": 4,
487
+ "group_size": 128
488
+ },
489
+ "model.layers.14.self_attn.o_proj": {
490
+ "bits": 4,
491
+ "group_size": 128
492
+ },
493
+ "model.layers.14.mlp.shared_experts.gate_proj": {
494
+ "bits": 4,
495
+ "group_size": 128
496
+ },
497
+ "model.layers.14.mlp.shared_experts.up_proj": {
498
+ "bits": 4,
499
+ "group_size": 128
500
+ },
501
+ "model.layers.14.mlp.shared_experts.down_proj": {
502
+ "bits": 4,
503
+ "group_size": 128
504
+ },
505
+ "model.layers.15.self_attn.q_a_proj": {
506
+ "bits": 4,
507
+ "group_size": 128
508
+ },
509
+ "model.layers.15.self_attn.q_b_proj": {
510
+ "bits": 4,
511
+ "group_size": 128
512
+ },
513
+ "model.layers.15.self_attn.kv_a_proj_with_mqa": {
514
+ "bits": 4,
515
+ "group_size": 128
516
+ },
517
+ "model.layers.15.self_attn.kv_b_proj": {
518
+ "bits": 4,
519
+ "group_size": 128
520
+ },
521
+ "model.layers.15.self_attn.o_proj": {
522
+ "bits": 4,
523
+ "group_size": 128
524
+ },
525
+ "model.layers.15.mlp.shared_experts.gate_proj": {
526
+ "bits": 4,
527
+ "group_size": 128
528
+ },
529
+ "model.layers.15.mlp.shared_experts.up_proj": {
530
+ "bits": 4,
531
+ "group_size": 128
532
+ },
533
+ "model.layers.15.mlp.shared_experts.down_proj": {
534
+ "bits": 4,
535
+ "group_size": 128
536
+ },
537
+ "model.layers.16.self_attn.q_a_proj": {
538
+ "bits": 4,
539
+ "group_size": 128
540
+ },
541
+ "model.layers.16.self_attn.q_b_proj": {
542
+ "bits": 4,
543
+ "group_size": 128
544
+ },
545
+ "model.layers.16.self_attn.kv_a_proj_with_mqa": {
546
+ "bits": 4,
547
+ "group_size": 128
548
+ },
549
+ "model.layers.16.self_attn.kv_b_proj": {
550
+ "bits": 4,
551
+ "group_size": 128
552
+ },
553
+ "model.layers.16.self_attn.o_proj": {
554
+ "bits": 4,
555
+ "group_size": 128
556
+ },
557
+ "model.layers.16.mlp.shared_experts.gate_proj": {
558
+ "bits": 4,
559
+ "group_size": 128
560
+ },
561
+ "model.layers.16.mlp.shared_experts.up_proj": {
562
+ "bits": 4,
563
+ "group_size": 128
564
+ },
565
+ "model.layers.16.mlp.shared_experts.down_proj": {
566
+ "bits": 4,
567
+ "group_size": 128
568
+ },
569
+ "model.layers.17.self_attn.q_a_proj": {
570
+ "bits": 4,
571
+ "group_size": 128
572
+ },
573
+ "model.layers.17.self_attn.q_b_proj": {
574
+ "bits": 4,
575
+ "group_size": 128
576
+ },
577
+ "model.layers.17.self_attn.kv_a_proj_with_mqa": {
578
+ "bits": 4,
579
+ "group_size": 128
580
+ },
581
+ "model.layers.17.self_attn.kv_b_proj": {
582
+ "bits": 4,
583
+ "group_size": 128
584
+ },
585
+ "model.layers.17.self_attn.o_proj": {
586
+ "bits": 4,
587
+ "group_size": 128
588
+ },
589
+ "model.layers.17.mlp.shared_experts.gate_proj": {
590
+ "bits": 4,
591
+ "group_size": 128
592
+ },
593
+ "model.layers.17.mlp.shared_experts.up_proj": {
594
+ "bits": 4,
595
+ "group_size": 128
596
+ },
597
+ "model.layers.17.mlp.shared_experts.down_proj": {
598
+ "bits": 4,
599
+ "group_size": 128
600
+ },
601
+ "model.layers.18.self_attn.q_a_proj": {
602
+ "bits": 4,
603
+ "group_size": 128
604
+ },
605
+ "model.layers.18.self_attn.q_b_proj": {
606
+ "bits": 4,
607
+ "group_size": 128
608
+ },
609
+ "model.layers.18.self_attn.kv_a_proj_with_mqa": {
610
+ "bits": 4,
611
+ "group_size": 128
612
+ },
613
+ "model.layers.18.self_attn.kv_b_proj": {
614
+ "bits": 4,
615
+ "group_size": 128
616
+ },
617
+ "model.layers.18.self_attn.o_proj": {
618
+ "bits": 4,
619
+ "group_size": 128
620
+ },
621
+ "model.layers.18.mlp.shared_experts.gate_proj": {
622
+ "bits": 4,
623
+ "group_size": 128
624
+ },
625
+ "model.layers.18.mlp.shared_experts.up_proj": {
626
+ "bits": 4,
627
+ "group_size": 128
628
+ },
629
+ "model.layers.18.mlp.shared_experts.down_proj": {
630
+ "bits": 4,
631
+ "group_size": 128
632
+ },
633
+ "model.layers.19.self_attn.q_a_proj": {
634
+ "bits": 4,
635
+ "group_size": 128
636
+ },
637
+ "model.layers.19.self_attn.q_b_proj": {
638
+ "bits": 4,
639
+ "group_size": 128
640
+ },
641
+ "model.layers.19.self_attn.kv_a_proj_with_mqa": {
642
+ "bits": 4,
643
+ "group_size": 128
644
+ },
645
+ "model.layers.19.self_attn.kv_b_proj": {
646
+ "bits": 4,
647
+ "group_size": 128
648
+ },
649
+ "model.layers.19.self_attn.o_proj": {
650
+ "bits": 4,
651
+ "group_size": 128
652
+ },
653
+ "model.layers.19.mlp.shared_experts.gate_proj": {
654
+ "bits": 4,
655
+ "group_size": 128
656
+ },
657
+ "model.layers.19.mlp.shared_experts.up_proj": {
658
+ "bits": 4,
659
+ "group_size": 128
660
+ },
661
+ "model.layers.19.mlp.shared_experts.down_proj": {
662
+ "bits": 4,
663
+ "group_size": 128
664
+ },
665
+ "model.layers.20.self_attn.q_a_proj": {
666
+ "bits": 4,
667
+ "group_size": 128
668
+ },
669
+ "model.layers.20.self_attn.q_b_proj": {
670
+ "bits": 4,
671
+ "group_size": 128
672
+ },
673
+ "model.layers.20.self_attn.kv_a_proj_with_mqa": {
674
+ "bits": 4,
675
+ "group_size": 128
676
+ },
677
+ "model.layers.20.self_attn.kv_b_proj": {
678
+ "bits": 4,
679
+ "group_size": 128
680
+ },
681
+ "model.layers.20.self_attn.o_proj": {
682
+ "bits": 4,
683
+ "group_size": 128
684
+ },
685
+ "model.layers.20.mlp.shared_experts.gate_proj": {
686
+ "bits": 4,
687
+ "group_size": 128
688
+ },
689
+ "model.layers.20.mlp.shared_experts.up_proj": {
690
+ "bits": 4,
691
+ "group_size": 128
692
+ },
693
+ "model.layers.20.mlp.shared_experts.down_proj": {
694
+ "bits": 4,
695
+ "group_size": 128
696
+ },
697
+ "model.layers.21.self_attn.q_a_proj": {
698
+ "bits": 4,
699
+ "group_size": 128
700
+ },
701
+ "model.layers.21.self_attn.q_b_proj": {
702
+ "bits": 4,
703
+ "group_size": 128
704
+ },
705
+ "model.layers.21.self_attn.kv_a_proj_with_mqa": {
706
+ "bits": 4,
707
+ "group_size": 128
708
+ },
709
+ "model.layers.21.self_attn.kv_b_proj": {
710
+ "bits": 4,
711
+ "group_size": 128
712
+ },
713
+ "model.layers.21.self_attn.o_proj": {
714
+ "bits": 4,
715
+ "group_size": 128
716
+ },
717
+ "model.layers.21.mlp.shared_experts.gate_proj": {
718
+ "bits": 4,
719
+ "group_size": 128
720
+ },
721
+ "model.layers.21.mlp.shared_experts.up_proj": {
722
+ "bits": 4,
723
+ "group_size": 128
724
+ },
725
+ "model.layers.21.mlp.shared_experts.down_proj": {
726
+ "bits": 4,
727
+ "group_size": 128
728
+ },
729
+ "model.layers.22.self_attn.q_a_proj": {
730
+ "bits": 4,
731
+ "group_size": 128
732
+ },
733
+ "model.layers.22.self_attn.q_b_proj": {
734
+ "bits": 4,
735
+ "group_size": 128
736
+ },
737
+ "model.layers.22.self_attn.kv_a_proj_with_mqa": {
738
+ "bits": 4,
739
+ "group_size": 128
740
+ },
741
+ "model.layers.22.self_attn.kv_b_proj": {
742
+ "bits": 4,
743
+ "group_size": 128
744
+ },
745
+ "model.layers.22.self_attn.o_proj": {
746
+ "bits": 4,
747
+ "group_size": 128
748
+ },
749
+ "model.layers.22.mlp.shared_experts.gate_proj": {
750
+ "bits": 4,
751
+ "group_size": 128
752
+ },
753
+ "model.layers.22.mlp.shared_experts.up_proj": {
754
+ "bits": 4,
755
+ "group_size": 128
756
+ },
757
+ "model.layers.22.mlp.shared_experts.down_proj": {
758
+ "bits": 4,
759
+ "group_size": 128
760
+ },
761
+ "model.layers.23.self_attn.q_a_proj": {
762
+ "bits": 4,
763
+ "group_size": 128
764
+ },
765
+ "model.layers.23.self_attn.q_b_proj": {
766
+ "bits": 4,
767
+ "group_size": 128
768
+ },
769
+ "model.layers.23.self_attn.kv_a_proj_with_mqa": {
770
+ "bits": 4,
771
+ "group_size": 128
772
+ },
773
+ "model.layers.23.self_attn.kv_b_proj": {
774
+ "bits": 4,
775
+ "group_size": 128
776
+ },
777
+ "model.layers.23.self_attn.o_proj": {
778
+ "bits": 4,
779
+ "group_size": 128
780
+ },
781
+ "model.layers.23.mlp.shared_experts.gate_proj": {
782
+ "bits": 4,
783
+ "group_size": 128
784
+ },
785
+ "model.layers.23.mlp.shared_experts.up_proj": {
786
+ "bits": 4,
787
+ "group_size": 128
788
+ },
789
+ "model.layers.23.mlp.shared_experts.down_proj": {
790
+ "bits": 4,
791
+ "group_size": 128
792
+ },
793
+ "model.layers.24.self_attn.q_a_proj": {
794
+ "bits": 4,
795
+ "group_size": 128
796
+ },
797
+ "model.layers.24.self_attn.q_b_proj": {
798
+ "bits": 4,
799
+ "group_size": 128
800
+ },
801
+ "model.layers.24.self_attn.kv_a_proj_with_mqa": {
802
+ "bits": 4,
803
+ "group_size": 128
804
+ },
805
+ "model.layers.24.self_attn.kv_b_proj": {
806
+ "bits": 4,
807
+ "group_size": 128
808
+ },
809
+ "model.layers.24.self_attn.o_proj": {
810
+ "bits": 4,
811
+ "group_size": 128
812
+ },
813
+ "model.layers.24.mlp.shared_experts.gate_proj": {
814
+ "bits": 4,
815
+ "group_size": 128
816
+ },
817
+ "model.layers.24.mlp.shared_experts.up_proj": {
818
+ "bits": 4,
819
+ "group_size": 128
820
+ },
821
+ "model.layers.24.mlp.shared_experts.down_proj": {
822
+ "bits": 4,
823
+ "group_size": 128
824
+ },
825
+ "model.layers.25.self_attn.q_a_proj": {
826
+ "bits": 4,
827
+ "group_size": 128
828
+ },
829
+ "model.layers.25.self_attn.q_b_proj": {
830
+ "bits": 4,
831
+ "group_size": 128
832
+ },
833
+ "model.layers.25.self_attn.kv_a_proj_with_mqa": {
834
+ "bits": 4,
835
+ "group_size": 128
836
+ },
837
+ "model.layers.25.self_attn.kv_b_proj": {
838
+ "bits": 4,
839
+ "group_size": 128
840
+ },
841
+ "model.layers.25.self_attn.o_proj": {
842
+ "bits": 4,
843
+ "group_size": 128
844
+ },
845
+ "model.layers.25.mlp.shared_experts.gate_proj": {
846
+ "bits": 4,
847
+ "group_size": 128
848
+ },
849
+ "model.layers.25.mlp.shared_experts.up_proj": {
850
+ "bits": 4,
851
+ "group_size": 128
852
+ },
853
+ "model.layers.25.mlp.shared_experts.down_proj": {
854
+ "bits": 4,
855
+ "group_size": 128
856
+ },
857
+ "model.layers.26.self_attn.q_a_proj": {
858
+ "bits": 4,
859
+ "group_size": 128
860
+ },
861
+ "model.layers.26.self_attn.q_b_proj": {
862
+ "bits": 4,
863
+ "group_size": 128
864
+ },
865
+ "model.layers.26.self_attn.kv_a_proj_with_mqa": {
866
+ "bits": 4,
867
+ "group_size": 128
868
+ },
869
+ "model.layers.26.self_attn.kv_b_proj": {
870
+ "bits": 4,
871
+ "group_size": 128
872
+ },
873
+ "model.layers.26.self_attn.o_proj": {
874
+ "bits": 4,
875
+ "group_size": 128
876
+ },
877
+ "model.layers.26.mlp.shared_experts.gate_proj": {
878
+ "bits": 4,
879
+ "group_size": 128
880
+ },
881
+ "model.layers.26.mlp.shared_experts.up_proj": {
882
+ "bits": 4,
883
+ "group_size": 128
884
+ },
885
+ "model.layers.26.mlp.shared_experts.down_proj": {
886
+ "bits": 4,
887
+ "group_size": 128
888
+ },
889
+ "model.layers.27.self_attn.q_a_proj": {
890
+ "bits": 4,
891
+ "group_size": 128
892
+ },
893
+ "model.layers.27.self_attn.q_b_proj": {
894
+ "bits": 4,
895
+ "group_size": 128
896
+ },
897
+ "model.layers.27.self_attn.kv_a_proj_with_mqa": {
898
+ "bits": 4,
899
+ "group_size": 128
900
+ },
901
+ "model.layers.27.self_attn.kv_b_proj": {
902
+ "bits": 4,
903
+ "group_size": 128
904
+ },
905
+ "model.layers.27.self_attn.o_proj": {
906
+ "bits": 4,
907
+ "group_size": 128
908
+ },
909
+ "model.layers.27.mlp.shared_experts.gate_proj": {
910
+ "bits": 4,
911
+ "group_size": 128
912
+ },
913
+ "model.layers.27.mlp.shared_experts.up_proj": {
914
+ "bits": 4,
915
+ "group_size": 128
916
+ },
917
+ "model.layers.27.mlp.shared_experts.down_proj": {
918
+ "bits": 4,
919
+ "group_size": 128
920
+ },
921
+ "model.layers.28.self_attn.q_a_proj": {
922
+ "bits": 4,
923
+ "group_size": 128
924
+ },
925
+ "model.layers.28.self_attn.q_b_proj": {
926
+ "bits": 4,
927
+ "group_size": 128
928
+ },
929
+ "model.layers.28.self_attn.kv_a_proj_with_mqa": {
930
+ "bits": 4,
931
+ "group_size": 128
932
+ },
933
+ "model.layers.28.self_attn.kv_b_proj": {
934
+ "bits": 4,
935
+ "group_size": 128
936
+ },
937
+ "model.layers.28.self_attn.o_proj": {
938
+ "bits": 4,
939
+ "group_size": 128
940
+ },
941
+ "model.layers.28.mlp.shared_experts.gate_proj": {
942
+ "bits": 4,
943
+ "group_size": 128
944
+ },
945
+ "model.layers.28.mlp.shared_experts.up_proj": {
946
+ "bits": 4,
947
+ "group_size": 128
948
+ },
949
+ "model.layers.28.mlp.shared_experts.down_proj": {
950
+ "bits": 4,
951
+ "group_size": 128
952
+ },
953
+ "model.layers.29.self_attn.q_a_proj": {
954
+ "bits": 4,
955
+ "group_size": 128
956
+ },
957
+ "model.layers.29.self_attn.q_b_proj": {
958
+ "bits": 4,
959
+ "group_size": 128
960
+ },
961
+ "model.layers.29.self_attn.kv_a_proj_with_mqa": {
962
+ "bits": 4,
963
+ "group_size": 128
964
+ },
965
+ "model.layers.29.self_attn.kv_b_proj": {
966
+ "bits": 4,
967
+ "group_size": 128
968
+ },
969
+ "model.layers.29.self_attn.o_proj": {
970
+ "bits": 4,
971
+ "group_size": 128
972
+ },
973
+ "model.layers.29.mlp.shared_experts.gate_proj": {
974
+ "bits": 4,
975
+ "group_size": 128
976
+ },
977
+ "model.layers.29.mlp.shared_experts.up_proj": {
978
+ "bits": 4,
979
+ "group_size": 128
980
+ },
981
+ "model.layers.29.mlp.shared_experts.down_proj": {
982
+ "bits": 4,
983
+ "group_size": 128
984
+ },
985
+ "model.layers.30.self_attn.q_a_proj": {
986
+ "bits": 4,
987
+ "group_size": 128
988
+ },
989
+ "model.layers.30.self_attn.q_b_proj": {
990
+ "bits": 4,
991
+ "group_size": 128
992
+ },
993
+ "model.layers.30.self_attn.kv_a_proj_with_mqa": {
994
+ "bits": 4,
995
+ "group_size": 128
996
+ },
997
+ "model.layers.30.self_attn.kv_b_proj": {
998
+ "bits": 4,
999
+ "group_size": 128
1000
+ },
1001
+ "model.layers.30.self_attn.o_proj": {
1002
+ "bits": 4,
1003
+ "group_size": 128
1004
+ },
1005
+ "model.layers.30.mlp.shared_experts.gate_proj": {
1006
+ "bits": 4,
1007
+ "group_size": 128
1008
+ },
1009
+ "model.layers.30.mlp.shared_experts.up_proj": {
1010
+ "bits": 4,
1011
+ "group_size": 128
1012
+ },
1013
+ "model.layers.30.mlp.shared_experts.down_proj": {
1014
+ "bits": 4,
1015
+ "group_size": 128
1016
+ },
1017
+ "model.layers.31.self_attn.q_a_proj": {
1018
+ "bits": 4,
1019
+ "group_size": 128
1020
+ },
1021
+ "model.layers.31.self_attn.q_b_proj": {
1022
+ "bits": 4,
1023
+ "group_size": 128
1024
+ },
1025
+ "model.layers.31.self_attn.kv_a_proj_with_mqa": {
1026
+ "bits": 4,
1027
+ "group_size": 128
1028
+ },
1029
+ "model.layers.31.self_attn.kv_b_proj": {
1030
+ "bits": 4,
1031
+ "group_size": 128
1032
+ },
1033
+ "model.layers.31.self_attn.o_proj": {
1034
+ "bits": 4,
1035
+ "group_size": 128
1036
+ },
1037
+ "model.layers.31.mlp.shared_experts.gate_proj": {
1038
+ "bits": 4,
1039
+ "group_size": 128
1040
+ },
1041
+ "model.layers.31.mlp.shared_experts.up_proj": {
1042
+ "bits": 4,
1043
+ "group_size": 128
1044
+ },
1045
+ "model.layers.31.mlp.shared_experts.down_proj": {
1046
+ "bits": 4,
1047
+ "group_size": 128
1048
+ },
1049
+ "model.layers.32.self_attn.q_a_proj": {
1050
+ "bits": 4,
1051
+ "group_size": 128
1052
+ },
1053
+ "model.layers.32.self_attn.q_b_proj": {
1054
+ "bits": 4,
1055
+ "group_size": 128
1056
+ },
1057
+ "model.layers.32.self_attn.kv_a_proj_with_mqa": {
1058
+ "bits": 4,
1059
+ "group_size": 128
1060
+ },
1061
+ "model.layers.32.self_attn.kv_b_proj": {
1062
+ "bits": 4,
1063
+ "group_size": 128
1064
+ },
1065
+ "model.layers.32.self_attn.o_proj": {
1066
+ "bits": 4,
1067
+ "group_size": 128
1068
+ },
1069
+ "model.layers.32.mlp.shared_experts.gate_proj": {
1070
+ "bits": 4,
1071
+ "group_size": 128
1072
+ },
1073
+ "model.layers.32.mlp.shared_experts.up_proj": {
1074
+ "bits": 4,
1075
+ "group_size": 128
1076
+ },
1077
+ "model.layers.32.mlp.shared_experts.down_proj": {
1078
+ "bits": 4,
1079
+ "group_size": 128
1080
+ },
1081
+ "model.layers.33.self_attn.q_a_proj": {
1082
+ "bits": 4,
1083
+ "group_size": 128
1084
+ },
1085
+ "model.layers.33.self_attn.q_b_proj": {
1086
+ "bits": 4,
1087
+ "group_size": 128
1088
+ },
1089
+ "model.layers.33.self_attn.kv_a_proj_with_mqa": {
1090
+ "bits": 4,
1091
+ "group_size": 128
1092
+ },
1093
+ "model.layers.33.self_attn.kv_b_proj": {
1094
+ "bits": 4,
1095
+ "group_size": 128
1096
+ },
1097
+ "model.layers.33.self_attn.o_proj": {
1098
+ "bits": 4,
1099
+ "group_size": 128
1100
+ },
1101
+ "model.layers.33.mlp.shared_experts.gate_proj": {
1102
+ "bits": 4,
1103
+ "group_size": 128
1104
+ },
1105
+ "model.layers.33.mlp.shared_experts.up_proj": {
1106
+ "bits": 4,
1107
+ "group_size": 128
1108
+ },
1109
+ "model.layers.33.mlp.shared_experts.down_proj": {
1110
+ "bits": 4,
1111
+ "group_size": 128
1112
+ },
1113
+ "model.layers.34.self_attn.q_a_proj": {
1114
+ "bits": 4,
1115
+ "group_size": 128
1116
+ },
1117
+ "model.layers.34.self_attn.q_b_proj": {
1118
+ "bits": 4,
1119
+ "group_size": 128
1120
+ },
1121
+ "model.layers.34.self_attn.kv_a_proj_with_mqa": {
1122
+ "bits": 4,
1123
+ "group_size": 128
1124
+ },
1125
+ "model.layers.34.self_attn.kv_b_proj": {
1126
+ "bits": 4,
1127
+ "group_size": 128
1128
+ },
1129
+ "model.layers.34.self_attn.o_proj": {
1130
+ "bits": 4,
1131
+ "group_size": 128
1132
+ },
1133
+ "model.layers.34.mlp.shared_experts.gate_proj": {
1134
+ "bits": 4,
1135
+ "group_size": 128
1136
+ },
1137
+ "model.layers.34.mlp.shared_experts.up_proj": {
1138
+ "bits": 4,
1139
+ "group_size": 128
1140
+ },
1141
+ "model.layers.34.mlp.shared_experts.down_proj": {
1142
+ "bits": 4,
1143
+ "group_size": 128
1144
+ },
1145
+ "model.layers.35.self_attn.q_a_proj": {
1146
+ "bits": 4,
1147
+ "group_size": 128
1148
+ },
1149
+ "model.layers.35.self_attn.q_b_proj": {
1150
+ "bits": 4,
1151
+ "group_size": 128
1152
+ },
1153
+ "model.layers.35.self_attn.kv_a_proj_with_mqa": {
1154
+ "bits": 4,
1155
+ "group_size": 128
1156
+ },
1157
+ "model.layers.35.self_attn.kv_b_proj": {
1158
+ "bits": 4,
1159
+ "group_size": 128
1160
+ },
1161
+ "model.layers.35.self_attn.o_proj": {
1162
+ "bits": 4,
1163
+ "group_size": 128
1164
+ },
1165
+ "model.layers.35.mlp.shared_experts.gate_proj": {
1166
+ "bits": 4,
1167
+ "group_size": 128
1168
+ },
1169
+ "model.layers.35.mlp.shared_experts.up_proj": {
1170
+ "bits": 4,
1171
+ "group_size": 128
1172
+ },
1173
+ "model.layers.35.mlp.shared_experts.down_proj": {
1174
+ "bits": 4,
1175
+ "group_size": 128
1176
+ },
1177
+ "model.layers.36.self_attn.q_a_proj": {
1178
+ "bits": 4,
1179
+ "group_size": 128
1180
+ },
1181
+ "model.layers.36.self_attn.q_b_proj": {
1182
+ "bits": 4,
1183
+ "group_size": 128
1184
+ },
1185
+ "model.layers.36.self_attn.kv_a_proj_with_mqa": {
1186
+ "bits": 4,
1187
+ "group_size": 128
1188
+ },
1189
+ "model.layers.36.self_attn.kv_b_proj": {
1190
+ "bits": 4,
1191
+ "group_size": 128
1192
+ },
1193
+ "model.layers.36.self_attn.o_proj": {
1194
+ "bits": 4,
1195
+ "group_size": 128
1196
+ },
1197
+ "model.layers.36.mlp.shared_experts.gate_proj": {
1198
+ "bits": 4,
1199
+ "group_size": 128
1200
+ },
1201
+ "model.layers.36.mlp.shared_experts.up_proj": {
1202
+ "bits": 4,
1203
+ "group_size": 128
1204
+ },
1205
+ "model.layers.36.mlp.shared_experts.down_proj": {
1206
+ "bits": 4,
1207
+ "group_size": 128
1208
+ },
1209
+ "model.layers.37.self_attn.q_a_proj": {
1210
+ "bits": 4,
1211
+ "group_size": 128
1212
+ },
1213
+ "model.layers.37.self_attn.q_b_proj": {
1214
+ "bits": 4,
1215
+ "group_size": 128
1216
+ },
1217
+ "model.layers.37.self_attn.kv_a_proj_with_mqa": {
1218
+ "bits": 4,
1219
+ "group_size": 128
1220
+ },
1221
+ "model.layers.37.self_attn.kv_b_proj": {
1222
+ "bits": 4,
1223
+ "group_size": 128
1224
+ },
1225
+ "model.layers.37.self_attn.o_proj": {
1226
+ "bits": 4,
1227
+ "group_size": 128
1228
+ },
1229
+ "model.layers.37.mlp.shared_experts.gate_proj": {
1230
+ "bits": 4,
1231
+ "group_size": 128
1232
+ },
1233
+ "model.layers.37.mlp.shared_experts.up_proj": {
1234
+ "bits": 4,
1235
+ "group_size": 128
1236
+ },
1237
+ "model.layers.37.mlp.shared_experts.down_proj": {
1238
+ "bits": 4,
1239
+ "group_size": 128
1240
+ },
1241
+ "model.layers.38.self_attn.q_a_proj": {
1242
+ "bits": 4,
1243
+ "group_size": 128
1244
+ },
1245
+ "model.layers.38.self_attn.q_b_proj": {
1246
+ "bits": 4,
1247
+ "group_size": 128
1248
+ },
1249
+ "model.layers.38.self_attn.kv_a_proj_with_mqa": {
1250
+ "bits": 4,
1251
+ "group_size": 128
1252
+ },
1253
+ "model.layers.38.self_attn.kv_b_proj": {
1254
+ "bits": 4,
1255
+ "group_size": 128
1256
+ },
1257
+ "model.layers.38.self_attn.o_proj": {
1258
+ "bits": 4,
1259
+ "group_size": 128
1260
+ },
1261
+ "model.layers.38.mlp.shared_experts.gate_proj": {
1262
+ "bits": 4,
1263
+ "group_size": 128
1264
+ },
1265
+ "model.layers.38.mlp.shared_experts.up_proj": {
1266
+ "bits": 4,
1267
+ "group_size": 128
1268
+ },
1269
+ "model.layers.38.mlp.shared_experts.down_proj": {
1270
+ "bits": 4,
1271
+ "group_size": 128
1272
+ },
1273
+ "model.layers.39.self_attn.q_a_proj": {
1274
+ "bits": 4,
1275
+ "group_size": 128
1276
+ },
1277
+ "model.layers.39.self_attn.q_b_proj": {
1278
+ "bits": 4,
1279
+ "group_size": 128
1280
+ },
1281
+ "model.layers.39.self_attn.kv_a_proj_with_mqa": {
1282
+ "bits": 4,
1283
+ "group_size": 128
1284
+ },
1285
+ "model.layers.39.self_attn.kv_b_proj": {
1286
+ "bits": 4,
1287
+ "group_size": 128
1288
+ },
1289
+ "model.layers.39.self_attn.o_proj": {
1290
+ "bits": 4,
1291
+ "group_size": 128
1292
+ },
1293
+ "model.layers.39.mlp.shared_experts.gate_proj": {
1294
+ "bits": 4,
1295
+ "group_size": 128
1296
+ },
1297
+ "model.layers.39.mlp.shared_experts.up_proj": {
1298
+ "bits": 4,
1299
+ "group_size": 128
1300
+ },
1301
+ "model.layers.39.mlp.shared_experts.down_proj": {
1302
+ "bits": 4,
1303
+ "group_size": 128
1304
+ },
1305
+ "model.layers.40.self_attn.q_a_proj": {
1306
+ "bits": 4,
1307
+ "group_size": 128
1308
+ },
1309
+ "model.layers.40.self_attn.q_b_proj": {
1310
+ "bits": 4,
1311
+ "group_size": 128
1312
+ },
1313
+ "model.layers.40.self_attn.kv_a_proj_with_mqa": {
1314
+ "bits": 4,
1315
+ "group_size": 128
1316
+ },
1317
+ "model.layers.40.self_attn.kv_b_proj": {
1318
+ "bits": 4,
1319
+ "group_size": 128
1320
+ },
1321
+ "model.layers.40.self_attn.o_proj": {
1322
+ "bits": 4,
1323
+ "group_size": 128
1324
+ },
1325
+ "model.layers.40.mlp.shared_experts.gate_proj": {
1326
+ "bits": 4,
1327
+ "group_size": 128
1328
+ },
1329
+ "model.layers.40.mlp.shared_experts.up_proj": {
1330
+ "bits": 4,
1331
+ "group_size": 128
1332
+ },
1333
+ "model.layers.40.mlp.shared_experts.down_proj": {
1334
+ "bits": 4,
1335
+ "group_size": 128
1336
+ },
1337
+ "model.layers.41.self_attn.q_a_proj": {
1338
+ "bits": 4,
1339
+ "group_size": 128
1340
+ },
1341
+ "model.layers.41.self_attn.q_b_proj": {
1342
+ "bits": 4,
1343
+ "group_size": 128
1344
+ },
1345
+ "model.layers.41.self_attn.kv_a_proj_with_mqa": {
1346
+ "bits": 4,
1347
+ "group_size": 128
1348
+ },
1349
+ "model.layers.41.self_attn.kv_b_proj": {
1350
+ "bits": 4,
1351
+ "group_size": 128
1352
+ },
1353
+ "model.layers.41.self_attn.o_proj": {
1354
+ "bits": 4,
1355
+ "group_size": 128
1356
+ },
1357
+ "model.layers.41.mlp.shared_experts.gate_proj": {
1358
+ "bits": 4,
1359
+ "group_size": 128
1360
+ },
1361
+ "model.layers.41.mlp.shared_experts.up_proj": {
1362
+ "bits": 4,
1363
+ "group_size": 128
1364
+ },
1365
+ "model.layers.41.mlp.shared_experts.down_proj": {
1366
+ "bits": 4,
1367
+ "group_size": 128
1368
+ },
1369
+ "model.layers.42.self_attn.q_a_proj": {
1370
+ "bits": 4,
1371
+ "group_size": 128
1372
+ },
1373
+ "model.layers.42.self_attn.q_b_proj": {
1374
+ "bits": 4,
1375
+ "group_size": 128
1376
+ },
1377
+ "model.layers.42.self_attn.kv_a_proj_with_mqa": {
1378
+ "bits": 4,
1379
+ "group_size": 128
1380
+ },
1381
+ "model.layers.42.self_attn.kv_b_proj": {
1382
+ "bits": 4,
1383
+ "group_size": 128
1384
+ },
1385
+ "model.layers.42.self_attn.o_proj": {
1386
+ "bits": 4,
1387
+ "group_size": 128
1388
+ },
1389
+ "model.layers.42.mlp.shared_experts.gate_proj": {
1390
+ "bits": 4,
1391
+ "group_size": 128
1392
+ },
1393
+ "model.layers.42.mlp.shared_experts.up_proj": {
1394
+ "bits": 4,
1395
+ "group_size": 128
1396
+ },
1397
+ "model.layers.42.mlp.shared_experts.down_proj": {
1398
+ "bits": 4,
1399
+ "group_size": 128
1400
+ },
1401
+ "model.layers.43.self_attn.q_a_proj": {
1402
+ "bits": 4,
1403
+ "group_size": 128
1404
+ },
1405
+ "model.layers.43.self_attn.q_b_proj": {
1406
+ "bits": 4,
1407
+ "group_size": 128
1408
+ },
1409
+ "model.layers.43.self_attn.kv_a_proj_with_mqa": {
1410
+ "bits": 4,
1411
+ "group_size": 128
1412
+ },
1413
+ "model.layers.43.self_attn.kv_b_proj": {
1414
+ "bits": 4,
1415
+ "group_size": 128
1416
+ },
1417
+ "model.layers.43.self_attn.o_proj": {
1418
+ "bits": 4,
1419
+ "group_size": 128
1420
+ },
1421
+ "model.layers.43.mlp.shared_experts.gate_proj": {
1422
+ "bits": 4,
1423
+ "group_size": 128
1424
+ },
1425
+ "model.layers.43.mlp.shared_experts.up_proj": {
1426
+ "bits": 4,
1427
+ "group_size": 128
1428
+ },
1429
+ "model.layers.43.mlp.shared_experts.down_proj": {
1430
+ "bits": 4,
1431
+ "group_size": 128
1432
+ },
1433
+ "model.layers.44.self_attn.q_a_proj": {
1434
+ "bits": 4,
1435
+ "group_size": 128
1436
+ },
1437
+ "model.layers.44.self_attn.q_b_proj": {
1438
+ "bits": 4,
1439
+ "group_size": 128
1440
+ },
1441
+ "model.layers.44.self_attn.kv_a_proj_with_mqa": {
1442
+ "bits": 4,
1443
+ "group_size": 128
1444
+ },
1445
+ "model.layers.44.self_attn.kv_b_proj": {
1446
+ "bits": 4,
1447
+ "group_size": 128
1448
+ },
1449
+ "model.layers.44.self_attn.o_proj": {
1450
+ "bits": 4,
1451
+ "group_size": 128
1452
+ },
1453
+ "model.layers.44.mlp.shared_experts.gate_proj": {
1454
+ "bits": 4,
1455
+ "group_size": 128
1456
+ },
1457
+ "model.layers.44.mlp.shared_experts.up_proj": {
1458
+ "bits": 4,
1459
+ "group_size": 128
1460
+ },
1461
+ "model.layers.44.mlp.shared_experts.down_proj": {
1462
+ "bits": 4,
1463
+ "group_size": 128
1464
+ },
1465
+ "model.layers.45.self_attn.q_a_proj": {
1466
+ "bits": 4,
1467
+ "group_size": 128
1468
+ },
1469
+ "model.layers.45.self_attn.q_b_proj": {
1470
+ "bits": 4,
1471
+ "group_size": 128
1472
+ },
1473
+ "model.layers.45.self_attn.kv_a_proj_with_mqa": {
1474
+ "bits": 4,
1475
+ "group_size": 128
1476
+ },
1477
+ "model.layers.45.self_attn.kv_b_proj": {
1478
+ "bits": 4,
1479
+ "group_size": 128
1480
+ },
1481
+ "model.layers.45.self_attn.o_proj": {
1482
+ "bits": 4,
1483
+ "group_size": 128
1484
+ },
1485
+ "model.layers.45.mlp.shared_experts.gate_proj": {
1486
+ "bits": 4,
1487
+ "group_size": 128
1488
+ },
1489
+ "model.layers.45.mlp.shared_experts.up_proj": {
1490
+ "bits": 4,
1491
+ "group_size": 128
1492
+ },
1493
+ "model.layers.45.mlp.shared_experts.down_proj": {
1494
+ "bits": 4,
1495
+ "group_size": 128
1496
+ },
1497
+ "model.layers.46.self_attn.q_a_proj": {
1498
+ "bits": 4,
1499
+ "group_size": 128
1500
+ },
1501
+ "model.layers.46.self_attn.q_b_proj": {
1502
+ "bits": 4,
1503
+ "group_size": 128
1504
+ },
1505
+ "model.layers.46.self_attn.kv_a_proj_with_mqa": {
1506
+ "bits": 4,
1507
+ "group_size": 128
1508
+ },
1509
+ "model.layers.46.self_attn.kv_b_proj": {
1510
+ "bits": 4,
1511
+ "group_size": 128
1512
+ },
1513
+ "model.layers.46.self_attn.o_proj": {
1514
+ "bits": 4,
1515
+ "group_size": 128
1516
+ },
1517
+ "model.layers.46.mlp.shared_experts.gate_proj": {
1518
+ "bits": 4,
1519
+ "group_size": 128
1520
+ },
1521
+ "model.layers.46.mlp.shared_experts.up_proj": {
1522
+ "bits": 4,
1523
+ "group_size": 128
1524
+ },
1525
+ "model.layers.46.mlp.shared_experts.down_proj": {
1526
+ "bits": 4,
1527
+ "group_size": 128
1528
+ },
1529
+ "model.layers.47.self_attn.q_a_proj": {
1530
+ "bits": 4,
1531
+ "group_size": 128
1532
+ },
1533
+ "model.layers.47.self_attn.q_b_proj": {
1534
+ "bits": 4,
1535
+ "group_size": 128
1536
+ },
1537
+ "model.layers.47.self_attn.kv_a_proj_with_mqa": {
1538
+ "bits": 4,
1539
+ "group_size": 128
1540
+ },
1541
+ "model.layers.47.self_attn.kv_b_proj": {
1542
+ "bits": 4,
1543
+ "group_size": 128
1544
+ },
1545
+ "model.layers.47.self_attn.o_proj": {
1546
+ "bits": 4,
1547
+ "group_size": 128
1548
+ },
1549
+ "model.layers.47.mlp.shared_experts.gate_proj": {
1550
+ "bits": 4,
1551
+ "group_size": 128
1552
+ },
1553
+ "model.layers.47.mlp.shared_experts.up_proj": {
1554
+ "bits": 4,
1555
+ "group_size": 128
1556
+ },
1557
+ "model.layers.47.mlp.shared_experts.down_proj": {
1558
+ "bits": 4,
1559
+ "group_size": 128
1560
+ },
1561
+ "model.layers.48.self_attn.q_a_proj": {
1562
+ "bits": 4,
1563
+ "group_size": 128
1564
+ },
1565
+ "model.layers.48.self_attn.q_b_proj": {
1566
+ "bits": 4,
1567
+ "group_size": 128
1568
+ },
1569
+ "model.layers.48.self_attn.kv_a_proj_with_mqa": {
1570
+ "bits": 4,
1571
+ "group_size": 128
1572
+ },
1573
+ "model.layers.48.self_attn.kv_b_proj": {
1574
+ "bits": 4,
1575
+ "group_size": 128
1576
+ },
1577
+ "model.layers.48.self_attn.o_proj": {
1578
+ "bits": 4,
1579
+ "group_size": 128
1580
+ },
1581
+ "model.layers.48.mlp.shared_experts.gate_proj": {
1582
+ "bits": 4,
1583
+ "group_size": 128
1584
+ },
1585
+ "model.layers.48.mlp.shared_experts.up_proj": {
1586
+ "bits": 4,
1587
+ "group_size": 128
1588
+ },
1589
+ "model.layers.48.mlp.shared_experts.down_proj": {
1590
+ "bits": 4,
1591
+ "group_size": 128
1592
+ },
1593
+ "model.layers.49.self_attn.q_a_proj": {
1594
+ "bits": 4,
1595
+ "group_size": 128
1596
+ },
1597
+ "model.layers.49.self_attn.q_b_proj": {
1598
+ "bits": 4,
1599
+ "group_size": 128
1600
+ },
1601
+ "model.layers.49.self_attn.kv_a_proj_with_mqa": {
1602
+ "bits": 4,
1603
+ "group_size": 128
1604
+ },
1605
+ "model.layers.49.self_attn.kv_b_proj": {
1606
+ "bits": 4,
1607
+ "group_size": 128
1608
+ },
1609
+ "model.layers.49.self_attn.o_proj": {
1610
+ "bits": 4,
1611
+ "group_size": 128
1612
+ },
1613
+ "model.layers.49.mlp.shared_experts.gate_proj": {
1614
+ "bits": 4,
1615
+ "group_size": 128
1616
+ },
1617
+ "model.layers.49.mlp.shared_experts.up_proj": {
1618
+ "bits": 4,
1619
+ "group_size": 128
1620
+ },
1621
+ "model.layers.49.mlp.shared_experts.down_proj": {
1622
+ "bits": 4,
1623
+ "group_size": 128
1624
+ },
1625
+ "model.layers.50.self_attn.q_a_proj": {
1626
+ "bits": 4,
1627
+ "group_size": 128
1628
+ },
1629
+ "model.layers.50.self_attn.q_b_proj": {
1630
+ "bits": 4,
1631
+ "group_size": 128
1632
+ },
1633
+ "model.layers.50.self_attn.kv_a_proj_with_mqa": {
1634
+ "bits": 4,
1635
+ "group_size": 128
1636
+ },
1637
+ "model.layers.50.self_attn.kv_b_proj": {
1638
+ "bits": 4,
1639
+ "group_size": 128
1640
+ },
1641
+ "model.layers.50.self_attn.o_proj": {
1642
+ "bits": 4,
1643
+ "group_size": 128
1644
+ },
1645
+ "model.layers.50.mlp.shared_experts.gate_proj": {
1646
+ "bits": 4,
1647
+ "group_size": 128
1648
+ },
1649
+ "model.layers.50.mlp.shared_experts.up_proj": {
1650
+ "bits": 4,
1651
+ "group_size": 128
1652
+ },
1653
+ "model.layers.50.mlp.shared_experts.down_proj": {
1654
+ "bits": 4,
1655
+ "group_size": 128
1656
+ },
1657
+ "model.layers.51.self_attn.q_a_proj": {
1658
+ "bits": 4,
1659
+ "group_size": 128
1660
+ },
1661
+ "model.layers.51.self_attn.q_b_proj": {
1662
+ "bits": 4,
1663
+ "group_size": 128
1664
+ },
1665
+ "model.layers.51.self_attn.kv_a_proj_with_mqa": {
1666
+ "bits": 4,
1667
+ "group_size": 128
1668
+ },
1669
+ "model.layers.51.self_attn.kv_b_proj": {
1670
+ "bits": 4,
1671
+ "group_size": 128
1672
+ },
1673
+ "model.layers.51.self_attn.o_proj": {
1674
+ "bits": 4,
1675
+ "group_size": 128
1676
+ },
1677
+ "model.layers.51.mlp.shared_experts.gate_proj": {
1678
+ "bits": 4,
1679
+ "group_size": 128
1680
+ },
1681
+ "model.layers.51.mlp.shared_experts.up_proj": {
1682
+ "bits": 4,
1683
+ "group_size": 128
1684
+ },
1685
+ "model.layers.51.mlp.shared_experts.down_proj": {
1686
+ "bits": 4,
1687
+ "group_size": 128
1688
+ },
1689
+ "model.layers.52.self_attn.q_a_proj": {
1690
+ "bits": 4,
1691
+ "group_size": 128
1692
+ },
1693
+ "model.layers.52.self_attn.q_b_proj": {
1694
+ "bits": 4,
1695
+ "group_size": 128
1696
+ },
1697
+ "model.layers.52.self_attn.kv_a_proj_with_mqa": {
1698
+ "bits": 4,
1699
+ "group_size": 128
1700
+ },
1701
+ "model.layers.52.self_attn.kv_b_proj": {
1702
+ "bits": 4,
1703
+ "group_size": 128
1704
+ },
1705
+ "model.layers.52.self_attn.o_proj": {
1706
+ "bits": 4,
1707
+ "group_size": 128
1708
+ },
1709
+ "model.layers.52.mlp.shared_experts.gate_proj": {
1710
+ "bits": 4,
1711
+ "group_size": 128
1712
+ },
1713
+ "model.layers.52.mlp.shared_experts.up_proj": {
1714
+ "bits": 4,
1715
+ "group_size": 128
1716
+ },
1717
+ "model.layers.52.mlp.shared_experts.down_proj": {
1718
+ "bits": 4,
1719
+ "group_size": 128
1720
+ },
1721
+ "model.layers.53.self_attn.q_a_proj": {
1722
+ "bits": 4,
1723
+ "group_size": 128
1724
+ },
1725
+ "model.layers.53.self_attn.q_b_proj": {
1726
+ "bits": 4,
1727
+ "group_size": 128
1728
+ },
1729
+ "model.layers.53.self_attn.kv_a_proj_with_mqa": {
1730
+ "bits": 4,
1731
+ "group_size": 128
1732
+ },
1733
+ "model.layers.53.self_attn.kv_b_proj": {
1734
+ "bits": 4,
1735
+ "group_size": 128
1736
+ },
1737
+ "model.layers.53.self_attn.o_proj": {
1738
+ "bits": 4,
1739
+ "group_size": 128
1740
+ },
1741
+ "model.layers.53.mlp.shared_experts.gate_proj": {
1742
+ "bits": 4,
1743
+ "group_size": 128
1744
+ },
1745
+ "model.layers.53.mlp.shared_experts.up_proj": {
1746
+ "bits": 4,
1747
+ "group_size": 128
1748
+ },
1749
+ "model.layers.53.mlp.shared_experts.down_proj": {
1750
+ "bits": 4,
1751
+ "group_size": 128
1752
+ },
1753
+ "model.layers.54.self_attn.q_a_proj": {
1754
+ "bits": 4,
1755
+ "group_size": 128
1756
+ },
1757
+ "model.layers.54.self_attn.q_b_proj": {
1758
+ "bits": 4,
1759
+ "group_size": 128
1760
+ },
1761
+ "model.layers.54.self_attn.kv_a_proj_with_mqa": {
1762
+ "bits": 4,
1763
+ "group_size": 128
1764
+ },
1765
+ "model.layers.54.self_attn.kv_b_proj": {
1766
+ "bits": 4,
1767
+ "group_size": 128
1768
+ },
1769
+ "model.layers.54.self_attn.o_proj": {
1770
+ "bits": 4,
1771
+ "group_size": 128
1772
+ },
1773
+ "model.layers.54.mlp.shared_experts.gate_proj": {
1774
+ "bits": 4,
1775
+ "group_size": 128
1776
+ },
1777
+ "model.layers.54.mlp.shared_experts.up_proj": {
1778
+ "bits": 4,
1779
+ "group_size": 128
1780
+ },
1781
+ "model.layers.54.mlp.shared_experts.down_proj": {
1782
+ "bits": 4,
1783
+ "group_size": 128
1784
+ },
1785
+ "model.layers.55.self_attn.q_a_proj": {
1786
+ "bits": 4,
1787
+ "group_size": 128
1788
+ },
1789
+ "model.layers.55.self_attn.q_b_proj": {
1790
+ "bits": 4,
1791
+ "group_size": 128
1792
+ },
1793
+ "model.layers.55.self_attn.kv_a_proj_with_mqa": {
1794
+ "bits": 4,
1795
+ "group_size": 128
1796
+ },
1797
+ "model.layers.55.self_attn.kv_b_proj": {
1798
+ "bits": 4,
1799
+ "group_size": 128
1800
+ },
1801
+ "model.layers.55.self_attn.o_proj": {
1802
+ "bits": 4,
1803
+ "group_size": 128
1804
+ },
1805
+ "model.layers.55.mlp.shared_experts.gate_proj": {
1806
+ "bits": 4,
1807
+ "group_size": 128
1808
+ },
1809
+ "model.layers.55.mlp.shared_experts.up_proj": {
1810
+ "bits": 4,
1811
+ "group_size": 128
1812
+ },
1813
+ "model.layers.55.mlp.shared_experts.down_proj": {
1814
+ "bits": 4,
1815
+ "group_size": 128
1816
+ },
1817
+ "model.layers.56.self_attn.q_a_proj": {
1818
+ "bits": 4,
1819
+ "group_size": 128
1820
+ },
1821
+ "model.layers.56.self_attn.q_b_proj": {
1822
+ "bits": 4,
1823
+ "group_size": 128
1824
+ },
1825
+ "model.layers.56.self_attn.kv_a_proj_with_mqa": {
1826
+ "bits": 4,
1827
+ "group_size": 128
1828
+ },
1829
+ "model.layers.56.self_attn.kv_b_proj": {
1830
+ "bits": 4,
1831
+ "group_size": 128
1832
+ },
1833
+ "model.layers.56.self_attn.o_proj": {
1834
+ "bits": 4,
1835
+ "group_size": 128
1836
+ },
1837
+ "model.layers.56.mlp.shared_experts.gate_proj": {
1838
+ "bits": 4,
1839
+ "group_size": 128
1840
+ },
1841
+ "model.layers.56.mlp.shared_experts.up_proj": {
1842
+ "bits": 4,
1843
+ "group_size": 128
1844
+ },
1845
+ "model.layers.56.mlp.shared_experts.down_proj": {
1846
+ "bits": 4,
1847
+ "group_size": 128
1848
+ },
1849
+ "model.layers.57.self_attn.q_a_proj": {
1850
+ "bits": 4,
1851
+ "group_size": 128
1852
+ },
1853
+ "model.layers.57.self_attn.q_b_proj": {
1854
+ "bits": 4,
1855
+ "group_size": 128
1856
+ },
1857
+ "model.layers.57.self_attn.kv_a_proj_with_mqa": {
1858
+ "bits": 4,
1859
+ "group_size": 128
1860
+ },
1861
+ "model.layers.57.self_attn.kv_b_proj": {
1862
+ "bits": 4,
1863
+ "group_size": 128
1864
+ },
1865
+ "model.layers.57.self_attn.o_proj": {
1866
+ "bits": 4,
1867
+ "group_size": 128
1868
+ },
1869
+ "model.layers.57.mlp.shared_experts.gate_proj": {
1870
+ "bits": 4,
1871
+ "group_size": 128
1872
+ },
1873
+ "model.layers.57.mlp.shared_experts.up_proj": {
1874
+ "bits": 4,
1875
+ "group_size": 128
1876
+ },
1877
+ "model.layers.57.mlp.shared_experts.down_proj": {
1878
+ "bits": 4,
1879
+ "group_size": 128
1880
+ },
1881
+ "model.layers.58.self_attn.q_a_proj": {
1882
+ "bits": 4,
1883
+ "group_size": 128
1884
+ },
1885
+ "model.layers.58.self_attn.q_b_proj": {
1886
+ "bits": 4,
1887
+ "group_size": 128
1888
+ },
1889
+ "model.layers.58.self_attn.kv_a_proj_with_mqa": {
1890
+ "bits": 4,
1891
+ "group_size": 128
1892
+ },
1893
+ "model.layers.58.self_attn.kv_b_proj": {
1894
+ "bits": 4,
1895
+ "group_size": 128
1896
+ },
1897
+ "model.layers.58.self_attn.o_proj": {
1898
+ "bits": 4,
1899
+ "group_size": 128
1900
+ },
1901
+ "model.layers.58.mlp.shared_experts.gate_proj": {
1902
+ "bits": 4,
1903
+ "group_size": 128
1904
+ },
1905
+ "model.layers.58.mlp.shared_experts.up_proj": {
1906
+ "bits": 4,
1907
+ "group_size": 128
1908
+ },
1909
+ "model.layers.58.mlp.shared_experts.down_proj": {
1910
+ "bits": 4,
1911
+ "group_size": 128
1912
+ },
1913
+ "model.layers.59.self_attn.q_a_proj": {
1914
+ "bits": 4,
1915
+ "group_size": 128
1916
+ },
1917
+ "model.layers.59.self_attn.q_b_proj": {
1918
+ "bits": 4,
1919
+ "group_size": 128
1920
+ },
1921
+ "model.layers.59.self_attn.kv_a_proj_with_mqa": {
1922
+ "bits": 4,
1923
+ "group_size": 128
1924
+ },
1925
+ "model.layers.59.self_attn.kv_b_proj": {
1926
+ "bits": 4,
1927
+ "group_size": 128
1928
+ },
1929
+ "model.layers.59.self_attn.o_proj": {
1930
+ "bits": 4,
1931
+ "group_size": 128
1932
+ },
1933
+ "model.layers.59.mlp.shared_experts.gate_proj": {
1934
+ "bits": 4,
1935
+ "group_size": 128
1936
+ },
1937
+ "model.layers.59.mlp.shared_experts.up_proj": {
1938
+ "bits": 4,
1939
+ "group_size": 128
1940
+ },
1941
+ "model.layers.59.mlp.shared_experts.down_proj": {
1942
+ "bits": 4,
1943
+ "group_size": 128
1944
+ },
1945
+ "model.layers.60.self_attn.q_a_proj": {
1946
+ "bits": 4,
1947
+ "group_size": 128
1948
+ },
1949
+ "model.layers.60.self_attn.q_b_proj": {
1950
+ "bits": 4,
1951
+ "group_size": 128
1952
+ },
1953
+ "model.layers.60.self_attn.kv_a_proj_with_mqa": {
1954
+ "bits": 4,
1955
+ "group_size": 128
1956
+ },
1957
+ "model.layers.60.self_attn.kv_b_proj": {
1958
+ "bits": 4,
1959
+ "group_size": 128
1960
+ },
1961
+ "model.layers.60.self_attn.o_proj": {
1962
+ "bits": 4,
1963
+ "group_size": 128
1964
+ },
1965
+ "model.layers.60.mlp.experts.0.down_proj": {
1966
+ "data_type": "bfloat",
1967
+ "bits": 16
1968
+ },
1969
+ "model.layers.60.mlp.experts.1.down_proj": {
1970
+ "data_type": "bfloat",
1971
+ "bits": 16
1972
+ },
1973
+ "model.layers.60.mlp.experts.2.down_proj": {
1974
+ "data_type": "bfloat",
1975
+ "bits": 16
1976
+ },
1977
+ "model.layers.60.mlp.experts.3.down_proj": {
1978
+ "data_type": "bfloat",
1979
+ "bits": 16
1980
+ },
1981
+ "model.layers.60.mlp.experts.4.down_proj": {
1982
+ "data_type": "bfloat",
1983
+ "bits": 16
1984
+ },
1985
+ "model.layers.60.mlp.experts.5.down_proj": {
1986
+ "data_type": "bfloat",
1987
+ "bits": 16
1988
+ },
1989
+ "model.layers.60.mlp.experts.6.down_proj": {
1990
+ "data_type": "bfloat",
1991
+ "bits": 16
1992
+ },
1993
+ "model.layers.60.mlp.experts.7.down_proj": {
1994
+ "data_type": "bfloat",
1995
+ "bits": 16
1996
+ },
1997
+ "model.layers.60.mlp.experts.8.down_proj": {
1998
+ "data_type": "bfloat",
1999
+ "bits": 16
2000
+ },
2001
+ "model.layers.60.mlp.experts.9.down_proj": {
2002
+ "data_type": "bfloat",
2003
+ "bits": 16
2004
+ },
2005
+ "model.layers.60.mlp.experts.10.down_proj": {
2006
+ "data_type": "bfloat",
2007
+ "bits": 16
2008
+ },
2009
+ "model.layers.60.mlp.experts.11.down_proj": {
2010
+ "data_type": "bfloat",
2011
+ "bits": 16
2012
+ },
2013
+ "model.layers.60.mlp.experts.12.down_proj": {
2014
+ "data_type": "bfloat",
2015
+ "bits": 16
2016
+ },
2017
+ "model.layers.60.mlp.experts.13.down_proj": {
2018
+ "data_type": "bfloat",
2019
+ "bits": 16
2020
+ },
2021
+ "model.layers.60.mlp.experts.14.down_proj": {
2022
+ "data_type": "bfloat",
2023
+ "bits": 16
2024
+ },
2025
+ "model.layers.60.mlp.experts.15.down_proj": {
2026
+ "data_type": "bfloat",
2027
+ "bits": 16
2028
+ },
2029
+ "model.layers.60.mlp.experts.16.down_proj": {
2030
+ "data_type": "bfloat",
2031
+ "bits": 16
2032
+ },
2033
+ "model.layers.60.mlp.experts.17.down_proj": {
2034
+ "data_type": "bfloat",
2035
+ "bits": 16
2036
+ },
2037
+ "model.layers.60.mlp.experts.18.down_proj": {
2038
+ "data_type": "bfloat",
2039
+ "bits": 16
2040
+ },
2041
+ "model.layers.60.mlp.experts.19.down_proj": {
2042
+ "data_type": "bfloat",
2043
+ "bits": 16
2044
+ },
2045
+ "model.layers.60.mlp.experts.20.down_proj": {
2046
+ "data_type": "bfloat",
2047
+ "bits": 16
2048
+ },
2049
+ "model.layers.60.mlp.experts.21.down_proj": {
2050
+ "data_type": "bfloat",
2051
+ "bits": 16
2052
+ },
2053
+ "model.layers.60.mlp.experts.22.down_proj": {
2054
+ "data_type": "bfloat",
2055
+ "bits": 16
2056
+ },
2057
+ "model.layers.60.mlp.experts.23.down_proj": {
2058
+ "data_type": "bfloat",
2059
+ "bits": 16
2060
+ },
2061
+ "model.layers.60.mlp.experts.24.down_proj": {
2062
+ "data_type": "bfloat",
2063
+ "bits": 16
2064
+ },
2065
+ "model.layers.60.mlp.experts.25.down_proj": {
2066
+ "data_type": "bfloat",
2067
+ "bits": 16
2068
+ },
2069
+ "model.layers.60.mlp.experts.26.down_proj": {
2070
+ "data_type": "bfloat",
2071
+ "bits": 16
2072
+ },
2073
+ "model.layers.60.mlp.experts.27.down_proj": {
2074
+ "data_type": "bfloat",
2075
+ "bits": 16
2076
+ },
2077
+ "model.layers.60.mlp.experts.28.down_proj": {
2078
+ "data_type": "bfloat",
2079
+ "bits": 16
2080
+ },
2081
+ "model.layers.60.mlp.experts.29.down_proj": {
2082
+ "data_type": "bfloat",
2083
+ "bits": 16
2084
+ },
2085
+ "model.layers.60.mlp.experts.30.down_proj": {
2086
+ "data_type": "bfloat",
2087
+ "bits": 16
2088
+ },
2089
+ "model.layers.60.mlp.experts.31.down_proj": {
2090
+ "data_type": "bfloat",
2091
+ "bits": 16
2092
+ },
2093
+ "model.layers.60.mlp.experts.32.down_proj": {
2094
+ "data_type": "bfloat",
2095
+ "bits": 16
2096
+ },
2097
+ "model.layers.60.mlp.experts.33.down_proj": {
2098
+ "data_type": "bfloat",
2099
+ "bits": 16
2100
+ },
2101
+ "model.layers.60.mlp.experts.34.down_proj": {
2102
+ "data_type": "bfloat",
2103
+ "bits": 16
2104
+ },
2105
+ "model.layers.60.mlp.experts.35.down_proj": {
2106
+ "data_type": "bfloat",
2107
+ "bits": 16
2108
+ },
2109
+ "model.layers.60.mlp.experts.36.down_proj": {
2110
+ "data_type": "bfloat",
2111
+ "bits": 16
2112
+ },
2113
+ "model.layers.60.mlp.experts.37.down_proj": {
2114
+ "data_type": "bfloat",
2115
+ "bits": 16
2116
+ },
2117
+ "model.layers.60.mlp.experts.38.down_proj": {
2118
+ "data_type": "bfloat",
2119
+ "bits": 16
2120
+ },
2121
+ "model.layers.60.mlp.experts.39.down_proj": {
2122
+ "data_type": "bfloat",
2123
+ "bits": 16
2124
+ },
2125
+ "model.layers.60.mlp.experts.40.down_proj": {
2126
+ "data_type": "bfloat",
2127
+ "bits": 16
2128
+ },
2129
+ "model.layers.60.mlp.experts.41.down_proj": {
2130
+ "data_type": "bfloat",
2131
+ "bits": 16
2132
+ },
2133
+ "model.layers.60.mlp.experts.42.down_proj": {
2134
+ "data_type": "bfloat",
2135
+ "bits": 16
2136
+ },
2137
+ "model.layers.60.mlp.experts.43.down_proj": {
2138
+ "data_type": "bfloat",
2139
+ "bits": 16
2140
+ },
2141
+ "model.layers.60.mlp.experts.44.down_proj": {
2142
+ "data_type": "bfloat",
2143
+ "bits": 16
2144
+ },
2145
+ "model.layers.60.mlp.experts.45.down_proj": {
2146
+ "data_type": "bfloat",
2147
+ "bits": 16
2148
+ },
2149
+ "model.layers.60.mlp.experts.46.down_proj": {
2150
+ "data_type": "bfloat",
2151
+ "bits": 16
2152
+ },
2153
+ "model.layers.60.mlp.experts.47.down_proj": {
2154
+ "data_type": "bfloat",
2155
+ "bits": 16
2156
+ },
2157
+ "model.layers.60.mlp.experts.48.down_proj": {
2158
+ "data_type": "bfloat",
2159
+ "bits": 16
2160
+ },
2161
+ "model.layers.60.mlp.experts.49.down_proj": {
2162
+ "data_type": "bfloat",
2163
+ "bits": 16
2164
+ },
2165
+ "model.layers.60.mlp.experts.50.down_proj": {
2166
+ "data_type": "bfloat",
2167
+ "bits": 16
2168
+ },
2169
+ "model.layers.60.mlp.experts.51.down_proj": {
2170
+ "data_type": "bfloat",
2171
+ "bits": 16
2172
+ },
2173
+ "model.layers.60.mlp.experts.52.down_proj": {
2174
+ "data_type": "bfloat",
2175
+ "bits": 16
2176
+ },
2177
+ "model.layers.60.mlp.experts.53.down_proj": {
2178
+ "data_type": "bfloat",
2179
+ "bits": 16
2180
+ },
2181
+ "model.layers.60.mlp.experts.54.down_proj": {
2182
+ "data_type": "bfloat",
2183
+ "bits": 16
2184
+ },
2185
+ "model.layers.60.mlp.experts.55.down_proj": {
2186
+ "data_type": "bfloat",
2187
+ "bits": 16
2188
+ },
2189
+ "model.layers.60.mlp.experts.56.down_proj": {
2190
+ "data_type": "bfloat",
2191
+ "bits": 16
2192
+ },
2193
+ "model.layers.60.mlp.experts.57.down_proj": {
2194
+ "data_type": "bfloat",
2195
+ "bits": 16
2196
+ },
2197
+ "model.layers.60.mlp.experts.58.down_proj": {
2198
+ "data_type": "bfloat",
2199
+ "bits": 16
2200
+ },
2201
+ "model.layers.60.mlp.experts.59.down_proj": {
2202
+ "data_type": "bfloat",
2203
+ "bits": 16
2204
+ },
2205
+ "model.layers.60.mlp.experts.60.down_proj": {
2206
+ "data_type": "bfloat",
2207
+ "bits": 16
2208
+ },
2209
+ "model.layers.60.mlp.experts.61.down_proj": {
2210
+ "data_type": "bfloat",
2211
+ "bits": 16
2212
+ },
2213
+ "model.layers.60.mlp.experts.62.down_proj": {
2214
+ "data_type": "bfloat",
2215
+ "bits": 16
2216
+ },
2217
+ "model.layers.60.mlp.experts.63.down_proj": {
2218
+ "data_type": "bfloat",
2219
+ "bits": 16
2220
+ },
2221
+ "model.layers.60.mlp.experts.64.down_proj": {
2222
+ "data_type": "bfloat",
2223
+ "bits": 16
2224
+ },
2225
+ "model.layers.60.mlp.experts.65.down_proj": {
2226
+ "data_type": "bfloat",
2227
+ "bits": 16
2228
+ },
2229
+ "model.layers.60.mlp.experts.66.down_proj": {
2230
+ "data_type": "bfloat",
2231
+ "bits": 16
2232
+ },
2233
+ "model.layers.60.mlp.experts.67.down_proj": {
2234
+ "data_type": "bfloat",
2235
+ "bits": 16
2236
+ },
2237
+ "model.layers.60.mlp.experts.68.down_proj": {
2238
+ "data_type": "bfloat",
2239
+ "bits": 16
2240
+ },
2241
+ "model.layers.60.mlp.experts.69.down_proj": {
2242
+ "data_type": "bfloat",
2243
+ "bits": 16
2244
+ },
2245
+ "model.layers.60.mlp.experts.70.down_proj": {
2246
+ "data_type": "bfloat",
2247
+ "bits": 16
2248
+ },
2249
+ "model.layers.60.mlp.experts.71.down_proj": {
2250
+ "data_type": "bfloat",
2251
+ "bits": 16
2252
+ },
2253
+ "model.layers.60.mlp.experts.72.down_proj": {
2254
+ "data_type": "bfloat",
2255
+ "bits": 16
2256
+ },
2257
+ "model.layers.60.mlp.experts.73.down_proj": {
2258
+ "data_type": "bfloat",
2259
+ "bits": 16
2260
+ },
2261
+ "model.layers.60.mlp.experts.74.down_proj": {
2262
+ "data_type": "bfloat",
2263
+ "bits": 16
2264
+ },
2265
+ "model.layers.60.mlp.experts.75.down_proj": {
2266
+ "data_type": "bfloat",
2267
+ "bits": 16
2268
+ },
2269
+ "model.layers.60.mlp.experts.76.down_proj": {
2270
+ "data_type": "bfloat",
2271
+ "bits": 16
2272
+ },
2273
+ "model.layers.60.mlp.experts.77.down_proj": {
2274
+ "data_type": "bfloat",
2275
+ "bits": 16
2276
+ },
2277
+ "model.layers.60.mlp.experts.78.down_proj": {
2278
+ "data_type": "bfloat",
2279
+ "bits": 16
2280
+ },
2281
+ "model.layers.60.mlp.experts.79.down_proj": {
2282
+ "data_type": "bfloat",
2283
+ "bits": 16
2284
+ },
2285
+ "model.layers.60.mlp.experts.80.down_proj": {
2286
+ "data_type": "bfloat",
2287
+ "bits": 16
2288
+ },
2289
+ "model.layers.60.mlp.experts.81.down_proj": {
2290
+ "data_type": "bfloat",
2291
+ "bits": 16
2292
+ },
2293
+ "model.layers.60.mlp.experts.82.down_proj": {
2294
+ "data_type": "bfloat",
2295
+ "bits": 16
2296
+ },
2297
+ "model.layers.60.mlp.experts.83.down_proj": {
2298
+ "data_type": "bfloat",
2299
+ "bits": 16
2300
+ },
2301
+ "model.layers.60.mlp.experts.84.down_proj": {
2302
+ "data_type": "bfloat",
2303
+ "bits": 16
2304
+ },
2305
+ "model.layers.60.mlp.experts.85.down_proj": {
2306
+ "data_type": "bfloat",
2307
+ "bits": 16
2308
+ },
2309
+ "model.layers.60.mlp.experts.86.down_proj": {
2310
+ "data_type": "bfloat",
2311
+ "bits": 16
2312
+ },
2313
+ "model.layers.60.mlp.experts.87.down_proj": {
2314
+ "data_type": "bfloat",
2315
+ "bits": 16
2316
+ },
2317
+ "model.layers.60.mlp.experts.88.down_proj": {
2318
+ "data_type": "bfloat",
2319
+ "bits": 16
2320
+ },
2321
+ "model.layers.60.mlp.experts.89.down_proj": {
2322
+ "data_type": "bfloat",
2323
+ "bits": 16
2324
+ },
2325
+ "model.layers.60.mlp.experts.90.down_proj": {
2326
+ "data_type": "bfloat",
2327
+ "bits": 16
2328
+ },
2329
+ "model.layers.60.mlp.experts.91.down_proj": {
2330
+ "data_type": "bfloat",
2331
+ "bits": 16
2332
+ },
2333
+ "model.layers.60.mlp.experts.92.down_proj": {
2334
+ "data_type": "bfloat",
2335
+ "bits": 16
2336
+ },
2337
+ "model.layers.60.mlp.experts.93.down_proj": {
2338
+ "data_type": "bfloat",
2339
+ "bits": 16
2340
+ },
2341
+ "model.layers.60.mlp.experts.94.down_proj": {
2342
+ "data_type": "bfloat",
2343
+ "bits": 16
2344
+ },
2345
+ "model.layers.60.mlp.experts.95.down_proj": {
2346
+ "data_type": "bfloat",
2347
+ "bits": 16
2348
+ },
2349
+ "model.layers.60.mlp.experts.96.down_proj": {
2350
+ "data_type": "bfloat",
2351
+ "bits": 16
2352
+ },
2353
+ "model.layers.60.mlp.experts.97.down_proj": {
2354
+ "data_type": "bfloat",
2355
+ "bits": 16
2356
+ },
2357
+ "model.layers.60.mlp.experts.98.down_proj": {
2358
+ "data_type": "bfloat",
2359
+ "bits": 16
2360
+ },
2361
+ "model.layers.60.mlp.experts.99.down_proj": {
2362
+ "data_type": "bfloat",
2363
+ "bits": 16
2364
+ },
2365
+ "model.layers.60.mlp.experts.100.down_proj": {
2366
+ "data_type": "bfloat",
2367
+ "bits": 16
2368
+ },
2369
+ "model.layers.60.mlp.experts.101.down_proj": {
2370
+ "data_type": "bfloat",
2371
+ "bits": 16
2372
+ },
2373
+ "model.layers.60.mlp.experts.102.down_proj": {
2374
+ "data_type": "bfloat",
2375
+ "bits": 16
2376
+ },
2377
+ "model.layers.60.mlp.experts.103.down_proj": {
2378
+ "data_type": "bfloat",
2379
+ "bits": 16
2380
+ },
2381
+ "model.layers.60.mlp.experts.104.down_proj": {
2382
+ "data_type": "bfloat",
2383
+ "bits": 16
2384
+ },
2385
+ "model.layers.60.mlp.experts.105.down_proj": {
2386
+ "data_type": "bfloat",
2387
+ "bits": 16
2388
+ },
2389
+ "model.layers.60.mlp.experts.106.down_proj": {
2390
+ "data_type": "bfloat",
2391
+ "bits": 16
2392
+ },
2393
+ "model.layers.60.mlp.experts.107.down_proj": {
2394
+ "data_type": "bfloat",
2395
+ "bits": 16
2396
+ },
2397
+ "model.layers.60.mlp.experts.108.down_proj": {
2398
+ "data_type": "bfloat",
2399
+ "bits": 16
2400
+ },
2401
+ "model.layers.60.mlp.experts.109.down_proj": {
2402
+ "data_type": "bfloat",
2403
+ "bits": 16
2404
+ },
2405
+ "model.layers.60.mlp.experts.110.down_proj": {
2406
+ "data_type": "bfloat",
2407
+ "bits": 16
2408
+ },
2409
+ "model.layers.60.mlp.experts.111.down_proj": {
2410
+ "data_type": "bfloat",
2411
+ "bits": 16
2412
+ },
2413
+ "model.layers.60.mlp.experts.112.down_proj": {
2414
+ "data_type": "bfloat",
2415
+ "bits": 16
2416
+ },
2417
+ "model.layers.60.mlp.experts.113.down_proj": {
2418
+ "data_type": "bfloat",
2419
+ "bits": 16
2420
+ },
2421
+ "model.layers.60.mlp.experts.114.down_proj": {
2422
+ "data_type": "bfloat",
2423
+ "bits": 16
2424
+ },
2425
+ "model.layers.60.mlp.experts.115.down_proj": {
2426
+ "data_type": "bfloat",
2427
+ "bits": 16
2428
+ },
2429
+ "model.layers.60.mlp.experts.116.down_proj": {
2430
+ "data_type": "bfloat",
2431
+ "bits": 16
2432
+ },
2433
+ "model.layers.60.mlp.experts.117.down_proj": {
2434
+ "data_type": "bfloat",
2435
+ "bits": 16
2436
+ },
2437
+ "model.layers.60.mlp.experts.118.down_proj": {
2438
+ "data_type": "bfloat",
2439
+ "bits": 16
2440
+ },
2441
+ "model.layers.60.mlp.experts.119.down_proj": {
2442
+ "data_type": "bfloat",
2443
+ "bits": 16
2444
+ },
2445
+ "model.layers.60.mlp.experts.120.down_proj": {
2446
+ "data_type": "bfloat",
2447
+ "bits": 16
2448
+ },
2449
+ "model.layers.60.mlp.experts.121.down_proj": {
2450
+ "data_type": "bfloat",
2451
+ "bits": 16
2452
+ },
2453
+ "model.layers.60.mlp.experts.122.down_proj": {
2454
+ "data_type": "bfloat",
2455
+ "bits": 16
2456
+ },
2457
+ "model.layers.60.mlp.experts.123.down_proj": {
2458
+ "data_type": "bfloat",
2459
+ "bits": 16
2460
+ },
2461
+ "model.layers.60.mlp.experts.124.down_proj": {
2462
+ "data_type": "bfloat",
2463
+ "bits": 16
2464
+ },
2465
+ "model.layers.60.mlp.experts.125.down_proj": {
2466
+ "data_type": "bfloat",
2467
+ "bits": 16
2468
+ },
2469
+ "model.layers.60.mlp.experts.126.down_proj": {
2470
+ "data_type": "bfloat",
2471
+ "bits": 16
2472
+ },
2473
+ "model.layers.60.mlp.experts.127.down_proj": {
2474
+ "data_type": "bfloat",
2475
+ "bits": 16
2476
+ },
2477
+ "model.layers.60.mlp.experts.128.down_proj": {
2478
+ "data_type": "bfloat",
2479
+ "bits": 16
2480
+ },
2481
+ "model.layers.60.mlp.experts.129.down_proj": {
2482
+ "data_type": "bfloat",
2483
+ "bits": 16
2484
+ },
2485
+ "model.layers.60.mlp.experts.130.down_proj": {
2486
+ "data_type": "bfloat",
2487
+ "bits": 16
2488
+ },
2489
+ "model.layers.60.mlp.experts.131.down_proj": {
2490
+ "data_type": "bfloat",
2491
+ "bits": 16
2492
+ },
2493
+ "model.layers.60.mlp.experts.132.down_proj": {
2494
+ "data_type": "bfloat",
2495
+ "bits": 16
2496
+ },
2497
+ "model.layers.60.mlp.experts.133.down_proj": {
2498
+ "data_type": "bfloat",
2499
+ "bits": 16
2500
+ },
2501
+ "model.layers.60.mlp.experts.134.down_proj": {
2502
+ "data_type": "bfloat",
2503
+ "bits": 16
2504
+ },
2505
+ "model.layers.60.mlp.experts.135.down_proj": {
2506
+ "data_type": "bfloat",
2507
+ "bits": 16
2508
+ },
2509
+ "model.layers.60.mlp.experts.136.down_proj": {
2510
+ "data_type": "bfloat",
2511
+ "bits": 16
2512
+ },
2513
+ "model.layers.60.mlp.experts.137.down_proj": {
2514
+ "data_type": "bfloat",
2515
+ "bits": 16
2516
+ },
2517
+ "model.layers.60.mlp.experts.138.down_proj": {
2518
+ "data_type": "bfloat",
2519
+ "bits": 16
2520
+ },
2521
+ "model.layers.60.mlp.experts.139.down_proj": {
2522
+ "data_type": "bfloat",
2523
+ "bits": 16
2524
+ },
2525
+ "model.layers.60.mlp.experts.140.down_proj": {
2526
+ "data_type": "bfloat",
2527
+ "bits": 16
2528
+ },
2529
+ "model.layers.60.mlp.experts.141.down_proj": {
2530
+ "data_type": "bfloat",
2531
+ "bits": 16
2532
+ },
2533
+ "model.layers.60.mlp.experts.142.down_proj": {
2534
+ "data_type": "bfloat",
2535
+ "bits": 16
2536
+ },
2537
+ "model.layers.60.mlp.experts.143.down_proj": {
2538
+ "data_type": "bfloat",
2539
+ "bits": 16
2540
+ },
2541
+ "model.layers.60.mlp.experts.144.down_proj": {
2542
+ "data_type": "bfloat",
2543
+ "bits": 16
2544
+ },
2545
+ "model.layers.60.mlp.experts.145.down_proj": {
2546
+ "data_type": "bfloat",
2547
+ "bits": 16
2548
+ },
2549
+ "model.layers.60.mlp.experts.146.down_proj": {
2550
+ "data_type": "bfloat",
2551
+ "bits": 16
2552
+ },
2553
+ "model.layers.60.mlp.experts.147.down_proj": {
2554
+ "data_type": "bfloat",
2555
+ "bits": 16
2556
+ },
2557
+ "model.layers.60.mlp.experts.148.down_proj": {
2558
+ "data_type": "bfloat",
2559
+ "bits": 16
2560
+ },
2561
+ "model.layers.60.mlp.experts.149.down_proj": {
2562
+ "data_type": "bfloat",
2563
+ "bits": 16
2564
+ },
2565
+ "model.layers.60.mlp.experts.150.down_proj": {
2566
+ "data_type": "bfloat",
2567
+ "bits": 16
2568
+ },
2569
+ "model.layers.60.mlp.experts.151.down_proj": {
2570
+ "data_type": "bfloat",
2571
+ "bits": 16
2572
+ },
2573
+ "model.layers.60.mlp.experts.152.down_proj": {
2574
+ "data_type": "bfloat",
2575
+ "bits": 16
2576
+ },
2577
+ "model.layers.60.mlp.experts.153.down_proj": {
2578
+ "data_type": "bfloat",
2579
+ "bits": 16
2580
+ },
2581
+ "model.layers.60.mlp.experts.154.down_proj": {
2582
+ "data_type": "bfloat",
2583
+ "bits": 16
2584
+ },
2585
+ "model.layers.60.mlp.experts.155.down_proj": {
2586
+ "data_type": "bfloat",
2587
+ "bits": 16
2588
+ },
2589
+ "model.layers.60.mlp.experts.156.down_proj": {
2590
+ "data_type": "bfloat",
2591
+ "bits": 16
2592
+ },
2593
+ "model.layers.60.mlp.experts.157.down_proj": {
2594
+ "data_type": "bfloat",
2595
+ "bits": 16
2596
+ },
2597
+ "model.layers.60.mlp.experts.158.down_proj": {
2598
+ "data_type": "bfloat",
2599
+ "bits": 16
2600
+ },
2601
+ "model.layers.60.mlp.experts.159.down_proj": {
2602
+ "data_type": "bfloat",
2603
+ "bits": 16
2604
+ },
2605
+ "model.layers.60.mlp.experts.160.down_proj": {
2606
+ "data_type": "bfloat",
2607
+ "bits": 16
2608
+ },
2609
+ "model.layers.60.mlp.experts.161.down_proj": {
2610
+ "data_type": "bfloat",
2611
+ "bits": 16
2612
+ },
2613
+ "model.layers.60.mlp.experts.162.down_proj": {
2614
+ "data_type": "bfloat",
2615
+ "bits": 16
2616
+ },
2617
+ "model.layers.60.mlp.experts.163.down_proj": {
2618
+ "data_type": "bfloat",
2619
+ "bits": 16
2620
+ },
2621
+ "model.layers.60.mlp.experts.164.down_proj": {
2622
+ "data_type": "bfloat",
2623
+ "bits": 16
2624
+ },
2625
+ "model.layers.60.mlp.experts.165.down_proj": {
2626
+ "data_type": "bfloat",
2627
+ "bits": 16
2628
+ },
2629
+ "model.layers.60.mlp.experts.166.down_proj": {
2630
+ "data_type": "bfloat",
2631
+ "bits": 16
2632
+ },
2633
+ "model.layers.60.mlp.experts.167.down_proj": {
2634
+ "data_type": "bfloat",
2635
+ "bits": 16
2636
+ },
2637
+ "model.layers.60.mlp.experts.168.down_proj": {
2638
+ "data_type": "bfloat",
2639
+ "bits": 16
2640
+ },
2641
+ "model.layers.60.mlp.experts.169.down_proj": {
2642
+ "data_type": "bfloat",
2643
+ "bits": 16
2644
+ },
2645
+ "model.layers.60.mlp.experts.170.down_proj": {
2646
+ "data_type": "bfloat",
2647
+ "bits": 16
2648
+ },
2649
+ "model.layers.60.mlp.experts.171.down_proj": {
2650
+ "data_type": "bfloat",
2651
+ "bits": 16
2652
+ },
2653
+ "model.layers.60.mlp.experts.172.down_proj": {
2654
+ "data_type": "bfloat",
2655
+ "bits": 16
2656
+ },
2657
+ "model.layers.60.mlp.experts.173.down_proj": {
2658
+ "data_type": "bfloat",
2659
+ "bits": 16
2660
+ },
2661
+ "model.layers.60.mlp.experts.174.down_proj": {
2662
+ "data_type": "bfloat",
2663
+ "bits": 16
2664
+ },
2665
+ "model.layers.60.mlp.experts.175.down_proj": {
2666
+ "data_type": "bfloat",
2667
+ "bits": 16
2668
+ },
2669
+ "model.layers.60.mlp.experts.176.down_proj": {
2670
+ "data_type": "bfloat",
2671
+ "bits": 16
2672
+ },
2673
+ "model.layers.60.mlp.experts.177.down_proj": {
2674
+ "data_type": "bfloat",
2675
+ "bits": 16
2676
+ },
2677
+ "model.layers.60.mlp.experts.178.down_proj": {
2678
+ "data_type": "bfloat",
2679
+ "bits": 16
2680
+ },
2681
+ "model.layers.60.mlp.experts.179.down_proj": {
2682
+ "data_type": "bfloat",
2683
+ "bits": 16
2684
+ },
2685
+ "model.layers.60.mlp.experts.180.down_proj": {
2686
+ "data_type": "bfloat",
2687
+ "bits": 16
2688
+ },
2689
+ "model.layers.60.mlp.experts.181.down_proj": {
2690
+ "data_type": "bfloat",
2691
+ "bits": 16
2692
+ },
2693
+ "model.layers.60.mlp.experts.182.down_proj": {
2694
+ "data_type": "bfloat",
2695
+ "bits": 16
2696
+ },
2697
+ "model.layers.60.mlp.experts.183.down_proj": {
2698
+ "data_type": "bfloat",
2699
+ "bits": 16
2700
+ },
2701
+ "model.layers.60.mlp.experts.184.down_proj": {
2702
+ "data_type": "bfloat",
2703
+ "bits": 16
2704
+ },
2705
+ "model.layers.60.mlp.experts.185.down_proj": {
2706
+ "data_type": "bfloat",
2707
+ "bits": 16
2708
+ },
2709
+ "model.layers.60.mlp.experts.186.down_proj": {
2710
+ "data_type": "bfloat",
2711
+ "bits": 16
2712
+ },
2713
+ "model.layers.60.mlp.experts.187.down_proj": {
2714
+ "data_type": "bfloat",
2715
+ "bits": 16
2716
+ },
2717
+ "model.layers.60.mlp.experts.188.down_proj": {
2718
+ "data_type": "bfloat",
2719
+ "bits": 16
2720
+ },
2721
+ "model.layers.60.mlp.experts.189.down_proj": {
2722
+ "data_type": "bfloat",
2723
+ "bits": 16
2724
+ },
2725
+ "model.layers.60.mlp.experts.190.down_proj": {
2726
+ "data_type": "bfloat",
2727
+ "bits": 16
2728
+ },
2729
+ "model.layers.60.mlp.experts.191.down_proj": {
2730
+ "data_type": "bfloat",
2731
+ "bits": 16
2732
+ },
2733
+ "model.layers.60.mlp.experts.192.down_proj": {
2734
+ "data_type": "bfloat",
2735
+ "bits": 16
2736
+ },
2737
+ "model.layers.60.mlp.experts.193.down_proj": {
2738
+ "data_type": "bfloat",
2739
+ "bits": 16
2740
+ },
2741
+ "model.layers.60.mlp.experts.194.down_proj": {
2742
+ "data_type": "bfloat",
2743
+ "bits": 16
2744
+ },
2745
+ "model.layers.60.mlp.experts.195.down_proj": {
2746
+ "data_type": "bfloat",
2747
+ "bits": 16
2748
+ },
2749
+ "model.layers.60.mlp.experts.196.down_proj": {
2750
+ "data_type": "bfloat",
2751
+ "bits": 16
2752
+ },
2753
+ "model.layers.60.mlp.experts.197.down_proj": {
2754
+ "data_type": "bfloat",
2755
+ "bits": 16
2756
+ },
2757
+ "model.layers.60.mlp.experts.198.down_proj": {
2758
+ "data_type": "bfloat",
2759
+ "bits": 16
2760
+ },
2761
+ "model.layers.60.mlp.experts.199.down_proj": {
2762
+ "data_type": "bfloat",
2763
+ "bits": 16
2764
+ },
2765
+ "model.layers.60.mlp.experts.200.down_proj": {
2766
+ "data_type": "bfloat",
2767
+ "bits": 16
2768
+ },
2769
+ "model.layers.60.mlp.experts.201.down_proj": {
2770
+ "data_type": "bfloat",
2771
+ "bits": 16
2772
+ },
2773
+ "model.layers.60.mlp.experts.202.down_proj": {
2774
+ "data_type": "bfloat",
2775
+ "bits": 16
2776
+ },
2777
+ "model.layers.60.mlp.experts.203.down_proj": {
2778
+ "data_type": "bfloat",
2779
+ "bits": 16
2780
+ },
2781
+ "model.layers.60.mlp.experts.204.down_proj": {
2782
+ "data_type": "bfloat",
2783
+ "bits": 16
2784
+ },
2785
+ "model.layers.60.mlp.experts.205.down_proj": {
2786
+ "data_type": "bfloat",
2787
+ "bits": 16
2788
+ },
2789
+ "model.layers.60.mlp.experts.206.down_proj": {
2790
+ "data_type": "bfloat",
2791
+ "bits": 16
2792
+ },
2793
+ "model.layers.60.mlp.experts.207.down_proj": {
2794
+ "data_type": "bfloat",
2795
+ "bits": 16
2796
+ },
2797
+ "model.layers.60.mlp.experts.208.down_proj": {
2798
+ "data_type": "bfloat",
2799
+ "bits": 16
2800
+ },
2801
+ "model.layers.60.mlp.experts.209.down_proj": {
2802
+ "data_type": "bfloat",
2803
+ "bits": 16
2804
+ },
2805
+ "model.layers.60.mlp.experts.210.down_proj": {
2806
+ "data_type": "bfloat",
2807
+ "bits": 16
2808
+ },
2809
+ "model.layers.60.mlp.experts.211.down_proj": {
2810
+ "data_type": "bfloat",
2811
+ "bits": 16
2812
+ },
2813
+ "model.layers.60.mlp.experts.212.down_proj": {
2814
+ "data_type": "bfloat",
2815
+ "bits": 16
2816
+ },
2817
+ "model.layers.60.mlp.experts.213.down_proj": {
2818
+ "data_type": "bfloat",
2819
+ "bits": 16
2820
+ },
2821
+ "model.layers.60.mlp.experts.214.down_proj": {
2822
+ "data_type": "bfloat",
2823
+ "bits": 16
2824
+ },
2825
+ "model.layers.60.mlp.experts.215.down_proj": {
2826
+ "data_type": "bfloat",
2827
+ "bits": 16
2828
+ },
2829
+ "model.layers.60.mlp.experts.216.down_proj": {
2830
+ "data_type": "bfloat",
2831
+ "bits": 16
2832
+ },
2833
+ "model.layers.60.mlp.experts.217.down_proj": {
2834
+ "data_type": "bfloat",
2835
+ "bits": 16
2836
+ },
2837
+ "model.layers.60.mlp.experts.218.down_proj": {
2838
+ "data_type": "bfloat",
2839
+ "bits": 16
2840
+ },
2841
+ "model.layers.60.mlp.experts.219.down_proj": {
2842
+ "data_type": "bfloat",
2843
+ "bits": 16
2844
+ },
2845
+ "model.layers.60.mlp.experts.220.down_proj": {
2846
+ "data_type": "bfloat",
2847
+ "bits": 16
2848
+ },
2849
+ "model.layers.60.mlp.experts.221.down_proj": {
2850
+ "data_type": "bfloat",
2851
+ "bits": 16
2852
+ },
2853
+ "model.layers.60.mlp.experts.222.down_proj": {
2854
+ "data_type": "bfloat",
2855
+ "bits": 16
2856
+ },
2857
+ "model.layers.60.mlp.experts.223.down_proj": {
2858
+ "data_type": "bfloat",
2859
+ "bits": 16
2860
+ },
2861
+ "model.layers.60.mlp.experts.224.down_proj": {
2862
+ "data_type": "bfloat",
2863
+ "bits": 16
2864
+ },
2865
+ "model.layers.60.mlp.experts.225.down_proj": {
2866
+ "data_type": "bfloat",
2867
+ "bits": 16
2868
+ },
2869
+ "model.layers.60.mlp.experts.226.down_proj": {
2870
+ "data_type": "bfloat",
2871
+ "bits": 16
2872
+ },
2873
+ "model.layers.60.mlp.experts.227.down_proj": {
2874
+ "data_type": "bfloat",
2875
+ "bits": 16
2876
+ },
2877
+ "model.layers.60.mlp.experts.228.down_proj": {
2878
+ "data_type": "bfloat",
2879
+ "bits": 16
2880
+ },
2881
+ "model.layers.60.mlp.experts.229.down_proj": {
2882
+ "data_type": "bfloat",
2883
+ "bits": 16
2884
+ },
2885
+ "model.layers.60.mlp.experts.230.down_proj": {
2886
+ "data_type": "bfloat",
2887
+ "bits": 16
2888
+ },
2889
+ "model.layers.60.mlp.experts.231.down_proj": {
2890
+ "data_type": "bfloat",
2891
+ "bits": 16
2892
+ },
2893
+ "model.layers.60.mlp.experts.232.down_proj": {
2894
+ "data_type": "bfloat",
2895
+ "bits": 16
2896
+ },
2897
+ "model.layers.60.mlp.experts.233.down_proj": {
2898
+ "data_type": "bfloat",
2899
+ "bits": 16
2900
+ },
2901
+ "model.layers.60.mlp.experts.234.down_proj": {
2902
+ "data_type": "bfloat",
2903
+ "bits": 16
2904
+ },
2905
+ "model.layers.60.mlp.experts.235.down_proj": {
2906
+ "data_type": "bfloat",
2907
+ "bits": 16
2908
+ },
2909
+ "model.layers.60.mlp.experts.236.down_proj": {
2910
+ "data_type": "bfloat",
2911
+ "bits": 16
2912
+ },
2913
+ "model.layers.60.mlp.experts.237.down_proj": {
2914
+ "data_type": "bfloat",
2915
+ "bits": 16
2916
+ },
2917
+ "model.layers.60.mlp.experts.238.down_proj": {
2918
+ "data_type": "bfloat",
2919
+ "bits": 16
2920
+ },
2921
+ "model.layers.60.mlp.experts.239.down_proj": {
2922
+ "data_type": "bfloat",
2923
+ "bits": 16
2924
+ },
2925
+ "model.layers.60.mlp.experts.240.down_proj": {
2926
+ "data_type": "bfloat",
2927
+ "bits": 16
2928
+ },
2929
+ "model.layers.60.mlp.experts.241.down_proj": {
2930
+ "data_type": "bfloat",
2931
+ "bits": 16
2932
+ },
2933
+ "model.layers.60.mlp.experts.242.down_proj": {
2934
+ "data_type": "bfloat",
2935
+ "bits": 16
2936
+ },
2937
+ "model.layers.60.mlp.experts.243.down_proj": {
2938
+ "data_type": "bfloat",
2939
+ "bits": 16
2940
+ },
2941
+ "model.layers.60.mlp.experts.244.down_proj": {
2942
+ "data_type": "bfloat",
2943
+ "bits": 16
2944
+ },
2945
+ "model.layers.60.mlp.experts.245.down_proj": {
2946
+ "data_type": "bfloat",
2947
+ "bits": 16
2948
+ },
2949
+ "model.layers.60.mlp.experts.246.down_proj": {
2950
+ "data_type": "bfloat",
2951
+ "bits": 16
2952
+ },
2953
+ "model.layers.60.mlp.experts.247.down_proj": {
2954
+ "data_type": "bfloat",
2955
+ "bits": 16
2956
+ },
2957
+ "model.layers.60.mlp.experts.248.down_proj": {
2958
+ "data_type": "bfloat",
2959
+ "bits": 16
2960
+ },
2961
+ "model.layers.60.mlp.experts.249.down_proj": {
2962
+ "data_type": "bfloat",
2963
+ "bits": 16
2964
+ },
2965
+ "model.layers.60.mlp.experts.250.down_proj": {
2966
+ "data_type": "bfloat",
2967
+ "bits": 16
2968
+ },
2969
+ "model.layers.60.mlp.experts.251.down_proj": {
2970
+ "data_type": "bfloat",
2971
+ "bits": 16
2972
+ },
2973
+ "model.layers.60.mlp.experts.252.down_proj": {
2974
+ "data_type": "bfloat",
2975
+ "bits": 16
2976
+ },
2977
+ "model.layers.60.mlp.experts.253.down_proj": {
2978
+ "data_type": "bfloat",
2979
+ "bits": 16
2980
+ },
2981
+ "model.layers.60.mlp.experts.254.down_proj": {
2982
+ "data_type": "bfloat",
2983
+ "bits": 16
2984
+ },
2985
+ "model.layers.60.mlp.experts.255.down_proj": {
2986
+ "data_type": "bfloat",
2987
+ "bits": 16
2988
+ },
2989
+ "model.layers.60.mlp.shared_experts.gate_proj": {
2990
+ "bits": 4,
2991
+ "group_size": 128
2992
+ },
2993
+ "model.layers.60.mlp.shared_experts.up_proj": {
2994
+ "bits": 4,
2995
+ "group_size": 128
2996
+ },
2997
+ "model.layers.60.mlp.shared_experts.down_proj": {
2998
+ "data_type": "bfloat",
2999
+ "bits": 16
3000
+ }
3001
+ }
3002
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff