Safetensors
File size: 12,139 Bytes
1af7e96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
{
    "regularizer": {
        "_target_": "flextok.regularizers.quantize_fsq.FSQ",
        "latents_read_key": "enc_registers",
        "quants_write_key": "enc_registers_quant",
        "tokens_write_key": "tokens",
        "levels": [
            8,
            8,
            8,
            5,
            5,
            5
        ],
        "drop_quant_p": 0.0,
        "packed_call": false
    },
    "decoder": {
        "_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper",
        "module_dict": {
            "dec_from_latents": {
                "_target_": "flextok.model.preprocessors.linear.LinearLayer",
                "read_key": "enc_registers_quant",
                "write_key": "dec_registers_proj",
                "dim_in": 6,
                "dim": 1152
            },
            "dec_registers_posemb_module": {
                "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
                "read_key": "dec_registers_proj",
                "write_key": "dec_registers_proj",
                "dim": 1152,
                "max_sizes": [
                    256
                ],
                "posemb_type": "learnable_sum",
                "posemb_scaling": "absolute"
            },
            "dec_nested_dropout": {
                "_target_": "flextok.model.preprocessors.token_dropout.MaskedNestedDropout",
                "read_write_key": "dec_registers_proj",
                "dim": 1152,
                "size_sampling_mode": "pow2"
            },
            "dec_latent_dropout": {
                "_target_": "flextok.model.preprocessors.nullcond.LearnedNullCond",
                "read_write_key": "dec_registers_proj",
                "dim": 1152,
                "dropout_prob": 0.2
            },
            "dec_noise_channels_to_last": {
                "_target_": "flextok.model.utils.dict_ops.PerSampleOp",
                "read_key": "vae_latents_noised",
                "write_key": "vae_latents_noised_bhwc",
                "per_sample_op": {
                    "_target_": "flextok.model.utils.dict_ops.channels_first_to_last",
                    "_partial_": true
                }
            },
            "dec_noise_patch_emb": {
                "_target_": "flextok.model.preprocessors.patching.PatchEmbedder",
                "input_tensor_list_read_key": "vae_latents_noised_bhwc",
                "patches_list_write_key": "vae_latents_noised_patched",
                "n_patches_write_key": "dec_n_patches",
                "channels_in": 16,
                "dim": 1152,
                "patch_sizes": [
                    2,
                    2
                ],
                "flatten_patches": false
            },
            "dec_patches_posemb_module": {
                "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
                "read_key": "vae_latents_noised_patched",
                "write_key": "dec_patches",
                "dim": 1152,
                "max_sizes": [
                    16,
                    16
                ],
                "posemb_type": "sincos",
                "posemb_scaling": "absolute"
            },
            "dec_seq_packer": {
                "_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker",
                "input_list_read_keys": [
                    "dec_patches",
                    "dec_registers_proj"
                ],
                "packed_seq_write_key": "dec_packed_seq",
                "block_mask_write_key": "dec_block_mask",
                "inner_packed_shapes_write_key": "dec_ps_inner",
                "outer_packed_shapes_write_key": "dec_ps_outer",
                "emb_packing_fn_write_key": "emb_packing_fn",
                "mask_mode": "full",
                "pad_to_multiple": 128,
                "per_subseq_embs": true
            },
            "dec_time_embedder": {
                "_target_": "flextok.model.preprocessors.time_embedding.TimestepEmbedder",
                "timesteps_read_key": "timesteps",
                "time_embedding_write_key": "dec_temb",
                "dim": 1152,
                "frequency_embedding_size": 256,
                "max_timestep": 1000.0
            },
            "dec_transformer": {
                "_target_": "flextok.model.trunks.transformers.FlexTransformer",
                "input_seq_read_key": "dec_packed_seq",
                "output_seq_write_key": "dec_packed_seq",
                "dim": 1152,
                "depth": 18,
                "block_mask_read_key": "dec_block_mask",
                "adaLN_emb_read_key": "dec_temb",
                "adaLN_packing_fn_read_key": "emb_packing_fn",
                "adaLN_expansion": 2,
                "intermediate_layer_write_key": "dec_packed_seq_repa_layer",
                "intermediate_layers": [
                    1
                ],
                "use_act_checkpoint": false
            },
            "dec_unpacker": {
                "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
                "packed_seq_read_key": "dec_packed_seq",
                "inner_seq_write_keys": [
                    "dec_patches",
                    "dec_registers_proj"
                ],
                "inner_packed_shapes_read_key": "dec_ps_inner",
                "outer_packed_shapes_read_key": "dec_ps_outer"
            },
            "dec_repa_unpacker": {
                "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
                "packed_seq_read_key": "dec_packed_seq_repa_layer",
                "inner_seq_write_keys": [
                    "dec_patches_repa_layer",
                    "dec_registers_repa_layer"
                ],
                "inner_packed_shapes_read_key": "dec_ps_inner",
                "outer_packed_shapes_read_key": "dec_ps_outer"
            },
            "dec_to_patches": {
                "_target_": "flextok.model.postprocessors.heads.ToPatchesLinearHead",
                "read_key": "dec_patches",
                "write_key": "dec_patches",
                "dim": 1152,
                "channels_out": 16,
                "patch_sizes": [
                    2,
                    2
                ],
                "use_mup_readout": false,
                "weight_init_style": "zero",
                "adaLN_emb_read_key": "dec_temb"
            },
            "dec_channels_to_first": {
                "_target_": "flextok.model.utils.dict_ops.PerSampleOp",
                "read_key": "dec_patches",
                "write_key": "vae_latents_reconst",
                "per_sample_op": {
                    "_target_": "flextok.model.utils.dict_ops.channels_last_to_first",
                    "_partial_": true
                }
            }
        }
    },
    "vae": {
        "_target_": "flextok.vae_wrapper.StableDiffusionVAE",
        "images_read_key": "rgb",
        "vae_latents_read_key": "vae_latents_reconst",
        "vae_latents_write_key": "vae_latents",
        "images_reconst_write_key": "rgb_reconst",
        "vae_kl_loss_write_key": "kl_loss",
        "dtype_override": null,
        "sample_posterior": true,
        "compile_encode_fn": false,
        "force_vae_encode": true,
        "latent_channels": 16,
        "scaling_factor": 0.88
    },
    "pipeline": {
        "_target_": "flextok.flow_matching.pipelines.MinRFPipeline",
        "_partial_": true,
        "target_sizes_read_key": null,
        "latents_read_key": "enc_registers_quant",
        "timesteps_read_key": "timesteps",
        "noised_images_read_key": "vae_latents_noised",
        "reconst_write_key": "vae_latents_reconst",
        "out_channels": 16
    },
    "flow_matching_noise_module": {
        "_target_": "flextok.flow_matching.noise_modules.MinRFNoiseModule",
        "clean_images_read_key": "vae_latents",
        "noised_images_write_key": "vae_latents_noised",
        "noise_write_key": "flow_noise",
        "timesteps_write_key": "timesteps",
        "sigmas_write_key": "sigmas",
        "ln": false,
        "stratisfied": false,
        "mode_scale": 0.25
    },
    "_target_": "flextok.flextok_wrapper.FlexTok",
    "encoder": {
        "_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper",
        "module_dict": {
            "enc_channels_to_last": {
                "_target_": "flextok.model.utils.dict_ops.PerSampleOp",
                "read_key": "vae_latents",
                "write_key": "vae_latents_bhwc",
                "per_sample_op": {
                    "_target_": "flextok.model.utils.dict_ops.channels_first_to_last",
                    "_partial_": true
                }
            },
            "enc_patch_emb": {
                "_target_": "flextok.model.preprocessors.patching.PatchEmbedder",
                "input_tensor_list_read_key": "vae_latents_bhwc",
                "patches_list_write_key": "enc_vae_latents_patched",
                "n_patches_write_key": "enc_n_patches",
                "channels_in": 16,
                "dim": 1152,
                "patch_sizes": [
                    2,
                    2
                ],
                "flatten_patches": false
            },
            "enc_posemb_module": {
                "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
                "read_key": "enc_vae_latents_patched",
                "write_key": "enc_vae_latents_patched",
                "dim": 1152,
                "max_sizes": [
                    16,
                    16
                ],
                "posemb_type": "sincos",
                "posemb_scaling": "absolute"
            },
            "enc_register_module": {
                "_target_": "flextok.model.preprocessors.registers.Registers1D",
                "input_tensor_list_read_key": "enc_vae_latents_patched",
                "register_sizes_read_write_key": "register_sizes",
                "registers_write_key": "enc_registers",
                "dim": 1152,
                "n_min": 256,
                "n_max": 256,
                "size_sampling_mode": "uniform",
                "ordering_mode": "nested"
            },
            "enc_seq_packer": {
                "_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker",
                "input_list_read_keys": [
                    "enc_vae_latents_patched",
                    "enc_registers"
                ],
                "packed_seq_write_key": "enc_packed_seq",
                "block_mask_write_key": "enc_block_mask",
                "inner_packed_shapes_write_key": "enc_ps_inner",
                "outer_packed_shapes_write_key": "enc_ps_outer",
                "mask_mode": "causal_last",
                "pad_to_multiple": 128
            },
            "enc_transformer": {
                "_target_": "flextok.model.trunks.transformers.FlexTransformer",
                "input_seq_read_key": "enc_packed_seq",
                "output_seq_write_key": "enc_packed_seq",
                "dim": 1152,
                "depth": 18,
                "block_mask_read_key": "enc_block_mask",
                "use_act_checkpoint": false
            },
            "enc_unpacker": {
                "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
                "packed_seq_read_key": "enc_packed_seq",
                "inner_seq_write_keys": [
                    "enc_vae_latents_patched",
                    "enc_registers"
                ],
                "inner_packed_shapes_read_key": "enc_ps_inner",
                "outer_packed_shapes_read_key": "enc_ps_outer"
            },
            "enc_to_latents": {
                "_target_": "flextok.model.postprocessors.heads.LinearHead",
                "read_key": "enc_registers",
                "write_key": "enc_registers",
                "dim": 1152,
                "dim_out": 6,
                "use_mup_readout": false,
                "weight_init_style": "zero",
                "dtype_override": null
            }
        }
    }
}