File size: 12,139 Bytes
1af7e96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 |
{
"regularizer": {
"_target_": "flextok.regularizers.quantize_fsq.FSQ",
"latents_read_key": "enc_registers",
"quants_write_key": "enc_registers_quant",
"tokens_write_key": "tokens",
"levels": [
8,
8,
8,
5,
5,
5
],
"drop_quant_p": 0.0,
"packed_call": false
},
"decoder": {
"_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper",
"module_dict": {
"dec_from_latents": {
"_target_": "flextok.model.preprocessors.linear.LinearLayer",
"read_key": "enc_registers_quant",
"write_key": "dec_registers_proj",
"dim_in": 6,
"dim": 1152
},
"dec_registers_posemb_module": {
"_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
"read_key": "dec_registers_proj",
"write_key": "dec_registers_proj",
"dim": 1152,
"max_sizes": [
256
],
"posemb_type": "learnable_sum",
"posemb_scaling": "absolute"
},
"dec_nested_dropout": {
"_target_": "flextok.model.preprocessors.token_dropout.MaskedNestedDropout",
"read_write_key": "dec_registers_proj",
"dim": 1152,
"size_sampling_mode": "pow2"
},
"dec_latent_dropout": {
"_target_": "flextok.model.preprocessors.nullcond.LearnedNullCond",
"read_write_key": "dec_registers_proj",
"dim": 1152,
"dropout_prob": 0.2
},
"dec_noise_channels_to_last": {
"_target_": "flextok.model.utils.dict_ops.PerSampleOp",
"read_key": "vae_latents_noised",
"write_key": "vae_latents_noised_bhwc",
"per_sample_op": {
"_target_": "flextok.model.utils.dict_ops.channels_first_to_last",
"_partial_": true
}
},
"dec_noise_patch_emb": {
"_target_": "flextok.model.preprocessors.patching.PatchEmbedder",
"input_tensor_list_read_key": "vae_latents_noised_bhwc",
"patches_list_write_key": "vae_latents_noised_patched",
"n_patches_write_key": "dec_n_patches",
"channels_in": 16,
"dim": 1152,
"patch_sizes": [
2,
2
],
"flatten_patches": false
},
"dec_patches_posemb_module": {
"_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
"read_key": "vae_latents_noised_patched",
"write_key": "dec_patches",
"dim": 1152,
"max_sizes": [
16,
16
],
"posemb_type": "sincos",
"posemb_scaling": "absolute"
},
"dec_seq_packer": {
"_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker",
"input_list_read_keys": [
"dec_patches",
"dec_registers_proj"
],
"packed_seq_write_key": "dec_packed_seq",
"block_mask_write_key": "dec_block_mask",
"inner_packed_shapes_write_key": "dec_ps_inner",
"outer_packed_shapes_write_key": "dec_ps_outer",
"emb_packing_fn_write_key": "emb_packing_fn",
"mask_mode": "full",
"pad_to_multiple": 128,
"per_subseq_embs": true
},
"dec_time_embedder": {
"_target_": "flextok.model.preprocessors.time_embedding.TimestepEmbedder",
"timesteps_read_key": "timesteps",
"time_embedding_write_key": "dec_temb",
"dim": 1152,
"frequency_embedding_size": 256,
"max_timestep": 1000.0
},
"dec_transformer": {
"_target_": "flextok.model.trunks.transformers.FlexTransformer",
"input_seq_read_key": "dec_packed_seq",
"output_seq_write_key": "dec_packed_seq",
"dim": 1152,
"depth": 18,
"block_mask_read_key": "dec_block_mask",
"adaLN_emb_read_key": "dec_temb",
"adaLN_packing_fn_read_key": "emb_packing_fn",
"adaLN_expansion": 2,
"intermediate_layer_write_key": "dec_packed_seq_repa_layer",
"intermediate_layers": [
1
],
"use_act_checkpoint": false
},
"dec_unpacker": {
"_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
"packed_seq_read_key": "dec_packed_seq",
"inner_seq_write_keys": [
"dec_patches",
"dec_registers_proj"
],
"inner_packed_shapes_read_key": "dec_ps_inner",
"outer_packed_shapes_read_key": "dec_ps_outer"
},
"dec_repa_unpacker": {
"_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
"packed_seq_read_key": "dec_packed_seq_repa_layer",
"inner_seq_write_keys": [
"dec_patches_repa_layer",
"dec_registers_repa_layer"
],
"inner_packed_shapes_read_key": "dec_ps_inner",
"outer_packed_shapes_read_key": "dec_ps_outer"
},
"dec_to_patches": {
"_target_": "flextok.model.postprocessors.heads.ToPatchesLinearHead",
"read_key": "dec_patches",
"write_key": "dec_patches",
"dim": 1152,
"channels_out": 16,
"patch_sizes": [
2,
2
],
"use_mup_readout": false,
"weight_init_style": "zero",
"adaLN_emb_read_key": "dec_temb"
},
"dec_channels_to_first": {
"_target_": "flextok.model.utils.dict_ops.PerSampleOp",
"read_key": "dec_patches",
"write_key": "vae_latents_reconst",
"per_sample_op": {
"_target_": "flextok.model.utils.dict_ops.channels_last_to_first",
"_partial_": true
}
}
}
},
"vae": {
"_target_": "flextok.vae_wrapper.StableDiffusionVAE",
"images_read_key": "rgb",
"vae_latents_read_key": "vae_latents_reconst",
"vae_latents_write_key": "vae_latents",
"images_reconst_write_key": "rgb_reconst",
"vae_kl_loss_write_key": "kl_loss",
"dtype_override": null,
"sample_posterior": true,
"compile_encode_fn": false,
"force_vae_encode": true,
"latent_channels": 16,
"scaling_factor": 0.88
},
"pipeline": {
"_target_": "flextok.flow_matching.pipelines.MinRFPipeline",
"_partial_": true,
"target_sizes_read_key": null,
"latents_read_key": "enc_registers_quant",
"timesteps_read_key": "timesteps",
"noised_images_read_key": "vae_latents_noised",
"reconst_write_key": "vae_latents_reconst",
"out_channels": 16
},
"flow_matching_noise_module": {
"_target_": "flextok.flow_matching.noise_modules.MinRFNoiseModule",
"clean_images_read_key": "vae_latents",
"noised_images_write_key": "vae_latents_noised",
"noise_write_key": "flow_noise",
"timesteps_write_key": "timesteps",
"sigmas_write_key": "sigmas",
"ln": false,
"stratisfied": false,
"mode_scale": 0.25
},
"_target_": "flextok.flextok_wrapper.FlexTok",
"encoder": {
"_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper",
"module_dict": {
"enc_channels_to_last": {
"_target_": "flextok.model.utils.dict_ops.PerSampleOp",
"read_key": "vae_latents",
"write_key": "vae_latents_bhwc",
"per_sample_op": {
"_target_": "flextok.model.utils.dict_ops.channels_first_to_last",
"_partial_": true
}
},
"enc_patch_emb": {
"_target_": "flextok.model.preprocessors.patching.PatchEmbedder",
"input_tensor_list_read_key": "vae_latents_bhwc",
"patches_list_write_key": "enc_vae_latents_patched",
"n_patches_write_key": "enc_n_patches",
"channels_in": 16,
"dim": 1152,
"patch_sizes": [
2,
2
],
"flatten_patches": false
},
"enc_posemb_module": {
"_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
"read_key": "enc_vae_latents_patched",
"write_key": "enc_vae_latents_patched",
"dim": 1152,
"max_sizes": [
16,
16
],
"posemb_type": "sincos",
"posemb_scaling": "absolute"
},
"enc_register_module": {
"_target_": "flextok.model.preprocessors.registers.Registers1D",
"input_tensor_list_read_key": "enc_vae_latents_patched",
"register_sizes_read_write_key": "register_sizes",
"registers_write_key": "enc_registers",
"dim": 1152,
"n_min": 256,
"n_max": 256,
"size_sampling_mode": "uniform",
"ordering_mode": "nested"
},
"enc_seq_packer": {
"_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker",
"input_list_read_keys": [
"enc_vae_latents_patched",
"enc_registers"
],
"packed_seq_write_key": "enc_packed_seq",
"block_mask_write_key": "enc_block_mask",
"inner_packed_shapes_write_key": "enc_ps_inner",
"outer_packed_shapes_write_key": "enc_ps_outer",
"mask_mode": "causal_last",
"pad_to_multiple": 128
},
"enc_transformer": {
"_target_": "flextok.model.trunks.transformers.FlexTransformer",
"input_seq_read_key": "enc_packed_seq",
"output_seq_write_key": "enc_packed_seq",
"dim": 1152,
"depth": 18,
"block_mask_read_key": "enc_block_mask",
"use_act_checkpoint": false
},
"enc_unpacker": {
"_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
"packed_seq_read_key": "enc_packed_seq",
"inner_seq_write_keys": [
"enc_vae_latents_patched",
"enc_registers"
],
"inner_packed_shapes_read_key": "enc_ps_inner",
"outer_packed_shapes_read_key": "enc_ps_outer"
},
"enc_to_latents": {
"_target_": "flextok.model.postprocessors.heads.LinearHead",
"read_key": "enc_registers",
"write_key": "enc_registers",
"dim": 1152,
"dim_out": 6,
"use_mup_readout": false,
"weight_init_style": "zero",
"dtype_override": null
}
}
}
} |