|
{ |
|
"regularizer": { |
|
"_target_": "flextok.regularizers.quantize_fsq.FSQ", |
|
"latents_read_key": "enc_registers", |
|
"quants_write_key": "enc_registers_quant", |
|
"tokens_write_key": "tokens", |
|
"levels": [ |
|
8, |
|
8, |
|
8, |
|
5, |
|
5, |
|
5 |
|
], |
|
"drop_quant_p": 0.0, |
|
"packed_call": false |
|
}, |
|
"decoder": { |
|
"_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper", |
|
"module_dict": { |
|
"dec_from_latents": { |
|
"_target_": "flextok.model.preprocessors.linear.LinearLayer", |
|
"read_key": "enc_registers_quant", |
|
"write_key": "dec_registers_proj", |
|
"dim_in": 6, |
|
"dim": 1152 |
|
}, |
|
"dec_registers_posemb_module": { |
|
"_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder", |
|
"read_key": "dec_registers_proj", |
|
"write_key": "dec_registers_proj", |
|
"dim": 1152, |
|
"max_sizes": [ |
|
256 |
|
], |
|
"posemb_type": "learnable_sum", |
|
"posemb_scaling": "absolute" |
|
}, |
|
"dec_nested_dropout": { |
|
"_target_": "flextok.model.preprocessors.token_dropout.MaskedNestedDropout", |
|
"read_write_key": "dec_registers_proj", |
|
"dim": 1152, |
|
"size_sampling_mode": "pow2" |
|
}, |
|
"dec_latent_dropout": { |
|
"_target_": "flextok.model.preprocessors.nullcond.LearnedNullCond", |
|
"read_write_key": "dec_registers_proj", |
|
"dim": 1152, |
|
"dropout_prob": 0.2 |
|
}, |
|
"dec_noise_channels_to_last": { |
|
"_target_": "flextok.model.utils.dict_ops.PerSampleOp", |
|
"read_key": "vae_latents_noised", |
|
"write_key": "vae_latents_noised_bhwc", |
|
"per_sample_op": { |
|
"_target_": "flextok.model.utils.dict_ops.channels_first_to_last", |
|
"_partial_": true |
|
} |
|
}, |
|
"dec_noise_patch_emb": { |
|
"_target_": "flextok.model.preprocessors.patching.PatchEmbedder", |
|
"input_tensor_list_read_key": "vae_latents_noised_bhwc", |
|
"patches_list_write_key": "vae_latents_noised_patched", |
|
"n_patches_write_key": "dec_n_patches", |
|
"channels_in": 16, |
|
"dim": 1152, |
|
"patch_sizes": [ |
|
2, |
|
2 |
|
], |
|
"flatten_patches": false |
|
}, |
|
"dec_patches_posemb_module": { |
|
"_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder", |
|
"read_key": "vae_latents_noised_patched", |
|
"write_key": "dec_patches", |
|
"dim": 1152, |
|
"max_sizes": [ |
|
16, |
|
16 |
|
], |
|
"posemb_type": "sincos", |
|
"posemb_scaling": "absolute" |
|
}, |
|
"dec_seq_packer": { |
|
"_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker", |
|
"input_list_read_keys": [ |
|
"dec_patches", |
|
"dec_registers_proj" |
|
], |
|
"packed_seq_write_key": "dec_packed_seq", |
|
"block_mask_write_key": "dec_block_mask", |
|
"inner_packed_shapes_write_key": "dec_ps_inner", |
|
"outer_packed_shapes_write_key": "dec_ps_outer", |
|
"emb_packing_fn_write_key": "emb_packing_fn", |
|
"mask_mode": "full", |
|
"pad_to_multiple": 128, |
|
"per_subseq_embs": true |
|
}, |
|
"dec_time_embedder": { |
|
"_target_": "flextok.model.preprocessors.time_embedding.TimestepEmbedder", |
|
"timesteps_read_key": "timesteps", |
|
"time_embedding_write_key": "dec_temb", |
|
"dim": 1152, |
|
"frequency_embedding_size": 256, |
|
"max_timestep": 1000.0 |
|
}, |
|
"dec_transformer": { |
|
"_target_": "flextok.model.trunks.transformers.FlexTransformer", |
|
"input_seq_read_key": "dec_packed_seq", |
|
"output_seq_write_key": "dec_packed_seq", |
|
"dim": 1152, |
|
"depth": 18, |
|
"block_mask_read_key": "dec_block_mask", |
|
"adaLN_emb_read_key": "dec_temb", |
|
"adaLN_packing_fn_read_key": "emb_packing_fn", |
|
"adaLN_expansion": 2, |
|
"intermediate_layer_write_key": "dec_packed_seq_repa_layer", |
|
"intermediate_layers": [ |
|
1 |
|
], |
|
"use_act_checkpoint": false |
|
}, |
|
"dec_unpacker": { |
|
"_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker", |
|
"packed_seq_read_key": "dec_packed_seq", |
|
"inner_seq_write_keys": [ |
|
"dec_patches", |
|
"dec_registers_proj" |
|
], |
|
"inner_packed_shapes_read_key": "dec_ps_inner", |
|
"outer_packed_shapes_read_key": "dec_ps_outer" |
|
}, |
|
"dec_repa_unpacker": { |
|
"_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker", |
|
"packed_seq_read_key": "dec_packed_seq_repa_layer", |
|
"inner_seq_write_keys": [ |
|
"dec_patches_repa_layer", |
|
"dec_registers_repa_layer" |
|
], |
|
"inner_packed_shapes_read_key": "dec_ps_inner", |
|
"outer_packed_shapes_read_key": "dec_ps_outer" |
|
}, |
|
"dec_to_patches": { |
|
"_target_": "flextok.model.postprocessors.heads.ToPatchesLinearHead", |
|
"read_key": "dec_patches", |
|
"write_key": "dec_patches", |
|
"dim": 1152, |
|
"channels_out": 16, |
|
"patch_sizes": [ |
|
2, |
|
2 |
|
], |
|
"use_mup_readout": false, |
|
"weight_init_style": "zero", |
|
"adaLN_emb_read_key": "dec_temb" |
|
}, |
|
"dec_channels_to_first": { |
|
"_target_": "flextok.model.utils.dict_ops.PerSampleOp", |
|
"read_key": "dec_patches", |
|
"write_key": "vae_latents_reconst", |
|
"per_sample_op": { |
|
"_target_": "flextok.model.utils.dict_ops.channels_last_to_first", |
|
"_partial_": true |
|
} |
|
} |
|
} |
|
}, |
|
"vae": { |
|
"_target_": "flextok.vae_wrapper.StableDiffusionVAE", |
|
"images_read_key": "rgb", |
|
"vae_latents_read_key": "vae_latents_reconst", |
|
"vae_latents_write_key": "vae_latents", |
|
"images_reconst_write_key": "rgb_reconst", |
|
"vae_kl_loss_write_key": "kl_loss", |
|
"dtype_override": null, |
|
"sample_posterior": true, |
|
"compile_encode_fn": false, |
|
"force_vae_encode": true, |
|
"latent_channels": 16, |
|
"scaling_factor": 0.88 |
|
}, |
|
"pipeline": { |
|
"_target_": "flextok.flow_matching.pipelines.MinRFPipeline", |
|
"_partial_": true, |
|
"target_sizes_read_key": null, |
|
"latents_read_key": "enc_registers_quant", |
|
"timesteps_read_key": "timesteps", |
|
"noised_images_read_key": "vae_latents_noised", |
|
"reconst_write_key": "vae_latents_reconst", |
|
"out_channels": 16 |
|
}, |
|
"flow_matching_noise_module": { |
|
"_target_": "flextok.flow_matching.noise_modules.MinRFNoiseModule", |
|
"clean_images_read_key": "vae_latents", |
|
"noised_images_write_key": "vae_latents_noised", |
|
"noise_write_key": "flow_noise", |
|
"timesteps_write_key": "timesteps", |
|
"sigmas_write_key": "sigmas", |
|
"ln": false, |
|
"stratisfied": false, |
|
"mode_scale": 0.25 |
|
}, |
|
"_target_": "flextok.flextok_wrapper.FlexTok", |
|
"encoder": { |
|
"_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper", |
|
"module_dict": { |
|
"enc_channels_to_last": { |
|
"_target_": "flextok.model.utils.dict_ops.PerSampleOp", |
|
"read_key": "vae_latents", |
|
"write_key": "vae_latents_bhwc", |
|
"per_sample_op": { |
|
"_target_": "flextok.model.utils.dict_ops.channels_first_to_last", |
|
"_partial_": true |
|
} |
|
}, |
|
"enc_patch_emb": { |
|
"_target_": "flextok.model.preprocessors.patching.PatchEmbedder", |
|
"input_tensor_list_read_key": "vae_latents_bhwc", |
|
"patches_list_write_key": "enc_vae_latents_patched", |
|
"n_patches_write_key": "enc_n_patches", |
|
"channels_in": 16, |
|
"dim": 1152, |
|
"patch_sizes": [ |
|
2, |
|
2 |
|
], |
|
"flatten_patches": false |
|
}, |
|
"enc_posemb_module": { |
|
"_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder", |
|
"read_key": "enc_vae_latents_patched", |
|
"write_key": "enc_vae_latents_patched", |
|
"dim": 1152, |
|
"max_sizes": [ |
|
16, |
|
16 |
|
], |
|
"posemb_type": "sincos", |
|
"posemb_scaling": "absolute" |
|
}, |
|
"enc_register_module": { |
|
"_target_": "flextok.model.preprocessors.registers.Registers1D", |
|
"input_tensor_list_read_key": "enc_vae_latents_patched", |
|
"register_sizes_read_write_key": "register_sizes", |
|
"registers_write_key": "enc_registers", |
|
"dim": 1152, |
|
"n_min": 256, |
|
"n_max": 256, |
|
"size_sampling_mode": "uniform", |
|
"ordering_mode": "nested" |
|
}, |
|
"enc_seq_packer": { |
|
"_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker", |
|
"input_list_read_keys": [ |
|
"enc_vae_latents_patched", |
|
"enc_registers" |
|
], |
|
"packed_seq_write_key": "enc_packed_seq", |
|
"block_mask_write_key": "enc_block_mask", |
|
"inner_packed_shapes_write_key": "enc_ps_inner", |
|
"outer_packed_shapes_write_key": "enc_ps_outer", |
|
"mask_mode": "causal_last", |
|
"pad_to_multiple": 128 |
|
}, |
|
"enc_transformer": { |
|
"_target_": "flextok.model.trunks.transformers.FlexTransformer", |
|
"input_seq_read_key": "enc_packed_seq", |
|
"output_seq_write_key": "enc_packed_seq", |
|
"dim": 1152, |
|
"depth": 18, |
|
"block_mask_read_key": "enc_block_mask", |
|
"use_act_checkpoint": false |
|
}, |
|
"enc_unpacker": { |
|
"_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker", |
|
"packed_seq_read_key": "enc_packed_seq", |
|
"inner_seq_write_keys": [ |
|
"enc_vae_latents_patched", |
|
"enc_registers" |
|
], |
|
"inner_packed_shapes_read_key": "enc_ps_inner", |
|
"outer_packed_shapes_read_key": "enc_ps_outer" |
|
}, |
|
"enc_to_latents": { |
|
"_target_": "flextok.model.postprocessors.heads.LinearHead", |
|
"read_key": "enc_registers", |
|
"write_key": "enc_registers", |
|
"dim": 1152, |
|
"dim_out": 6, |
|
"use_mup_readout": false, |
|
"weight_init_style": "zero", |
|
"dtype_override": null |
|
} |
|
} |
|
} |
|
} |