{ "regularizer": { "_target_": "flextok.regularizers.quantize_fsq.FSQ", "latents_read_key": "enc_registers", "quants_write_key": "enc_registers_quant", "tokens_write_key": "tokens", "levels": [ 8, 8, 8, 5, 5, 5 ], "drop_quant_p": 0.0, "packed_call": false }, "decoder": { "_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper", "module_dict": { "dec_from_latents": { "_target_": "flextok.model.preprocessors.linear.LinearLayer", "read_key": "enc_registers_quant", "write_key": "dec_registers_proj", "dim_in": 6, "dim": 1152 }, "dec_registers_posemb_module": { "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder", "read_key": "dec_registers_proj", "write_key": "dec_registers_proj", "dim": 1152, "max_sizes": [ 256 ], "posemb_type": "learnable_sum", "posemb_scaling": "absolute" }, "dec_nested_dropout": { "_target_": "flextok.model.preprocessors.token_dropout.MaskedNestedDropout", "read_write_key": "dec_registers_proj", "dim": 1152, "size_sampling_mode": "pow2" }, "dec_latent_dropout": { "_target_": "flextok.model.preprocessors.nullcond.LearnedNullCond", "read_write_key": "dec_registers_proj", "dim": 1152, "dropout_prob": 0.2 }, "dec_noise_channels_to_last": { "_target_": "flextok.model.utils.dict_ops.PerSampleOp", "read_key": "vae_latents_noised", "write_key": "vae_latents_noised_bhwc", "per_sample_op": { "_target_": "flextok.model.utils.dict_ops.channels_first_to_last", "_partial_": true } }, "dec_noise_patch_emb": { "_target_": "flextok.model.preprocessors.patching.PatchEmbedder", "input_tensor_list_read_key": "vae_latents_noised_bhwc", "patches_list_write_key": "vae_latents_noised_patched", "n_patches_write_key": "dec_n_patches", "channels_in": 16, "dim": 1152, "patch_sizes": [ 2, 2 ], "flatten_patches": false }, "dec_patches_posemb_module": { "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder", "read_key": "vae_latents_noised_patched", "write_key": "dec_patches", "dim": 1152, "max_sizes": [ 16, 16 ], "posemb_type": "sincos", "posemb_scaling": "absolute" }, "dec_seq_packer": { "_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker", "input_list_read_keys": [ "dec_patches", "dec_registers_proj" ], "packed_seq_write_key": "dec_packed_seq", "block_mask_write_key": "dec_block_mask", "inner_packed_shapes_write_key": "dec_ps_inner", "outer_packed_shapes_write_key": "dec_ps_outer", "emb_packing_fn_write_key": "emb_packing_fn", "mask_mode": "full", "pad_to_multiple": 128, "per_subseq_embs": true }, "dec_time_embedder": { "_target_": "flextok.model.preprocessors.time_embedding.TimestepEmbedder", "timesteps_read_key": "timesteps", "time_embedding_write_key": "dec_temb", "dim": 1152, "frequency_embedding_size": 256, "max_timestep": 1000.0 }, "dec_transformer": { "_target_": "flextok.model.trunks.transformers.FlexTransformer", "input_seq_read_key": "dec_packed_seq", "output_seq_write_key": "dec_packed_seq", "dim": 1152, "depth": 18, "block_mask_read_key": "dec_block_mask", "adaLN_emb_read_key": "dec_temb", "adaLN_packing_fn_read_key": "emb_packing_fn", "adaLN_expansion": 2, "intermediate_layer_write_key": "dec_packed_seq_repa_layer", "intermediate_layers": [ 1 ], "use_act_checkpoint": false }, "dec_unpacker": { "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker", "packed_seq_read_key": "dec_packed_seq", "inner_seq_write_keys": [ "dec_patches", "dec_registers_proj" ], "inner_packed_shapes_read_key": "dec_ps_inner", "outer_packed_shapes_read_key": "dec_ps_outer" }, "dec_repa_unpacker": { "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker", "packed_seq_read_key": "dec_packed_seq_repa_layer", "inner_seq_write_keys": [ "dec_patches_repa_layer", "dec_registers_repa_layer" ], "inner_packed_shapes_read_key": "dec_ps_inner", "outer_packed_shapes_read_key": "dec_ps_outer" }, "dec_to_patches": { "_target_": "flextok.model.postprocessors.heads.ToPatchesLinearHead", "read_key": "dec_patches", "write_key": "dec_patches", "dim": 1152, "channels_out": 16, "patch_sizes": [ 2, 2 ], "use_mup_readout": false, "weight_init_style": "zero", "adaLN_emb_read_key": "dec_temb" }, "dec_channels_to_first": { "_target_": "flextok.model.utils.dict_ops.PerSampleOp", "read_key": "dec_patches", "write_key": "vae_latents_reconst", "per_sample_op": { "_target_": "flextok.model.utils.dict_ops.channels_last_to_first", "_partial_": true } } } }, "vae": { "_target_": "flextok.vae_wrapper.StableDiffusionVAE", "images_read_key": "rgb", "vae_latents_read_key": "vae_latents_reconst", "vae_latents_write_key": "vae_latents", "images_reconst_write_key": "rgb_reconst", "vae_kl_loss_write_key": "kl_loss", "dtype_override": null, "sample_posterior": true, "compile_encode_fn": false, "force_vae_encode": true, "latent_channels": 16, "scaling_factor": 0.88 }, "pipeline": { "_target_": "flextok.flow_matching.pipelines.MinRFPipeline", "_partial_": true, "target_sizes_read_key": null, "latents_read_key": "enc_registers_quant", "timesteps_read_key": "timesteps", "noised_images_read_key": "vae_latents_noised", "reconst_write_key": "vae_latents_reconst", "out_channels": 16 }, "flow_matching_noise_module": { "_target_": "flextok.flow_matching.noise_modules.MinRFNoiseModule", "clean_images_read_key": "vae_latents", "noised_images_write_key": "vae_latents_noised", "noise_write_key": "flow_noise", "timesteps_write_key": "timesteps", "sigmas_write_key": "sigmas", "ln": false, "stratisfied": false, "mode_scale": 0.25 }, "_target_": "flextok.flextok_wrapper.FlexTok", "encoder": { "_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper", "module_dict": { "enc_channels_to_last": { "_target_": "flextok.model.utils.dict_ops.PerSampleOp", "read_key": "vae_latents", "write_key": "vae_latents_bhwc", "per_sample_op": { "_target_": "flextok.model.utils.dict_ops.channels_first_to_last", "_partial_": true } }, "enc_patch_emb": { "_target_": "flextok.model.preprocessors.patching.PatchEmbedder", "input_tensor_list_read_key": "vae_latents_bhwc", "patches_list_write_key": "enc_vae_latents_patched", "n_patches_write_key": "enc_n_patches", "channels_in": 16, "dim": 1152, "patch_sizes": [ 2, 2 ], "flatten_patches": false }, "enc_posemb_module": { "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder", "read_key": "enc_vae_latents_patched", "write_key": "enc_vae_latents_patched", "dim": 1152, "max_sizes": [ 16, 16 ], "posemb_type": "sincos", "posemb_scaling": "absolute" }, "enc_register_module": { "_target_": "flextok.model.preprocessors.registers.Registers1D", "input_tensor_list_read_key": "enc_vae_latents_patched", "register_sizes_read_write_key": "register_sizes", "registers_write_key": "enc_registers", "dim": 1152, "n_min": 256, "n_max": 256, "size_sampling_mode": "uniform", "ordering_mode": "nested" }, "enc_seq_packer": { "_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker", "input_list_read_keys": [ "enc_vae_latents_patched", "enc_registers" ], "packed_seq_write_key": "enc_packed_seq", "block_mask_write_key": "enc_block_mask", "inner_packed_shapes_write_key": "enc_ps_inner", "outer_packed_shapes_write_key": "enc_ps_outer", "mask_mode": "causal_last", "pad_to_multiple": 128 }, "enc_transformer": { "_target_": "flextok.model.trunks.transformers.FlexTransformer", "input_seq_read_key": "enc_packed_seq", "output_seq_write_key": "enc_packed_seq", "dim": 1152, "depth": 18, "block_mask_read_key": "enc_block_mask", "use_act_checkpoint": false }, "enc_unpacker": { "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker", "packed_seq_read_key": "enc_packed_seq", "inner_seq_write_keys": [ "enc_vae_latents_patched", "enc_registers" ], "inner_packed_shapes_read_key": "enc_ps_inner", "outer_packed_shapes_read_key": "enc_ps_outer" }, "enc_to_latents": { "_target_": "flextok.model.postprocessors.heads.LinearHead", "read_key": "enc_registers", "write_key": "enc_registers", "dim": 1152, "dim_out": 6, "use_mup_readout": false, "weight_init_style": "zero", "dtype_override": null } } } }