Safetensors
flextok_d18_d18_in1k / config.json
roman-bachmann's picture
Initial commit
1af7e96
{
"regularizer": {
"_target_": "flextok.regularizers.quantize_fsq.FSQ",
"latents_read_key": "enc_registers",
"quants_write_key": "enc_registers_quant",
"tokens_write_key": "tokens",
"levels": [
8,
8,
8,
5,
5,
5
],
"drop_quant_p": 0.0,
"packed_call": false
},
"decoder": {
"_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper",
"module_dict": {
"dec_from_latents": {
"_target_": "flextok.model.preprocessors.linear.LinearLayer",
"read_key": "enc_registers_quant",
"write_key": "dec_registers_proj",
"dim_in": 6,
"dim": 1152
},
"dec_registers_posemb_module": {
"_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
"read_key": "dec_registers_proj",
"write_key": "dec_registers_proj",
"dim": 1152,
"max_sizes": [
256
],
"posemb_type": "learnable_sum",
"posemb_scaling": "absolute"
},
"dec_nested_dropout": {
"_target_": "flextok.model.preprocessors.token_dropout.MaskedNestedDropout",
"read_write_key": "dec_registers_proj",
"dim": 1152,
"size_sampling_mode": "pow2"
},
"dec_latent_dropout": {
"_target_": "flextok.model.preprocessors.nullcond.LearnedNullCond",
"read_write_key": "dec_registers_proj",
"dim": 1152,
"dropout_prob": 0.2
},
"dec_noise_channels_to_last": {
"_target_": "flextok.model.utils.dict_ops.PerSampleOp",
"read_key": "vae_latents_noised",
"write_key": "vae_latents_noised_bhwc",
"per_sample_op": {
"_target_": "flextok.model.utils.dict_ops.channels_first_to_last",
"_partial_": true
}
},
"dec_noise_patch_emb": {
"_target_": "flextok.model.preprocessors.patching.PatchEmbedder",
"input_tensor_list_read_key": "vae_latents_noised_bhwc",
"patches_list_write_key": "vae_latents_noised_patched",
"n_patches_write_key": "dec_n_patches",
"channels_in": 16,
"dim": 1152,
"patch_sizes": [
2,
2
],
"flatten_patches": false
},
"dec_patches_posemb_module": {
"_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
"read_key": "vae_latents_noised_patched",
"write_key": "dec_patches",
"dim": 1152,
"max_sizes": [
16,
16
],
"posemb_type": "sincos",
"posemb_scaling": "absolute"
},
"dec_seq_packer": {
"_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker",
"input_list_read_keys": [
"dec_patches",
"dec_registers_proj"
],
"packed_seq_write_key": "dec_packed_seq",
"block_mask_write_key": "dec_block_mask",
"inner_packed_shapes_write_key": "dec_ps_inner",
"outer_packed_shapes_write_key": "dec_ps_outer",
"emb_packing_fn_write_key": "emb_packing_fn",
"mask_mode": "full",
"pad_to_multiple": 128,
"per_subseq_embs": true
},
"dec_time_embedder": {
"_target_": "flextok.model.preprocessors.time_embedding.TimestepEmbedder",
"timesteps_read_key": "timesteps",
"time_embedding_write_key": "dec_temb",
"dim": 1152,
"frequency_embedding_size": 256,
"max_timestep": 1000.0
},
"dec_transformer": {
"_target_": "flextok.model.trunks.transformers.FlexTransformer",
"input_seq_read_key": "dec_packed_seq",
"output_seq_write_key": "dec_packed_seq",
"dim": 1152,
"depth": 18,
"block_mask_read_key": "dec_block_mask",
"adaLN_emb_read_key": "dec_temb",
"adaLN_packing_fn_read_key": "emb_packing_fn",
"adaLN_expansion": 2,
"intermediate_layer_write_key": "dec_packed_seq_repa_layer",
"intermediate_layers": [
1
],
"use_act_checkpoint": false
},
"dec_unpacker": {
"_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
"packed_seq_read_key": "dec_packed_seq",
"inner_seq_write_keys": [
"dec_patches",
"dec_registers_proj"
],
"inner_packed_shapes_read_key": "dec_ps_inner",
"outer_packed_shapes_read_key": "dec_ps_outer"
},
"dec_repa_unpacker": {
"_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
"packed_seq_read_key": "dec_packed_seq_repa_layer",
"inner_seq_write_keys": [
"dec_patches_repa_layer",
"dec_registers_repa_layer"
],
"inner_packed_shapes_read_key": "dec_ps_inner",
"outer_packed_shapes_read_key": "dec_ps_outer"
},
"dec_to_patches": {
"_target_": "flextok.model.postprocessors.heads.ToPatchesLinearHead",
"read_key": "dec_patches",
"write_key": "dec_patches",
"dim": 1152,
"channels_out": 16,
"patch_sizes": [
2,
2
],
"use_mup_readout": false,
"weight_init_style": "zero",
"adaLN_emb_read_key": "dec_temb"
},
"dec_channels_to_first": {
"_target_": "flextok.model.utils.dict_ops.PerSampleOp",
"read_key": "dec_patches",
"write_key": "vae_latents_reconst",
"per_sample_op": {
"_target_": "flextok.model.utils.dict_ops.channels_last_to_first",
"_partial_": true
}
}
}
},
"vae": {
"_target_": "flextok.vae_wrapper.StableDiffusionVAE",
"images_read_key": "rgb",
"vae_latents_read_key": "vae_latents_reconst",
"vae_latents_write_key": "vae_latents",
"images_reconst_write_key": "rgb_reconst",
"vae_kl_loss_write_key": "kl_loss",
"dtype_override": null,
"sample_posterior": true,
"compile_encode_fn": false,
"force_vae_encode": true,
"latent_channels": 16,
"scaling_factor": 0.88
},
"pipeline": {
"_target_": "flextok.flow_matching.pipelines.MinRFPipeline",
"_partial_": true,
"target_sizes_read_key": null,
"latents_read_key": "enc_registers_quant",
"timesteps_read_key": "timesteps",
"noised_images_read_key": "vae_latents_noised",
"reconst_write_key": "vae_latents_reconst",
"out_channels": 16
},
"flow_matching_noise_module": {
"_target_": "flextok.flow_matching.noise_modules.MinRFNoiseModule",
"clean_images_read_key": "vae_latents",
"noised_images_write_key": "vae_latents_noised",
"noise_write_key": "flow_noise",
"timesteps_write_key": "timesteps",
"sigmas_write_key": "sigmas",
"ln": false,
"stratisfied": false,
"mode_scale": 0.25
},
"_target_": "flextok.flextok_wrapper.FlexTok",
"encoder": {
"_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper",
"module_dict": {
"enc_channels_to_last": {
"_target_": "flextok.model.utils.dict_ops.PerSampleOp",
"read_key": "vae_latents",
"write_key": "vae_latents_bhwc",
"per_sample_op": {
"_target_": "flextok.model.utils.dict_ops.channels_first_to_last",
"_partial_": true
}
},
"enc_patch_emb": {
"_target_": "flextok.model.preprocessors.patching.PatchEmbedder",
"input_tensor_list_read_key": "vae_latents_bhwc",
"patches_list_write_key": "enc_vae_latents_patched",
"n_patches_write_key": "enc_n_patches",
"channels_in": 16,
"dim": 1152,
"patch_sizes": [
2,
2
],
"flatten_patches": false
},
"enc_posemb_module": {
"_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
"read_key": "enc_vae_latents_patched",
"write_key": "enc_vae_latents_patched",
"dim": 1152,
"max_sizes": [
16,
16
],
"posemb_type": "sincos",
"posemb_scaling": "absolute"
},
"enc_register_module": {
"_target_": "flextok.model.preprocessors.registers.Registers1D",
"input_tensor_list_read_key": "enc_vae_latents_patched",
"register_sizes_read_write_key": "register_sizes",
"registers_write_key": "enc_registers",
"dim": 1152,
"n_min": 256,
"n_max": 256,
"size_sampling_mode": "uniform",
"ordering_mode": "nested"
},
"enc_seq_packer": {
"_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker",
"input_list_read_keys": [
"enc_vae_latents_patched",
"enc_registers"
],
"packed_seq_write_key": "enc_packed_seq",
"block_mask_write_key": "enc_block_mask",
"inner_packed_shapes_write_key": "enc_ps_inner",
"outer_packed_shapes_write_key": "enc_ps_outer",
"mask_mode": "causal_last",
"pad_to_multiple": 128
},
"enc_transformer": {
"_target_": "flextok.model.trunks.transformers.FlexTransformer",
"input_seq_read_key": "enc_packed_seq",
"output_seq_write_key": "enc_packed_seq",
"dim": 1152,
"depth": 18,
"block_mask_read_key": "enc_block_mask",
"use_act_checkpoint": false
},
"enc_unpacker": {
"_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
"packed_seq_read_key": "enc_packed_seq",
"inner_seq_write_keys": [
"enc_vae_latents_patched",
"enc_registers"
],
"inner_packed_shapes_read_key": "enc_ps_inner",
"outer_packed_shapes_read_key": "enc_ps_outer"
},
"enc_to_latents": {
"_target_": "flextok.model.postprocessors.heads.LinearHead",
"read_key": "enc_registers",
"write_key": "enc_registers",
"dim": 1152,
"dim_out": 6,
"use_mup_readout": false,
"weight_init_style": "zero",
"dtype_override": null
}
}
}
}