model: target: sgm.models.diffusion.DiffusionEngine params: scale_factor: 0.18215 disable_first_stage_autocast: True ckpt_path: denoiser_config: target: sgm.modules.diffusionmodules.denoiser.DenoiserDub params: scaling_config: target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise network_wrapper: target: sgm.modules.diffusionmodules.wrappers.InterpolationWrapper params: im_size: [512, 512] # USER: adapt this to your dataset n_channels: 4 starting_mask_method: zeros add_mask: True network_config: target: sgm.modules.diffusionmodules.video_model.VideoUNet params: adm_in_channels: 0 num_classes: sequential use_checkpoint: True in_channels: 9 out_channels: 4 model_channels: 320 attention_resolutions: [4, 2, 1] num_res_blocks: 2 channel_mult: [1, 2, 4, 4] num_head_channels: 64 use_linear_in_transformer: True transformer_depth: 1 context_dim: 1024 spatial_transformer_attn_type: softmax-xformers extra_ff_mix_layer: True use_spatial_context: True merge_strategy: learned_with_images video_kernel_size: [3, 1, 1] fine_tuning_method: null audio_cond_method: both_keyframes additional_audio_frames: 0 audio_dim: 1024 unfreeze_blocks: ["input"] conditioner_config: target: sgm.modules.GeneralConditioner params: emb_models: - input_key: cond_frames is_trainable: False target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder params: disable_encoder_autocast: True n_cond_frames: 2 n_copies: 1 is_ae: True encoder_config: target: sgm.models.autoencoder.AutoencoderKLModeOnly params: embed_dim: 4 monitor: val/rec_loss ddconfig: attn_type: vanilla-xformers double_z: True z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [1, 2, 4, 4] num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity - input_key: gt # allows to use the ground truth as a condition is_trainable: False target: sgm.modules.encoders.modules.IdentityEncoder params: cond_type: gt - input_key: audio_emb is_trainable: True target: sgm.modules.encoders.modules.WhisperAudioEmbedder params: merge_method: mean linear_dim: 1024 cond_type: crossattn audio_dim: 768 - input_key: masks is_trainable: False target: sgm.modules.encoders.modules.IdentityEncoder params: cond_type: masks first_stage_config: target: sgm.models.autoencoder.AutoencodingEngine params: loss_config: target: torch.nn.Identity regularizer_config: target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer encoder_config: target: sgm.modules.diffusionmodules.model.Encoder params: attn_type: vanilla double_z: True z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [1, 2, 4, 4] num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 decoder_config: target: sgm.modules.autoencoding.temporal_ae.VideoDecoder params: attn_type: vanilla double_z: True z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [1, 2, 4, 4] num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 video_kernel_size: [3, 1, 1] sampler_config: target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler params: num_steps: 10 discretization_config: target: sgm.modules.diffusionmodules.discretizer.AYSDiscretization guider_config: target: sgm.modules.diffusionmodules.guiders.AudioRefMultiCondGuider params: audio_ratio: 5.0 ref_ratio: 2.0