model:
  target: sgm.models.diffusion.DiffusionEngine
  params:
    scale_factor: 0.18215
    disable_first_stage_autocast: True
    ckpt_path: 

    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.DenoiserDub
      params:
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise

    network_wrapper: 
      target: sgm.modules.diffusionmodules.wrappers.InterpolationWrapper
      params:
        im_size: [512, 512] # USER: adapt this to your dataset
        n_channels: 4
        starting_mask_method: zeros
        add_mask: True

    network_config:
      target: sgm.modules.diffusionmodules.video_model.VideoUNet
      params:
        adm_in_channels: 0
        num_classes: sequential
        use_checkpoint: True
        in_channels: 9
        out_channels: 4
        model_channels: 320
        attention_resolutions: [4, 2, 1]
        num_res_blocks: 2
        channel_mult: [1, 2, 4, 4]
        num_head_channels: 64
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        spatial_transformer_attn_type: softmax-xformers
        extra_ff_mix_layer: True
        use_spatial_context: True
        merge_strategy: learned_with_images
        video_kernel_size: [3, 1, 1]
        fine_tuning_method: null
        audio_cond_method: both_keyframes
        additional_audio_frames: 0
        audio_dim: 1024
        unfreeze_blocks: ["input"] 

    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
   
        - input_key: cond_frames
          is_trainable: False
          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
          params:
            disable_encoder_autocast: True
            n_cond_frames: 2
            n_copies: 1
            is_ae: True
            encoder_config:
              target: sgm.models.autoencoder.AutoencoderKLModeOnly
              params:
                embed_dim: 4
                monitor: val/rec_loss
                ddconfig:
                  attn_type: vanilla-xformers
                  double_z: True
                  z_channels: 4
                  resolution: 256
                  in_channels: 3
                  out_ch: 3
                  ch: 128
                  ch_mult: [1, 2, 4, 4]
                  num_res_blocks: 2
                  attn_resolutions: []
                  dropout: 0.0
                lossconfig:
                  target: torch.nn.Identity

        - input_key: gt # allows to use the ground truth as a condition
          is_trainable: False
          target: sgm.modules.encoders.modules.IdentityEncoder
          params:
            cond_type: gt

        - input_key: audio_emb
          is_trainable: True
          target: sgm.modules.encoders.modules.WhisperAudioEmbedder
          params:
            merge_method: mean 
            linear_dim: 1024
            cond_type: crossattn
            audio_dim: 768

        - input_key: masks
          is_trainable: False
          target: sgm.modules.encoders.modules.IdentityEncoder
          params:
            cond_type: masks

    first_stage_config:
      target: sgm.models.autoencoder.AutoencodingEngine
      params:
        loss_config:
          target: torch.nn.Identity
        regularizer_config:
          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
        encoder_config: 
          target: sgm.modules.diffusionmodules.model.Encoder
          params:
            attn_type: vanilla
            double_z: True
            z_channels: 4
            resolution: 256
            in_channels: 3
            out_ch: 3
            ch: 128
            ch_mult: [1, 2, 4, 4]
            num_res_blocks: 2
            attn_resolutions: []
            dropout: 0.0
        decoder_config:
          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
          params:
            attn_type: vanilla
            double_z: True
            z_channels: 4
            resolution: 256
            in_channels: 3
            out_ch: 3
            ch: 128
            ch_mult: [1, 2, 4, 4]
            num_res_blocks: 2
            attn_resolutions: []
            dropout: 0.0
            video_kernel_size: [3, 1, 1]

    sampler_config:
      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
      params:
        num_steps: 10
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.AYSDiscretization

        guider_config:
          target: sgm.modules.diffusionmodules.guiders.AudioRefMultiCondGuider
          params:
            audio_ratio: 5.0
            ref_ratio: 2.0