keysync-demo / keyframe.yaml
Antoni Bigata
add checkpoint download
13f5b7a
raw
history blame contribute delete
4.71 kB
model:
target: sgm.models.diffusion.DiffusionEngine
params:
input_key: latents
scale_factor: 0.18215
disable_first_stage_autocast: True
ckpt_path:
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DenoiserDub
params:
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
network_wrapper:
target: sgm.modules.diffusionmodules.wrappers.DubbingWrapper
params:
mask_input: True
network_config:
target: sgm.modules.diffusionmodules.video_model.VideoUNet
params:
adm_in_channels: 0
num_classes: sequential
use_checkpoint: True
in_channels: 8
out_channels: 4
model_channels: 320
attention_resolutions: [4, 2, 1]
num_res_blocks: 2
channel_mult: [1, 2, 4, 4]
num_head_channels: 64
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024
spatial_transformer_attn_type: softmax-xformers
extra_ff_mix_layer: True
use_spatial_context: True
merge_strategy: learned_with_images
video_kernel_size: [3, 1, 1]
fine_tuning_method: null
audio_cond_method: both_keyframes
additional_audio_frames: 0
audio_dim: 1024
unfreeze_blocks: [] # Because we changed the input block
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
- input_key: cond_frames
is_trainable: False
ucg_rate: 0.1
target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
params:
disable_encoder_autocast: True
n_cond_frames: 1
n_copies: 1
is_ae: True
load_encoder: False
encoder_config:
target: sgm.models.autoencoder.AutoencoderKLModeOnly
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: True
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
- input_key: gt # allows to use the ground truth as a condition
is_trainable: False
target: sgm.modules.encoders.modules.IdentityEncoder
params:
cond_type: gt
- input_key: audio_emb
is_trainable: True
ucg_rate: 0.2
target: sgm.modules.encoders.modules.WhisperAudioEmbedder
params:
merge_method: mean
linear_dim: 1024
cond_type: crossattn
audio_dim: 768
- input_key: masks
is_trainable: False
target: sgm.modules.encoders.modules.IdentityEncoder
params:
cond_type: masks
first_stage_config:
target: sgm.models.autoencoder.AutoencodingEngine
params:
loss_config:
target: torch.nn.Identity
regularizer_config:
target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
encoder_config:
target: sgm.modules.diffusionmodules.model.Encoder
params:
attn_type: vanilla
double_z: True
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
decoder_config:
target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
params:
attn_type: vanilla
double_z: True
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
video_kernel_size: [3, 1, 1]
sampler_config:
target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
params:
num_steps: 10
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.AYSDiscretization
guider_config:
target: sgm.modules.diffusionmodules.guiders.AudioRefMultiCondGuider
params:
audio_ratio: 5.0
ref_ratio: 2.0