Spaces:

toninio19
/

keysync-demo

Running on Zero

App Files Files Community

Antoni Bigata commited on 6 days ago

Commit

13f5b7a

1 Parent(s): 892dd0f

add checkpoint download

Browse files

Files changed (3) hide show

app.py +15 -5
interpolation.yaml +152 -0
keyframe.yaml +154 -0

app.py CHANGED Viewed

@@ -25,6 +25,16 @@ from inference_functions import (
 from wordle_game import WordleGame
 import torch.cuda.amp as amp  # Import amp for mixed precision
 # Set default tensor type to float16 for faster computation
 if torch.cuda.is_available():
@@ -136,7 +146,7 @@ def load_all_models():
         model_size="Base+",
         feed_as_frames=False,
         merge_type="None",
-        model_path="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/pretrained_models/checkpoints/WavLM-Base+.pt",
     ).cuda()
     wavlm_model = wavlm_model.half()  # Convert to half precision
@@ -148,12 +158,12 @@ def load_all_models():
     landmarks_extractor = LandmarksExtractor()
     keyframe_model = load_model(
-        config="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/scripts/sampling/configs/keyframe.yaml",
-        ckpt="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/pretrained_models/checkpoints/keyframe_dub.pt",
     )
     interpolation_model = load_model(
-        config="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/scripts/sampling/configs/interpolation.yaml",
-        ckpt="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/pretrained_models/checkpoints/interpolation_dub.pt",
     )
     keyframe_model.en_and_decode_n_samples_a_time = 2
     interpolation_model.en_and_decode_n_samples_a_time = 2

 from wordle_game import WordleGame
 import torch.cuda.amp as amp  # Import amp for mixed precision
+from huggingface_hub import snapshot_download
+# Define the repository ID
+repo_id = "toninio19/keysync"
+# Download the entire repository
+repo_path = snapshot_download(repo_id=repo_id)
+print(f"Repository downloaded to: {repo_path}")
 # Set default tensor type to float16 for faster computation
 if torch.cuda.is_available():
         model_size="Base+",
         feed_as_frames=False,
         merge_type="None",
+        model_path=os.path.join(repo_path, "checkpoints/WavLM-Base+.pt"),
     ).cuda()
     wavlm_model = wavlm_model.half()  # Convert to half precision
     landmarks_extractor = LandmarksExtractor()
     keyframe_model = load_model(
+        config="keyframe.yaml",
+        ckpt=os.path.join(repo_path, "checkpoints/keyframe_dub.pt"),
     )
     interpolation_model = load_model(
+        config="interpolation.yaml",
+        ckpt=os.path.join(repo_path, "checkpoints/interpolation_dub.pt"),
     )
     keyframe_model.en_and_decode_n_samples_a_time = 2
     interpolation_model.en_and_decode_n_samples_a_time = 2

interpolation.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    ckpt_path:
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DenoiserDub
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+    network_wrapper:
+      target: sgm.modules.diffusionmodules.wrappers.InterpolationWrapper
+      params:
+        im_size: [512, 512] # USER: adapt this to your dataset
+        n_channels: 4
+        starting_mask_method: zeros
+        add_mask: True
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 0
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 9
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+        fine_tuning_method: null
+        audio_cond_method: both_keyframes
+        additional_audio_frames: 0
+        audio_dim: 1024
+        unfreeze_blocks: ["input"]
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - input_key: cond_frames
+          is_trainable: False
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: True
+            n_cond_frames: 2
+            n_copies: 1
+            is_ae: True
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: True
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+        - input_key: gt # allows to use the ground truth as a condition
+          is_trainable: False
+          target: sgm.modules.encoders.modules.IdentityEncoder
+          params:
+            cond_type: gt
+        - input_key: audio_emb
+          is_trainable: True
+          target: sgm.modules.encoders.modules.WhisperAudioEmbedder
+          params:
+            merge_method: mean
+            linear_dim: 1024
+            cond_type: crossattn
+            audio_dim: 768
+        - input_key: masks
+          is_trainable: False
+          target: sgm.modules.encoders.modules.IdentityEncoder
+          params:
+            cond_type: masks
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config:
+          target: sgm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+        decoder_config:
+          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+            video_kernel_size: [3, 1, 1]
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 10
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.AYSDiscretization
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.AudioRefMultiCondGuider
+          params:
+            audio_ratio: 5.0
+            ref_ratio: 2.0

keyframe.yaml ADDED Viewed

	@@ -0,0 +1,154 @@

+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    input_key: latents
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    ckpt_path:
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DenoiserDub
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+    network_wrapper:
+      target: sgm.modules.diffusionmodules.wrappers.DubbingWrapper
+      params:
+        mask_input: True
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 0
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+        fine_tuning_method: null
+        audio_cond_method: both_keyframes
+        additional_audio_frames: 0
+        audio_dim: 1024
+        unfreeze_blocks: [] # Because we changed the input block
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - input_key: cond_frames
+          is_trainable: False
+          ucg_rate: 0.1
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: True
+            n_cond_frames: 1
+            n_copies: 1
+            is_ae: True
+            load_encoder: False
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: True
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+        - input_key: gt # allows to use the ground truth as a condition
+          is_trainable: False
+          target: sgm.modules.encoders.modules.IdentityEncoder
+          params:
+            cond_type: gt
+        - input_key: audio_emb
+          is_trainable: True
+          ucg_rate: 0.2
+          target: sgm.modules.encoders.modules.WhisperAudioEmbedder
+          params:
+            merge_method: mean
+            linear_dim: 1024
+            cond_type: crossattn
+            audio_dim: 768
+        - input_key: masks
+          is_trainable: False
+          target: sgm.modules.encoders.modules.IdentityEncoder
+          params:
+            cond_type: masks
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config:
+          target: sgm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+        decoder_config:
+          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+            video_kernel_size: [3, 1, 1]
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 10
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.AYSDiscretization
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.AudioRefMultiCondGuider
+          params:
+            audio_ratio: 5.0
+            ref_ratio: 2.0