Spaces:
Sleeping
Sleeping
import gradio as gr | |
from pathlib import Path | |
from scripts.inference import main | |
from omegaconf import OmegaConf | |
import argparse | |
from datetime import datetime | |
CONFIG_PATH = Path("configs/unet/second_stage.yaml") | |
CHECKPOINT_PATH = Path("checkpoints/latentsync_unet.pt") | |
def process_video( | |
video_path, | |
audio_path, | |
guidance_scale, | |
inference_steps, | |
seed, | |
): | |
# Create the temp directory if it doesn't exist | |
output_dir = Path("./temp") | |
output_dir.mkdir(parents=True, exist_ok=True) | |
# Convert paths to absolute Path objects and normalize them | |
video_file_path = Path(video_path) | |
video_path = video_file_path.absolute().as_posix() | |
audio_path = Path(audio_path).absolute().as_posix() | |
current_time = datetime.now().strftime("%Y%m%d_%H%M%S") | |
# Set the output path for the processed video | |
output_path = str(output_dir / f"{video_file_path.stem}_{current_time}.mp4") # Change the filename as needed | |
config = OmegaConf.load(CONFIG_PATH) | |
config["run"].update( | |
{ | |
"guidance_scale": guidance_scale, | |
"inference_steps": inference_steps, | |
} | |
) | |
# Parse the arguments | |
args = create_args(video_path, audio_path, output_path, inference_steps, guidance_scale, seed) | |
try: | |
result = main( | |
config=config, | |
args=args, | |
) | |
print("Processing completed successfully.") | |
return output_path # Ensure the output path is returned | |
except Exception as e: | |
print(f"Error during processing: {str(e)}") | |
raise gr.Error(f"Error during processing: {str(e)}") | |
def create_args( | |
video_path: str, audio_path: str, output_path: str, inference_steps: int, guidance_scale: float, seed: int | |
) -> argparse.Namespace: | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--inference_ckpt_path", type=str, required=True) | |
parser.add_argument("--video_path", type=str, required=True) | |
parser.add_argument("--audio_path", type=str, required=True) | |
parser.add_argument("--video_out_path", type=str, required=True) | |
parser.add_argument("--inference_steps", type=int, default=20) | |
parser.add_argument("--guidance_scale", type=float, default=1.0) | |
parser.add_argument("--seed", type=int, default=1247) | |
return parser.parse_args( | |
[ | |
"--inference_ckpt_path", | |
CHECKPOINT_PATH.absolute().as_posix(), | |
"--video_path", | |
video_path, | |
"--audio_path", | |
audio_path, | |
"--video_out_path", | |
output_path, | |
"--inference_steps", | |
str(inference_steps), | |
"--guidance_scale", | |
str(guidance_scale), | |
"--seed", | |
str(seed), | |
] | |
) | |
# Create Gradio interface | |
with gr.Blocks(title="LatentSync Video Processing") as demo: | |
gr.Markdown( | |
""" | |
# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync | |
Upload a video and audio file to process with LatentSync model. | |
<div align="center"> | |
<strong>Chunyu Li1,2 Chao Zhang1 Weikai Xu1 Jinghui Xie1,† Weiguo Feng1 | |
Bingyue Peng1 Weiwei Xing2,†</strong> | |
</div> | |
<div align="center"> | |
<strong>1ByteDance 2Beijing Jiaotong University</strong> | |
</div> | |
<div style="display:flex;justify-content:center;column-gap:4px;"> | |
<a href="https://github.com/bytedance/LatentSync"> | |
<img src='https://img.shields.io/badge/GitHub-Repo-blue'> | |
</a> | |
<a href="https://arxiv.org/pdf/2412.09262"> | |
<img src='https://img.shields.io/badge/ArXiv-Paper-red'> | |
</a> | |
</div> | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
video_input = gr.Video(label="Input Video") | |
audio_input = gr.Audio(label="Input Audio", type="filepath") | |
with gr.Row(): | |
guidance_scale = gr.Slider( | |
minimum=1.0, | |
maximum=3.5, | |
value=1.5, | |
step=0.5, | |
label="Guidance Scale", | |
) | |
inference_steps = gr.Slider(minimum=10, maximum=50, value=20, step=1, label="Inference Steps") | |
with gr.Row(): | |
seed = gr.Number(value=1247, label="Random Seed", precision=0) | |
process_btn = gr.Button("Process Video") | |
with gr.Column(): | |
video_output = gr.Video(label="Output Video") | |
gr.Examples( | |
examples=[ | |
["assets/demo1_video.mp4", "assets/demo1_audio.wav"], | |
["assets/demo2_video.mp4", "assets/demo2_audio.wav"], | |
["assets/demo3_video.mp4", "assets/demo3_audio.wav"], | |
], | |
inputs=[video_input, audio_input], | |
) | |
process_btn.click( | |
fn=process_video, | |
inputs=[ | |
video_input, | |
audio_input, | |
guidance_scale, | |
inference_steps, | |
seed, | |
], | |
outputs=video_output, | |
) | |
if __name__ == "__main__": | |
demo.launch(inbrowser=True, share=True) | |