ayushman-dashtoon commited on
Commit
3c94737
·
verified ·
1 Parent(s): ad081bc

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. README.md +320 -0
  3. i2v.sft +3 -0
  4. inference.py +298 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ i2v.sft filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - tencent/HunyuanVideo
4
+ library_name: diffusers
5
+ ---
6
+
7
+ HunyuanVideo Keyframe Control Lora is an adapter for HunyuanVideo T2V model for keyframe-based video generation. ​Our architecture builds upon existing models, introducing key enhancements to optimize keyframe-based video generation:​
8
+ * We modify the input patch embedding projection layer to effectively incorporate keyframe information. By adjusting the convolutional input parameters, we enable the model to process image inputs within the Diffusion Transformer (DiT) framework.​
9
+ * We apply Low-Rank Adaptation (LoRA) across all linear layers and the convolutional input layer. This approach facilitates efficient fine-tuning by introducing low-rank matrices that approximate the weight updates, thereby preserving the base model's foundational capabilities while reducing the number of trainable parameters.
10
+ * The model is conditioned on user-defined keyframes, allowing precise control over the generated video's start and end frames. This conditioning ensures that the generated content aligns seamlessly with the specified keyframes, enhancing the coherence and narrative flow of the video.​
11
+
12
+ ## Recommended Settings
13
+ 1. The model works best on human subjects. Single subject images work slightly better.
14
+ 2. It is recommended to use the following image generation resolutions `720x1280`, `544x960`, `1280x720`, `960x544`.
15
+ 3. It is recommended to set frames from 33 upto 97. Can go upto 121 frames as well (but not tested much).
16
+ 4. Prompting helps a lot but works even without. The prompt can be as simple as just the name of the object you want to generate or can be detailed.
17
+ 5. `num_inference_steps` is recommended to be 50, but for fast results you can use 30 as well. Anything less than 30 is not recommended.
18
+
19
+ ## Diffusers
20
+ HunyuanVideo Keyframe Control Lora can be used directly from Diffusers. Install the latest version of Diffusers.
21
+
22
+
23
+ ```python
24
+ from typing import List, Optional, Tuple, Union
25
+
26
+ import cv2
27
+ import numpy as np
28
+ import safetensors.torch
29
+ import torch
30
+ import torchvision.transforms.v2 as transforms
31
+ from diffusers import FlowMatchEulerDiscreteScheduler, HunyuanVideoPipeline
32
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
33
+ from diffusers.loaders import HunyuanVideoLoraLoaderMixin
34
+ from diffusers.models import AutoencoderKLHunyuanVideo, HunyuanVideoTransformer3DModel
35
+ from diffusers.models.attention import Attention
36
+ from diffusers.models.embeddings import apply_rotary_emb
37
+ from diffusers.models.transformers.transformer_hunyuan_video import HunyuanVideoPatchEmbed, HunyuanVideoTransformer3DModel
38
+ from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import DEFAULT_PROMPT_TEMPLATE, retrieve_timesteps
39
+ from diffusers.pipelines.hunyuan_video.pipeline_output import HunyuanVideoPipelineOutput
40
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
41
+ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
42
+ from diffusers.utils import export_to_video, is_torch_xla_available, load_image, logging, replace_example_docstring
43
+ from diffusers.utils.state_dict_utils import convert_state_dict_to_diffusers, convert_unet_state_dict_to_peft
44
+ from diffusers.utils.torch_utils import randn_tensor
45
+ from diffusers.video_processor import VideoProcessor
46
+ from peft import LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict
47
+ from PIL import Image
48
+
49
+ video_transforms = transforms.Compose(
50
+ [
51
+ transforms.Lambda(lambda x: x / 255.0),
52
+ transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
53
+ ]
54
+ )
55
+
56
+
57
+ def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: tuple[int, int]) -> np.ndarray:
58
+ """
59
+ Resize the image to the bucket resolution.
60
+ """
61
+ is_pil_image = isinstance(image, Image.Image)
62
+ if is_pil_image:
63
+ image_width, image_height = image.size
64
+ else:
65
+ image_height, image_width = image.shape[:2]
66
+
67
+ if bucket_reso == (image_width, image_height):
68
+ return np.array(image) if is_pil_image else image
69
+
70
+ bucket_width, bucket_height = bucket_reso
71
+
72
+ scale_width = bucket_width / image_width
73
+ scale_height = bucket_height / image_height
74
+ scale = max(scale_width, scale_height)
75
+ image_width = int(image_width * scale + 0.5)
76
+ image_height = int(image_height * scale + 0.5)
77
+
78
+ if scale > 1:
79
+ image = Image.fromarray(image) if not is_pil_image else image
80
+ image = image.resize((image_width, image_height), Image.LANCZOS)
81
+ image = np.array(image)
82
+ else:
83
+ image = np.array(image) if is_pil_image else image
84
+ image = cv2.resize(image, (image_width, image_height), interpolation=cv2.INTER_AREA)
85
+
86
+ # crop the image to the bucket resolution
87
+ crop_left = (image_width - bucket_width) // 2
88
+ crop_top = (image_height - bucket_height) // 2
89
+ image = image[crop_top : crop_top + bucket_height, crop_left : crop_left + bucket_width]
90
+
91
+ return image
92
+
93
+
94
+ model_id = "hunyuanvideo-community/HunyuanVideo"
95
+ transformer = HunyuanVideoTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
96
+ pipe = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.bfloat16)
97
+
98
+ pipe.to("cuda")
99
+ pipe.vae.enable_tiling()
100
+ pipe.vae.enable_slicing()
101
+
102
+ with torch.no_grad(): # enable image inputs
103
+ initial_input_channels = pipe.transformer.config.in_channels
104
+ new_img_in = HunyuanVideoPatchEmbed(
105
+ patch_size=(pipe.transformer.config.patch_size_t, pipe.transformer.config.patch_size, pipe.transformer.config.patch_size),
106
+ in_chans=pipe.transformer.config.in_channels * 2,
107
+ embed_dim=pipe.transformer.config.num_attention_heads * pipe.transformer.config.attention_head_dim,
108
+ )
109
+ new_img_in = new_img_in.to(pipe.device, dtype=pipe.dtype)
110
+ new_img_in.proj.weight.zero_()
111
+ new_img_in.proj.weight[:, :initial_input_channels].copy_(pipe.transformer.x_embedder.proj.weight)
112
+
113
+ if pipe.transformer.x_embedder.proj.bias is not None:
114
+ new_img_in.proj.bias.copy_(pipe.transformer.x_embedder.proj.bias)
115
+
116
+ pipe.transformer.x_embedder = new_img_in
117
+
118
+ LORA_PATH = "<PATH_TO_CONTROL_LORA_SAFETENSORS>"
119
+ lora_state_dict = pipe.lora_state_dict(LORA_PATH)
120
+ transformer_lora_state_dict = {f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.") and "lora" in k}
121
+ pipe.load_lora_into_transformer(transformer_lora_state_dict, transformer=pipe.transformer, adapter_name="i2v", _pipeline=pipe)
122
+ pipe.set_adapters(["i2v"], adapter_weights=[1.0])
123
+ pipe.fuse_lora(components=["transformer"], lora_scale=1.0, adapter_names=["i2v"])
124
+ pipe.unload_lora_weights()
125
+
126
+ n_frames, height, width = 77, 1280, 720
127
+ prompt = "a woman"
128
+ cond_frame1 = load_image("https://content.dashtoon.ai/stability-images/e524013d-55d4-483a-b80a-dfc51d639158.png")
129
+ cond_frame1 = resize_image_to_bucket(cond_frame1, bucket_reso=(width, height))
130
+
131
+ cond_frame2 = load_image("https://content.dashtoon.ai/stability-images/0b29c296-0a90-4b92-96b9-1ed0ae21e480.png")
132
+ cond_frame2 = resize_image_to_bucket(cond_frame2, bucket_reso=(width, height))
133
+
134
+ cond_video = np.zeros(shape=(n_frames, height, width, 3))
135
+ cond_video[0], cond_video[-1] = np.array(cond_frame1), np.array(cond_frame2)
136
+
137
+ cond_video = torch.from_numpy(cond_video.copy()).permute(0, 3, 1, 2)
138
+ cond_video = torch.stack([video_transforms(x) for x in cond_video], dim=0).unsqueeze(0)
139
+
140
+ with torch.no_grad():
141
+ image_or_video = cond_video.to(device="cuda", dtype=pipe.dtype)
142
+ image_or_video = image_or_video.permute(0, 2, 1, 3, 4).contiguous() # [B, F, C, H, W] -> [B, C, F, H, W]
143
+ cond_latents = pipe.vae.encode(image_or_video).latent_dist.sample()
144
+ cond_latents = cond_latents * pipe.vae.config.scaling_factor
145
+ cond_latents = cond_latents.to(dtype=pipe.dtype)
146
+
147
+
148
+ @torch.no_grad()
149
+ def call_pipe(
150
+ pipe,
151
+ prompt: Union[str, List[str]] = None,
152
+ prompt_2: Union[str, List[str]] = None,
153
+ height: int = 720,
154
+ width: int = 1280,
155
+ num_frames: int = 129,
156
+ num_inference_steps: int = 50,
157
+ sigmas: List[float] = None,
158
+ guidance_scale: float = 6.0,
159
+ num_videos_per_prompt: Optional[int] = 1,
160
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
161
+ latents: Optional[torch.Tensor] = None,
162
+ prompt_embeds: Optional[torch.Tensor] = None,
163
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
164
+ prompt_attention_mask: Optional[torch.Tensor] = None,
165
+ output_type: Optional[str] = "pil",
166
+ return_dict: bool = True,
167
+ attention_kwargs: Optional[Dict[str, Any]] = None,
168
+ callback_on_step_end: Optional[Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]] = None,
169
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
170
+ prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
171
+ max_sequence_length: int = 256,
172
+ image_latents: Optional[torch.Tensor] = None,
173
+ ):
174
+
175
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
176
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
177
+
178
+ # 1. Check inputs. Raise error if not correct
179
+ pipe.check_inputs(
180
+ prompt,
181
+ prompt_2,
182
+ height,
183
+ width,
184
+ prompt_embeds,
185
+ callback_on_step_end_tensor_inputs,
186
+ prompt_template,
187
+ )
188
+
189
+ pipe._guidance_scale = guidance_scale
190
+ pipe._attention_kwargs = attention_kwargs
191
+ pipe._current_timestep = None
192
+ pipe._interrupt = False
193
+
194
+ device = pipe._execution_device
195
+
196
+ # 2. Define call parameters
197
+ if prompt is not None and isinstance(prompt, str):
198
+ batch_size = 1
199
+ elif prompt is not None and isinstance(prompt, list):
200
+ batch_size = len(prompt)
201
+ else:
202
+ batch_size = prompt_embeds.shape[0]
203
+
204
+ # 3. Encode input prompt
205
+ prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
206
+ prompt=prompt,
207
+ prompt_2=prompt_2,
208
+ prompt_template=prompt_template,
209
+ num_videos_per_prompt=num_videos_per_prompt,
210
+ prompt_embeds=prompt_embeds,
211
+ pooled_prompt_embeds=pooled_prompt_embeds,
212
+ prompt_attention_mask=prompt_attention_mask,
213
+ device=device,
214
+ max_sequence_length=max_sequence_length,
215
+ )
216
+
217
+ transformer_dtype = pipe.transformer.dtype
218
+ prompt_embeds = prompt_embeds.to(transformer_dtype)
219
+ prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)
220
+ if pooled_prompt_embeds is not None:
221
+ pooled_prompt_embeds = pooled_prompt_embeds.to(transformer_dtype)
222
+
223
+ # 4. Prepare timesteps
224
+ sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
225
+ timesteps, num_inference_steps = retrieve_timesteps(
226
+ pipe.scheduler,
227
+ num_inference_steps,
228
+ device,
229
+ sigmas=sigmas,
230
+ )
231
+
232
+ # 5. Prepare latent variables
233
+ num_channels_latents = pipe.transformer.config.in_channels
234
+ num_latent_frames = (num_frames - 1) // pipe.vae_scale_factor_temporal + 1
235
+ latents = pipe.prepare_latents(
236
+ batch_size * num_videos_per_prompt,
237
+ num_channels_latents,
238
+ height,
239
+ width,
240
+ num_latent_frames,
241
+ torch.float32,
242
+ device,
243
+ generator,
244
+ latents,
245
+ )
246
+
247
+ # 6. Prepare guidance condition
248
+ guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
249
+
250
+ # 7. Denoising loop
251
+ num_warmup_steps = len(timesteps) - num_inference_steps * pipe.scheduler.order
252
+ pipe._num_timesteps = len(timesteps)
253
+
254
+ with pipe.progress_bar(total=num_inference_steps) as progress_bar:
255
+ for i, t in enumerate(timesteps):
256
+ if pipe.interrupt:
257
+ continue
258
+
259
+ pipe._current_timestep = t
260
+ latent_model_input = latents.to(transformer_dtype)
261
+ timestep = t.expand(latents.shape[0]).to(latents.dtype)
262
+
263
+ noise_pred = pipe.transformer(
264
+ hidden_states=torch.cat([latent_model_input, image_latents], dim=1),
265
+ timestep=timestep,
266
+ encoder_hidden_states=prompt_embeds,
267
+ encoder_attention_mask=prompt_attention_mask,
268
+ pooled_projections=pooled_prompt_embeds,
269
+ guidance=guidance,
270
+ attention_kwargs=attention_kwargs,
271
+ return_dict=False,
272
+ )[0]
273
+
274
+ # compute the previous noisy sample x_t -> x_t-1
275
+ latents = pipe.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
276
+
277
+ if callback_on_step_end is not None:
278
+ callback_kwargs = {}
279
+ for k in callback_on_step_end_tensor_inputs:
280
+ callback_kwargs[k] = locals()[k]
281
+ callback_outputs = callback_on_step_end(pipe, i, t, callback_kwargs)
282
+
283
+ latents = callback_outputs.pop("latents", latents)
284
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
285
+
286
+ # call the callback, if provided
287
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % pipe.scheduler.order == 0):
288
+ progress_bar.update()
289
+ pipe._current_timestep = None
290
+
291
+ if not output_type == "latent":
292
+ latents = latents.to(pipe.vae.dtype) / pipe.vae.config.scaling_factor
293
+ video = pipe.vae.decode(latents, return_dict=False)[0]
294
+ video = pipe.video_processor.postprocess_video(video, output_type=output_type)
295
+ else:
296
+ video = latents
297
+
298
+ # Offload all models
299
+ pipe.maybe_free_model_hooks()
300
+
301
+ if not return_dict:
302
+ return (video,)
303
+
304
+ return HunyuanVideoPipelineOutput(frames=video)
305
+
306
+
307
+ video = call_pipe(
308
+ pipe,
309
+ prompt=prompt,
310
+ num_frames=n_frames,
311
+ num_inference_steps=50,
312
+ image_latents=cond_latents,
313
+ width=width,
314
+ height=height,
315
+ guidance_scale=6.0,
316
+ generator=torch.Generator(device="cuda").manual_seed(0),
317
+ ).frames[0]
318
+
319
+ export_to_video(video, "output.mp4", fps=24)
320
+ ```
i2v.sft ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b02f04e123365277f5b4a57729759ba96e7b129451788778e78741d1549cc8fd
3
+ size 1146140784
inference.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import safetensors.torch
2
+ import torchvision.transforms.v2 as transforms
3
+ import cv2
4
+ import torch
5
+ import numpy as np
6
+ from typing import List, Optional, Tuple, Union
7
+ from PIL import Image
8
+ from diffusers import HunyuanVideoPipeline, FlowMatchEulerDiscreteScheduler
9
+ from diffusers.models.transformers.transformer_hunyuan_video import HunyuanVideoPatchEmbed, HunyuanVideoTransformer3DModel
10
+ from diffusers.utils import export_to_video
11
+ from diffusers.models.attention import Attention
12
+ from diffusers.utils.state_dict_utils import convert_state_dict_to_diffusers, convert_unet_state_dict_to_peft
13
+ from peft import LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict
14
+ from diffusers.models.embeddings import apply_rotary_emb
15
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
16
+ from diffusers.loaders import HunyuanVideoLoraLoaderMixin
17
+ from diffusers.models import AutoencoderKLHunyuanVideo, HunyuanVideoTransformer3DModel
18
+ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
19
+ from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
20
+ from diffusers.utils.torch_utils import randn_tensor
21
+ from diffusers.video_processor import VideoProcessor
22
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
23
+ from diffusers.pipelines.hunyuan_video.pipeline_output import HunyuanVideoPipelineOutput
24
+ from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import retrieve_timesteps, DEFAULT_PROMPT_TEMPLATE
25
+ from diffusers.utils import load_image
26
+
27
+ video_transforms = transforms.Compose(
28
+ [
29
+ transforms.Lambda(lambda x: x / 255.0),
30
+ transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
31
+ ]
32
+ )
33
+
34
+
35
+ def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: tuple[int, int]) -> np.ndarray:
36
+ """
37
+ Resize the image to the bucket resolution.
38
+ """
39
+ is_pil_image = isinstance(image, Image.Image)
40
+ if is_pil_image:
41
+ image_width, image_height = image.size
42
+ else:
43
+ image_height, image_width = image.shape[:2]
44
+
45
+ if bucket_reso == (image_width, image_height):
46
+ return np.array(image) if is_pil_image else image
47
+
48
+ bucket_width, bucket_height = bucket_reso
49
+
50
+ scale_width = bucket_width / image_width
51
+ scale_height = bucket_height / image_height
52
+ scale = max(scale_width, scale_height)
53
+ image_width = int(image_width * scale + 0.5)
54
+ image_height = int(image_height * scale + 0.5)
55
+
56
+ if scale > 1:
57
+ image = Image.fromarray(image) if not is_pil_image else image
58
+ image = image.resize((image_width, image_height), Image.LANCZOS)
59
+ image = np.array(image)
60
+ else:
61
+ image = np.array(image) if is_pil_image else image
62
+ image = cv2.resize(image, (image_width, image_height), interpolation=cv2.INTER_AREA)
63
+
64
+ # crop the image to the bucket resolution
65
+ crop_left = (image_width - bucket_width) // 2
66
+ crop_top = (image_height - bucket_height) // 2
67
+ image = image[crop_top : crop_top + bucket_height, crop_left : crop_left + bucket_width]
68
+
69
+ return image
70
+
71
+
72
+ model_id = "hunyuanvideo-community/HunyuanVideo"
73
+ transformer = HunyuanVideoTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
74
+ pipe = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.bfloat16)
75
+
76
+ # Enable memory savings
77
+ pipe.vae.enable_tiling()
78
+ pipe.enable_model_cpu_offload()
79
+
80
+ with torch.no_grad(): # enable image inputs
81
+ initial_input_channels = pipe.transformer.config.in_channels
82
+ new_img_in = HunyuanVideoPatchEmbed(
83
+ patch_size=(pipe.transformer.config.patch_size_t, pipe.transformer.config.patch_size, pipe.transformer.config.patch_size),
84
+ in_chans=pipe.transformer.config.in_channels * 2,
85
+ embed_dim=pipe.transformer.config.num_attention_heads * pipe.transformer.config.attention_head_dim,
86
+ )
87
+ new_img_in = new_img_in.to(pipe.device, dtype=pipe.dtype)
88
+ new_img_in.proj.weight.zero_()
89
+ new_img_in.proj.weight[:, :initial_input_channels].copy_(pipe.transformer.x_embedder.proj.weight)
90
+
91
+ if pipe.transformer.x_embedder.proj.bias is not None:
92
+ new_img_in.proj.bias.copy_(pipe.transformer.x_embedder.proj.bias)
93
+
94
+ pipe.transformer.x_embedder = new_img_in
95
+
96
+ LORA_PATH = "<PATH TO CONTROL LORA>"
97
+ lora_state_dict = pipe.lora_state_dict(LORA_PATH)
98
+ transformer_lora_state_dict = {f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.") and "lora" in k}
99
+ pipe.load_lora_into_transformer(transformer_lora_state_dict, transformer=pipe.transformer, adapter_name="i2v", _pipeline=pipe)
100
+ pipe.set_adapters(["i2v"], adapter_weights=[1.0])
101
+ pipe.fuse_lora(components=["transformer"], lora_scale=1.0, adapter_names=["i2v"])
102
+ pipe.unload_lora_weights()
103
+
104
+ n_frames, height, width = 77, 1280, 720
105
+ prompt = "a woman"
106
+ cond_frame1 = load_image("https://content.dashtoon.ai/stability-images/e524013d-55d4-483a-b80a-dfc51d639158.png")
107
+ cond_frame1 = resize_image_to_bucket(cond_frame1, bucket_reso=(width, height))
108
+
109
+ cond_frame2 = load_image("https://content.dashtoon.ai/stability-images/0b29c296-0a90-4b92-96b9-1ed0ae21e480.png")
110
+ cond_frame2 = resize_image_to_bucket(cond_frame2, bucket_reso=(width, height))
111
+
112
+ cond_video = np.zeros(shape=(n_frames, height, width, 3))
113
+ cond_video[0], cond_video[-1] = np.array(cond_frame1), np.array(cond_frame2)
114
+
115
+ cond_video = torch.from_numpy(cond_video.copy()).permute(0, 3, 1, 2)
116
+ cond_video = torch.stack([video_transforms(x) for x in cond_video], dim=0).unsqueeze(0)
117
+
118
+ with torch.inference_mode():
119
+ image_or_video = cond_video.to(device="cuda", dtype=pipe.dtype)
120
+ image_or_video = image_or_video.permute(0, 2, 1, 3, 4).contiguous() # [B, F, C, H, W] -> [B, C, F, H, W]
121
+ cond_latents = pipe.vae.encode(image_or_video).latent_dist.sample()
122
+ cond_latents = cond_latents * pipe.vae.config.scaling_factor
123
+ cond_latents = cond_latents.to(dtype=pipe.dtype)
124
+ assert not torch.any(torch.isnan(cond_latents))
125
+
126
+
127
+ @torch.inference_mode()
128
+ def call_pipe(
129
+ pipe,
130
+ prompt: Union[str, List[str]] = None,
131
+ prompt_2: Union[str, List[str]] = None,
132
+ height: int = 720,
133
+ width: int = 1280,
134
+ num_frames: int = 129,
135
+ num_inference_steps: int = 50,
136
+ sigmas: List[float] = None,
137
+ guidance_scale: float = 6.0,
138
+ num_videos_per_prompt: Optional[int] = 1,
139
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
140
+ latents: Optional[torch.Tensor] = None,
141
+ prompt_embeds: Optional[torch.Tensor] = None,
142
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
143
+ prompt_attention_mask: Optional[torch.Tensor] = None,
144
+ output_type: Optional[str] = "pil",
145
+ return_dict: bool = True,
146
+ attention_kwargs: Optional[Dict[str, Any]] = None,
147
+ callback_on_step_end: Optional[Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]] = None,
148
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
149
+ prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
150
+ max_sequence_length: int = 256,
151
+ image_latents: Optional[torch.Tensor] = None,
152
+ ):
153
+
154
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
155
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
156
+
157
+ # 1. Check inputs. Raise error if not correct
158
+ pipe.check_inputs(
159
+ prompt,
160
+ prompt_2,
161
+ height,
162
+ width,
163
+ prompt_embeds,
164
+ callback_on_step_end_tensor_inputs,
165
+ prompt_template,
166
+ )
167
+
168
+ pipe._guidance_scale = guidance_scale
169
+ pipe._attention_kwargs = attention_kwargs
170
+ pipe._current_timestep = None
171
+ pipe._interrupt = False
172
+
173
+ device = pipe._execution_device
174
+
175
+ # 2. Define call parameters
176
+ if prompt is not None and isinstance(prompt, str):
177
+ batch_size = 1
178
+ elif prompt is not None and isinstance(prompt, list):
179
+ batch_size = len(prompt)
180
+ else:
181
+ batch_size = prompt_embeds.shape[0]
182
+
183
+ # 3. Encode input prompt
184
+ prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = pipe.encode_prompt(
185
+ prompt=prompt,
186
+ prompt_2=prompt_2,
187
+ prompt_template=prompt_template,
188
+ num_videos_per_prompt=num_videos_per_prompt,
189
+ prompt_embeds=prompt_embeds,
190
+ pooled_prompt_embeds=pooled_prompt_embeds,
191
+ prompt_attention_mask=prompt_attention_mask,
192
+ device=device,
193
+ max_sequence_length=max_sequence_length,
194
+ )
195
+
196
+ transformer_dtype = pipe.transformer.dtype
197
+ prompt_embeds = prompt_embeds.to(transformer_dtype)
198
+ prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)
199
+ if pooled_prompt_embeds is not None:
200
+ pooled_prompt_embeds = pooled_prompt_embeds.to(transformer_dtype)
201
+
202
+ # 4. Prepare timesteps
203
+ sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
204
+ timesteps, num_inference_steps = retrieve_timesteps(
205
+ pipe.scheduler,
206
+ num_inference_steps,
207
+ device,
208
+ sigmas=sigmas,
209
+ )
210
+
211
+ # 5. Prepare latent variables
212
+ num_channels_latents = pipe.transformer.config.in_channels
213
+ num_latent_frames = (num_frames - 1) // pipe.vae_scale_factor_temporal + 1
214
+ latents = pipe.prepare_latents(
215
+ batch_size * num_videos_per_prompt,
216
+ num_channels_latents,
217
+ height,
218
+ width,
219
+ num_latent_frames,
220
+ torch.float32,
221
+ device,
222
+ generator,
223
+ latents,
224
+ )
225
+
226
+ # 6. Prepare guidance condition
227
+ guidance = torch.tensor([guidance_scale] * latents.shape[0], dtype=transformer_dtype, device=device) * 1000.0
228
+
229
+ # 7. Denoising loop
230
+ num_warmup_steps = len(timesteps) - num_inference_steps * pipe.scheduler.order
231
+ pipe._num_timesteps = len(timesteps)
232
+
233
+ with pipe.progress_bar(total=num_inference_steps) as progress_bar:
234
+ for i, t in enumerate(timesteps):
235
+ if pipe.interrupt:
236
+ continue
237
+
238
+ pipe._current_timestep = t
239
+ latent_model_input = latents.to(transformer_dtype)
240
+ timestep = t.expand(latents.shape[0]).to(latents.dtype)
241
+
242
+ noise_pred = pipe.transformer(
243
+ hidden_states=torch.cat([latent_model_input, image_latents], dim=1),
244
+ timestep=timestep,
245
+ encoder_hidden_states=prompt_embeds,
246
+ encoder_attention_mask=prompt_attention_mask,
247
+ pooled_projections=pooled_prompt_embeds,
248
+ guidance=guidance,
249
+ attention_kwargs=attention_kwargs,
250
+ return_dict=False,
251
+ )[0]
252
+
253
+ # compute the previous noisy sample x_t -> x_t-1
254
+ latents = pipe.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
255
+
256
+ if callback_on_step_end is not None:
257
+ callback_kwargs = {}
258
+ for k in callback_on_step_end_tensor_inputs:
259
+ callback_kwargs[k] = locals()[k]
260
+ callback_outputs = callback_on_step_end(pipe, i, t, callback_kwargs)
261
+
262
+ latents = callback_outputs.pop("latents", latents)
263
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
264
+
265
+ # call the callback, if provided
266
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % pipe.scheduler.order == 0):
267
+ progress_bar.update()
268
+ pipe._current_timestep = None
269
+
270
+ if not output_type == "latent":
271
+ latents = latents.to(pipe.vae.dtype) / pipe.vae.config.scaling_factor
272
+ video = pipe.vae.decode(latents, return_dict=False)[0]
273
+ video = pipe.video_processor.postprocess_video(video, output_type=output_type)
274
+ else:
275
+ video = latents
276
+
277
+ # Offload all models
278
+ pipe.maybe_free_model_hooks()
279
+
280
+ if not return_dict:
281
+ return (video,)
282
+
283
+ return HunyuanVideoPipelineOutput(frames=video)
284
+
285
+
286
+ video = call_pipe(
287
+ pipe,
288
+ prompt=prompt,
289
+ num_frames=n_frames,
290
+ num_inference_steps=50,
291
+ image_latents=cond_latents,
292
+ width=width,
293
+ height=height,
294
+ guidance_scale=6.0,
295
+ generator=torch.Generator(device="cuda").manual_seed(0),
296
+ ).frames[0]
297
+
298
+ export_to_video(video, "output.mp4", fps=24)