naumnaum commited on
Commit
f456fbb
·
verified ·
1 Parent(s): 96c35bf

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 20250312_12-29-01/epoch10/adapter_config.json +36 -0
  2. 20250312_12-29-01/epoch10/adapter_model.safetensors +3 -0
  3. 20250312_12-29-01/epoch10/wan21.toml +104 -0
  4. 20250312_12-29-01/epoch20/adapter_config.json +36 -0
  5. 20250312_12-29-01/epoch20/adapter_model.safetensors +3 -0
  6. 20250312_12-29-01/epoch20/wan21.toml +104 -0
  7. 20250312_12-29-01/epoch30/adapter_config.json +36 -0
  8. 20250312_12-29-01/epoch30/adapter_model.safetensors +3 -0
  9. 20250312_12-29-01/epoch30/wan21.toml +104 -0
  10. 20250312_12-29-01/epoch40/adapter_config.json +36 -0
  11. 20250312_12-29-01/epoch40/adapter_model.safetensors +3 -0
  12. 20250312_12-29-01/epoch40/wan21.toml +104 -0
  13. 20250312_12-29-01/epoch50/adapter_config.json +36 -0
  14. 20250312_12-29-01/epoch50/adapter_model.safetensors +3 -0
  15. 20250312_12-29-01/epoch50/wan21.toml +104 -0
  16. 20250312_12-29-01/epoch60/adapter_config.json +36 -0
  17. 20250312_12-29-01/epoch60/adapter_model.safetensors +3 -0
  18. 20250312_12-29-01/epoch60/wan21.toml +104 -0
  19. 20250312_12-29-01/epoch70/adapter_config.json +36 -0
  20. 20250312_12-29-01/epoch70/adapter_model.safetensors +3 -0
  21. 20250312_12-29-01/epoch70/wan21.toml +104 -0
  22. 20250312_12-29-01/events.out.tfevents.1741782541.eb3e120b3b16.7516.0 +3 -0
  23. 20250312_12-29-01/global_step1090/layer_00-model_states.pt +3 -0
  24. 20250312_12-29-01/global_step1090/layer_01-model_states.pt +3 -0
  25. 20250312_12-29-01/global_step1090/layer_02-model_states.pt +3 -0
  26. 20250312_12-29-01/global_step1090/layer_03-model_states.pt +3 -0
  27. 20250312_12-29-01/global_step1090/layer_04-model_states.pt +3 -0
  28. 20250312_12-29-01/global_step1090/layer_05-model_states.pt +3 -0
  29. 20250312_12-29-01/global_step1090/layer_06-model_states.pt +3 -0
  30. 20250312_12-29-01/global_step1090/layer_07-model_states.pt +3 -0
  31. 20250312_12-29-01/global_step1090/layer_08-model_states.pt +3 -0
  32. 20250312_12-29-01/global_step1090/layer_09-model_states.pt +3 -0
  33. 20250312_12-29-01/global_step1090/layer_10-model_states.pt +3 -0
  34. 20250312_12-29-01/global_step1090/layer_11-model_states.pt +3 -0
  35. 20250312_12-29-01/global_step1090/layer_12-model_states.pt +3 -0
  36. 20250312_12-29-01/global_step1090/layer_13-model_states.pt +3 -0
  37. 20250312_12-29-01/global_step1090/layer_14-model_states.pt +3 -0
  38. 20250312_12-29-01/global_step1090/layer_15-model_states.pt +3 -0
  39. 20250312_12-29-01/global_step1090/layer_16-model_states.pt +3 -0
  40. 20250312_12-29-01/global_step1090/layer_17-model_states.pt +3 -0
  41. 20250312_12-29-01/global_step1090/layer_18-model_states.pt +3 -0
  42. 20250312_12-29-01/global_step1090/layer_19-model_states.pt +3 -0
  43. 20250312_12-29-01/global_step1090/layer_20-model_states.pt +3 -0
  44. 20250312_12-29-01/global_step1090/layer_21-model_states.pt +3 -0
  45. 20250312_12-29-01/global_step1090/layer_22-model_states.pt +3 -0
  46. 20250312_12-29-01/global_step1090/layer_23-model_states.pt +3 -0
  47. 20250312_12-29-01/global_step1090/layer_24-model_states.pt +3 -0
  48. 20250312_12-29-01/global_step1090/layer_25-model_states.pt +3 -0
  49. 20250312_12-29-01/global_step1090/layer_26-model_states.pt +3 -0
  50. 20250312_12-29-01/global_step1090/layer_27-model_states.pt +3 -0
20250312_12-29-01/epoch10/adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": false,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 32,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q",
27
+ "o",
28
+ "k",
29
+ "ffn.2",
30
+ "v",
31
+ "ffn.0"
32
+ ],
33
+ "task_type": null,
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
20250312_12-29-01/epoch10/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:641ba8362e63fb40e588719ecccdec46c5ae2917f99b5e5a69aee0153eca34f0
3
+ size 87564160
20250312_12-29-01/epoch10/wan21.toml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Output path for training runs. Each training run makes a new directory in here.
2
+ output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
3
+
4
+ # Dataset config file.
5
+ dataset = 'examples/dataset.toml'
6
+ # You can have separate eval datasets. Give them a name for Tensorboard metrics.
7
+ # eval_datasets = [
8
+ # {name = 'something', config = 'path/to/eval_dataset.toml'},
9
+ # ]
10
+
11
+ # training settings
12
+
13
+ # I usually set this to a really high value because I don't know how long I want to train.
14
+ epochs = 1000
15
+ # Batch size of a single forward/backward pass for one GPU.
16
+ micro_batch_size_per_gpu = 1
17
+ # Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
18
+ pipeline_stages = 1
19
+ # Number of micro-batches sent through the pipeline for each training step.
20
+ # If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
21
+ gradient_accumulation_steps = 4
22
+ # Grad norm clipping.
23
+ gradient_clipping = 1.0
24
+ # Learning rate warmup.
25
+ warmup_steps = 100
26
+
27
+ # Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
28
+ # of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
29
+ # exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
30
+ # Block swapping only works for LoRA training, and requires pipeline_stages=1.
31
+ #blocks_to_swap = 20
32
+
33
+ # eval settings
34
+
35
+ eval_every_n_epochs = 1
36
+ eval_before_first_step = true
37
+ # Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
38
+ # Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
39
+ # more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
40
+ eval_micro_batch_size_per_gpu = 1
41
+ eval_gradient_accumulation_steps = 1
42
+
43
+ # misc settings
44
+
45
+ # Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
46
+ save_every_n_epochs = 10
47
+ # Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
48
+ #checkpoint_every_n_epochs = 1
49
+ checkpoint_every_n_minutes = 30
50
+ # Always set to true unless you have a huge amount of VRAM.
51
+ activation_checkpointing = true
52
+ # Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
53
+ partition_method = 'parameters'
54
+ # dtype for saving the LoRA or model, if different from training dtype
55
+ save_dtype = 'bfloat16'
56
+ # Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
57
+ caching_batch_size = 1
58
+ # How often deepspeed logs to console.
59
+ steps_per_print = 1
60
+ # How to extract video clips for training from a single input video file.
61
+ # The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
62
+ # number of frames for that bucket.
63
+ # single_beginning: one clip starting at the beginning of the video
64
+ # single_middle: one clip from the middle of the video (cutting off the start and end equally)
65
+ # multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
66
+ # default is single_beginning
67
+ video_clip_mode = 'single_beginning'
68
+
69
+ # This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
70
+ # details on the configuration and options for each model.
71
+ [model]
72
+ type = 'wan'
73
+ ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
74
+ dtype = 'bfloat16'
75
+ # You can use fp8 for the transformer when training LoRA.
76
+ #transformer_dtype = 'float8'
77
+ timestep_sample_method = 'logit_normal'
78
+
79
+ # For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
80
+ [adapter]
81
+ type = 'lora'
82
+ rank = 32
83
+ # Dtype for the LoRA weights you are training.
84
+ dtype = 'bfloat16'
85
+ # You can initialize the lora weights from a previously trained lora.
86
+ #init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
87
+
88
+ [optimizer]
89
+ # AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
90
+ # Look at train.py for other options. You could also easily edit the file and add your own.
91
+ type = 'adamw_optimi'
92
+ lr = 5e-5
93
+ betas = [0.9, 0.99]
94
+ weight_decay = 0.01
95
+ eps = 1e-8
96
+
97
+ # Can use this optimizer for a bit less memory usage.
98
+
99
+ # [optimizer]
100
+ # type = 'AdamW8bitKahan'
101
+ # lr = 2e-5
102
+ # betas = [0.9, 0.99]
103
+ # weight_decay = 0.01
104
+ # stabilize = false
20250312_12-29-01/epoch20/adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": false,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 32,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q",
27
+ "o",
28
+ "k",
29
+ "ffn.2",
30
+ "v",
31
+ "ffn.0"
32
+ ],
33
+ "task_type": null,
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
20250312_12-29-01/epoch20/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f233ea785484ddce71c746dfc2a5c08386fafb0ecdc1702d7d432262700de7df
3
+ size 87564160
20250312_12-29-01/epoch20/wan21.toml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Output path for training runs. Each training run makes a new directory in here.
2
+ output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
3
+
4
+ # Dataset config file.
5
+ dataset = 'examples/dataset.toml'
6
+ # You can have separate eval datasets. Give them a name for Tensorboard metrics.
7
+ # eval_datasets = [
8
+ # {name = 'something', config = 'path/to/eval_dataset.toml'},
9
+ # ]
10
+
11
+ # training settings
12
+
13
+ # I usually set this to a really high value because I don't know how long I want to train.
14
+ epochs = 1000
15
+ # Batch size of a single forward/backward pass for one GPU.
16
+ micro_batch_size_per_gpu = 1
17
+ # Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
18
+ pipeline_stages = 1
19
+ # Number of micro-batches sent through the pipeline for each training step.
20
+ # If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
21
+ gradient_accumulation_steps = 4
22
+ # Grad norm clipping.
23
+ gradient_clipping = 1.0
24
+ # Learning rate warmup.
25
+ warmup_steps = 100
26
+
27
+ # Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
28
+ # of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
29
+ # exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
30
+ # Block swapping only works for LoRA training, and requires pipeline_stages=1.
31
+ #blocks_to_swap = 20
32
+
33
+ # eval settings
34
+
35
+ eval_every_n_epochs = 1
36
+ eval_before_first_step = true
37
+ # Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
38
+ # Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
39
+ # more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
40
+ eval_micro_batch_size_per_gpu = 1
41
+ eval_gradient_accumulation_steps = 1
42
+
43
+ # misc settings
44
+
45
+ # Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
46
+ save_every_n_epochs = 10
47
+ # Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
48
+ #checkpoint_every_n_epochs = 1
49
+ checkpoint_every_n_minutes = 30
50
+ # Always set to true unless you have a huge amount of VRAM.
51
+ activation_checkpointing = true
52
+ # Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
53
+ partition_method = 'parameters'
54
+ # dtype for saving the LoRA or model, if different from training dtype
55
+ save_dtype = 'bfloat16'
56
+ # Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
57
+ caching_batch_size = 1
58
+ # How often deepspeed logs to console.
59
+ steps_per_print = 1
60
+ # How to extract video clips for training from a single input video file.
61
+ # The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
62
+ # number of frames for that bucket.
63
+ # single_beginning: one clip starting at the beginning of the video
64
+ # single_middle: one clip from the middle of the video (cutting off the start and end equally)
65
+ # multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
66
+ # default is single_beginning
67
+ video_clip_mode = 'single_beginning'
68
+
69
+ # This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
70
+ # details on the configuration and options for each model.
71
+ [model]
72
+ type = 'wan'
73
+ ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
74
+ dtype = 'bfloat16'
75
+ # You can use fp8 for the transformer when training LoRA.
76
+ #transformer_dtype = 'float8'
77
+ timestep_sample_method = 'logit_normal'
78
+
79
+ # For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
80
+ [adapter]
81
+ type = 'lora'
82
+ rank = 32
83
+ # Dtype for the LoRA weights you are training.
84
+ dtype = 'bfloat16'
85
+ # You can initialize the lora weights from a previously trained lora.
86
+ #init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
87
+
88
+ [optimizer]
89
+ # AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
90
+ # Look at train.py for other options. You could also easily edit the file and add your own.
91
+ type = 'adamw_optimi'
92
+ lr = 5e-5
93
+ betas = [0.9, 0.99]
94
+ weight_decay = 0.01
95
+ eps = 1e-8
96
+
97
+ # Can use this optimizer for a bit less memory usage.
98
+
99
+ # [optimizer]
100
+ # type = 'AdamW8bitKahan'
101
+ # lr = 2e-5
102
+ # betas = [0.9, 0.99]
103
+ # weight_decay = 0.01
104
+ # stabilize = false
20250312_12-29-01/epoch30/adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": false,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 32,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q",
27
+ "o",
28
+ "k",
29
+ "ffn.2",
30
+ "v",
31
+ "ffn.0"
32
+ ],
33
+ "task_type": null,
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
20250312_12-29-01/epoch30/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6f4b7f3cc28d0138680eaa7552d807f20c9c1bf801d25a8dbd2f17d5f055c60
3
+ size 87564160
20250312_12-29-01/epoch30/wan21.toml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Output path for training runs. Each training run makes a new directory in here.
2
+ output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
3
+
4
+ # Dataset config file.
5
+ dataset = 'examples/dataset.toml'
6
+ # You can have separate eval datasets. Give them a name for Tensorboard metrics.
7
+ # eval_datasets = [
8
+ # {name = 'something', config = 'path/to/eval_dataset.toml'},
9
+ # ]
10
+
11
+ # training settings
12
+
13
+ # I usually set this to a really high value because I don't know how long I want to train.
14
+ epochs = 1000
15
+ # Batch size of a single forward/backward pass for one GPU.
16
+ micro_batch_size_per_gpu = 1
17
+ # Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
18
+ pipeline_stages = 1
19
+ # Number of micro-batches sent through the pipeline for each training step.
20
+ # If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
21
+ gradient_accumulation_steps = 4
22
+ # Grad norm clipping.
23
+ gradient_clipping = 1.0
24
+ # Learning rate warmup.
25
+ warmup_steps = 100
26
+
27
+ # Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
28
+ # of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
29
+ # exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
30
+ # Block swapping only works for LoRA training, and requires pipeline_stages=1.
31
+ #blocks_to_swap = 20
32
+
33
+ # eval settings
34
+
35
+ eval_every_n_epochs = 1
36
+ eval_before_first_step = true
37
+ # Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
38
+ # Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
39
+ # more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
40
+ eval_micro_batch_size_per_gpu = 1
41
+ eval_gradient_accumulation_steps = 1
42
+
43
+ # misc settings
44
+
45
+ # Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
46
+ save_every_n_epochs = 10
47
+ # Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
48
+ #checkpoint_every_n_epochs = 1
49
+ checkpoint_every_n_minutes = 30
50
+ # Always set to true unless you have a huge amount of VRAM.
51
+ activation_checkpointing = true
52
+ # Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
53
+ partition_method = 'parameters'
54
+ # dtype for saving the LoRA or model, if different from training dtype
55
+ save_dtype = 'bfloat16'
56
+ # Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
57
+ caching_batch_size = 1
58
+ # How often deepspeed logs to console.
59
+ steps_per_print = 1
60
+ # How to extract video clips for training from a single input video file.
61
+ # The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
62
+ # number of frames for that bucket.
63
+ # single_beginning: one clip starting at the beginning of the video
64
+ # single_middle: one clip from the middle of the video (cutting off the start and end equally)
65
+ # multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
66
+ # default is single_beginning
67
+ video_clip_mode = 'single_beginning'
68
+
69
+ # This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
70
+ # details on the configuration and options for each model.
71
+ [model]
72
+ type = 'wan'
73
+ ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
74
+ dtype = 'bfloat16'
75
+ # You can use fp8 for the transformer when training LoRA.
76
+ #transformer_dtype = 'float8'
77
+ timestep_sample_method = 'logit_normal'
78
+
79
+ # For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
80
+ [adapter]
81
+ type = 'lora'
82
+ rank = 32
83
+ # Dtype for the LoRA weights you are training.
84
+ dtype = 'bfloat16'
85
+ # You can initialize the lora weights from a previously trained lora.
86
+ #init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
87
+
88
+ [optimizer]
89
+ # AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
90
+ # Look at train.py for other options. You could also easily edit the file and add your own.
91
+ type = 'adamw_optimi'
92
+ lr = 5e-5
93
+ betas = [0.9, 0.99]
94
+ weight_decay = 0.01
95
+ eps = 1e-8
96
+
97
+ # Can use this optimizer for a bit less memory usage.
98
+
99
+ # [optimizer]
100
+ # type = 'AdamW8bitKahan'
101
+ # lr = 2e-5
102
+ # betas = [0.9, 0.99]
103
+ # weight_decay = 0.01
104
+ # stabilize = false
20250312_12-29-01/epoch40/adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": false,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 32,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q",
27
+ "o",
28
+ "k",
29
+ "ffn.2",
30
+ "v",
31
+ "ffn.0"
32
+ ],
33
+ "task_type": null,
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
20250312_12-29-01/epoch40/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d09ea0d00faca0abda41407e3ff3aa98622cd01ee06d788daad57b202d16fcb
3
+ size 87564160
20250312_12-29-01/epoch40/wan21.toml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Output path for training runs. Each training run makes a new directory in here.
2
+ output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
3
+
4
+ # Dataset config file.
5
+ dataset = 'examples/dataset.toml'
6
+ # You can have separate eval datasets. Give them a name for Tensorboard metrics.
7
+ # eval_datasets = [
8
+ # {name = 'something', config = 'path/to/eval_dataset.toml'},
9
+ # ]
10
+
11
+ # training settings
12
+
13
+ # I usually set this to a really high value because I don't know how long I want to train.
14
+ epochs = 1000
15
+ # Batch size of a single forward/backward pass for one GPU.
16
+ micro_batch_size_per_gpu = 1
17
+ # Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
18
+ pipeline_stages = 1
19
+ # Number of micro-batches sent through the pipeline for each training step.
20
+ # If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
21
+ gradient_accumulation_steps = 4
22
+ # Grad norm clipping.
23
+ gradient_clipping = 1.0
24
+ # Learning rate warmup.
25
+ warmup_steps = 100
26
+
27
+ # Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
28
+ # of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
29
+ # exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
30
+ # Block swapping only works for LoRA training, and requires pipeline_stages=1.
31
+ #blocks_to_swap = 20
32
+
33
+ # eval settings
34
+
35
+ eval_every_n_epochs = 1
36
+ eval_before_first_step = true
37
+ # Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
38
+ # Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
39
+ # more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
40
+ eval_micro_batch_size_per_gpu = 1
41
+ eval_gradient_accumulation_steps = 1
42
+
43
+ # misc settings
44
+
45
+ # Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
46
+ save_every_n_epochs = 10
47
+ # Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
48
+ #checkpoint_every_n_epochs = 1
49
+ checkpoint_every_n_minutes = 30
50
+ # Always set to true unless you have a huge amount of VRAM.
51
+ activation_checkpointing = true
52
+ # Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
53
+ partition_method = 'parameters'
54
+ # dtype for saving the LoRA or model, if different from training dtype
55
+ save_dtype = 'bfloat16'
56
+ # Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
57
+ caching_batch_size = 1
58
+ # How often deepspeed logs to console.
59
+ steps_per_print = 1
60
+ # How to extract video clips for training from a single input video file.
61
+ # The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
62
+ # number of frames for that bucket.
63
+ # single_beginning: one clip starting at the beginning of the video
64
+ # single_middle: one clip from the middle of the video (cutting off the start and end equally)
65
+ # multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
66
+ # default is single_beginning
67
+ video_clip_mode = 'single_beginning'
68
+
69
+ # This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
70
+ # details on the configuration and options for each model.
71
+ [model]
72
+ type = 'wan'
73
+ ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
74
+ dtype = 'bfloat16'
75
+ # You can use fp8 for the transformer when training LoRA.
76
+ #transformer_dtype = 'float8'
77
+ timestep_sample_method = 'logit_normal'
78
+
79
+ # For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
80
+ [adapter]
81
+ type = 'lora'
82
+ rank = 32
83
+ # Dtype for the LoRA weights you are training.
84
+ dtype = 'bfloat16'
85
+ # You can initialize the lora weights from a previously trained lora.
86
+ #init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
87
+
88
+ [optimizer]
89
+ # AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
90
+ # Look at train.py for other options. You could also easily edit the file and add your own.
91
+ type = 'adamw_optimi'
92
+ lr = 5e-5
93
+ betas = [0.9, 0.99]
94
+ weight_decay = 0.01
95
+ eps = 1e-8
96
+
97
+ # Can use this optimizer for a bit less memory usage.
98
+
99
+ # [optimizer]
100
+ # type = 'AdamW8bitKahan'
101
+ # lr = 2e-5
102
+ # betas = [0.9, 0.99]
103
+ # weight_decay = 0.01
104
+ # stabilize = false
20250312_12-29-01/epoch50/adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": false,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 32,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q",
27
+ "o",
28
+ "k",
29
+ "ffn.2",
30
+ "v",
31
+ "ffn.0"
32
+ ],
33
+ "task_type": null,
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
20250312_12-29-01/epoch50/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19ab54f9d397b982a64cc9c4e46ea6ae5260a7a92927a2899e3d5d4947ef3e04
3
+ size 87564160
20250312_12-29-01/epoch50/wan21.toml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Output path for training runs. Each training run makes a new directory in here.
2
+ output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
3
+
4
+ # Dataset config file.
5
+ dataset = 'examples/dataset.toml'
6
+ # You can have separate eval datasets. Give them a name for Tensorboard metrics.
7
+ # eval_datasets = [
8
+ # {name = 'something', config = 'path/to/eval_dataset.toml'},
9
+ # ]
10
+
11
+ # training settings
12
+
13
+ # I usually set this to a really high value because I don't know how long I want to train.
14
+ epochs = 1000
15
+ # Batch size of a single forward/backward pass for one GPU.
16
+ micro_batch_size_per_gpu = 1
17
+ # Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
18
+ pipeline_stages = 1
19
+ # Number of micro-batches sent through the pipeline for each training step.
20
+ # If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
21
+ gradient_accumulation_steps = 4
22
+ # Grad norm clipping.
23
+ gradient_clipping = 1.0
24
+ # Learning rate warmup.
25
+ warmup_steps = 100
26
+
27
+ # Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
28
+ # of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
29
+ # exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
30
+ # Block swapping only works for LoRA training, and requires pipeline_stages=1.
31
+ #blocks_to_swap = 20
32
+
33
+ # eval settings
34
+
35
+ eval_every_n_epochs = 1
36
+ eval_before_first_step = true
37
+ # Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
38
+ # Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
39
+ # more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
40
+ eval_micro_batch_size_per_gpu = 1
41
+ eval_gradient_accumulation_steps = 1
42
+
43
+ # misc settings
44
+
45
+ # Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
46
+ save_every_n_epochs = 10
47
+ # Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
48
+ #checkpoint_every_n_epochs = 1
49
+ checkpoint_every_n_minutes = 30
50
+ # Always set to true unless you have a huge amount of VRAM.
51
+ activation_checkpointing = true
52
+ # Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
53
+ partition_method = 'parameters'
54
+ # dtype for saving the LoRA or model, if different from training dtype
55
+ save_dtype = 'bfloat16'
56
+ # Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
57
+ caching_batch_size = 1
58
+ # How often deepspeed logs to console.
59
+ steps_per_print = 1
60
+ # How to extract video clips for training from a single input video file.
61
+ # The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
62
+ # number of frames for that bucket.
63
+ # single_beginning: one clip starting at the beginning of the video
64
+ # single_middle: one clip from the middle of the video (cutting off the start and end equally)
65
+ # multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
66
+ # default is single_beginning
67
+ video_clip_mode = 'single_beginning'
68
+
69
+ # This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
70
+ # details on the configuration and options for each model.
71
+ [model]
72
+ type = 'wan'
73
+ ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
74
+ dtype = 'bfloat16'
75
+ # You can use fp8 for the transformer when training LoRA.
76
+ #transformer_dtype = 'float8'
77
+ timestep_sample_method = 'logit_normal'
78
+
79
+ # For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
80
+ [adapter]
81
+ type = 'lora'
82
+ rank = 32
83
+ # Dtype for the LoRA weights you are training.
84
+ dtype = 'bfloat16'
85
+ # You can initialize the lora weights from a previously trained lora.
86
+ #init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
87
+
88
+ [optimizer]
89
+ # AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
90
+ # Look at train.py for other options. You could also easily edit the file and add your own.
91
+ type = 'adamw_optimi'
92
+ lr = 5e-5
93
+ betas = [0.9, 0.99]
94
+ weight_decay = 0.01
95
+ eps = 1e-8
96
+
97
+ # Can use this optimizer for a bit less memory usage.
98
+
99
+ # [optimizer]
100
+ # type = 'AdamW8bitKahan'
101
+ # lr = 2e-5
102
+ # betas = [0.9, 0.99]
103
+ # weight_decay = 0.01
104
+ # stabilize = false
20250312_12-29-01/epoch60/adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": false,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 32,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q",
27
+ "o",
28
+ "k",
29
+ "ffn.2",
30
+ "v",
31
+ "ffn.0"
32
+ ],
33
+ "task_type": null,
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
20250312_12-29-01/epoch60/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c33b45a0912caa40149863883da6abe4567cb9d6a6137bbcfa33d3988def5f9
3
+ size 87564160
20250312_12-29-01/epoch60/wan21.toml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Output path for training runs. Each training run makes a new directory in here.
2
+ output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
3
+
4
+ # Dataset config file.
5
+ dataset = 'examples/dataset.toml'
6
+ # You can have separate eval datasets. Give them a name for Tensorboard metrics.
7
+ # eval_datasets = [
8
+ # {name = 'something', config = 'path/to/eval_dataset.toml'},
9
+ # ]
10
+
11
+ # training settings
12
+
13
+ # I usually set this to a really high value because I don't know how long I want to train.
14
+ epochs = 1000
15
+ # Batch size of a single forward/backward pass for one GPU.
16
+ micro_batch_size_per_gpu = 1
17
+ # Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
18
+ pipeline_stages = 1
19
+ # Number of micro-batches sent through the pipeline for each training step.
20
+ # If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
21
+ gradient_accumulation_steps = 4
22
+ # Grad norm clipping.
23
+ gradient_clipping = 1.0
24
+ # Learning rate warmup.
25
+ warmup_steps = 100
26
+
27
+ # Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
28
+ # of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
29
+ # exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
30
+ # Block swapping only works for LoRA training, and requires pipeline_stages=1.
31
+ #blocks_to_swap = 20
32
+
33
+ # eval settings
34
+
35
+ eval_every_n_epochs = 1
36
+ eval_before_first_step = true
37
+ # Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
38
+ # Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
39
+ # more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
40
+ eval_micro_batch_size_per_gpu = 1
41
+ eval_gradient_accumulation_steps = 1
42
+
43
+ # misc settings
44
+
45
+ # Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
46
+ save_every_n_epochs = 10
47
+ # Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
48
+ #checkpoint_every_n_epochs = 1
49
+ checkpoint_every_n_minutes = 30
50
+ # Always set to true unless you have a huge amount of VRAM.
51
+ activation_checkpointing = true
52
+ # Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
53
+ partition_method = 'parameters'
54
+ # dtype for saving the LoRA or model, if different from training dtype
55
+ save_dtype = 'bfloat16'
56
+ # Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
57
+ caching_batch_size = 1
58
+ # How often deepspeed logs to console.
59
+ steps_per_print = 1
60
+ # How to extract video clips for training from a single input video file.
61
+ # The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
62
+ # number of frames for that bucket.
63
+ # single_beginning: one clip starting at the beginning of the video
64
+ # single_middle: one clip from the middle of the video (cutting off the start and end equally)
65
+ # multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
66
+ # default is single_beginning
67
+ video_clip_mode = 'single_beginning'
68
+
69
+ # This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
70
+ # details on the configuration and options for each model.
71
+ [model]
72
+ type = 'wan'
73
+ ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
74
+ dtype = 'bfloat16'
75
+ # You can use fp8 for the transformer when training LoRA.
76
+ #transformer_dtype = 'float8'
77
+ timestep_sample_method = 'logit_normal'
78
+
79
+ # For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
80
+ [adapter]
81
+ type = 'lora'
82
+ rank = 32
83
+ # Dtype for the LoRA weights you are training.
84
+ dtype = 'bfloat16'
85
+ # You can initialize the lora weights from a previously trained lora.
86
+ #init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
87
+
88
+ [optimizer]
89
+ # AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
90
+ # Look at train.py for other options. You could also easily edit the file and add your own.
91
+ type = 'adamw_optimi'
92
+ lr = 5e-5
93
+ betas = [0.9, 0.99]
94
+ weight_decay = 0.01
95
+ eps = 1e-8
96
+
97
+ # Can use this optimizer for a bit less memory usage.
98
+
99
+ # [optimizer]
100
+ # type = 'AdamW8bitKahan'
101
+ # lr = 2e-5
102
+ # betas = [0.9, 0.99]
103
+ # weight_decay = 0.01
104
+ # stabilize = false
20250312_12-29-01/epoch70/adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": null,
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": false,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 32,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "q",
27
+ "o",
28
+ "k",
29
+ "ffn.2",
30
+ "v",
31
+ "ffn.0"
32
+ ],
33
+ "task_type": null,
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
20250312_12-29-01/epoch70/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:529767431e44985d1aa21f6472de8f08edbfddc9d688ab7fb5f9f5f4c4b8873b
3
+ size 87564160
20250312_12-29-01/epoch70/wan21.toml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Output path for training runs. Each training run makes a new directory in here.
2
+ output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
3
+
4
+ # Dataset config file.
5
+ dataset = 'examples/dataset.toml'
6
+ # You can have separate eval datasets. Give them a name for Tensorboard metrics.
7
+ # eval_datasets = [
8
+ # {name = 'something', config = 'path/to/eval_dataset.toml'},
9
+ # ]
10
+
11
+ # training settings
12
+
13
+ # I usually set this to a really high value because I don't know how long I want to train.
14
+ epochs = 1000
15
+ # Batch size of a single forward/backward pass for one GPU.
16
+ micro_batch_size_per_gpu = 1
17
+ # Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
18
+ pipeline_stages = 1
19
+ # Number of micro-batches sent through the pipeline for each training step.
20
+ # If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
21
+ gradient_accumulation_steps = 4
22
+ # Grad norm clipping.
23
+ gradient_clipping = 1.0
24
+ # Learning rate warmup.
25
+ warmup_steps = 100
26
+
27
+ # Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
28
+ # of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
29
+ # exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
30
+ # Block swapping only works for LoRA training, and requires pipeline_stages=1.
31
+ #blocks_to_swap = 20
32
+
33
+ # eval settings
34
+
35
+ eval_every_n_epochs = 1
36
+ eval_before_first_step = true
37
+ # Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
38
+ # Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
39
+ # more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
40
+ eval_micro_batch_size_per_gpu = 1
41
+ eval_gradient_accumulation_steps = 1
42
+
43
+ # misc settings
44
+
45
+ # Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
46
+ save_every_n_epochs = 10
47
+ # Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
48
+ #checkpoint_every_n_epochs = 1
49
+ checkpoint_every_n_minutes = 30
50
+ # Always set to true unless you have a huge amount of VRAM.
51
+ activation_checkpointing = true
52
+ # Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
53
+ partition_method = 'parameters'
54
+ # dtype for saving the LoRA or model, if different from training dtype
55
+ save_dtype = 'bfloat16'
56
+ # Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
57
+ caching_batch_size = 1
58
+ # How often deepspeed logs to console.
59
+ steps_per_print = 1
60
+ # How to extract video clips for training from a single input video file.
61
+ # The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
62
+ # number of frames for that bucket.
63
+ # single_beginning: one clip starting at the beginning of the video
64
+ # single_middle: one clip from the middle of the video (cutting off the start and end equally)
65
+ # multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
66
+ # default is single_beginning
67
+ video_clip_mode = 'single_beginning'
68
+
69
+ # This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
70
+ # details on the configuration and options for each model.
71
+ [model]
72
+ type = 'wan'
73
+ ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
74
+ dtype = 'bfloat16'
75
+ # You can use fp8 for the transformer when training LoRA.
76
+ #transformer_dtype = 'float8'
77
+ timestep_sample_method = 'logit_normal'
78
+
79
+ # For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
80
+ [adapter]
81
+ type = 'lora'
82
+ rank = 32
83
+ # Dtype for the LoRA weights you are training.
84
+ dtype = 'bfloat16'
85
+ # You can initialize the lora weights from a previously trained lora.
86
+ #init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
87
+
88
+ [optimizer]
89
+ # AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
90
+ # Look at train.py for other options. You could also easily edit the file and add your own.
91
+ type = 'adamw_optimi'
92
+ lr = 5e-5
93
+ betas = [0.9, 0.99]
94
+ weight_decay = 0.01
95
+ eps = 1e-8
96
+
97
+ # Can use this optimizer for a bit less memory usage.
98
+
99
+ # [optimizer]
100
+ # type = 'AdamW8bitKahan'
101
+ # lr = 2e-5
102
+ # betas = [0.9, 0.99]
103
+ # weight_decay = 0.01
104
+ # stabilize = false
20250312_12-29-01/events.out.tfevents.1741782541.eb3e120b3b16.7516.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edc10449f7159f89ed461932e8516e9a2338a500927b5b2a2a3cc2c9aa88f3c8
3
+ size 211879
20250312_12-29-01/global_step1090/layer_00-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c682c9cc8e731c37498845f3a980635b04094468e55e77553c7a12ccff998f5
3
+ size 920
20250312_12-29-01/global_step1090/layer_01-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bedc13e12900f3b8ee5095b33e79d28f55201cd1edea9be742a3f04db5fcc98a
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_02-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc949cb7eb3453189935f28fb668d3689b2b631750c0ae3717e4a84f21744ccb
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_03-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d2981510dbf3ac72fdcd66076ca65844172957d1d3822b00de5f3beb28f93a3
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_04-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fc1b82597ac77b5fe70c34ea9e909e154f10d56f1a5cbc0a7bbb684c1817fa2
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_05-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fedc146798570b2a0134494b6d8e5eec9c02ea97a404f0fc8a482aef8def2c5
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_06-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cefa9ff76a4f91309d81e5d12b6e16803066e2294b97ab5a9789ecdb654385d
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_07-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa0988153bbc122d50dae42c16d7c685a881d5e2ed1ced56aade6c489f94c091
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_08-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd04a3922e1f4f06ea94691d559cdaa0cf3ad7edb4289752b0a0c8eb1fe2860e
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_09-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f01ceb4aee93c9e5a0162dd835aab061473e9af27901ff086169f21b203a11ae
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_10-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e65ee29182677556a964de713541cb660175e83a489c7fb8f0fadbc847c94b31
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_11-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c62ecaf88a42fdfa263a2e72834e724b3889317918f16fefd2fac6af8fd7cc2
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_12-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daaa31d39df63d3a70894d4e820ea59420644d8444abc702d3a61b21fd738d46
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_13-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f749757475dab5157b6e143016031c94ec86d25e8a7b496290b04b0f799e3b9f
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_14-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e0d2bf27053d7a21156628b5b185ece57d63d9f89cfa5c9a5887263f0141b2a
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_15-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eef58883d8b5f3ec47afb5abd52e41f3b8d083d97631adb26ba8560bde9ef2b2
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_16-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd968150278251b4b5ffc12aef89ce8ab921fa127ff77cdc1bff8bf2db72d7da
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_17-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67d5eb552a3c22f286f092a0fb7437da7382e9a1eb68376b2a964859aacb6fd3
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_18-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b792f09591ac283113ca2442402093b96a16d511e87bf8354baf96b4afc3a929
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_19-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfafb111c53506c427663ce6d1b71e00840fde82bb2bc7a178528a6eccf86ebd
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_20-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2db41450cc94f528ac9098046a5e2e98c6b25abf72d7864634e856db3598ee02
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_21-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db7da2089babe3ff319b4b9198a52fe8ab05a590c2956ceb364e603e1638ab3a
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_22-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5067d8c45bfbca662efbe50be1b9e555e9de9e669a9d6f0357d3fee41f3dc68d
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_23-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:209345826004eeb4792ab8a48ca98ad5ef0d053121dd46c0452750ed581e3500
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_24-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc370c2c74d7c425dfd9b7ee5073fb9dc94bd6aa0f1214fc982d07197906aaed
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_25-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5b30f7b92973879da28c86cba400d77a3f15ce14ed7f685bd33e47f01811740
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_26-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ef066b845be9a6db1fe832d3914bdc5566e4a4211122f95f976d45574a1cd8c
3
+ size 2923434
20250312_12-29-01/global_step1090/layer_27-model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd9be3129ac29bda09ccc68213ce81bd9b2c337749ae1b8b0eaaa36a9c43b79d
3
+ size 2923434