Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- 20250312_12-29-01/epoch10/adapter_config.json +36 -0
- 20250312_12-29-01/epoch10/adapter_model.safetensors +3 -0
- 20250312_12-29-01/epoch10/wan21.toml +104 -0
- 20250312_12-29-01/epoch20/adapter_config.json +36 -0
- 20250312_12-29-01/epoch20/adapter_model.safetensors +3 -0
- 20250312_12-29-01/epoch20/wan21.toml +104 -0
- 20250312_12-29-01/epoch30/adapter_config.json +36 -0
- 20250312_12-29-01/epoch30/adapter_model.safetensors +3 -0
- 20250312_12-29-01/epoch30/wan21.toml +104 -0
- 20250312_12-29-01/epoch40/adapter_config.json +36 -0
- 20250312_12-29-01/epoch40/adapter_model.safetensors +3 -0
- 20250312_12-29-01/epoch40/wan21.toml +104 -0
- 20250312_12-29-01/epoch50/adapter_config.json +36 -0
- 20250312_12-29-01/epoch50/adapter_model.safetensors +3 -0
- 20250312_12-29-01/epoch50/wan21.toml +104 -0
- 20250312_12-29-01/epoch60/adapter_config.json +36 -0
- 20250312_12-29-01/epoch60/adapter_model.safetensors +3 -0
- 20250312_12-29-01/epoch60/wan21.toml +104 -0
- 20250312_12-29-01/epoch70/adapter_config.json +36 -0
- 20250312_12-29-01/epoch70/adapter_model.safetensors +3 -0
- 20250312_12-29-01/epoch70/wan21.toml +104 -0
- 20250312_12-29-01/events.out.tfevents.1741782541.eb3e120b3b16.7516.0 +3 -0
- 20250312_12-29-01/global_step1090/layer_00-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_01-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_02-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_03-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_04-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_05-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_06-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_07-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_08-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_09-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_10-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_11-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_12-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_13-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_14-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_15-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_16-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_17-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_18-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_19-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_20-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_21-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_22-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_23-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_24-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_25-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_26-model_states.pt +3 -0
- 20250312_12-29-01/global_step1090/layer_27-model_states.pt +3 -0
20250312_12-29-01/epoch10/adapter_config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"q",
|
27 |
+
"o",
|
28 |
+
"k",
|
29 |
+
"ffn.2",
|
30 |
+
"v",
|
31 |
+
"ffn.0"
|
32 |
+
],
|
33 |
+
"task_type": null,
|
34 |
+
"use_dora": false,
|
35 |
+
"use_rslora": false
|
36 |
+
}
|
20250312_12-29-01/epoch10/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:641ba8362e63fb40e588719ecccdec46c5ae2917f99b5e5a69aee0153eca34f0
|
3 |
+
size 87564160
|
20250312_12-29-01/epoch10/wan21.toml
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = 'examples/dataset.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
|
28 |
+
# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
|
29 |
+
# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
|
30 |
+
# Block swapping only works for LoRA training, and requires pipeline_stages=1.
|
31 |
+
#blocks_to_swap = 20
|
32 |
+
|
33 |
+
# eval settings
|
34 |
+
|
35 |
+
eval_every_n_epochs = 1
|
36 |
+
eval_before_first_step = true
|
37 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
38 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
39 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
40 |
+
eval_micro_batch_size_per_gpu = 1
|
41 |
+
eval_gradient_accumulation_steps = 1
|
42 |
+
|
43 |
+
# misc settings
|
44 |
+
|
45 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
46 |
+
save_every_n_epochs = 10
|
47 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
48 |
+
#checkpoint_every_n_epochs = 1
|
49 |
+
checkpoint_every_n_minutes = 30
|
50 |
+
# Always set to true unless you have a huge amount of VRAM.
|
51 |
+
activation_checkpointing = true
|
52 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
53 |
+
partition_method = 'parameters'
|
54 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
55 |
+
save_dtype = 'bfloat16'
|
56 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
57 |
+
caching_batch_size = 1
|
58 |
+
# How often deepspeed logs to console.
|
59 |
+
steps_per_print = 1
|
60 |
+
# How to extract video clips for training from a single input video file.
|
61 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
62 |
+
# number of frames for that bucket.
|
63 |
+
# single_beginning: one clip starting at the beginning of the video
|
64 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
65 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
66 |
+
# default is single_beginning
|
67 |
+
video_clip_mode = 'single_beginning'
|
68 |
+
|
69 |
+
# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
|
70 |
+
# details on the configuration and options for each model.
|
71 |
+
[model]
|
72 |
+
type = 'wan'
|
73 |
+
ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
|
74 |
+
dtype = 'bfloat16'
|
75 |
+
# You can use fp8 for the transformer when training LoRA.
|
76 |
+
#transformer_dtype = 'float8'
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
|
80 |
+
[adapter]
|
81 |
+
type = 'lora'
|
82 |
+
rank = 32
|
83 |
+
# Dtype for the LoRA weights you are training.
|
84 |
+
dtype = 'bfloat16'
|
85 |
+
# You can initialize the lora weights from a previously trained lora.
|
86 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
87 |
+
|
88 |
+
[optimizer]
|
89 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
90 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
91 |
+
type = 'adamw_optimi'
|
92 |
+
lr = 5e-5
|
93 |
+
betas = [0.9, 0.99]
|
94 |
+
weight_decay = 0.01
|
95 |
+
eps = 1e-8
|
96 |
+
|
97 |
+
# Can use this optimizer for a bit less memory usage.
|
98 |
+
|
99 |
+
# [optimizer]
|
100 |
+
# type = 'AdamW8bitKahan'
|
101 |
+
# lr = 2e-5
|
102 |
+
# betas = [0.9, 0.99]
|
103 |
+
# weight_decay = 0.01
|
104 |
+
# stabilize = false
|
20250312_12-29-01/epoch20/adapter_config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"q",
|
27 |
+
"o",
|
28 |
+
"k",
|
29 |
+
"ffn.2",
|
30 |
+
"v",
|
31 |
+
"ffn.0"
|
32 |
+
],
|
33 |
+
"task_type": null,
|
34 |
+
"use_dora": false,
|
35 |
+
"use_rslora": false
|
36 |
+
}
|
20250312_12-29-01/epoch20/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f233ea785484ddce71c746dfc2a5c08386fafb0ecdc1702d7d432262700de7df
|
3 |
+
size 87564160
|
20250312_12-29-01/epoch20/wan21.toml
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = 'examples/dataset.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
|
28 |
+
# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
|
29 |
+
# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
|
30 |
+
# Block swapping only works for LoRA training, and requires pipeline_stages=1.
|
31 |
+
#blocks_to_swap = 20
|
32 |
+
|
33 |
+
# eval settings
|
34 |
+
|
35 |
+
eval_every_n_epochs = 1
|
36 |
+
eval_before_first_step = true
|
37 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
38 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
39 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
40 |
+
eval_micro_batch_size_per_gpu = 1
|
41 |
+
eval_gradient_accumulation_steps = 1
|
42 |
+
|
43 |
+
# misc settings
|
44 |
+
|
45 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
46 |
+
save_every_n_epochs = 10
|
47 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
48 |
+
#checkpoint_every_n_epochs = 1
|
49 |
+
checkpoint_every_n_minutes = 30
|
50 |
+
# Always set to true unless you have a huge amount of VRAM.
|
51 |
+
activation_checkpointing = true
|
52 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
53 |
+
partition_method = 'parameters'
|
54 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
55 |
+
save_dtype = 'bfloat16'
|
56 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
57 |
+
caching_batch_size = 1
|
58 |
+
# How often deepspeed logs to console.
|
59 |
+
steps_per_print = 1
|
60 |
+
# How to extract video clips for training from a single input video file.
|
61 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
62 |
+
# number of frames for that bucket.
|
63 |
+
# single_beginning: one clip starting at the beginning of the video
|
64 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
65 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
66 |
+
# default is single_beginning
|
67 |
+
video_clip_mode = 'single_beginning'
|
68 |
+
|
69 |
+
# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
|
70 |
+
# details on the configuration and options for each model.
|
71 |
+
[model]
|
72 |
+
type = 'wan'
|
73 |
+
ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
|
74 |
+
dtype = 'bfloat16'
|
75 |
+
# You can use fp8 for the transformer when training LoRA.
|
76 |
+
#transformer_dtype = 'float8'
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
|
80 |
+
[adapter]
|
81 |
+
type = 'lora'
|
82 |
+
rank = 32
|
83 |
+
# Dtype for the LoRA weights you are training.
|
84 |
+
dtype = 'bfloat16'
|
85 |
+
# You can initialize the lora weights from a previously trained lora.
|
86 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
87 |
+
|
88 |
+
[optimizer]
|
89 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
90 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
91 |
+
type = 'adamw_optimi'
|
92 |
+
lr = 5e-5
|
93 |
+
betas = [0.9, 0.99]
|
94 |
+
weight_decay = 0.01
|
95 |
+
eps = 1e-8
|
96 |
+
|
97 |
+
# Can use this optimizer for a bit less memory usage.
|
98 |
+
|
99 |
+
# [optimizer]
|
100 |
+
# type = 'AdamW8bitKahan'
|
101 |
+
# lr = 2e-5
|
102 |
+
# betas = [0.9, 0.99]
|
103 |
+
# weight_decay = 0.01
|
104 |
+
# stabilize = false
|
20250312_12-29-01/epoch30/adapter_config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"q",
|
27 |
+
"o",
|
28 |
+
"k",
|
29 |
+
"ffn.2",
|
30 |
+
"v",
|
31 |
+
"ffn.0"
|
32 |
+
],
|
33 |
+
"task_type": null,
|
34 |
+
"use_dora": false,
|
35 |
+
"use_rslora": false
|
36 |
+
}
|
20250312_12-29-01/epoch30/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d6f4b7f3cc28d0138680eaa7552d807f20c9c1bf801d25a8dbd2f17d5f055c60
|
3 |
+
size 87564160
|
20250312_12-29-01/epoch30/wan21.toml
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = 'examples/dataset.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
|
28 |
+
# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
|
29 |
+
# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
|
30 |
+
# Block swapping only works for LoRA training, and requires pipeline_stages=1.
|
31 |
+
#blocks_to_swap = 20
|
32 |
+
|
33 |
+
# eval settings
|
34 |
+
|
35 |
+
eval_every_n_epochs = 1
|
36 |
+
eval_before_first_step = true
|
37 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
38 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
39 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
40 |
+
eval_micro_batch_size_per_gpu = 1
|
41 |
+
eval_gradient_accumulation_steps = 1
|
42 |
+
|
43 |
+
# misc settings
|
44 |
+
|
45 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
46 |
+
save_every_n_epochs = 10
|
47 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
48 |
+
#checkpoint_every_n_epochs = 1
|
49 |
+
checkpoint_every_n_minutes = 30
|
50 |
+
# Always set to true unless you have a huge amount of VRAM.
|
51 |
+
activation_checkpointing = true
|
52 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
53 |
+
partition_method = 'parameters'
|
54 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
55 |
+
save_dtype = 'bfloat16'
|
56 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
57 |
+
caching_batch_size = 1
|
58 |
+
# How often deepspeed logs to console.
|
59 |
+
steps_per_print = 1
|
60 |
+
# How to extract video clips for training from a single input video file.
|
61 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
62 |
+
# number of frames for that bucket.
|
63 |
+
# single_beginning: one clip starting at the beginning of the video
|
64 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
65 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
66 |
+
# default is single_beginning
|
67 |
+
video_clip_mode = 'single_beginning'
|
68 |
+
|
69 |
+
# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
|
70 |
+
# details on the configuration and options for each model.
|
71 |
+
[model]
|
72 |
+
type = 'wan'
|
73 |
+
ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
|
74 |
+
dtype = 'bfloat16'
|
75 |
+
# You can use fp8 for the transformer when training LoRA.
|
76 |
+
#transformer_dtype = 'float8'
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
|
80 |
+
[adapter]
|
81 |
+
type = 'lora'
|
82 |
+
rank = 32
|
83 |
+
# Dtype for the LoRA weights you are training.
|
84 |
+
dtype = 'bfloat16'
|
85 |
+
# You can initialize the lora weights from a previously trained lora.
|
86 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
87 |
+
|
88 |
+
[optimizer]
|
89 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
90 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
91 |
+
type = 'adamw_optimi'
|
92 |
+
lr = 5e-5
|
93 |
+
betas = [0.9, 0.99]
|
94 |
+
weight_decay = 0.01
|
95 |
+
eps = 1e-8
|
96 |
+
|
97 |
+
# Can use this optimizer for a bit less memory usage.
|
98 |
+
|
99 |
+
# [optimizer]
|
100 |
+
# type = 'AdamW8bitKahan'
|
101 |
+
# lr = 2e-5
|
102 |
+
# betas = [0.9, 0.99]
|
103 |
+
# weight_decay = 0.01
|
104 |
+
# stabilize = false
|
20250312_12-29-01/epoch40/adapter_config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"q",
|
27 |
+
"o",
|
28 |
+
"k",
|
29 |
+
"ffn.2",
|
30 |
+
"v",
|
31 |
+
"ffn.0"
|
32 |
+
],
|
33 |
+
"task_type": null,
|
34 |
+
"use_dora": false,
|
35 |
+
"use_rslora": false
|
36 |
+
}
|
20250312_12-29-01/epoch40/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d09ea0d00faca0abda41407e3ff3aa98622cd01ee06d788daad57b202d16fcb
|
3 |
+
size 87564160
|
20250312_12-29-01/epoch40/wan21.toml
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = 'examples/dataset.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
|
28 |
+
# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
|
29 |
+
# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
|
30 |
+
# Block swapping only works for LoRA training, and requires pipeline_stages=1.
|
31 |
+
#blocks_to_swap = 20
|
32 |
+
|
33 |
+
# eval settings
|
34 |
+
|
35 |
+
eval_every_n_epochs = 1
|
36 |
+
eval_before_first_step = true
|
37 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
38 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
39 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
40 |
+
eval_micro_batch_size_per_gpu = 1
|
41 |
+
eval_gradient_accumulation_steps = 1
|
42 |
+
|
43 |
+
# misc settings
|
44 |
+
|
45 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
46 |
+
save_every_n_epochs = 10
|
47 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
48 |
+
#checkpoint_every_n_epochs = 1
|
49 |
+
checkpoint_every_n_minutes = 30
|
50 |
+
# Always set to true unless you have a huge amount of VRAM.
|
51 |
+
activation_checkpointing = true
|
52 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
53 |
+
partition_method = 'parameters'
|
54 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
55 |
+
save_dtype = 'bfloat16'
|
56 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
57 |
+
caching_batch_size = 1
|
58 |
+
# How often deepspeed logs to console.
|
59 |
+
steps_per_print = 1
|
60 |
+
# How to extract video clips for training from a single input video file.
|
61 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
62 |
+
# number of frames for that bucket.
|
63 |
+
# single_beginning: one clip starting at the beginning of the video
|
64 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
65 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
66 |
+
# default is single_beginning
|
67 |
+
video_clip_mode = 'single_beginning'
|
68 |
+
|
69 |
+
# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
|
70 |
+
# details on the configuration and options for each model.
|
71 |
+
[model]
|
72 |
+
type = 'wan'
|
73 |
+
ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
|
74 |
+
dtype = 'bfloat16'
|
75 |
+
# You can use fp8 for the transformer when training LoRA.
|
76 |
+
#transformer_dtype = 'float8'
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
|
80 |
+
[adapter]
|
81 |
+
type = 'lora'
|
82 |
+
rank = 32
|
83 |
+
# Dtype for the LoRA weights you are training.
|
84 |
+
dtype = 'bfloat16'
|
85 |
+
# You can initialize the lora weights from a previously trained lora.
|
86 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
87 |
+
|
88 |
+
[optimizer]
|
89 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
90 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
91 |
+
type = 'adamw_optimi'
|
92 |
+
lr = 5e-5
|
93 |
+
betas = [0.9, 0.99]
|
94 |
+
weight_decay = 0.01
|
95 |
+
eps = 1e-8
|
96 |
+
|
97 |
+
# Can use this optimizer for a bit less memory usage.
|
98 |
+
|
99 |
+
# [optimizer]
|
100 |
+
# type = 'AdamW8bitKahan'
|
101 |
+
# lr = 2e-5
|
102 |
+
# betas = [0.9, 0.99]
|
103 |
+
# weight_decay = 0.01
|
104 |
+
# stabilize = false
|
20250312_12-29-01/epoch50/adapter_config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"q",
|
27 |
+
"o",
|
28 |
+
"k",
|
29 |
+
"ffn.2",
|
30 |
+
"v",
|
31 |
+
"ffn.0"
|
32 |
+
],
|
33 |
+
"task_type": null,
|
34 |
+
"use_dora": false,
|
35 |
+
"use_rslora": false
|
36 |
+
}
|
20250312_12-29-01/epoch50/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19ab54f9d397b982a64cc9c4e46ea6ae5260a7a92927a2899e3d5d4947ef3e04
|
3 |
+
size 87564160
|
20250312_12-29-01/epoch50/wan21.toml
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = 'examples/dataset.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
|
28 |
+
# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
|
29 |
+
# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
|
30 |
+
# Block swapping only works for LoRA training, and requires pipeline_stages=1.
|
31 |
+
#blocks_to_swap = 20
|
32 |
+
|
33 |
+
# eval settings
|
34 |
+
|
35 |
+
eval_every_n_epochs = 1
|
36 |
+
eval_before_first_step = true
|
37 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
38 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
39 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
40 |
+
eval_micro_batch_size_per_gpu = 1
|
41 |
+
eval_gradient_accumulation_steps = 1
|
42 |
+
|
43 |
+
# misc settings
|
44 |
+
|
45 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
46 |
+
save_every_n_epochs = 10
|
47 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
48 |
+
#checkpoint_every_n_epochs = 1
|
49 |
+
checkpoint_every_n_minutes = 30
|
50 |
+
# Always set to true unless you have a huge amount of VRAM.
|
51 |
+
activation_checkpointing = true
|
52 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
53 |
+
partition_method = 'parameters'
|
54 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
55 |
+
save_dtype = 'bfloat16'
|
56 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
57 |
+
caching_batch_size = 1
|
58 |
+
# How often deepspeed logs to console.
|
59 |
+
steps_per_print = 1
|
60 |
+
# How to extract video clips for training from a single input video file.
|
61 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
62 |
+
# number of frames for that bucket.
|
63 |
+
# single_beginning: one clip starting at the beginning of the video
|
64 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
65 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
66 |
+
# default is single_beginning
|
67 |
+
video_clip_mode = 'single_beginning'
|
68 |
+
|
69 |
+
# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
|
70 |
+
# details on the configuration and options for each model.
|
71 |
+
[model]
|
72 |
+
type = 'wan'
|
73 |
+
ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
|
74 |
+
dtype = 'bfloat16'
|
75 |
+
# You can use fp8 for the transformer when training LoRA.
|
76 |
+
#transformer_dtype = 'float8'
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
|
80 |
+
[adapter]
|
81 |
+
type = 'lora'
|
82 |
+
rank = 32
|
83 |
+
# Dtype for the LoRA weights you are training.
|
84 |
+
dtype = 'bfloat16'
|
85 |
+
# You can initialize the lora weights from a previously trained lora.
|
86 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
87 |
+
|
88 |
+
[optimizer]
|
89 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
90 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
91 |
+
type = 'adamw_optimi'
|
92 |
+
lr = 5e-5
|
93 |
+
betas = [0.9, 0.99]
|
94 |
+
weight_decay = 0.01
|
95 |
+
eps = 1e-8
|
96 |
+
|
97 |
+
# Can use this optimizer for a bit less memory usage.
|
98 |
+
|
99 |
+
# [optimizer]
|
100 |
+
# type = 'AdamW8bitKahan'
|
101 |
+
# lr = 2e-5
|
102 |
+
# betas = [0.9, 0.99]
|
103 |
+
# weight_decay = 0.01
|
104 |
+
# stabilize = false
|
20250312_12-29-01/epoch60/adapter_config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"q",
|
27 |
+
"o",
|
28 |
+
"k",
|
29 |
+
"ffn.2",
|
30 |
+
"v",
|
31 |
+
"ffn.0"
|
32 |
+
],
|
33 |
+
"task_type": null,
|
34 |
+
"use_dora": false,
|
35 |
+
"use_rslora": false
|
36 |
+
}
|
20250312_12-29-01/epoch60/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c33b45a0912caa40149863883da6abe4567cb9d6a6137bbcfa33d3988def5f9
|
3 |
+
size 87564160
|
20250312_12-29-01/epoch60/wan21.toml
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = 'examples/dataset.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
|
28 |
+
# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
|
29 |
+
# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
|
30 |
+
# Block swapping only works for LoRA training, and requires pipeline_stages=1.
|
31 |
+
#blocks_to_swap = 20
|
32 |
+
|
33 |
+
# eval settings
|
34 |
+
|
35 |
+
eval_every_n_epochs = 1
|
36 |
+
eval_before_first_step = true
|
37 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
38 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
39 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
40 |
+
eval_micro_batch_size_per_gpu = 1
|
41 |
+
eval_gradient_accumulation_steps = 1
|
42 |
+
|
43 |
+
# misc settings
|
44 |
+
|
45 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
46 |
+
save_every_n_epochs = 10
|
47 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
48 |
+
#checkpoint_every_n_epochs = 1
|
49 |
+
checkpoint_every_n_minutes = 30
|
50 |
+
# Always set to true unless you have a huge amount of VRAM.
|
51 |
+
activation_checkpointing = true
|
52 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
53 |
+
partition_method = 'parameters'
|
54 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
55 |
+
save_dtype = 'bfloat16'
|
56 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
57 |
+
caching_batch_size = 1
|
58 |
+
# How often deepspeed logs to console.
|
59 |
+
steps_per_print = 1
|
60 |
+
# How to extract video clips for training from a single input video file.
|
61 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
62 |
+
# number of frames for that bucket.
|
63 |
+
# single_beginning: one clip starting at the beginning of the video
|
64 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
65 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
66 |
+
# default is single_beginning
|
67 |
+
video_clip_mode = 'single_beginning'
|
68 |
+
|
69 |
+
# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
|
70 |
+
# details on the configuration and options for each model.
|
71 |
+
[model]
|
72 |
+
type = 'wan'
|
73 |
+
ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
|
74 |
+
dtype = 'bfloat16'
|
75 |
+
# You can use fp8 for the transformer when training LoRA.
|
76 |
+
#transformer_dtype = 'float8'
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
|
80 |
+
[adapter]
|
81 |
+
type = 'lora'
|
82 |
+
rank = 32
|
83 |
+
# Dtype for the LoRA weights you are training.
|
84 |
+
dtype = 'bfloat16'
|
85 |
+
# You can initialize the lora weights from a previously trained lora.
|
86 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
87 |
+
|
88 |
+
[optimizer]
|
89 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
90 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
91 |
+
type = 'adamw_optimi'
|
92 |
+
lr = 5e-5
|
93 |
+
betas = [0.9, 0.99]
|
94 |
+
weight_decay = 0.01
|
95 |
+
eps = 1e-8
|
96 |
+
|
97 |
+
# Can use this optimizer for a bit less memory usage.
|
98 |
+
|
99 |
+
# [optimizer]
|
100 |
+
# type = 'AdamW8bitKahan'
|
101 |
+
# lr = 2e-5
|
102 |
+
# betas = [0.9, 0.99]
|
103 |
+
# weight_decay = 0.01
|
104 |
+
# stabilize = false
|
20250312_12-29-01/epoch70/adapter_config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": null,
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": false,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.0,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 32,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"q",
|
27 |
+
"o",
|
28 |
+
"k",
|
29 |
+
"ffn.2",
|
30 |
+
"v",
|
31 |
+
"ffn.0"
|
32 |
+
],
|
33 |
+
"task_type": null,
|
34 |
+
"use_dora": false,
|
35 |
+
"use_rslora": false
|
36 |
+
}
|
20250312_12-29-01/epoch70/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:529767431e44985d1aa21f6472de8f08edbfddc9d688ab7fb5f9f5f4c4b8873b
|
3 |
+
size 87564160
|
20250312_12-29-01/epoch70/wan21.toml
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Output path for training runs. Each training run makes a new directory in here.
|
2 |
+
output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
|
3 |
+
|
4 |
+
# Dataset config file.
|
5 |
+
dataset = 'examples/dataset.toml'
|
6 |
+
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
|
7 |
+
# eval_datasets = [
|
8 |
+
# {name = 'something', config = 'path/to/eval_dataset.toml'},
|
9 |
+
# ]
|
10 |
+
|
11 |
+
# training settings
|
12 |
+
|
13 |
+
# I usually set this to a really high value because I don't know how long I want to train.
|
14 |
+
epochs = 1000
|
15 |
+
# Batch size of a single forward/backward pass for one GPU.
|
16 |
+
micro_batch_size_per_gpu = 1
|
17 |
+
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
|
18 |
+
pipeline_stages = 1
|
19 |
+
# Number of micro-batches sent through the pipeline for each training step.
|
20 |
+
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
|
21 |
+
gradient_accumulation_steps = 4
|
22 |
+
# Grad norm clipping.
|
23 |
+
gradient_clipping = 1.0
|
24 |
+
# Learning rate warmup.
|
25 |
+
warmup_steps = 100
|
26 |
+
|
27 |
+
# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
|
28 |
+
# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
|
29 |
+
# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
|
30 |
+
# Block swapping only works for LoRA training, and requires pipeline_stages=1.
|
31 |
+
#blocks_to_swap = 20
|
32 |
+
|
33 |
+
# eval settings
|
34 |
+
|
35 |
+
eval_every_n_epochs = 1
|
36 |
+
eval_before_first_step = true
|
37 |
+
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
|
38 |
+
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
|
39 |
+
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
|
40 |
+
eval_micro_batch_size_per_gpu = 1
|
41 |
+
eval_gradient_accumulation_steps = 1
|
42 |
+
|
43 |
+
# misc settings
|
44 |
+
|
45 |
+
# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
|
46 |
+
save_every_n_epochs = 10
|
47 |
+
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
|
48 |
+
#checkpoint_every_n_epochs = 1
|
49 |
+
checkpoint_every_n_minutes = 30
|
50 |
+
# Always set to true unless you have a huge amount of VRAM.
|
51 |
+
activation_checkpointing = true
|
52 |
+
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
|
53 |
+
partition_method = 'parameters'
|
54 |
+
# dtype for saving the LoRA or model, if different from training dtype
|
55 |
+
save_dtype = 'bfloat16'
|
56 |
+
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
|
57 |
+
caching_batch_size = 1
|
58 |
+
# How often deepspeed logs to console.
|
59 |
+
steps_per_print = 1
|
60 |
+
# How to extract video clips for training from a single input video file.
|
61 |
+
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
|
62 |
+
# number of frames for that bucket.
|
63 |
+
# single_beginning: one clip starting at the beginning of the video
|
64 |
+
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
|
65 |
+
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
|
66 |
+
# default is single_beginning
|
67 |
+
video_clip_mode = 'single_beginning'
|
68 |
+
|
69 |
+
# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
|
70 |
+
# details on the configuration and options for each model.
|
71 |
+
[model]
|
72 |
+
type = 'wan'
|
73 |
+
ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
|
74 |
+
dtype = 'bfloat16'
|
75 |
+
# You can use fp8 for the transformer when training LoRA.
|
76 |
+
#transformer_dtype = 'float8'
|
77 |
+
timestep_sample_method = 'logit_normal'
|
78 |
+
|
79 |
+
# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
|
80 |
+
[adapter]
|
81 |
+
type = 'lora'
|
82 |
+
rank = 32
|
83 |
+
# Dtype for the LoRA weights you are training.
|
84 |
+
dtype = 'bfloat16'
|
85 |
+
# You can initialize the lora weights from a previously trained lora.
|
86 |
+
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
|
87 |
+
|
88 |
+
[optimizer]
|
89 |
+
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
|
90 |
+
# Look at train.py for other options. You could also easily edit the file and add your own.
|
91 |
+
type = 'adamw_optimi'
|
92 |
+
lr = 5e-5
|
93 |
+
betas = [0.9, 0.99]
|
94 |
+
weight_decay = 0.01
|
95 |
+
eps = 1e-8
|
96 |
+
|
97 |
+
# Can use this optimizer for a bit less memory usage.
|
98 |
+
|
99 |
+
# [optimizer]
|
100 |
+
# type = 'AdamW8bitKahan'
|
101 |
+
# lr = 2e-5
|
102 |
+
# betas = [0.9, 0.99]
|
103 |
+
# weight_decay = 0.01
|
104 |
+
# stabilize = false
|
20250312_12-29-01/events.out.tfevents.1741782541.eb3e120b3b16.7516.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:edc10449f7159f89ed461932e8516e9a2338a500927b5b2a2a3cc2c9aa88f3c8
|
3 |
+
size 211879
|
20250312_12-29-01/global_step1090/layer_00-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c682c9cc8e731c37498845f3a980635b04094468e55e77553c7a12ccff998f5
|
3 |
+
size 920
|
20250312_12-29-01/global_step1090/layer_01-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bedc13e12900f3b8ee5095b33e79d28f55201cd1edea9be742a3f04db5fcc98a
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_02-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cc949cb7eb3453189935f28fb668d3689b2b631750c0ae3717e4a84f21744ccb
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_03-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d2981510dbf3ac72fdcd66076ca65844172957d1d3822b00de5f3beb28f93a3
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_04-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5fc1b82597ac77b5fe70c34ea9e909e154f10d56f1a5cbc0a7bbb684c1817fa2
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_05-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4fedc146798570b2a0134494b6d8e5eec9c02ea97a404f0fc8a482aef8def2c5
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_06-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7cefa9ff76a4f91309d81e5d12b6e16803066e2294b97ab5a9789ecdb654385d
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_07-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa0988153bbc122d50dae42c16d7c685a881d5e2ed1ced56aade6c489f94c091
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_08-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd04a3922e1f4f06ea94691d559cdaa0cf3ad7edb4289752b0a0c8eb1fe2860e
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_09-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f01ceb4aee93c9e5a0162dd835aab061473e9af27901ff086169f21b203a11ae
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_10-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e65ee29182677556a964de713541cb660175e83a489c7fb8f0fadbc847c94b31
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_11-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c62ecaf88a42fdfa263a2e72834e724b3889317918f16fefd2fac6af8fd7cc2
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_12-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:daaa31d39df63d3a70894d4e820ea59420644d8444abc702d3a61b21fd738d46
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_13-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f749757475dab5157b6e143016031c94ec86d25e8a7b496290b04b0f799e3b9f
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_14-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e0d2bf27053d7a21156628b5b185ece57d63d9f89cfa5c9a5887263f0141b2a
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_15-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eef58883d8b5f3ec47afb5abd52e41f3b8d083d97631adb26ba8560bde9ef2b2
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_16-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd968150278251b4b5ffc12aef89ce8ab921fa127ff77cdc1bff8bf2db72d7da
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_17-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:67d5eb552a3c22f286f092a0fb7437da7382e9a1eb68376b2a964859aacb6fd3
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_18-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b792f09591ac283113ca2442402093b96a16d511e87bf8354baf96b4afc3a929
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_19-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfafb111c53506c427663ce6d1b71e00840fde82bb2bc7a178528a6eccf86ebd
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_20-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2db41450cc94f528ac9098046a5e2e98c6b25abf72d7864634e856db3598ee02
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_21-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db7da2089babe3ff319b4b9198a52fe8ab05a590c2956ceb364e603e1638ab3a
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_22-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5067d8c45bfbca662efbe50be1b9e555e9de9e669a9d6f0357d3fee41f3dc68d
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_23-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:209345826004eeb4792ab8a48ca98ad5ef0d053121dd46c0452750ed581e3500
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_24-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc370c2c74d7c425dfd9b7ee5073fb9dc94bd6aa0f1214fc982d07197906aaed
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_25-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5b30f7b92973879da28c86cba400d77a3f15ce14ed7f685bd33e47f01811740
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_26-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ef066b845be9a6db1fe832d3914bdc5566e4a4211122f95f976d45574a1cd8c
|
3 |
+
size 2923434
|
20250312_12-29-01/global_step1090/layer_27-model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd9be3129ac29bda09ccc68213ce81bd9b2c337749ae1b8b0eaaa36a9c43b79d
|
3 |
+
size 2923434
|