naumnaum commited on Mar 12

Commit

f456fbb

verified ·

1 Parent(s): 96c35bf

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

20250312_12-29-01/epoch10/adapter_config.json +36 -0
20250312_12-29-01/epoch10/adapter_model.safetensors +3 -0
20250312_12-29-01/epoch10/wan21.toml +104 -0
20250312_12-29-01/epoch20/adapter_config.json +36 -0
20250312_12-29-01/epoch20/adapter_model.safetensors +3 -0
20250312_12-29-01/epoch20/wan21.toml +104 -0
20250312_12-29-01/epoch30/adapter_config.json +36 -0
20250312_12-29-01/epoch30/adapter_model.safetensors +3 -0
20250312_12-29-01/epoch30/wan21.toml +104 -0
20250312_12-29-01/epoch40/adapter_config.json +36 -0
20250312_12-29-01/epoch40/adapter_model.safetensors +3 -0
20250312_12-29-01/epoch40/wan21.toml +104 -0
20250312_12-29-01/epoch50/adapter_config.json +36 -0
20250312_12-29-01/epoch50/adapter_model.safetensors +3 -0
20250312_12-29-01/epoch50/wan21.toml +104 -0
20250312_12-29-01/epoch60/adapter_config.json +36 -0
20250312_12-29-01/epoch60/adapter_model.safetensors +3 -0
20250312_12-29-01/epoch60/wan21.toml +104 -0
20250312_12-29-01/epoch70/adapter_config.json +36 -0
20250312_12-29-01/epoch70/adapter_model.safetensors +3 -0
20250312_12-29-01/epoch70/wan21.toml +104 -0
20250312_12-29-01/events.out.tfevents.1741782541.eb3e120b3b16.7516.0 +3 -0
20250312_12-29-01/global_step1090/layer_00-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_01-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_02-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_03-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_04-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_05-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_06-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_07-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_08-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_09-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_10-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_11-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_12-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_13-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_14-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_15-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_16-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_17-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_18-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_19-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_20-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_21-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_22-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_23-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_24-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_25-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_26-model_states.pt +3 -0
20250312_12-29-01/global_step1090/layer_27-model_states.pt +3 -0

20250312_12-29-01/epoch10/adapter_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": null,
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": false,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q",
+    "o",
+    "k",
+    "ffn.2",
+    "v",
+    "ffn.0"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

20250312_12-29-01/epoch10/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:641ba8362e63fb40e588719ecccdec46c5ae2917f99b5e5a69aee0153eca34f0
+size 87564160

20250312_12-29-01/epoch10/wan21.toml ADDED Viewed

	@@ -0,0 +1,104 @@

+# Output path for training runs. Each training run makes a new directory in here.
+output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
+# Dataset config file.
+dataset = 'examples/dataset.toml'
+# You can have separate eval datasets. Give them a name for Tensorboard metrics.
+# eval_datasets = [
+#     {name = 'something', config = 'path/to/eval_dataset.toml'},
+# ]
+# training settings
+# I usually set this to a really high value because I don't know how long I want to train.
+epochs = 1000
+# Batch size of a single forward/backward pass for one GPU.
+micro_batch_size_per_gpu = 1
+# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
+pipeline_stages = 1
+# Number of micro-batches sent through the pipeline for each training step.
+# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
+gradient_accumulation_steps = 4
+# Grad norm clipping.
+gradient_clipping = 1.0
+# Learning rate warmup.
+warmup_steps = 100
+# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
+# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
+# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
+# Block swapping only works for LoRA training, and requires pipeline_stages=1.
+#blocks_to_swap = 20
+# eval settings
+eval_every_n_epochs = 1
+eval_before_first_step = true
+# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
+# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
+# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
+eval_micro_batch_size_per_gpu = 1
+eval_gradient_accumulation_steps = 1
+# misc settings
+# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
+save_every_n_epochs = 10
+# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
+#checkpoint_every_n_epochs = 1
+checkpoint_every_n_minutes = 30
+# Always set to true unless you have a huge amount of VRAM.
+activation_checkpointing = true
+# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
+partition_method = 'parameters'
+# dtype for saving the LoRA or model, if different from training dtype
+save_dtype = 'bfloat16'
+# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
+caching_batch_size = 1
+# How often deepspeed logs to console.
+steps_per_print = 1
+# How to extract video clips for training from a single input video file.
+# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
+# number of frames for that bucket.
+# single_beginning: one clip starting at the beginning of the video
+# single_middle: one clip from the middle of the video (cutting off the start and end equally)
+# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
+# default is single_beginning
+video_clip_mode = 'single_beginning'
+# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
+# details on the configuration and options for each model.
+[model]
+type = 'wan'
+ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
+dtype = 'bfloat16'
+# You can use fp8 for the transformer when training LoRA.
+#transformer_dtype = 'float8'
+timestep_sample_method = 'logit_normal'
+# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
+[adapter]
+type = 'lora'
+rank = 32
+# Dtype for the LoRA weights you are training.
+dtype = 'bfloat16'
+# You can initialize the lora weights from a previously trained lora.
+#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
+[optimizer]
+# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
+# Look at train.py for other options. You could also easily edit the file and add your own.
+type = 'adamw_optimi'
+lr = 5e-5
+betas = [0.9, 0.99]
+weight_decay = 0.01
+eps = 1e-8
+# Can use this optimizer for a bit less memory usage.
+# [optimizer]
+# type = 'AdamW8bitKahan'
+# lr = 2e-5
+# betas = [0.9, 0.99]
+# weight_decay = 0.01
+# stabilize = false

20250312_12-29-01/epoch20/adapter_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": null,
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": false,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q",
+    "o",
+    "k",
+    "ffn.2",
+    "v",
+    "ffn.0"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

20250312_12-29-01/epoch20/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f233ea785484ddce71c746dfc2a5c08386fafb0ecdc1702d7d432262700de7df
+size 87564160

20250312_12-29-01/epoch20/wan21.toml ADDED Viewed

	@@ -0,0 +1,104 @@

+# Output path for training runs. Each training run makes a new directory in here.
+output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
+# Dataset config file.
+dataset = 'examples/dataset.toml'
+# You can have separate eval datasets. Give them a name for Tensorboard metrics.
+# eval_datasets = [
+#     {name = 'something', config = 'path/to/eval_dataset.toml'},
+# ]
+# training settings
+# I usually set this to a really high value because I don't know how long I want to train.
+epochs = 1000
+# Batch size of a single forward/backward pass for one GPU.
+micro_batch_size_per_gpu = 1
+# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
+pipeline_stages = 1
+# Number of micro-batches sent through the pipeline for each training step.
+# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
+gradient_accumulation_steps = 4
+# Grad norm clipping.
+gradient_clipping = 1.0
+# Learning rate warmup.
+warmup_steps = 100
+# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
+# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
+# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
+# Block swapping only works for LoRA training, and requires pipeline_stages=1.
+#blocks_to_swap = 20
+# eval settings
+eval_every_n_epochs = 1
+eval_before_first_step = true
+# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
+# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
+# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
+eval_micro_batch_size_per_gpu = 1
+eval_gradient_accumulation_steps = 1
+# misc settings
+# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
+save_every_n_epochs = 10
+# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
+#checkpoint_every_n_epochs = 1
+checkpoint_every_n_minutes = 30
+# Always set to true unless you have a huge amount of VRAM.
+activation_checkpointing = true
+# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
+partition_method = 'parameters'
+# dtype for saving the LoRA or model, if different from training dtype
+save_dtype = 'bfloat16'
+# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
+caching_batch_size = 1
+# How often deepspeed logs to console.
+steps_per_print = 1
+# How to extract video clips for training from a single input video file.
+# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
+# number of frames for that bucket.
+# single_beginning: one clip starting at the beginning of the video
+# single_middle: one clip from the middle of the video (cutting off the start and end equally)
+# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
+# default is single_beginning
+video_clip_mode = 'single_beginning'
+# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
+# details on the configuration and options for each model.
+[model]
+type = 'wan'
+ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
+dtype = 'bfloat16'
+# You can use fp8 for the transformer when training LoRA.
+#transformer_dtype = 'float8'
+timestep_sample_method = 'logit_normal'
+# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
+[adapter]
+type = 'lora'
+rank = 32
+# Dtype for the LoRA weights you are training.
+dtype = 'bfloat16'
+# You can initialize the lora weights from a previously trained lora.
+#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
+[optimizer]
+# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
+# Look at train.py for other options. You could also easily edit the file and add your own.
+type = 'adamw_optimi'
+lr = 5e-5
+betas = [0.9, 0.99]
+weight_decay = 0.01
+eps = 1e-8
+# Can use this optimizer for a bit less memory usage.
+# [optimizer]
+# type = 'AdamW8bitKahan'
+# lr = 2e-5
+# betas = [0.9, 0.99]
+# weight_decay = 0.01
+# stabilize = false

20250312_12-29-01/epoch30/adapter_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": null,
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": false,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q",
+    "o",
+    "k",
+    "ffn.2",
+    "v",
+    "ffn.0"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

20250312_12-29-01/epoch30/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6f4b7f3cc28d0138680eaa7552d807f20c9c1bf801d25a8dbd2f17d5f055c60
+size 87564160

20250312_12-29-01/epoch30/wan21.toml ADDED Viewed

	@@ -0,0 +1,104 @@

+# Output path for training runs. Each training run makes a new directory in here.
+output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
+# Dataset config file.
+dataset = 'examples/dataset.toml'
+# You can have separate eval datasets. Give them a name for Tensorboard metrics.
+# eval_datasets = [
+#     {name = 'something', config = 'path/to/eval_dataset.toml'},
+# ]
+# training settings
+# I usually set this to a really high value because I don't know how long I want to train.
+epochs = 1000
+# Batch size of a single forward/backward pass for one GPU.
+micro_batch_size_per_gpu = 1
+# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
+pipeline_stages = 1
+# Number of micro-batches sent through the pipeline for each training step.
+# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
+gradient_accumulation_steps = 4
+# Grad norm clipping.
+gradient_clipping = 1.0
+# Learning rate warmup.
+warmup_steps = 100
+# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
+# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
+# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
+# Block swapping only works for LoRA training, and requires pipeline_stages=1.
+#blocks_to_swap = 20
+# eval settings
+eval_every_n_epochs = 1
+eval_before_first_step = true
+# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
+# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
+# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
+eval_micro_batch_size_per_gpu = 1
+eval_gradient_accumulation_steps = 1
+# misc settings
+# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
+save_every_n_epochs = 10
+# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
+#checkpoint_every_n_epochs = 1
+checkpoint_every_n_minutes = 30
+# Always set to true unless you have a huge amount of VRAM.
+activation_checkpointing = true
+# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
+partition_method = 'parameters'
+# dtype for saving the LoRA or model, if different from training dtype
+save_dtype = 'bfloat16'
+# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
+caching_batch_size = 1
+# How often deepspeed logs to console.
+steps_per_print = 1
+# How to extract video clips for training from a single input video file.
+# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
+# number of frames for that bucket.
+# single_beginning: one clip starting at the beginning of the video
+# single_middle: one clip from the middle of the video (cutting off the start and end equally)
+# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
+# default is single_beginning
+video_clip_mode = 'single_beginning'
+# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
+# details on the configuration and options for each model.
+[model]
+type = 'wan'
+ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
+dtype = 'bfloat16'
+# You can use fp8 for the transformer when training LoRA.
+#transformer_dtype = 'float8'
+timestep_sample_method = 'logit_normal'
+# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
+[adapter]
+type = 'lora'
+rank = 32
+# Dtype for the LoRA weights you are training.
+dtype = 'bfloat16'
+# You can initialize the lora weights from a previously trained lora.
+#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
+[optimizer]
+# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
+# Look at train.py for other options. You could also easily edit the file and add your own.
+type = 'adamw_optimi'
+lr = 5e-5
+betas = [0.9, 0.99]
+weight_decay = 0.01
+eps = 1e-8
+# Can use this optimizer for a bit less memory usage.
+# [optimizer]
+# type = 'AdamW8bitKahan'
+# lr = 2e-5
+# betas = [0.9, 0.99]
+# weight_decay = 0.01
+# stabilize = false

20250312_12-29-01/epoch40/adapter_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": null,
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": false,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q",
+    "o",
+    "k",
+    "ffn.2",
+    "v",
+    "ffn.0"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

20250312_12-29-01/epoch40/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d09ea0d00faca0abda41407e3ff3aa98622cd01ee06d788daad57b202d16fcb
+size 87564160

20250312_12-29-01/epoch40/wan21.toml ADDED Viewed

	@@ -0,0 +1,104 @@

+# Output path for training runs. Each training run makes a new directory in here.
+output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
+# Dataset config file.
+dataset = 'examples/dataset.toml'
+# You can have separate eval datasets. Give them a name for Tensorboard metrics.
+# eval_datasets = [
+#     {name = 'something', config = 'path/to/eval_dataset.toml'},
+# ]
+# training settings
+# I usually set this to a really high value because I don't know how long I want to train.
+epochs = 1000
+# Batch size of a single forward/backward pass for one GPU.
+micro_batch_size_per_gpu = 1
+# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
+pipeline_stages = 1
+# Number of micro-batches sent through the pipeline for each training step.
+# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
+gradient_accumulation_steps = 4
+# Grad norm clipping.
+gradient_clipping = 1.0
+# Learning rate warmup.
+warmup_steps = 100
+# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
+# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
+# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
+# Block swapping only works for LoRA training, and requires pipeline_stages=1.
+#blocks_to_swap = 20
+# eval settings
+eval_every_n_epochs = 1
+eval_before_first_step = true
+# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
+# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
+# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
+eval_micro_batch_size_per_gpu = 1
+eval_gradient_accumulation_steps = 1
+# misc settings
+# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
+save_every_n_epochs = 10
+# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
+#checkpoint_every_n_epochs = 1
+checkpoint_every_n_minutes = 30
+# Always set to true unless you have a huge amount of VRAM.
+activation_checkpointing = true
+# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
+partition_method = 'parameters'
+# dtype for saving the LoRA or model, if different from training dtype
+save_dtype = 'bfloat16'
+# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
+caching_batch_size = 1
+# How often deepspeed logs to console.
+steps_per_print = 1
+# How to extract video clips for training from a single input video file.
+# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
+# number of frames for that bucket.
+# single_beginning: one clip starting at the beginning of the video
+# single_middle: one clip from the middle of the video (cutting off the start and end equally)
+# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
+# default is single_beginning
+video_clip_mode = 'single_beginning'
+# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
+# details on the configuration and options for each model.
+[model]
+type = 'wan'
+ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
+dtype = 'bfloat16'
+# You can use fp8 for the transformer when training LoRA.
+#transformer_dtype = 'float8'
+timestep_sample_method = 'logit_normal'
+# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
+[adapter]
+type = 'lora'
+rank = 32
+# Dtype for the LoRA weights you are training.
+dtype = 'bfloat16'
+# You can initialize the lora weights from a previously trained lora.
+#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
+[optimizer]
+# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
+# Look at train.py for other options. You could also easily edit the file and add your own.
+type = 'adamw_optimi'
+lr = 5e-5
+betas = [0.9, 0.99]
+weight_decay = 0.01
+eps = 1e-8
+# Can use this optimizer for a bit less memory usage.
+# [optimizer]
+# type = 'AdamW8bitKahan'
+# lr = 2e-5
+# betas = [0.9, 0.99]
+# weight_decay = 0.01
+# stabilize = false

20250312_12-29-01/epoch50/adapter_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": null,
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": false,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q",
+    "o",
+    "k",
+    "ffn.2",
+    "v",
+    "ffn.0"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

20250312_12-29-01/epoch50/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19ab54f9d397b982a64cc9c4e46ea6ae5260a7a92927a2899e3d5d4947ef3e04
+size 87564160

20250312_12-29-01/epoch50/wan21.toml ADDED Viewed

	@@ -0,0 +1,104 @@

+# Output path for training runs. Each training run makes a new directory in here.
+output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
+# Dataset config file.
+dataset = 'examples/dataset.toml'
+# You can have separate eval datasets. Give them a name for Tensorboard metrics.
+# eval_datasets = [
+#     {name = 'something', config = 'path/to/eval_dataset.toml'},
+# ]
+# training settings
+# I usually set this to a really high value because I don't know how long I want to train.
+epochs = 1000
+# Batch size of a single forward/backward pass for one GPU.
+micro_batch_size_per_gpu = 1
+# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
+pipeline_stages = 1
+# Number of micro-batches sent through the pipeline for each training step.
+# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
+gradient_accumulation_steps = 4
+# Grad norm clipping.
+gradient_clipping = 1.0
+# Learning rate warmup.
+warmup_steps = 100
+# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
+# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
+# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
+# Block swapping only works for LoRA training, and requires pipeline_stages=1.
+#blocks_to_swap = 20
+# eval settings
+eval_every_n_epochs = 1
+eval_before_first_step = true
+# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
+# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
+# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
+eval_micro_batch_size_per_gpu = 1
+eval_gradient_accumulation_steps = 1
+# misc settings
+# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
+save_every_n_epochs = 10
+# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
+#checkpoint_every_n_epochs = 1
+checkpoint_every_n_minutes = 30
+# Always set to true unless you have a huge amount of VRAM.
+activation_checkpointing = true
+# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
+partition_method = 'parameters'
+# dtype for saving the LoRA or model, if different from training dtype
+save_dtype = 'bfloat16'
+# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
+caching_batch_size = 1
+# How often deepspeed logs to console.
+steps_per_print = 1
+# How to extract video clips for training from a single input video file.
+# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
+# number of frames for that bucket.
+# single_beginning: one clip starting at the beginning of the video
+# single_middle: one clip from the middle of the video (cutting off the start and end equally)
+# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
+# default is single_beginning
+video_clip_mode = 'single_beginning'
+# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
+# details on the configuration and options for each model.
+[model]
+type = 'wan'
+ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
+dtype = 'bfloat16'
+# You can use fp8 for the transformer when training LoRA.
+#transformer_dtype = 'float8'
+timestep_sample_method = 'logit_normal'
+# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
+[adapter]
+type = 'lora'
+rank = 32
+# Dtype for the LoRA weights you are training.
+dtype = 'bfloat16'
+# You can initialize the lora weights from a previously trained lora.
+#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
+[optimizer]
+# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
+# Look at train.py for other options. You could also easily edit the file and add your own.
+type = 'adamw_optimi'
+lr = 5e-5
+betas = [0.9, 0.99]
+weight_decay = 0.01
+eps = 1e-8
+# Can use this optimizer for a bit less memory usage.
+# [optimizer]
+# type = 'AdamW8bitKahan'
+# lr = 2e-5
+# betas = [0.9, 0.99]
+# weight_decay = 0.01
+# stabilize = false

20250312_12-29-01/epoch60/adapter_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": null,
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": false,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q",
+    "o",
+    "k",
+    "ffn.2",
+    "v",
+    "ffn.0"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

20250312_12-29-01/epoch60/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c33b45a0912caa40149863883da6abe4567cb9d6a6137bbcfa33d3988def5f9
+size 87564160

20250312_12-29-01/epoch60/wan21.toml ADDED Viewed

	@@ -0,0 +1,104 @@

+# Output path for training runs. Each training run makes a new directory in here.
+output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
+# Dataset config file.
+dataset = 'examples/dataset.toml'
+# You can have separate eval datasets. Give them a name for Tensorboard metrics.
+# eval_datasets = [
+#     {name = 'something', config = 'path/to/eval_dataset.toml'},
+# ]
+# training settings
+# I usually set this to a really high value because I don't know how long I want to train.
+epochs = 1000
+# Batch size of a single forward/backward pass for one GPU.
+micro_batch_size_per_gpu = 1
+# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
+pipeline_stages = 1
+# Number of micro-batches sent through the pipeline for each training step.
+# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
+gradient_accumulation_steps = 4
+# Grad norm clipping.
+gradient_clipping = 1.0
+# Learning rate warmup.
+warmup_steps = 100
+# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
+# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
+# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
+# Block swapping only works for LoRA training, and requires pipeline_stages=1.
+#blocks_to_swap = 20
+# eval settings
+eval_every_n_epochs = 1
+eval_before_first_step = true
+# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
+# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
+# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
+eval_micro_batch_size_per_gpu = 1
+eval_gradient_accumulation_steps = 1
+# misc settings
+# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
+save_every_n_epochs = 10
+# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
+#checkpoint_every_n_epochs = 1
+checkpoint_every_n_minutes = 30
+# Always set to true unless you have a huge amount of VRAM.
+activation_checkpointing = true
+# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
+partition_method = 'parameters'
+# dtype for saving the LoRA or model, if different from training dtype
+save_dtype = 'bfloat16'
+# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
+caching_batch_size = 1
+# How often deepspeed logs to console.
+steps_per_print = 1
+# How to extract video clips for training from a single input video file.
+# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
+# number of frames for that bucket.
+# single_beginning: one clip starting at the beginning of the video
+# single_middle: one clip from the middle of the video (cutting off the start and end equally)
+# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
+# default is single_beginning
+video_clip_mode = 'single_beginning'
+# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
+# details on the configuration and options for each model.
+[model]
+type = 'wan'
+ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
+dtype = 'bfloat16'
+# You can use fp8 for the transformer when training LoRA.
+#transformer_dtype = 'float8'
+timestep_sample_method = 'logit_normal'
+# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
+[adapter]
+type = 'lora'
+rank = 32
+# Dtype for the LoRA weights you are training.
+dtype = 'bfloat16'
+# You can initialize the lora weights from a previously trained lora.
+#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
+[optimizer]
+# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
+# Look at train.py for other options. You could also easily edit the file and add your own.
+type = 'adamw_optimi'
+lr = 5e-5
+betas = [0.9, 0.99]
+weight_decay = 0.01
+eps = 1e-8
+# Can use this optimizer for a bit less memory usage.
+# [optimizer]
+# type = 'AdamW8bitKahan'
+# lr = 2e-5
+# betas = [0.9, 0.99]
+# weight_decay = 0.01
+# stabilize = false

20250312_12-29-01/epoch70/adapter_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": null,
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": false,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q",
+    "o",
+    "k",
+    "ffn.2",
+    "v",
+    "ffn.0"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

20250312_12-29-01/epoch70/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:529767431e44985d1aa21f6472de8f08edbfddc9d688ab7fb5f9f5f4c4b8873b
+size 87564160

20250312_12-29-01/epoch70/wan21.toml ADDED Viewed

	@@ -0,0 +1,104 @@

+# Output path for training runs. Each training run makes a new directory in here.
+output_dir = '/workspace/diffusion-pipe/data/output/run_1_20_img_12_03_resized'
+# Dataset config file.
+dataset = 'examples/dataset.toml'
+# You can have separate eval datasets. Give them a name for Tensorboard metrics.
+# eval_datasets = [
+#     {name = 'something', config = 'path/to/eval_dataset.toml'},
+# ]
+# training settings
+# I usually set this to a really high value because I don't know how long I want to train.
+epochs = 1000
+# Batch size of a single forward/backward pass for one GPU.
+micro_batch_size_per_gpu = 1
+# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
+pipeline_stages = 1
+# Number of micro-batches sent through the pipeline for each training step.
+# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
+gradient_accumulation_steps = 4
+# Grad norm clipping.
+gradient_clipping = 1.0
+# Learning rate warmup.
+warmup_steps = 100
+# Block swapping is supported for Wan, HunyuanVideo, Flux, and Chroma. This value controls the number
+# of blocks kept offloaded to RAM. Increasing it lowers VRAM use, but has a performance penalty. The
+# exactly performance penalty depends on the model and the type of training you are doing (e.g. images vs video).
+# Block swapping only works for LoRA training, and requires pipeline_stages=1.
+#blocks_to_swap = 20
+# eval settings
+eval_every_n_epochs = 1
+eval_before_first_step = true
+# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
+# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
+# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
+eval_micro_batch_size_per_gpu = 1
+eval_gradient_accumulation_steps = 1
+# misc settings
+# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
+save_every_n_epochs = 10
+# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
+#checkpoint_every_n_epochs = 1
+checkpoint_every_n_minutes = 30
+# Always set to true unless you have a huge amount of VRAM.
+activation_checkpointing = true
+# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
+partition_method = 'parameters'
+# dtype for saving the LoRA or model, if different from training dtype
+save_dtype = 'bfloat16'
+# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
+caching_batch_size = 1
+# How often deepspeed logs to console.
+steps_per_print = 1
+# How to extract video clips for training from a single input video file.
+# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
+# number of frames for that bucket.
+# single_beginning: one clip starting at the beginning of the video
+# single_middle: one clip from the middle of the video (cutting off the start and end equally)
+# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
+# default is single_beginning
+video_clip_mode = 'single_beginning'
+# This is how you configure HunyuanVideo. Other models will be different. See docs/supported_models.md for
+# details on the configuration and options for each model.
+[model]
+type = 'wan'
+ckpt_path = '/workspace/diffusion-pipe/models/Wan2.1-T2V-1.3B'
+dtype = 'bfloat16'
+# You can use fp8 for the transformer when training LoRA.
+#transformer_dtype = 'float8'
+timestep_sample_method = 'logit_normal'
+# For models that support full fine tuning, simply delete or comment out the [adapter] table to FFT.
+[adapter]
+type = 'lora'
+rank = 32
+# Dtype for the LoRA weights you are training.
+dtype = 'bfloat16'
+# You can initialize the lora weights from a previously trained lora.
+#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'
+[optimizer]
+# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
+# Look at train.py for other options. You could also easily edit the file and add your own.
+type = 'adamw_optimi'
+lr = 5e-5
+betas = [0.9, 0.99]
+weight_decay = 0.01
+eps = 1e-8
+# Can use this optimizer for a bit less memory usage.
+# [optimizer]
+# type = 'AdamW8bitKahan'
+# lr = 2e-5
+# betas = [0.9, 0.99]
+# weight_decay = 0.01
+# stabilize = false

20250312_12-29-01/events.out.tfevents.1741782541.eb3e120b3b16.7516.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edc10449f7159f89ed461932e8516e9a2338a500927b5b2a2a3cc2c9aa88f3c8
+size 211879

20250312_12-29-01/global_step1090/layer_00-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c682c9cc8e731c37498845f3a980635b04094468e55e77553c7a12ccff998f5
+size 920

20250312_12-29-01/global_step1090/layer_01-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bedc13e12900f3b8ee5095b33e79d28f55201cd1edea9be742a3f04db5fcc98a
+size 2923434

20250312_12-29-01/global_step1090/layer_02-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc949cb7eb3453189935f28fb668d3689b2b631750c0ae3717e4a84f21744ccb
+size 2923434

20250312_12-29-01/global_step1090/layer_03-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d2981510dbf3ac72fdcd66076ca65844172957d1d3822b00de5f3beb28f93a3
+size 2923434

20250312_12-29-01/global_step1090/layer_04-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fc1b82597ac77b5fe70c34ea9e909e154f10d56f1a5cbc0a7bbb684c1817fa2
+size 2923434

20250312_12-29-01/global_step1090/layer_05-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fedc146798570b2a0134494b6d8e5eec9c02ea97a404f0fc8a482aef8def2c5
+size 2923434

20250312_12-29-01/global_step1090/layer_06-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cefa9ff76a4f91309d81e5d12b6e16803066e2294b97ab5a9789ecdb654385d
+size 2923434

20250312_12-29-01/global_step1090/layer_07-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa0988153bbc122d50dae42c16d7c685a881d5e2ed1ced56aade6c489f94c091
+size 2923434

20250312_12-29-01/global_step1090/layer_08-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd04a3922e1f4f06ea94691d559cdaa0cf3ad7edb4289752b0a0c8eb1fe2860e
+size 2923434

20250312_12-29-01/global_step1090/layer_09-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f01ceb4aee93c9e5a0162dd835aab061473e9af27901ff086169f21b203a11ae
+size 2923434

20250312_12-29-01/global_step1090/layer_10-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e65ee29182677556a964de713541cb660175e83a489c7fb8f0fadbc847c94b31
+size 2923434

20250312_12-29-01/global_step1090/layer_11-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c62ecaf88a42fdfa263a2e72834e724b3889317918f16fefd2fac6af8fd7cc2
+size 2923434

20250312_12-29-01/global_step1090/layer_12-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:daaa31d39df63d3a70894d4e820ea59420644d8444abc702d3a61b21fd738d46
+size 2923434

20250312_12-29-01/global_step1090/layer_13-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f749757475dab5157b6e143016031c94ec86d25e8a7b496290b04b0f799e3b9f
+size 2923434

20250312_12-29-01/global_step1090/layer_14-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e0d2bf27053d7a21156628b5b185ece57d63d9f89cfa5c9a5887263f0141b2a
+size 2923434

20250312_12-29-01/global_step1090/layer_15-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eef58883d8b5f3ec47afb5abd52e41f3b8d083d97631adb26ba8560bde9ef2b2
+size 2923434

20250312_12-29-01/global_step1090/layer_16-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd968150278251b4b5ffc12aef89ce8ab921fa127ff77cdc1bff8bf2db72d7da
+size 2923434

20250312_12-29-01/global_step1090/layer_17-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67d5eb552a3c22f286f092a0fb7437da7382e9a1eb68376b2a964859aacb6fd3
+size 2923434

20250312_12-29-01/global_step1090/layer_18-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b792f09591ac283113ca2442402093b96a16d511e87bf8354baf96b4afc3a929
+size 2923434

20250312_12-29-01/global_step1090/layer_19-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfafb111c53506c427663ce6d1b71e00840fde82bb2bc7a178528a6eccf86ebd
+size 2923434

20250312_12-29-01/global_step1090/layer_20-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2db41450cc94f528ac9098046a5e2e98c6b25abf72d7864634e856db3598ee02
+size 2923434

20250312_12-29-01/global_step1090/layer_21-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db7da2089babe3ff319b4b9198a52fe8ab05a590c2956ceb364e603e1638ab3a
+size 2923434

20250312_12-29-01/global_step1090/layer_22-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5067d8c45bfbca662efbe50be1b9e555e9de9e669a9d6f0357d3fee41f3dc68d
+size 2923434

20250312_12-29-01/global_step1090/layer_23-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:209345826004eeb4792ab8a48ca98ad5ef0d053121dd46c0452750ed581e3500
+size 2923434

20250312_12-29-01/global_step1090/layer_24-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc370c2c74d7c425dfd9b7ee5073fb9dc94bd6aa0f1214fc982d07197906aaed
+size 2923434

20250312_12-29-01/global_step1090/layer_25-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5b30f7b92973879da28c86cba400d77a3f15ce14ed7f685bd33e47f01811740
+size 2923434

20250312_12-29-01/global_step1090/layer_26-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ef066b845be9a6db1fe832d3914bdc5566e4a4211122f95f976d45574a1cd8c
+size 2923434

20250312_12-29-01/global_step1090/layer_27-model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd9be3129ac29bda09ccc68213ce81bd9b2c337749ae1b8b0eaaa36a9c43b79d
+size 2923434