RickyRubini commited on
Commit
af2bdcf
·
verified ·
1 Parent(s): 4f51542

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: llava-hf/llava-v1.6-mistral-7b-hf
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: train_2025-04-21-17-35-28
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # train_2025-04-21-17-35-28
18
+
19
+ This model is a fine-tuned version of [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) on the docker_command_dataset dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.0002
39
+ - train_batch_size: 2
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - gradient_accumulation_steps: 8
43
+ - total_train_batch_size: 16
44
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
45
+ - lr_scheduler_type: cosine
46
+ - num_epochs: 3.0
47
+ - mixed_precision_training: Native AMP
48
+
49
+ ### Training results
50
+
51
+
52
+
53
+ ### Framework versions
54
+
55
+ - PEFT 0.12.0
56
+ - Transformers 4.49.0
57
+ - Pytorch 2.6.0+cu124
58
+ - Datasets 3.2.0
59
+ - Tokenizers 0.21.0
adapter_config.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "llava-hf/llava-v1.6-mistral-7b-hf",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "language_model.model.layers.2.self_attn.o_proj",
24
+ "language_model.model.layers.30.self_attn.v_proj",
25
+ "language_model.model.layers.18.self_attn.q_proj",
26
+ "language_model.model.layers.9.self_attn.v_proj",
27
+ "language_model.model.layers.30.self_attn.k_proj",
28
+ "language_model.model.layers.0.self_attn.o_proj",
29
+ "language_model.model.layers.19.mlp.up_proj",
30
+ "language_model.model.layers.20.self_attn.v_proj",
31
+ "language_model.model.layers.12.mlp.up_proj",
32
+ "language_model.model.layers.11.mlp.up_proj",
33
+ "language_model.model.layers.5.self_attn.q_proj",
34
+ "language_model.model.layers.18.mlp.gate_proj",
35
+ "language_model.model.layers.4.self_attn.v_proj",
36
+ "language_model.model.layers.12.mlp.down_proj",
37
+ "language_model.model.layers.17.mlp.down_proj",
38
+ "language_model.model.layers.13.self_attn.k_proj",
39
+ "language_model.model.layers.11.self_attn.k_proj",
40
+ "language_model.model.layers.15.self_attn.k_proj",
41
+ "language_model.model.layers.2.self_attn.k_proj",
42
+ "language_model.model.layers.23.self_attn.v_proj",
43
+ "language_model.model.layers.31.mlp.down_proj",
44
+ "language_model.model.layers.0.mlp.up_proj",
45
+ "language_model.model.layers.5.mlp.gate_proj",
46
+ "language_model.model.layers.8.self_attn.q_proj",
47
+ "language_model.model.layers.12.self_attn.o_proj",
48
+ "language_model.model.layers.10.self_attn.q_proj",
49
+ "language_model.model.layers.28.self_attn.q_proj",
50
+ "language_model.model.layers.5.self_attn.v_proj",
51
+ "language_model.model.layers.16.self_attn.v_proj",
52
+ "language_model.model.layers.1.mlp.up_proj",
53
+ "language_model.model.layers.7.mlp.down_proj",
54
+ "language_model.model.layers.18.self_attn.v_proj",
55
+ "language_model.model.layers.23.self_attn.k_proj",
56
+ "language_model.model.layers.11.self_attn.v_proj",
57
+ "language_model.model.layers.8.mlp.down_proj",
58
+ "language_model.model.layers.1.self_attn.q_proj",
59
+ "language_model.model.layers.18.mlp.up_proj",
60
+ "language_model.model.layers.2.mlp.up_proj",
61
+ "language_model.model.layers.20.self_attn.q_proj",
62
+ "language_model.model.layers.11.self_attn.q_proj",
63
+ "language_model.model.layers.12.self_attn.q_proj",
64
+ "language_model.model.layers.3.mlp.up_proj",
65
+ "language_model.model.layers.25.self_attn.v_proj",
66
+ "language_model.model.layers.8.self_attn.o_proj",
67
+ "language_model.model.layers.25.mlp.down_proj",
68
+ "language_model.model.layers.28.self_attn.k_proj",
69
+ "language_model.model.layers.29.mlp.gate_proj",
70
+ "language_model.model.layers.16.mlp.gate_proj",
71
+ "language_model.model.layers.7.mlp.up_proj",
72
+ "language_model.model.layers.4.self_attn.k_proj",
73
+ "language_model.model.layers.31.self_attn.v_proj",
74
+ "language_model.model.layers.25.self_attn.o_proj",
75
+ "language_model.model.layers.28.mlp.up_proj",
76
+ "language_model.model.layers.26.mlp.down_proj",
77
+ "language_model.model.layers.17.self_attn.v_proj",
78
+ "language_model.model.layers.5.mlp.down_proj",
79
+ "language_model.model.layers.3.self_attn.o_proj",
80
+ "language_model.model.layers.28.self_attn.o_proj",
81
+ "language_model.model.layers.28.self_attn.v_proj",
82
+ "language_model.model.layers.0.self_attn.k_proj",
83
+ "language_model.model.layers.16.mlp.up_proj",
84
+ "language_model.model.layers.13.self_attn.q_proj",
85
+ "language_model.model.layers.7.self_attn.k_proj",
86
+ "language_model.model.layers.31.self_attn.o_proj",
87
+ "language_model.model.layers.10.self_attn.o_proj",
88
+ "language_model.model.layers.28.mlp.down_proj",
89
+ "language_model.model.layers.20.mlp.gate_proj",
90
+ "language_model.model.layers.7.self_attn.o_proj",
91
+ "language_model.model.layers.26.mlp.up_proj",
92
+ "language_model.model.layers.13.self_attn.v_proj",
93
+ "language_model.model.layers.13.mlp.down_proj",
94
+ "language_model.model.layers.29.mlp.down_proj",
95
+ "language_model.model.layers.17.self_attn.q_proj",
96
+ "language_model.model.layers.19.mlp.gate_proj",
97
+ "language_model.model.layers.9.mlp.down_proj",
98
+ "language_model.model.layers.7.mlp.gate_proj",
99
+ "language_model.model.layers.26.self_attn.q_proj",
100
+ "language_model.model.layers.14.mlp.gate_proj",
101
+ "language_model.model.layers.24.mlp.gate_proj",
102
+ "language_model.model.layers.9.self_attn.q_proj",
103
+ "language_model.model.layers.18.self_attn.k_proj",
104
+ "language_model.model.layers.4.self_attn.o_proj",
105
+ "language_model.model.layers.10.self_attn.k_proj",
106
+ "language_model.model.layers.29.self_attn.k_proj",
107
+ "language_model.model.layers.1.mlp.gate_proj",
108
+ "language_model.model.layers.2.self_attn.q_proj",
109
+ "language_model.model.layers.16.self_attn.q_proj",
110
+ "language_model.model.layers.13.mlp.up_proj",
111
+ "language_model.model.layers.30.mlp.down_proj",
112
+ "language_model.model.layers.21.mlp.gate_proj",
113
+ "language_model.model.layers.12.mlp.gate_proj",
114
+ "language_model.model.layers.13.self_attn.o_proj",
115
+ "language_model.model.layers.14.self_attn.q_proj",
116
+ "language_model.model.layers.15.self_attn.v_proj",
117
+ "language_model.model.layers.25.self_attn.k_proj",
118
+ "language_model.model.layers.24.mlp.up_proj",
119
+ "language_model.model.layers.26.self_attn.v_proj",
120
+ "language_model.model.layers.27.mlp.gate_proj",
121
+ "language_model.model.layers.15.mlp.down_proj",
122
+ "language_model.model.layers.29.mlp.up_proj",
123
+ "language_model.model.layers.15.self_attn.o_proj",
124
+ "language_model.model.layers.17.mlp.up_proj",
125
+ "language_model.model.layers.18.mlp.down_proj",
126
+ "language_model.model.layers.14.self_attn.o_proj",
127
+ "language_model.model.layers.18.self_attn.o_proj",
128
+ "language_model.model.layers.24.mlp.down_proj",
129
+ "language_model.model.layers.15.mlp.gate_proj",
130
+ "language_model.model.layers.10.mlp.gate_proj",
131
+ "language_model.model.layers.30.mlp.gate_proj",
132
+ "language_model.model.layers.17.mlp.gate_proj",
133
+ "language_model.model.layers.21.self_attn.o_proj",
134
+ "language_model.model.layers.27.mlp.down_proj",
135
+ "language_model.model.layers.10.mlp.down_proj",
136
+ "language_model.model.layers.11.self_attn.o_proj",
137
+ "language_model.model.layers.14.mlp.down_proj",
138
+ "language_model.model.layers.19.self_attn.v_proj",
139
+ "language_model.model.layers.24.self_attn.q_proj",
140
+ "language_model.model.layers.31.self_attn.q_proj",
141
+ "language_model.model.layers.9.self_attn.k_proj",
142
+ "language_model.model.layers.9.mlp.up_proj",
143
+ "language_model.model.layers.0.self_attn.q_proj",
144
+ "language_model.model.layers.21.self_attn.q_proj",
145
+ "language_model.model.layers.22.mlp.down_proj",
146
+ "language_model.model.layers.25.self_attn.q_proj",
147
+ "language_model.model.layers.7.self_attn.q_proj",
148
+ "language_model.model.layers.27.self_attn.o_proj",
149
+ "language_model.model.layers.28.mlp.gate_proj",
150
+ "language_model.model.layers.8.mlp.gate_proj",
151
+ "language_model.model.layers.19.self_attn.k_proj",
152
+ "language_model.model.layers.26.self_attn.o_proj",
153
+ "language_model.model.layers.23.mlp.down_proj",
154
+ "language_model.model.layers.0.mlp.down_proj",
155
+ "language_model.model.layers.2.mlp.down_proj",
156
+ "language_model.model.layers.19.self_attn.o_proj",
157
+ "language_model.model.layers.21.mlp.up_proj",
158
+ "language_model.model.layers.27.self_attn.k_proj",
159
+ "language_model.model.layers.2.mlp.gate_proj",
160
+ "language_model.model.layers.15.mlp.up_proj",
161
+ "language_model.model.layers.1.mlp.down_proj",
162
+ "language_model.model.layers.6.self_attn.v_proj",
163
+ "language_model.model.layers.26.self_attn.k_proj",
164
+ "language_model.model.layers.23.mlp.up_proj",
165
+ "language_model.model.layers.7.self_attn.v_proj",
166
+ "language_model.model.layers.3.self_attn.k_proj",
167
+ "language_model.model.layers.5.mlp.up_proj",
168
+ "language_model.model.layers.27.self_attn.q_proj",
169
+ "language_model.model.layers.11.mlp.down_proj",
170
+ "language_model.model.layers.19.self_attn.q_proj",
171
+ "language_model.model.layers.16.self_attn.k_proj",
172
+ "language_model.model.layers.17.self_attn.o_proj",
173
+ "language_model.model.layers.27.mlp.up_proj",
174
+ "language_model.model.layers.4.mlp.down_proj",
175
+ "language_model.model.layers.22.mlp.up_proj",
176
+ "language_model.model.layers.19.mlp.down_proj",
177
+ "language_model.model.layers.3.self_attn.v_proj",
178
+ "language_model.model.layers.17.self_attn.k_proj",
179
+ "language_model.model.layers.8.self_attn.k_proj",
180
+ "language_model.model.layers.20.mlp.down_proj",
181
+ "language_model.model.layers.29.self_attn.q_proj",
182
+ "language_model.model.layers.14.self_attn.v_proj",
183
+ "language_model.model.layers.25.mlp.gate_proj",
184
+ "language_model.model.layers.29.self_attn.o_proj",
185
+ "language_model.model.layers.16.self_attn.o_proj",
186
+ "language_model.model.layers.16.mlp.down_proj",
187
+ "language_model.model.layers.23.self_attn.q_proj",
188
+ "language_model.model.layers.30.self_attn.q_proj",
189
+ "language_model.model.layers.6.mlp.gate_proj",
190
+ "language_model.model.layers.10.self_attn.v_proj",
191
+ "language_model.model.layers.12.self_attn.k_proj",
192
+ "language_model.model.layers.21.mlp.down_proj",
193
+ "language_model.model.layers.31.mlp.gate_proj",
194
+ "language_model.model.layers.25.mlp.up_proj",
195
+ "language_model.model.layers.5.self_attn.o_proj",
196
+ "language_model.model.layers.15.self_attn.q_proj",
197
+ "language_model.model.layers.4.mlp.gate_proj",
198
+ "language_model.model.layers.29.self_attn.v_proj",
199
+ "language_model.model.layers.30.mlp.up_proj",
200
+ "language_model.model.layers.12.self_attn.v_proj",
201
+ "language_model.model.layers.30.self_attn.o_proj",
202
+ "language_model.model.layers.4.self_attn.q_proj",
203
+ "language_model.model.layers.8.mlp.up_proj",
204
+ "language_model.model.layers.31.mlp.up_proj",
205
+ "language_model.model.layers.6.self_attn.q_proj",
206
+ "language_model.model.layers.6.mlp.down_proj",
207
+ "language_model.model.layers.4.mlp.up_proj",
208
+ "language_model.model.layers.31.self_attn.k_proj",
209
+ "language_model.model.layers.24.self_attn.o_proj",
210
+ "language_model.model.layers.22.self_attn.v_proj",
211
+ "language_model.model.layers.22.self_attn.q_proj",
212
+ "language_model.model.layers.21.self_attn.k_proj",
213
+ "language_model.model.layers.2.self_attn.v_proj",
214
+ "language_model.model.layers.22.mlp.gate_proj",
215
+ "language_model.model.layers.6.mlp.up_proj",
216
+ "language_model.model.layers.6.self_attn.k_proj",
217
+ "language_model.model.layers.26.mlp.gate_proj",
218
+ "language_model.model.layers.24.self_attn.k_proj",
219
+ "language_model.model.layers.3.self_attn.q_proj",
220
+ "language_model.model.layers.14.mlp.up_proj",
221
+ "language_model.model.layers.1.self_attn.k_proj",
222
+ "language_model.model.layers.9.mlp.gate_proj",
223
+ "language_model.model.layers.24.self_attn.v_proj",
224
+ "language_model.model.layers.1.self_attn.v_proj",
225
+ "language_model.model.layers.11.mlp.gate_proj",
226
+ "language_model.model.layers.0.mlp.gate_proj",
227
+ "language_model.model.layers.9.self_attn.o_proj",
228
+ "language_model.model.layers.8.self_attn.v_proj",
229
+ "language_model.model.layers.20.mlp.up_proj",
230
+ "language_model.model.layers.21.self_attn.v_proj",
231
+ "language_model.model.layers.22.self_attn.o_proj",
232
+ "language_model.model.layers.20.self_attn.k_proj",
233
+ "language_model.model.layers.0.self_attn.v_proj",
234
+ "language_model.model.layers.1.self_attn.o_proj",
235
+ "language_model.model.layers.3.mlp.gate_proj",
236
+ "language_model.model.layers.3.mlp.down_proj",
237
+ "language_model.model.layers.6.self_attn.o_proj",
238
+ "language_model.model.layers.13.mlp.gate_proj",
239
+ "language_model.model.layers.20.self_attn.o_proj",
240
+ "language_model.model.layers.22.self_attn.k_proj",
241
+ "language_model.model.layers.23.self_attn.o_proj",
242
+ "language_model.model.layers.5.self_attn.k_proj",
243
+ "language_model.model.layers.10.mlp.up_proj",
244
+ "language_model.model.layers.14.self_attn.k_proj",
245
+ "language_model.model.layers.27.self_attn.v_proj",
246
+ "language_model.model.layers.23.mlp.gate_proj"
247
+ ],
248
+ "task_type": "CAUSAL_LM",
249
+ "use_dora": false,
250
+ "use_rslora": false
251
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd4a7bb5d5c126c0322e7b8e437c498f751c2b0d4984f8ea1ceb63058f802aa5
3
+ size 83952016
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<image>": 32000,
3
+ "<pad>": 32001
4
+ }
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.6423841059602649,
3
+ "num_input_tokens_seen": 1916832,
4
+ "total_flos": 8.57490291718226e+16,
5
+ "train_loss": 1.3051114364997627,
6
+ "train_runtime": 5464.1136,
7
+ "train_samples_per_second": 1.326,
8
+ "train_steps_per_second": 0.083
9
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '<<SYS>>\n' + message['content'][0]['text'] + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'user' %}{{ '[INST] ' }}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] }}{% endfor %}{{' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'][0]['text'] + '</s> '}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
3
+ }
llamaboard_config.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ top.booster: auto
2
+ top.checkpoint_path: []
3
+ top.finetuning_type: lora
4
+ top.model_name: LLaVA-NeXT-Mistral-7B-Chat
5
+ top.quantization_bit: '4'
6
+ top.quantization_method: bitsandbytes
7
+ top.rope_scaling: none
8
+ top.template: llava_next_mistral
9
+ train.additional_target: ''
10
+ train.apollo_rank: 16
11
+ train.apollo_scale: 32
12
+ train.apollo_target: all
13
+ train.apollo_update_interval: 200
14
+ train.badam_mode: layer
15
+ train.badam_switch_interval: 50
16
+ train.badam_switch_mode: ascending
17
+ train.badam_update_ratio: 0.05
18
+ train.batch_size: 2
19
+ train.compute_type: fp16
20
+ train.create_new_adapter: false
21
+ train.cutoff_len: 2048
22
+ train.dataset:
23
+ - docker_command_dataset
24
+ train.dataset_dir: data
25
+ train.ds_offload: false
26
+ train.ds_stage: none
27
+ train.extra_args: '{"optim": "adamw_torch"}'
28
+ train.freeze_extra_modules: ''
29
+ train.freeze_trainable_layers: 2
30
+ train.freeze_trainable_modules: all
31
+ train.galore_rank: 16
32
+ train.galore_scale: 2
33
+ train.galore_target: all
34
+ train.galore_update_interval: 200
35
+ train.gradient_accumulation_steps: 8
36
+ train.learning_rate: 2e-4
37
+ train.logging_steps: 5
38
+ train.lora_alpha: 16
39
+ train.lora_dropout: 0
40
+ train.lora_rank: 8
41
+ train.lora_target: ''
42
+ train.loraplus_lr_ratio: 0
43
+ train.lr_scheduler_type: cosine
44
+ train.mask_history: false
45
+ train.max_grad_norm: '0.3'
46
+ train.max_samples: '100000'
47
+ train.neat_packing: false
48
+ train.neftune_alpha: 0
49
+ train.num_train_epochs: '3.0'
50
+ train.packing: false
51
+ train.ppo_score_norm: false
52
+ train.ppo_whiten_rewards: false
53
+ train.pref_beta: 0.1
54
+ train.pref_ftx: 0
55
+ train.pref_loss: sigmoid
56
+ train.report_to:
57
+ - none
58
+ train.resize_vocab: false
59
+ train.reward_model: []
60
+ train.save_steps: 100
61
+ train.swanlab_api_key: ''
62
+ train.swanlab_link: ''
63
+ train.swanlab_mode: cloud
64
+ train.swanlab_project: llamafactory
65
+ train.swanlab_run_name: ''
66
+ train.swanlab_workspace: ''
67
+ train.train_on_prompt: false
68
+ train.training_stage: Supervised Fine-Tuning
69
+ train.use_apollo: false
70
+ train.use_badam: false
71
+ train.use_dora: false
72
+ train.use_galore: false
73
+ train.use_llama_pro: false
74
+ train.use_pissa: false
75
+ train.use_rslora: false
76
+ train.use_swanlab: false
77
+ train.val_size: 0
78
+ train.warmup_steps: 0
preprocessor_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aspect_ratio_setting": "anyres",
3
+ "crop_size": {
4
+ "height": 336,
5
+ "width": 336
6
+ },
7
+ "do_center_crop": true,
8
+ "do_convert_rgb": true,
9
+ "do_normalize": true,
10
+ "do_pad": true,
11
+ "do_rescale": true,
12
+ "do_resize": true,
13
+ "image_grid_pinpoints": [
14
+ [
15
+ 336,
16
+ 672
17
+ ],
18
+ [
19
+ 672,
20
+ 336
21
+ ],
22
+ [
23
+ 672,
24
+ 672
25
+ ],
26
+ [
27
+ 1008,
28
+ 336
29
+ ],
30
+ [
31
+ 336,
32
+ 1008
33
+ ]
34
+ ],
35
+ "image_mean": [
36
+ 0.48145466,
37
+ 0.4578275,
38
+ 0.40821073
39
+ ],
40
+ "image_processor_type": "LlavaNextImageProcessor",
41
+ "image_std": [
42
+ 0.26862954,
43
+ 0.26130258,
44
+ 0.27577711
45
+ ],
46
+ "processor_class": "LlavaNextProcessor",
47
+ "resample": 3,
48
+ "rescale_factor": 0.00392156862745098,
49
+ "size": {
50
+ "shortest_edge": 336
51
+ }
52
+ }
processor_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_token": "<image>",
3
+ "num_additional_image_tokens": 1,
4
+ "patch_size": 14,
5
+ "processor_class": "LlavaNextProcessor",
6
+ "vision_feature_select_strategy": "default"
7
+ }
running_log.txt ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [INFO|2025-04-21 17:36:49] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/config.json
2
+
3
+ [INFO|2025-04-21 17:36:49] configuration_utils.py:771 >> Model config LlavaNextConfig {
4
+ "_name_or_path": "llava-hf/llava-v1.6-mistral-7b-hf",
5
+ "architectures": [
6
+ "LlavaNextForConditionalGeneration"
7
+ ],
8
+ "ignore_index": -100,
9
+ "image_grid_pinpoints": [
10
+ [
11
+ 336,
12
+ 672
13
+ ],
14
+ [
15
+ 672,
16
+ 336
17
+ ],
18
+ [
19
+ 672,
20
+ 672
21
+ ],
22
+ [
23
+ 1008,
24
+ 336
25
+ ],
26
+ [
27
+ 336,
28
+ 1008
29
+ ]
30
+ ],
31
+ "image_seq_length": 576,
32
+ "image_token_index": 32000,
33
+ "model_type": "llava_next",
34
+ "multimodal_projector_bias": true,
35
+ "projector_hidden_act": "gelu",
36
+ "text_config": {
37
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
38
+ "architectures": [
39
+ "MistralForCausalLM"
40
+ ],
41
+ "intermediate_size": 14336,
42
+ "max_position_embeddings": 32768,
43
+ "model_type": "mistral",
44
+ "num_key_value_heads": 8,
45
+ "rms_norm_eps": 1e-05,
46
+ "rope_theta": 1000000.0,
47
+ "sliding_window": null,
48
+ "torch_dtype": "bfloat16",
49
+ "vocab_size": 32064
50
+ },
51
+ "tie_word_embeddings": false,
52
+ "torch_dtype": "float16",
53
+ "transformers_version": "4.49.0",
54
+ "use_image_newline_parameter": true,
55
+ "vision_config": {
56
+ "hidden_size": 1024,
57
+ "image_size": 336,
58
+ "intermediate_size": 4096,
59
+ "model_type": "clip_vision_model",
60
+ "num_attention_heads": 16,
61
+ "num_hidden_layers": 24,
62
+ "patch_size": 14,
63
+ "projection_dim": 768,
64
+ "vocab_size": 32000
65
+ },
66
+ "vision_feature_layer": -2,
67
+ "vision_feature_select_strategy": "default",
68
+ "vocab_size": 32064
69
+ }
70
+
71
+
72
+ [INFO|2025-04-21 17:36:49] tokenization_utils_base.py:2050 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/tokenizer.model
73
+
74
+ [INFO|2025-04-21 17:36:49] tokenization_utils_base.py:2050 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/tokenizer.json
75
+
76
+ [INFO|2025-04-21 17:36:49] tokenization_utils_base.py:2050 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/added_tokens.json
77
+
78
+ [INFO|2025-04-21 17:36:49] tokenization_utils_base.py:2050 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/special_tokens_map.json
79
+
80
+ [INFO|2025-04-21 17:36:49] tokenization_utils_base.py:2050 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/tokenizer_config.json
81
+
82
+ [INFO|2025-04-21 17:36:49] tokenization_utils_base.py:2050 >> loading file chat_template.jinja from cache at None
83
+
84
+ [INFO|2025-04-21 17:36:49] tokenization_utils_base.py:2313 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
85
+
86
+ [INFO|2025-04-21 17:36:50] processing_utils.py:816 >> loading configuration file processor_config.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/processor_config.json
87
+
88
+ [INFO|2025-04-21 17:36:50] image_processing_base.py:381 >> loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/preprocessor_config.json
89
+
90
+ [WARNING|2025-04-21 17:36:50] logging.py:329 >> Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
91
+
92
+ [INFO|2025-04-21 17:36:50] image_processing_base.py:434 >> Image processor LlavaNextImageProcessor {
93
+ "aspect_ratio_setting": "anyres",
94
+ "crop_size": {
95
+ "height": 336,
96
+ "width": 336
97
+ },
98
+ "do_center_crop": true,
99
+ "do_convert_rgb": true,
100
+ "do_normalize": true,
101
+ "do_pad": true,
102
+ "do_rescale": true,
103
+ "do_resize": true,
104
+ "image_grid_pinpoints": [
105
+ [
106
+ 336,
107
+ 672
108
+ ],
109
+ [
110
+ 672,
111
+ 336
112
+ ],
113
+ [
114
+ 672,
115
+ 672
116
+ ],
117
+ [
118
+ 1008,
119
+ 336
120
+ ],
121
+ [
122
+ 336,
123
+ 1008
124
+ ]
125
+ ],
126
+ "image_mean": [
127
+ 0.48145466,
128
+ 0.4578275,
129
+ 0.40821073
130
+ ],
131
+ "image_processor_type": "LlavaNextImageProcessor",
132
+ "image_std": [
133
+ 0.26862954,
134
+ 0.26130258,
135
+ 0.27577711
136
+ ],
137
+ "processor_class": "LlavaNextProcessor",
138
+ "resample": 3,
139
+ "rescale_factor": 0.00392156862745098,
140
+ "size": {
141
+ "shortest_edge": 336
142
+ }
143
+ }
144
+
145
+
146
+ [INFO|2025-04-21 17:36:50] tokenization_utils_base.py:2050 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/tokenizer.model
147
+
148
+ [INFO|2025-04-21 17:36:50] tokenization_utils_base.py:2050 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/tokenizer.json
149
+
150
+ [INFO|2025-04-21 17:36:50] tokenization_utils_base.py:2050 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/added_tokens.json
151
+
152
+ [INFO|2025-04-21 17:36:50] tokenization_utils_base.py:2050 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/special_tokens_map.json
153
+
154
+ [INFO|2025-04-21 17:36:50] tokenization_utils_base.py:2050 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/tokenizer_config.json
155
+
156
+ [INFO|2025-04-21 17:36:50] tokenization_utils_base.py:2050 >> loading file chat_template.jinja from cache at None
157
+
158
+ [INFO|2025-04-21 17:36:50] tokenization_utils_base.py:2313 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
159
+
160
+ [INFO|2025-04-21 17:36:50] processing_utils.py:816 >> loading configuration file processor_config.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/processor_config.json
161
+
162
+ [INFO|2025-04-21 17:36:51] processing_utils.py:876 >> Processor LlavaNextProcessor:
163
+ - image_processor: LlavaNextImageProcessor {
164
+ "aspect_ratio_setting": "anyres",
165
+ "crop_size": {
166
+ "height": 336,
167
+ "width": 336
168
+ },
169
+ "do_center_crop": true,
170
+ "do_convert_rgb": true,
171
+ "do_normalize": true,
172
+ "do_pad": true,
173
+ "do_rescale": true,
174
+ "do_resize": true,
175
+ "image_grid_pinpoints": [
176
+ [
177
+ 336,
178
+ 672
179
+ ],
180
+ [
181
+ 672,
182
+ 336
183
+ ],
184
+ [
185
+ 672,
186
+ 672
187
+ ],
188
+ [
189
+ 1008,
190
+ 336
191
+ ],
192
+ [
193
+ 336,
194
+ 1008
195
+ ]
196
+ ],
197
+ "image_mean": [
198
+ 0.48145466,
199
+ 0.4578275,
200
+ 0.40821073
201
+ ],
202
+ "image_processor_type": "LlavaNextImageProcessor",
203
+ "image_std": [
204
+ 0.26862954,
205
+ 0.26130258,
206
+ 0.27577711
207
+ ],
208
+ "processor_class": "LlavaNextProcessor",
209
+ "resample": 3,
210
+ "rescale_factor": 0.00392156862745098,
211
+ "size": {
212
+ "shortest_edge": 336
213
+ }
214
+ }
215
+
216
+ - tokenizer: LlamaTokenizerFast(name_or_path='llava-hf/llava-v1.6-mistral-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'image_token': '<image>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
217
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
218
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
219
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
220
+ 32000: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
221
+ 32001: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
222
+ }
223
+ )
224
+
225
+ {
226
+ "image_token": "<image>",
227
+ "num_additional_image_tokens": 1,
228
+ "patch_size": 14,
229
+ "processor_class": "LlavaNextProcessor",
230
+ "vision_feature_select_strategy": "default"
231
+ }
232
+
233
+
234
+ [INFO|2025-04-21 17:36:51] logging.py:157 >> Loading dataset MattCoddity/dockerNLcommands...
235
+
236
+ [INFO|2025-04-21 17:36:53] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/config.json
237
+
238
+ [INFO|2025-04-21 17:36:53] configuration_utils.py:771 >> Model config LlavaNextConfig {
239
+ "_name_or_path": "llava-hf/llava-v1.6-mistral-7b-hf",
240
+ "architectures": [
241
+ "LlavaNextForConditionalGeneration"
242
+ ],
243
+ "ignore_index": -100,
244
+ "image_grid_pinpoints": [
245
+ [
246
+ 336,
247
+ 672
248
+ ],
249
+ [
250
+ 672,
251
+ 336
252
+ ],
253
+ [
254
+ 672,
255
+ 672
256
+ ],
257
+ [
258
+ 1008,
259
+ 336
260
+ ],
261
+ [
262
+ 336,
263
+ 1008
264
+ ]
265
+ ],
266
+ "image_seq_length": 576,
267
+ "image_token_index": 32000,
268
+ "model_type": "llava_next",
269
+ "multimodal_projector_bias": true,
270
+ "projector_hidden_act": "gelu",
271
+ "text_config": {
272
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
273
+ "architectures": [
274
+ "MistralForCausalLM"
275
+ ],
276
+ "intermediate_size": 14336,
277
+ "max_position_embeddings": 32768,
278
+ "model_type": "mistral",
279
+ "num_key_value_heads": 8,
280
+ "rms_norm_eps": 1e-05,
281
+ "rope_theta": 1000000.0,
282
+ "sliding_window": null,
283
+ "torch_dtype": "bfloat16",
284
+ "vocab_size": 32064
285
+ },
286
+ "tie_word_embeddings": false,
287
+ "torch_dtype": "float16",
288
+ "transformers_version": "4.49.0",
289
+ "use_image_newline_parameter": true,
290
+ "vision_config": {
291
+ "hidden_size": 1024,
292
+ "image_size": 336,
293
+ "intermediate_size": 4096,
294
+ "model_type": "clip_vision_model",
295
+ "num_attention_heads": 16,
296
+ "num_hidden_layers": 24,
297
+ "patch_size": 14,
298
+ "projection_dim": 768,
299
+ "vocab_size": 32000
300
+ },
301
+ "vision_feature_layer": -2,
302
+ "vision_feature_select_strategy": "default",
303
+ "vocab_size": 32064
304
+ }
305
+
306
+
307
+ [INFO|2025-04-21 17:36:53] logging.py:157 >> Quantizing model to 4 bit with bitsandbytes.
308
+
309
+ [INFO|2025-04-21 17:36:53] modeling_utils.py:3982 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/model.safetensors.index.json
310
+
311
+ [INFO|2025-04-21 17:36:53] modeling_utils.py:1633 >> Instantiating LlavaNextForConditionalGeneration model under default dtype torch.float16.
312
+
313
+ [INFO|2025-04-21 17:36:53] configuration_utils.py:1140 >> Generate config GenerationConfig {}
314
+
315
+
316
+ [INFO|2025-04-21 17:36:54] modeling_utils.py:1633 >> Instantiating CLIPVisionModel model under default dtype torch.float16.
317
+
318
+ [INFO|2025-04-21 17:36:54] modeling_utils.py:1633 >> Instantiating MistralForCausalLM model under default dtype torch.float16.
319
+
320
+ [INFO|2025-04-21 17:36:54] configuration_utils.py:1140 >> Generate config GenerationConfig {
321
+ "bos_token_id": 1,
322
+ "eos_token_id": 2
323
+ }
324
+
325
+
326
+ [INFO|2025-04-21 17:37:57] modeling_utils.py:4970 >> All model checkpoint weights were used when initializing LlavaNextForConditionalGeneration.
327
+
328
+
329
+ [INFO|2025-04-21 17:37:57] modeling_utils.py:4978 >> All the weights of LlavaNextForConditionalGeneration were initialized from the model checkpoint at llava-hf/llava-v1.6-mistral-7b-hf.
330
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use LlavaNextForConditionalGeneration for predictions without further training.
331
+
332
+ [INFO|2025-04-21 17:37:58] configuration_utils.py:1095 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/generation_config.json
333
+
334
+ [INFO|2025-04-21 17:37:58] configuration_utils.py:1140 >> Generate config GenerationConfig {
335
+ "bos_token_id": 1,
336
+ "eos_token_id": 2
337
+ }
338
+
339
+
340
+ [INFO|2025-04-21 17:37:58] logging.py:157 >> Gradient checkpointing enabled.
341
+
342
+ [INFO|2025-04-21 17:37:58] logging.py:157 >> Casting multimodal projector outputs in torch.float16.
343
+
344
+ [INFO|2025-04-21 17:37:58] logging.py:157 >> Using torch SDPA for faster training and inference.
345
+
346
+ [INFO|2025-04-21 17:37:58] logging.py:157 >> Upcasting trainable params to float32.
347
+
348
+ [INFO|2025-04-21 17:37:58] logging.py:157 >> Fine-tuning method: LoRA
349
+
350
+ [INFO|2025-04-21 17:37:58] logging.py:157 >> Found linear modules: q_proj,v_proj,k_proj,gate_proj,up_proj,o_proj,down_proj
351
+
352
+ [INFO|2025-04-21 17:37:58] logging.py:157 >> Set vision model not trainable: ['vision_tower'].
353
+
354
+ [INFO|2025-04-21 17:37:58] logging.py:157 >> Set multi model projector not trainable: multi_modal_projector.
355
+
356
+ [INFO|2025-04-21 17:37:58] logging.py:157 >> trainable params: 20,971,520 || all params: 7,587,719,168 || trainable%: 0.2764
357
+
358
+ [INFO|2025-04-21 17:37:58] trainer.py:746 >> Using auto half precision backend
359
+
360
+ [WARNING|2025-04-21 17:37:58] trainer.py:781 >> No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
361
+
362
+ [INFO|2025-04-21 17:37:59] trainer.py:2405 >> ***** Running training *****
363
+
364
+ [INFO|2025-04-21 17:37:59] trainer.py:2406 >> Num examples = 2,415
365
+
366
+ [INFO|2025-04-21 17:37:59] trainer.py:2407 >> Num Epochs = 3
367
+
368
+ [INFO|2025-04-21 17:37:59] trainer.py:2408 >> Instantaneous batch size per device = 2
369
+
370
+ [INFO|2025-04-21 17:37:59] trainer.py:2411 >> Total train batch size (w. parallel, distributed & accumulation) = 16
371
+
372
+ [INFO|2025-04-21 17:37:59] trainer.py:2412 >> Gradient Accumulation steps = 8
373
+
374
+ [INFO|2025-04-21 17:37:59] trainer.py:2413 >> Total optimization steps = 453
375
+
376
+ [INFO|2025-04-21 17:37:59] trainer.py:2414 >> Number of trainable parameters = 20,971,520
377
+
378
+ [WARNING|2025-04-21 17:38:00] logging.py:329 >> `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
379
+
380
+ [INFO|2025-04-21 17:42:39] logging.py:157 >> {'loss': 10.8058, 'learning_rate': 1.9998e-04, 'epoch': 0.03, 'throughput': 350.83}
381
+
382
+ [INFO|2025-04-21 17:47:20] logging.py:157 >> {'loss': 2.5613, 'learning_rate': 1.9985e-04, 'epoch': 0.07, 'throughput': 350.51}
383
+
384
+ [INFO|2025-04-21 17:52:02] logging.py:157 >> {'loss': 1.4579, 'learning_rate': 1.9959e-04, 'epoch': 0.10, 'throughput': 350.51}
385
+
386
+ [INFO|2025-04-21 17:56:43] logging.py:157 >> {'loss': 1.1407, 'learning_rate': 1.9922e-04, 'epoch': 0.13, 'throughput': 350.55}
387
+
388
+ [INFO|2025-04-21 18:01:24] logging.py:157 >> {'loss': 0.7902, 'learning_rate': 1.9873e-04, 'epoch': 0.17, 'throughput': 350.53}
389
+
390
+ [INFO|2025-04-21 18:06:04] logging.py:157 >> {'loss': 0.9830, 'learning_rate': 1.9812e-04, 'epoch': 0.20, 'throughput': 350.53}
391
+
392
+ [INFO|2025-04-21 18:10:45] logging.py:157 >> {'loss': 0.9299, 'learning_rate': 1.9739e-04, 'epoch': 0.23, 'throughput': 350.57}
393
+
394
+ [INFO|2025-04-21 18:15:26] logging.py:157 >> {'loss': 0.6314, 'learning_rate': 1.9655e-04, 'epoch': 0.26, 'throughput': 350.66}
395
+
396
+ [INFO|2025-04-21 18:20:06] logging.py:157 >> {'loss': 0.8311, 'learning_rate': 1.9559e-04, 'epoch': 0.30, 'throughput': 350.69}
397
+
398
+ [INFO|2025-04-21 18:24:46] logging.py:157 >> {'loss': 0.5493, 'learning_rate': 1.9451e-04, 'epoch': 0.33, 'throughput': 350.73}
399
+
400
+ [INFO|2025-04-21 18:29:27] logging.py:157 >> {'loss': 0.4694, 'learning_rate': 1.9332e-04, 'epoch': 0.36, 'throughput': 350.78}
401
+
402
+ [INFO|2025-04-21 18:34:07] logging.py:157 >> {'loss': 0.5595, 'learning_rate': 1.9202e-04, 'epoch': 0.40, 'throughput': 350.83}
403
+
404
+ [INFO|2025-04-21 18:38:47] logging.py:157 >> {'loss': 0.2787, 'learning_rate': 1.9061e-04, 'epoch': 0.43, 'throughput': 350.84}
405
+
406
+ [INFO|2025-04-21 18:43:28] logging.py:157 >> {'loss': 0.5269, 'learning_rate': 1.8971e-04, 'epoch': 0.46, 'throughput': 350.83}
407
+
408
+ [INFO|2025-04-21 18:48:09] logging.py:157 >> {'loss': 0.7782, 'learning_rate': 1.8812e-04, 'epoch': 0.50, 'throughput': 350.82}
409
+
410
+ [INFO|2025-04-21 18:52:48] logging.py:157 >> {'loss': 0.5458, 'learning_rate': 1.8643e-04, 'epoch': 0.53, 'throughput': 350.83}
411
+
412
+ [INFO|2025-04-21 18:57:27] logging.py:157 >> {'loss': 0.3148, 'learning_rate': 1.8463e-04, 'epoch': 0.56, 'throughput': 350.84}
413
+
414
+ [INFO|2025-04-21 19:02:08] logging.py:157 >> {'loss': 0.3010, 'learning_rate': 1.8274e-04, 'epoch': 0.60, 'throughput': 350.83}
415
+
416
+ [INFO|2025-04-21 19:06:49] logging.py:157 >> {'loss': 0.6369, 'learning_rate': 1.8074e-04, 'epoch': 0.63, 'throughput': 350.82}
417
+
418
+ [INFO|2025-04-21 19:09:03] trainer.py:2657 >>
419
+
420
+ Training completed. Do not forget to share your model on huggingface.co/models =)
421
+
422
+
423
+
424
+ [INFO|2025-04-21 19:09:03] image_processing_base.py:261 >> Image processor saved in saves/LLaVA-NeXT-Mistral-7B-Chat/lora/train_2025-04-21-17-35-28/preprocessor_config.json
425
+
426
+ [INFO|2025-04-21 19:09:03] tokenization_utils_base.py:2500 >> tokenizer config file saved in saves/LLaVA-NeXT-Mistral-7B-Chat/lora/train_2025-04-21-17-35-28/tokenizer_config.json
427
+
428
+ [INFO|2025-04-21 19:09:03] tokenization_utils_base.py:2509 >> Special tokens file saved in saves/LLaVA-NeXT-Mistral-7B-Chat/lora/train_2025-04-21-17-35-28/special_tokens_map.json
429
+
430
+ [INFO|2025-04-21 19:09:03] processing_utils.py:638 >> chat template saved in saves/LLaVA-NeXT-Mistral-7B-Chat/lora/train_2025-04-21-17-35-28/chat_template.json
431
+
432
+ [INFO|2025-04-21 19:09:03] processing_utils.py:644 >> processor saved in saves/LLaVA-NeXT-Mistral-7B-Chat/lora/train_2025-04-21-17-35-28/processor_config.json
433
+
434
+ [INFO|2025-04-21 19:09:03] trainer.py:3942 >> Saving model checkpoint to saves/LLaVA-NeXT-Mistral-7B-Chat/lora/train_2025-04-21-17-35-28
435
+
436
+ [INFO|2025-04-21 19:09:03] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--llava-hf--llava-v1.6-mistral-7b-hf/snapshots/144bfb964d4eef1502a22af4c5ff20d0d4a94cc1/config.json
437
+
438
+ [INFO|2025-04-21 19:09:03] configuration_utils.py:771 >> Model config LlavaNextConfig {
439
+ "architectures": [
440
+ "LlavaNextForConditionalGeneration"
441
+ ],
442
+ "ignore_index": -100,
443
+ "image_grid_pinpoints": [
444
+ [
445
+ 336,
446
+ 672
447
+ ],
448
+ [
449
+ 672,
450
+ 336
451
+ ],
452
+ [
453
+ 672,
454
+ 672
455
+ ],
456
+ [
457
+ 1008,
458
+ 336
459
+ ],
460
+ [
461
+ 336,
462
+ 1008
463
+ ]
464
+ ],
465
+ "image_seq_length": 576,
466
+ "image_token_index": 32000,
467
+ "model_type": "llava_next",
468
+ "multimodal_projector_bias": true,
469
+ "projector_hidden_act": "gelu",
470
+ "text_config": {
471
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
472
+ "architectures": [
473
+ "MistralForCausalLM"
474
+ ],
475
+ "intermediate_size": 14336,
476
+ "max_position_embeddings": 32768,
477
+ "model_type": "mistral",
478
+ "num_key_value_heads": 8,
479
+ "rms_norm_eps": 1e-05,
480
+ "rope_theta": 1000000.0,
481
+ "sliding_window": null,
482
+ "torch_dtype": "bfloat16",
483
+ "vocab_size": 32064
484
+ },
485
+ "tie_word_embeddings": false,
486
+ "torch_dtype": "float16",
487
+ "transformers_version": "4.49.0",
488
+ "use_image_newline_parameter": true,
489
+ "vision_config": {
490
+ "hidden_size": 1024,
491
+ "image_size": 336,
492
+ "intermediate_size": 4096,
493
+ "model_type": "clip_vision_model",
494
+ "num_attention_heads": 16,
495
+ "num_hidden_layers": 24,
496
+ "patch_size": 14,
497
+ "projection_dim": 768,
498
+ "vocab_size": 32000
499
+ },
500
+ "vision_feature_layer": -2,
501
+ "vision_feature_select_strategy": "default",
502
+ "vocab_size": 32064
503
+ }
504
+
505
+
506
+ [INFO|2025-04-21 19:09:04] tokenization_utils_base.py:2500 >> tokenizer config file saved in saves/LLaVA-NeXT-Mistral-7B-Chat/lora/train_2025-04-21-17-35-28/tokenizer_config.json
507
+
508
+ [INFO|2025-04-21 19:09:04] tokenization_utils_base.py:2509 >> Special tokens file saved in saves/LLaVA-NeXT-Mistral-7B-Chat/lora/train_2025-04-21-17-35-28/special_tokens_map.json
509
+
510
+ [WARNING|2025-04-21 19:09:04] logging.py:162 >> No metric eval_loss to plot.
511
+
512
+ [WARNING|2025-04-21 19:09:04] logging.py:162 >> No metric eval_accuracy to plot.
513
+
514
+ [INFO|2025-04-21 19:09:04] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields:
515
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
516
+
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "image_token": "<image>",
17
+ "pad_token": {
18
+ "content": "<pad>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "unk_token": {
25
+ "content": "<unk>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "<image>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "32001": {
39
+ "content": "<pad>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ }
46
+ },
47
+ "additional_special_tokens": [],
48
+ "bos_token": "<s>",
49
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
50
+ "clean_up_tokenization_spaces": false,
51
+ "eos_token": "</s>",
52
+ "extra_special_tokens": {
53
+ "image_token": "<image>"
54
+ },
55
+ "image_token": "<image>",
56
+ "legacy": true,
57
+ "max_length": null,
58
+ "model_max_length": 2048,
59
+ "pad_to_multiple_of": null,
60
+ "pad_token": "<pad>",
61
+ "pad_token_type_id": 0,
62
+ "padding_side": "right",
63
+ "processor_class": "LlavaNextProcessor",
64
+ "sp_model_kwargs": {},
65
+ "spaces_between_special_tokens": false,
66
+ "split_special_tokens": false,
67
+ "tokenizer_class": "LlamaTokenizer",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": false
70
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.6423841059602649,
3
+ "num_input_tokens_seen": 1916832,
4
+ "total_flos": 8.57490291718226e+16,
5
+ "train_loss": 1.3051114364997627,
6
+ "train_runtime": 5464.1136,
7
+ "train_samples_per_second": 1.326,
8
+ "train_steps_per_second": 0.083
9
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 453, "loss": 10.8058, "lr": 0.0001999783578606323, "epoch": 0.033112582781456956, "percentage": 1.1, "elapsed_time": "0:04:40", "remaining_time": "6:59:06", "throughput": 350.83, "total_tokens": 98464}
2
+ {"current_steps": 10, "total_steps": 453, "loss": 2.5613, "lr": 0.00019984613426472932, "epoch": 0.06622516556291391, "percentage": 2.21, "elapsed_time": "0:09:21", "remaining_time": "6:54:42", "throughput": 350.51, "total_tokens": 196880}
3
+ {"current_steps": 15, "total_steps": 453, "loss": 1.4579, "lr": 0.00019959386925858942, "epoch": 0.09933774834437085, "percentage": 3.31, "elapsed_time": "0:14:02", "remaining_time": "6:50:15", "throughput": 350.51, "total_tokens": 295472}
4
+ {"current_steps": 20, "total_steps": 453, "loss": 1.1407, "lr": 0.0001992218661313415, "epoch": 0.13245033112582782, "percentage": 4.42, "elapsed_time": "0:18:44", "remaining_time": "6:45:39", "throughput": 350.55, "total_tokens": 394096}
5
+ {"current_steps": 25, "total_steps": 453, "loss": 0.7902, "lr": 0.00019873057212894398, "epoch": 0.16556291390728478, "percentage": 5.52, "elapsed_time": "0:23:24", "remaining_time": "6:40:52", "throughput": 350.53, "total_tokens": 492464}
6
+ {"current_steps": 30, "total_steps": 453, "loss": 0.983, "lr": 0.00019812057791647686, "epoch": 0.1986754966887417, "percentage": 6.62, "elapsed_time": "0:28:05", "remaining_time": "6:36:08", "throughput": 350.53, "total_tokens": 590896}
7
+ {"current_steps": 35, "total_steps": 453, "loss": 0.9299, "lr": 0.0001973926168680066, "epoch": 0.23178807947019867, "percentage": 7.73, "elapsed_time": "0:32:46", "remaining_time": "6:31:23", "throughput": 350.57, "total_tokens": 689328}
8
+ {"current_steps": 40, "total_steps": 453, "loss": 0.6314, "lr": 0.00019654756418487667, "epoch": 0.26490066225165565, "percentage": 8.83, "elapsed_time": "0:37:27", "remaining_time": "6:26:43", "throughput": 350.66, "total_tokens": 788032}
9
+ {"current_steps": 45, "total_steps": 453, "loss": 0.8311, "lr": 0.00019558643584348476, "epoch": 0.2980132450331126, "percentage": 9.93, "elapsed_time": "0:42:06", "remaining_time": "6:21:50", "throughput": 350.69, "total_tokens": 886144}
10
+ {"current_steps": 50, "total_steps": 453, "loss": 0.5493, "lr": 0.00019451038737381077, "epoch": 0.33112582781456956, "percentage": 11.04, "elapsed_time": "0:46:47", "remaining_time": "6:17:08", "throughput": 350.73, "total_tokens": 984672}
11
+ {"current_steps": 55, "total_steps": 453, "loss": 0.4694, "lr": 0.00019332071247016476, "epoch": 0.36423841059602646, "percentage": 12.14, "elapsed_time": "0:51:28", "remaining_time": "6:12:26", "throughput": 350.78, "total_tokens": 1083232}
12
+ {"current_steps": 60, "total_steps": 453, "loss": 0.5595, "lr": 0.00019201884143582495, "epoch": 0.3973509933774834, "percentage": 13.25, "elapsed_time": "0:56:07", "remaining_time": "6:07:39", "throughput": 350.83, "total_tokens": 1181568}
13
+ {"current_steps": 65, "total_steps": 453, "loss": 0.2787, "lr": 0.0001906063394634356, "epoch": 0.4304635761589404, "percentage": 14.35, "elapsed_time": "1:00:48", "remaining_time": "6:02:56", "throughput": 350.84, "total_tokens": 1279936}
14
+ {"current_steps": 70, "total_steps": 453, "loss": 0.5269, "lr": 0.00018970643640796642, "epoch": 0.46357615894039733, "percentage": 15.45, "elapsed_time": "1:05:29", "remaining_time": "5:58:19", "throughput": 350.83, "total_tokens": 1378544}
15
+ {"current_steps": 75, "total_steps": 453, "loss": 0.7782, "lr": 0.00018812051176267307, "epoch": 0.4966887417218543, "percentage": 16.56, "elapsed_time": "1:10:09", "remaining_time": "5:53:37", "throughput": 350.82, "total_tokens": 1476896}
16
+ {"current_steps": 80, "total_steps": 453, "loss": 0.5458, "lr": 0.00018642864300065767, "epoch": 0.5298013245033113, "percentage": 17.66, "elapsed_time": "1:14:49", "remaining_time": "5:48:50", "throughput": 350.83, "total_tokens": 1574912}
17
+ {"current_steps": 85, "total_steps": 453, "loss": 0.3148, "lr": 0.00018463286419478255, "epoch": 0.5629139072847682, "percentage": 18.76, "elapsed_time": "1:19:28", "remaining_time": "5:44:05", "throughput": 350.84, "total_tokens": 1673056}
18
+ {"current_steps": 90, "total_steps": 453, "loss": 0.301, "lr": 0.00018273533434521263, "epoch": 0.5960264900662252, "percentage": 19.87, "elapsed_time": "1:24:09", "remaining_time": "5:39:26", "throughput": 350.83, "total_tokens": 1771536}
19
+ {"current_steps": 95, "total_steps": 453, "loss": 0.6369, "lr": 0.0001807383347837268, "epoch": 0.6291390728476821, "percentage": 20.97, "elapsed_time": "1:28:50", "remaining_time": "5:34:46", "throughput": 350.82, "total_tokens": 1869952}
20
+ {"current_steps": 97, "total_steps": 453, "epoch": 0.6423841059602649, "percentage": 21.41, "elapsed_time": "1:31:04", "remaining_time": "5:34:13", "throughput": 350.8, "total_tokens": 1916832}
trainer_state.json ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.6423841059602649,
5
+ "eval_steps": 500,
6
+ "global_step": 97,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.033112582781456956,
13
+ "grad_norm": 26.791088104248047,
14
+ "learning_rate": 0.0001999783578606323,
15
+ "loss": 10.8058,
16
+ "num_input_tokens_seen": 98464,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.06622516556291391,
21
+ "grad_norm": 22.559255599975586,
22
+ "learning_rate": 0.00019984613426472932,
23
+ "loss": 2.5613,
24
+ "num_input_tokens_seen": 196880,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.09933774834437085,
29
+ "grad_norm": 8.285969734191895,
30
+ "learning_rate": 0.00019959386925858942,
31
+ "loss": 1.4579,
32
+ "num_input_tokens_seen": 295472,
33
+ "step": 15
34
+ },
35
+ {
36
+ "epoch": 0.13245033112582782,
37
+ "grad_norm": 7.880527496337891,
38
+ "learning_rate": 0.0001992218661313415,
39
+ "loss": 1.1407,
40
+ "num_input_tokens_seen": 394096,
41
+ "step": 20
42
+ },
43
+ {
44
+ "epoch": 0.16556291390728478,
45
+ "grad_norm": 14.62392807006836,
46
+ "learning_rate": 0.00019873057212894398,
47
+ "loss": 0.7902,
48
+ "num_input_tokens_seen": 492464,
49
+ "step": 25
50
+ },
51
+ {
52
+ "epoch": 0.1986754966887417,
53
+ "grad_norm": 12.557646751403809,
54
+ "learning_rate": 0.00019812057791647686,
55
+ "loss": 0.983,
56
+ "num_input_tokens_seen": 590896,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 0.23178807947019867,
61
+ "grad_norm": 12.462843894958496,
62
+ "learning_rate": 0.0001973926168680066,
63
+ "loss": 0.9299,
64
+ "num_input_tokens_seen": 689328,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.26490066225165565,
69
+ "grad_norm": 3.7140164375305176,
70
+ "learning_rate": 0.00019654756418487667,
71
+ "loss": 0.6314,
72
+ "num_input_tokens_seen": 788032,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.2980132450331126,
77
+ "grad_norm": 16.123748779296875,
78
+ "learning_rate": 0.00019558643584348476,
79
+ "loss": 0.8311,
80
+ "num_input_tokens_seen": 886144,
81
+ "step": 45
82
+ },
83
+ {
84
+ "epoch": 0.33112582781456956,
85
+ "grad_norm": 7.482938289642334,
86
+ "learning_rate": 0.00019451038737381077,
87
+ "loss": 0.5493,
88
+ "num_input_tokens_seen": 984672,
89
+ "step": 50
90
+ },
91
+ {
92
+ "epoch": 0.36423841059602646,
93
+ "grad_norm": 4.410764694213867,
94
+ "learning_rate": 0.00019332071247016476,
95
+ "loss": 0.4694,
96
+ "num_input_tokens_seen": 1083232,
97
+ "step": 55
98
+ },
99
+ {
100
+ "epoch": 0.3973509933774834,
101
+ "grad_norm": 6.899860858917236,
102
+ "learning_rate": 0.00019201884143582495,
103
+ "loss": 0.5595,
104
+ "num_input_tokens_seen": 1181568,
105
+ "step": 60
106
+ },
107
+ {
108
+ "epoch": 0.4304635761589404,
109
+ "grad_norm": 3.364258050918579,
110
+ "learning_rate": 0.0001906063394634356,
111
+ "loss": 0.2787,
112
+ "num_input_tokens_seen": 1279936,
113
+ "step": 65
114
+ },
115
+ {
116
+ "epoch": 0.46357615894039733,
117
+ "grad_norm": 20.896175384521484,
118
+ "learning_rate": 0.00018970643640796642,
119
+ "loss": 0.5269,
120
+ "num_input_tokens_seen": 1378544,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.4966887417218543,
125
+ "grad_norm": 3.4167935848236084,
126
+ "learning_rate": 0.00018812051176267307,
127
+ "loss": 0.7782,
128
+ "num_input_tokens_seen": 1476896,
129
+ "step": 75
130
+ },
131
+ {
132
+ "epoch": 0.5298013245033113,
133
+ "grad_norm": 10.354905128479004,
134
+ "learning_rate": 0.00018642864300065767,
135
+ "loss": 0.5458,
136
+ "num_input_tokens_seen": 1574912,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 0.5629139072847682,
141
+ "grad_norm": 3.3909523487091064,
142
+ "learning_rate": 0.00018463286419478255,
143
+ "loss": 0.3148,
144
+ "num_input_tokens_seen": 1673056,
145
+ "step": 85
146
+ },
147
+ {
148
+ "epoch": 0.5960264900662252,
149
+ "grad_norm": 13.916143417358398,
150
+ "learning_rate": 0.00018273533434521263,
151
+ "loss": 0.301,
152
+ "num_input_tokens_seen": 1771536,
153
+ "step": 90
154
+ },
155
+ {
156
+ "epoch": 0.6291390728476821,
157
+ "grad_norm": 4.097564697265625,
158
+ "learning_rate": 0.0001807383347837268,
159
+ "loss": 0.6369,
160
+ "num_input_tokens_seen": 1869952,
161
+ "step": 95
162
+ },
163
+ {
164
+ "epoch": 0.6423841059602649,
165
+ "num_input_tokens_seen": 1916832,
166
+ "step": 97,
167
+ "total_flos": 8.57490291718226e+16,
168
+ "train_loss": 1.3051114364997627,
169
+ "train_runtime": 5464.1136,
170
+ "train_samples_per_second": 1.326,
171
+ "train_steps_per_second": 0.083
172
+ }
173
+ ],
174
+ "logging_steps": 5,
175
+ "max_steps": 453,
176
+ "num_input_tokens_seen": 1916832,
177
+ "num_train_epochs": 3,
178
+ "save_steps": 100,
179
+ "stateful_callbacks": {
180
+ "TrainerControl": {
181
+ "args": {
182
+ "should_epoch_stop": false,
183
+ "should_evaluate": false,
184
+ "should_log": false,
185
+ "should_save": false,
186
+ "should_training_stop": false
187
+ },
188
+ "attributes": {}
189
+ }
190
+ },
191
+ "total_flos": 8.57490291718226e+16,
192
+ "train_batch_size": 2,
193
+ "trial_name": null,
194
+ "trial_params": null
195
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fa7685ef62ad79b8a271f32dc871c3861c9fd6b862e0819d6ca0e143f6d38cc
3
+ size 5688
training_args.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cutoff_len: 2048
2
+ dataset: docker_command_dataset
3
+ dataset_dir: data
4
+ ddp_timeout: 180000000
5
+ do_train: true
6
+ double_quantization: true
7
+ finetuning_type: lora
8
+ flash_attn: auto
9
+ fp16: true
10
+ gradient_accumulation_steps: 8
11
+ include_num_input_tokens_seen: true
12
+ learning_rate: 0.0002
13
+ logging_steps: 5
14
+ lora_alpha: 16
15
+ lora_dropout: 0
16
+ lora_rank: 8
17
+ lora_target: all
18
+ lr_scheduler_type: cosine
19
+ max_grad_norm: 0.3
20
+ max_samples: 100000
21
+ model_name_or_path: llava-hf/llava-v1.6-mistral-7b-hf
22
+ num_train_epochs: 3.0
23
+ optim: adamw_torch
24
+ output_dir: saves/LLaVA-NeXT-Mistral-7B-Chat/lora/train_2025-04-21-17-35-28
25
+ packing: false
26
+ per_device_train_batch_size: 2
27
+ plot_loss: true
28
+ preprocessing_num_workers: 16
29
+ quantization_bit: 4
30
+ quantization_method: bitsandbytes
31
+ report_to: none
32
+ save_steps: 100
33
+ stage: sft
34
+ template: llava_next_mistral
35
+ trust_remote_code: true
36
+ warmup_steps: 0
training_loss.png ADDED