moussaKam commited on
Commit
7e719e1
·
verified ·
1 Parent(s): 89a52ae
Files changed (6) hide show
  1. config.json +1 -1
  2. latest +1 -1
  3. model.safetensors +1 -1
  4. trainer_state.json +2509 -605
  5. training_args.bin +2 -2
  6. zero_to_fp32.py +14 -84
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/lustre/fsn1/projects/rech/gkb/uua32zb/grand_challenge/checkpoints/Qwen__Qwen2.5-1.5B-pretraining-fineweb2-0.0001LR-8192CL-1GAS-4BS-1EPOCHS-0.9BETA1-0.95BETA2/",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
 
1
  {
2
+ "_name_or_path": "moussaKam/fr-qwen-1.5B-base",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
latest CHANGED
@@ -1 +1 @@
1
- global_step6237
 
1
+ global_step3962
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc5edf1bc45f08dfaeca221b40e789f61302043d25115984818a52a274f213be
3
  size 3554214752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b7c002ba46b916c79fff8f94759cc0cf8fbec829a682cd017fb9e23609dfab5
3
  size 3554214752
trainer_state.json CHANGED
@@ -1,887 +1,2791 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
- "eval_steps": 500.0,
6
- "global_step": 6237,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02405002405002405,
13
- "grad_norm": 0.4139963388442993,
14
- "learning_rate": 0.00019996828714700116,
15
- "loss": 1.5971,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  "step": 50
17
  },
18
  {
19
- "epoch": 0.0481000481000481,
20
- "grad_norm": 0.3423018157482147,
21
- "learning_rate": 0.00019987316870210547,
22
- "loss": 1.274,
23
- "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  },
25
  {
26
- "epoch": 0.07215007215007214,
27
- "grad_norm": 0.3551710247993469,
28
- "learning_rate": 0.0001997147049948582,
29
- "loss": 1.2519,
30
- "step": 150
31
  },
32
  {
33
- "epoch": 0.0962000962000962,
34
- "grad_norm": 0.32329073548316956,
35
- "learning_rate": 0.0001994929965319844,
36
- "loss": 1.2382,
37
- "step": 200
38
  },
39
  {
40
- "epoch": 0.12025012025012025,
41
- "grad_norm": 0.48585018515586853,
42
- "learning_rate": 0.0001992081839336419,
43
- "loss": 1.2293,
44
- "step": 250
45
  },
46
  {
47
- "epoch": 0.1443001443001443,
48
- "grad_norm": 0.40136224031448364,
49
- "learning_rate": 0.00019886044784423197,
50
- "loss": 1.2214,
51
- "step": 300
52
  },
53
  {
54
- "epoch": 0.16835016835016836,
55
- "grad_norm": 0.574002206325531,
56
- "learning_rate": 0.00019845000881782432,
57
- "loss": 1.2184,
58
- "step": 350
59
  },
60
  {
61
- "epoch": 0.1924001924001924,
62
- "grad_norm": 0.4179827570915222,
63
- "learning_rate": 0.00019797712717826914,
64
- "loss": 1.2064,
65
- "step": 400
66
  },
67
  {
68
- "epoch": 0.21645021645021645,
69
- "grad_norm": 0.33033809065818787,
70
- "learning_rate": 0.00019744210285408488,
71
- "loss": 1.2055,
72
- "step": 450
73
  },
74
  {
75
- "epoch": 0.2405002405002405,
76
- "grad_norm": 0.2719138562679291,
77
- "learning_rate": 0.0001968452751882264,
78
- "loss": 1.2077,
79
- "step": 500
80
  },
81
  {
82
- "epoch": 0.26455026455026454,
83
- "grad_norm": 0.29797521233558655,
84
- "learning_rate": 0.00019618702272285434,
85
- "loss": 1.2096,
86
- "step": 550
87
  },
88
  {
89
- "epoch": 0.2886002886002886,
90
- "grad_norm": 0.3336372673511505,
91
- "learning_rate": 0.00019546776295924212,
92
- "loss": 1.2072,
93
- "step": 600
94
  },
95
  {
96
- "epoch": 0.3126503126503126,
97
- "grad_norm": 0.26755037903785706,
98
- "learning_rate": 0.0001946879520929728,
99
- "loss": 1.1974,
100
- "step": 650
101
  },
102
  {
103
- "epoch": 0.3367003367003367,
104
- "grad_norm": 0.36268576979637146,
105
- "learning_rate": 0.00019384808472459368,
106
- "loss": 1.2045,
107
- "step": 700
108
  },
109
  {
110
- "epoch": 0.36075036075036077,
111
- "grad_norm": 0.3121575713157654,
112
- "learning_rate": 0.0001929486935459127,
113
- "loss": 1.1889,
114
- "step": 750
115
  },
116
  {
117
- "epoch": 0.3848003848003848,
118
- "grad_norm": 0.3159404993057251,
119
- "learning_rate": 0.00019199034900213452,
120
- "loss": 1.1921,
121
- "step": 800
122
  },
123
  {
124
- "epoch": 0.40885040885040885,
125
- "grad_norm": 0.7236579060554504,
126
- "learning_rate": 0.000190973658930052,
127
- "loss": 1.194,
128
- "step": 850
129
  },
130
  {
131
- "epoch": 0.4329004329004329,
132
- "grad_norm": 0.24907168745994568,
133
- "learning_rate": 0.00018989926817252113,
134
- "loss": 1.191,
135
- "step": 900
136
  },
137
  {
138
- "epoch": 0.45695045695045694,
139
- "grad_norm": 0.24481187760829926,
140
- "learning_rate": 0.00018876785816946505,
141
- "loss": 1.1857,
142
- "step": 950
143
  },
144
  {
145
- "epoch": 0.481000481000481,
146
- "grad_norm": 0.2668200731277466,
147
- "learning_rate": 0.00018758014652566597,
148
- "loss": 1.1957,
149
- "step": 1000
150
  },
151
  {
152
- "epoch": 0.5050505050505051,
153
- "grad_norm": 0.2687171399593353,
154
- "learning_rate": 0.0001863368865556191,
155
- "loss": 1.1864,
156
- "step": 1050
157
  },
158
  {
159
- "epoch": 0.5291005291005291,
160
- "grad_norm": 0.23915782570838928,
161
- "learning_rate": 0.0001850388668057379,
162
- "loss": 1.184,
163
- "step": 1100
164
  },
165
  {
166
- "epoch": 0.5531505531505532,
167
- "grad_norm": 0.37159469723701477,
168
- "learning_rate": 0.0001836869105542127,
169
- "loss": 1.1849,
170
- "step": 1150
171
  },
172
  {
173
- "epoch": 0.5772005772005772,
174
- "grad_norm": 0.2752649784088135,
175
- "learning_rate": 0.0001822818752888408,
176
- "loss": 1.1843,
177
- "step": 1200
178
  },
179
  {
180
- "epoch": 0.6012506012506013,
181
- "grad_norm": 0.19733025133609772,
182
- "learning_rate": 0.00018082465216315882,
183
- "loss": 1.1766,
184
- "step": 1250
185
  },
186
  {
187
- "epoch": 0.6253006253006252,
188
- "grad_norm": 0.2180165797472,
189
- "learning_rate": 0.00017931616543122214,
190
- "loss": 1.1865,
191
- "step": 1300
192
  },
193
  {
194
- "epoch": 0.6493506493506493,
195
- "grad_norm": 0.25025510787963867,
196
- "learning_rate": 0.00017775737186139038,
197
- "loss": 1.1723,
198
- "step": 1350
199
  },
200
  {
201
- "epoch": 0.6734006734006734,
202
- "grad_norm": 0.2865007817745209,
203
- "learning_rate": 0.00017614926012949028,
204
- "loss": 1.172,
205
- "step": 1400
206
  },
207
  {
208
- "epoch": 0.6974506974506974,
209
- "grad_norm": 0.3406023681163788,
210
- "learning_rate": 0.00017449285019174098,
211
- "loss": 1.1795,
212
- "step": 1450
213
  },
214
  {
215
- "epoch": 0.7215007215007215,
216
- "grad_norm": 0.19766800105571747,
217
- "learning_rate": 0.00017278919263783978,
218
- "loss": 1.1784,
219
- "step": 1500
220
  },
221
  {
222
- "epoch": 0.7455507455507455,
223
- "grad_norm": 0.1965962052345276,
224
- "learning_rate": 0.00017103936802461797,
225
- "loss": 1.1754,
226
- "step": 1550
227
  },
228
  {
229
- "epoch": 0.7696007696007696,
230
- "grad_norm": 0.2381555736064911,
231
- "learning_rate": 0.00016924448619069023,
232
- "loss": 1.1671,
233
- "step": 1600
234
  },
235
  {
236
- "epoch": 0.7936507936507936,
237
- "grad_norm": 0.20156389474868774,
238
- "learning_rate": 0.00016740568555253155,
239
- "loss": 1.1738,
240
- "step": 1650
241
  },
242
  {
243
- "epoch": 0.8177008177008177,
244
- "grad_norm": 0.18294361233711243,
245
- "learning_rate": 0.00016552413238242857,
246
- "loss": 1.1727,
247
- "step": 1700
248
  },
249
  {
250
- "epoch": 0.8417508417508418,
251
- "grad_norm": 0.2975623309612274,
252
- "learning_rate": 0.00016360102006876317,
253
- "loss": 1.1677,
254
- "step": 1750
255
  },
256
  {
257
- "epoch": 0.8658008658008658,
258
- "grad_norm": 0.1871371865272522,
259
- "learning_rate": 0.0001616375683590974,
260
- "loss": 1.1689,
261
- "step": 1800
262
  },
263
  {
264
- "epoch": 0.8898508898508899,
265
- "grad_norm": 0.21457934379577637,
266
- "learning_rate": 0.00015963502258654005,
267
- "loss": 1.1605,
268
- "step": 1850
269
  },
270
  {
271
- "epoch": 0.9139009139009139,
272
- "grad_norm": 0.20261706411838531,
273
- "learning_rate": 0.0001575946528798853,
274
- "loss": 1.1627,
275
- "step": 1900
276
  },
277
  {
278
- "epoch": 0.937950937950938,
279
- "grad_norm": 0.17685186862945557,
280
- "learning_rate": 0.0001555177533580245,
281
- "loss": 1.1627,
282
- "step": 1950
283
  },
284
  {
285
- "epoch": 0.962000962000962,
286
- "grad_norm": 0.212468221783638,
287
- "learning_rate": 0.00015340564130914233,
288
- "loss": 1.161,
289
- "step": 2000
290
  },
291
  {
292
- "epoch": 0.9860509860509861,
293
- "grad_norm": 0.175174742937088,
294
- "learning_rate": 0.00015125965635521724,
295
- "loss": 1.1688,
296
- "step": 2050
297
  },
298
  {
299
- "epoch": 1.0101010101010102,
300
- "grad_norm": 0.19970253109931946,
301
- "learning_rate": 0.00014908115960235682,
302
- "loss": 1.142,
303
- "step": 2100
304
  },
305
  {
306
- "epoch": 1.034151034151034,
307
- "grad_norm": 0.21254608035087585,
308
- "learning_rate": 0.00014687153277750676,
309
- "loss": 1.1271,
310
- "step": 2150
311
  },
312
  {
313
- "epoch": 1.0582010582010581,
314
- "grad_norm": 0.1651500016450882,
315
- "learning_rate": 0.00014463217735208062,
316
- "loss": 1.121,
317
- "step": 2200
318
  },
319
  {
320
- "epoch": 1.0822510822510822,
321
- "grad_norm": 0.2405405044555664,
322
- "learning_rate": 0.00014236451365306674,
323
- "loss": 1.1313,
324
- "step": 2250
 
 
 
 
 
 
 
325
  },
326
  {
327
- "epoch": 1.1063011063011063,
328
- "grad_norm": 0.17223596572875977,
329
- "learning_rate": 0.00014006997996217593,
330
- "loss": 1.1344,
 
 
 
 
 
 
 
331
  "step": 2300
332
  },
333
  {
334
- "epoch": 1.1303511303511304,
335
- "grad_norm": 0.1969347894191742,
336
- "learning_rate": 0.00013775003160360096,
337
- "loss": 1.1176,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  "step": 2350
339
  },
340
  {
341
- "epoch": 1.1544011544011543,
342
- "grad_norm": 0.187143936753273,
343
- "learning_rate": 0.00013540614002096701,
344
- "loss": 1.1322,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  "step": 2400
346
  },
347
  {
348
- "epoch": 1.1784511784511784,
349
- "grad_norm": 0.1838238537311554,
350
- "learning_rate": 0.00013303979184405826,
351
- "loss": 1.1293,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  "step": 2450
353
  },
354
  {
355
- "epoch": 1.2025012025012025,
356
- "grad_norm": 0.17928341031074524,
357
- "learning_rate": 0.00013065248794591223,
358
- "loss": 1.1268,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  "step": 2500
360
  },
361
  {
362
- "epoch": 1.2265512265512266,
363
- "grad_norm": 0.2683047950267792,
364
- "learning_rate": 0.00012824574249088063,
365
- "loss": 1.1234,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  "step": 2550
367
  },
368
  {
369
- "epoch": 1.2506012506012505,
370
- "grad_norm": 0.18034860491752625,
371
- "learning_rate": 0.0001258210819742599,
372
- "loss": 1.125,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  "step": 2600
374
  },
375
  {
376
- "epoch": 1.2746512746512746,
377
- "grad_norm": 0.26357391476631165,
378
- "learning_rate": 0.00012338004425410074,
379
- "loss": 1.1217,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  "step": 2650
381
  },
382
  {
383
- "epoch": 1.2987012987012987,
384
- "grad_norm": 0.17828579246997833,
385
- "learning_rate": 0.00012092417757581085,
386
- "loss": 1.1262,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  "step": 2700
388
  },
389
  {
390
- "epoch": 1.3227513227513228,
391
- "grad_norm": 0.20247310400009155,
392
- "learning_rate": 0.00011845503959016928,
393
- "loss": 1.1246,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  "step": 2750
395
  },
396
  {
397
- "epoch": 1.3468013468013469,
398
- "grad_norm": 0.17381271719932556,
399
- "learning_rate": 0.0001159741963653755,
400
- "loss": 1.1181,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  "step": 2800
402
  },
403
  {
404
- "epoch": 1.370851370851371,
405
- "grad_norm": 0.19958114624023438,
406
- "learning_rate": 0.00011348322139375948,
407
- "loss": 1.1307,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  "step": 2850
409
  },
410
  {
411
- "epoch": 1.3949013949013949,
412
- "grad_norm": 0.21912401914596558,
413
- "learning_rate": 0.00011098369459378328,
414
- "loss": 1.1264,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  "step": 2900
416
  },
417
  {
418
- "epoch": 1.418951418951419,
419
- "grad_norm": 0.1694297194480896,
420
- "learning_rate": 0.00010847720130796631,
421
- "loss": 1.1256,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  "step": 2950
423
  },
424
  {
425
- "epoch": 1.443001443001443,
426
- "grad_norm": 0.13446395099163055,
427
- "learning_rate": 0.00010596533129737092,
428
- "loss": 1.1258,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  "step": 3000
430
  },
431
  {
432
- "epoch": 1.467051467051467,
433
- "grad_norm": 0.140371173620224,
434
- "learning_rate": 0.00010344967773328507,
435
- "loss": 1.1191,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  "step": 3050
437
  },
438
  {
439
- "epoch": 1.491101491101491,
440
- "grad_norm": 0.18016813695430756,
441
- "learning_rate": 0.00010093183618674224,
442
- "loss": 1.114,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  "step": 3100
444
  },
445
  {
446
- "epoch": 1.5151515151515151,
447
- "grad_norm": 0.17306862771511078,
448
- "learning_rate": 9.84134036165192e-05,
449
- "loss": 1.1149,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  "step": 3150
451
  },
452
  {
453
- "epoch": 1.5392015392015392,
454
- "grad_norm": 0.14116255939006805,
455
- "learning_rate": 9.589597735625377e-05,
456
- "loss": 1.123,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  "step": 3200
458
  },
459
  {
460
- "epoch": 1.5632515632515633,
461
- "grad_norm": 0.16819800436496735,
462
- "learning_rate": 9.338115410132441e-05,
463
- "loss": 1.1203,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  "step": 3250
465
  },
466
  {
467
- "epoch": 1.5873015873015874,
468
- "grad_norm": 0.21958529949188232,
469
- "learning_rate": 9.087052889613518e-05,
470
- "loss": 1.1226,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  "step": 3300
472
  },
473
  {
474
- "epoch": 1.6113516113516113,
475
- "grad_norm": 0.15786272287368774,
476
- "learning_rate": 8.836569412244745e-05,
477
- "loss": 1.1212,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  "step": 3350
479
  },
480
  {
481
- "epoch": 1.6354016354016354,
482
- "grad_norm": 0.17366796731948853,
483
- "learning_rate": 8.586823848940047e-05,
484
- "loss": 1.1129,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  "step": 3400
486
  },
487
  {
488
- "epoch": 1.6594516594516593,
489
- "grad_norm": 0.21448016166687012,
490
- "learning_rate": 8.337974602586152e-05,
491
- "loss": 1.1216,
492
- "step": 3450
493
  },
494
  {
495
- "epoch": 1.6835016835016834,
496
- "grad_norm": 0.17243099212646484,
497
- "learning_rate": 8.090179507574427e-05,
498
- "loss": 1.1096,
499
- "step": 3500
500
  },
501
  {
502
- "epoch": 1.7075517075517075,
503
- "grad_norm": 0.1429734081029892,
504
- "learning_rate": 7.843595729693316e-05,
505
- "loss": 1.1071,
506
- "step": 3550
507
  },
508
  {
509
- "epoch": 1.7316017316017316,
510
- "grad_norm": 0.15200386941432953,
511
- "learning_rate": 7.598379666444808e-05,
512
- "loss": 1.1158,
513
- "step": 3600
514
  },
515
  {
516
- "epoch": 1.7556517556517557,
517
- "grad_norm": 0.1442406326532364,
518
- "learning_rate": 7.354686847848242e-05,
519
- "loss": 1.112,
520
- "step": 3650
521
  },
522
  {
523
- "epoch": 1.7797017797017798,
524
- "grad_norm": 0.17678239941596985,
525
- "learning_rate": 7.11267183779428e-05,
526
- "loss": 1.1118,
527
- "step": 3700
528
  },
529
  {
530
- "epoch": 1.8037518037518039,
531
- "grad_norm": 0.147593155503273,
532
- "learning_rate": 6.872488136011667e-05,
533
- "loss": 1.1165,
534
- "step": 3750
535
  },
536
  {
537
- "epoch": 1.8278018278018278,
538
- "grad_norm": 0.1334652155637741,
539
- "learning_rate": 6.634288080708952e-05,
540
- "loss": 1.1135,
541
- "step": 3800
542
  },
543
  {
544
- "epoch": 1.8518518518518519,
545
- "grad_norm": 0.14890378713607788,
546
- "learning_rate": 6.398222751952899e-05,
547
- "loss": 1.1086,
548
- "step": 3850
549
  },
550
  {
551
- "epoch": 1.8759018759018757,
552
- "grad_norm": 0.1334807574748993,
553
- "learning_rate": 6.164441875844882e-05,
554
- "loss": 1.1144,
555
- "step": 3900
556
  },
557
  {
558
- "epoch": 1.8999518999518998,
559
- "grad_norm": 0.12897680699825287,
560
- "learning_rate": 5.933093729556062e-05,
561
- "loss": 1.1116,
562
- "step": 3950
563
  },
564
  {
565
- "epoch": 1.924001924001924,
566
- "grad_norm": 0.17530564963817596,
567
- "learning_rate": 5.7043250472815356e-05,
568
- "loss": 1.1039,
569
- "step": 4000
570
  },
571
  {
572
- "epoch": 1.948051948051948,
573
- "grad_norm": 0.15966495871543884,
574
- "learning_rate": 5.478280927173145e-05,
575
- "loss": 1.101,
576
- "step": 4050
577
  },
578
  {
579
- "epoch": 1.9721019721019721,
580
- "grad_norm": 0.18890446424484253,
581
- "learning_rate": 5.255104739309924e-05,
582
- "loss": 1.1077,
583
- "step": 4100
584
  },
585
  {
586
- "epoch": 1.9961519961519962,
587
- "grad_norm": 0.1547369807958603,
588
- "learning_rate": 5.0349380347646494e-05,
589
- "loss": 1.103,
590
- "step": 4150
591
  },
592
  {
593
- "epoch": 2.0202020202020203,
594
- "grad_norm": 0.13888758420944214,
595
- "learning_rate": 4.8179204558240444e-05,
596
- "loss": 1.0826,
597
- "step": 4200
598
  },
599
  {
600
- "epoch": 2.0442520442520444,
601
- "grad_norm": 0.11266086250543594,
602
- "learning_rate": 4.6041896474197e-05,
603
- "loss": 1.071,
604
- "step": 4250
605
  },
606
  {
607
- "epoch": 2.068302068302068,
608
- "grad_norm": 0.14245671033859253,
609
- "learning_rate": 4.393881169825779e-05,
610
- "loss": 1.0759,
611
- "step": 4300
612
  },
613
  {
614
- "epoch": 2.092352092352092,
615
- "grad_norm": 0.1226249411702156,
616
- "learning_rate": 4.187128412678969e-05,
617
- "loss": 1.0742,
618
- "step": 4350
619
  },
620
  {
621
- "epoch": 2.1164021164021163,
622
- "grad_norm": 0.12307476997375488,
623
- "learning_rate": 3.984062510375155e-05,
624
- "loss": 1.0721,
625
- "step": 4400
626
  },
627
  {
628
- "epoch": 2.1404521404521404,
629
- "grad_norm": 0.12813834846019745,
630
- "learning_rate": 3.7848122588965144e-05,
631
- "loss": 1.0726,
632
- "step": 4450
633
  },
634
  {
635
- "epoch": 2.1645021645021645,
636
- "grad_norm": 0.13432885706424713,
637
- "learning_rate": 3.5895040341217543e-05,
638
- "loss": 1.0745,
639
- "step": 4500
640
  },
641
  {
642
- "epoch": 2.1885521885521886,
643
- "grad_norm": 0.11649097502231598,
644
- "learning_rate": 3.398261711671309e-05,
645
- "loss": 1.079,
646
- "step": 4550
647
  },
648
  {
649
- "epoch": 2.2126022126022127,
650
- "grad_norm": 0.11140163242816925,
651
- "learning_rate": 3.211206588338358e-05,
652
- "loss": 1.0748,
653
- "step": 4600
654
  },
655
  {
656
- "epoch": 2.236652236652237,
657
- "grad_norm": 0.10978424549102783,
658
- "learning_rate": 3.028457305155483e-05,
659
- "loss": 1.0726,
660
- "step": 4650
661
  },
662
  {
663
- "epoch": 2.260702260702261,
664
- "grad_norm": 0.11395589262247086,
665
- "learning_rate": 2.8501297721457422e-05,
666
- "loss": 1.0656,
667
- "step": 4700
668
  },
669
  {
670
- "epoch": 2.284752284752285,
671
- "grad_norm": 0.10599405318498611,
672
- "learning_rate": 2.6763370948059353e-05,
673
- "loss": 1.0765,
674
- "step": 4750
675
  },
676
  {
677
- "epoch": 2.3088023088023086,
678
- "grad_norm": 0.11157254874706268,
679
- "learning_rate": 2.5071895023686442e-05,
680
- "loss": 1.0726,
681
- "step": 4800
682
  },
683
  {
684
- "epoch": 2.3328523328523327,
685
- "grad_norm": 0.1390163153409958,
686
- "learning_rate": 2.342794277888547e-05,
687
- "loss": 1.0731,
688
- "step": 4850
689
  },
690
  {
691
- "epoch": 2.356902356902357,
692
- "grad_norm": 0.1519329994916916,
693
- "learning_rate": 2.1832556901973965e-05,
694
- "loss": 1.0704,
695
- "step": 4900
696
  },
697
  {
698
- "epoch": 2.380952380952381,
699
- "grad_norm": 0.1278182566165924,
700
- "learning_rate": 2.0286749277707782e-05,
701
- "loss": 1.0661,
702
- "step": 4950
703
  },
704
  {
705
- "epoch": 2.405002405002405,
706
- "grad_norm": 0.10508263111114502,
707
- "learning_rate": 1.879150034548588e-05,
708
- "loss": 1.0758,
709
- "step": 5000
710
  },
711
  {
712
- "epoch": 2.429052429052429,
713
- "grad_norm": 0.09690719097852707,
714
- "learning_rate": 1.7347758477500044e-05,
715
- "loss": 1.0644,
716
- "step": 5050
717
  },
718
  {
719
- "epoch": 2.4531024531024532,
720
- "grad_norm": 0.10174595564603806,
721
- "learning_rate": 1.5956439377222798e-05,
722
- "loss": 1.0726,
723
- "step": 5100
724
  },
725
  {
726
- "epoch": 2.4771524771524773,
727
- "grad_norm": 0.10294167697429657,
728
- "learning_rate": 1.4618425498616162e-05,
729
- "loss": 1.0655,
730
- "step": 5150
731
  },
732
  {
733
- "epoch": 2.501202501202501,
734
- "grad_norm": 0.11103129386901855,
735
- "learning_rate": 1.3334565486428996e-05,
736
- "loss": 1.0651,
737
- "step": 5200
738
  },
739
  {
740
- "epoch": 2.525252525252525,
741
- "grad_norm": 0.10614852607250214,
742
- "learning_rate": 1.2105673637938053e-05,
743
- "loss": 1.0701,
744
- "step": 5250
745
  },
746
  {
747
- "epoch": 2.549302549302549,
748
- "grad_norm": 0.09437720477581024,
749
- "learning_rate": 1.0932529386474188e-05,
750
- "loss": 1.0673,
751
- "step": 5300
752
  },
753
  {
754
- "epoch": 2.5733525733525733,
755
- "grad_norm": 0.0965106412768364,
756
- "learning_rate": 9.815876807061264e-06,
757
- "loss": 1.0769,
758
- "step": 5350
759
  },
760
  {
761
- "epoch": 2.5974025974025974,
762
- "grad_norm": 0.09335634112358093,
763
- "learning_rate": 8.756424144481312e-06,
764
- "loss": 1.0646,
765
- "step": 5400
766
  },
767
  {
768
- "epoch": 2.6214526214526215,
769
- "grad_norm": 0.09890544414520264,
770
- "learning_rate": 7.75484336406529e-06,
771
- "loss": 1.0757,
772
- "step": 5450
773
  },
774
  {
775
- "epoch": 2.6455026455026456,
776
- "grad_norm": 0.09670912474393845,
777
- "learning_rate": 6.8117697254943106e-06,
778
- "loss": 1.0668,
779
- "step": 5500
780
  },
781
  {
782
- "epoch": 2.6695526695526697,
783
- "grad_norm": 0.09898468106985092,
784
- "learning_rate": 5.927801379881714e-06,
785
- "loss": 1.0745,
786
- "step": 5550
787
  },
788
  {
789
- "epoch": 2.6936026936026938,
790
- "grad_norm": 0.08697386831045151,
791
- "learning_rate": 5.103498990391509e-06,
792
- "loss": 1.0653,
793
- "step": 5600
794
  },
795
  {
796
- "epoch": 2.717652717652718,
797
- "grad_norm": 0.09457134455442429,
798
- "learning_rate": 4.339385376633775e-06,
799
- "loss": 1.0678,
800
- "step": 5650
801
  },
802
  {
803
- "epoch": 2.741702741702742,
804
- "grad_norm": 0.09092475473880768,
805
- "learning_rate": 3.6359451830626723e-06,
806
- "loss": 1.0635,
807
- "step": 5700
808
  },
809
  {
810
- "epoch": 2.7657527657527656,
811
- "grad_norm": 0.08736653625965118,
812
- "learning_rate": 2.993624571587239e-06,
813
- "loss": 1.0639,
814
- "step": 5750
815
  },
816
  {
817
- "epoch": 2.7898027898027897,
818
- "grad_norm": 0.09138292819261551,
819
- "learning_rate": 2.4128309385900717e-06,
820
- "loss": 1.065,
821
- "step": 5800
822
  },
823
  {
824
- "epoch": 2.813852813852814,
825
- "grad_norm": 0.08842656016349792,
826
- "learning_rate": 1.8939326565333037e-06,
827
- "loss": 1.0636,
828
- "step": 5850
829
  },
830
  {
831
- "epoch": 2.837902837902838,
832
- "grad_norm": 0.08870802819728851,
833
- "learning_rate": 1.437258840315714e-06,
834
- "loss": 1.0706,
835
- "step": 5900
836
  },
837
  {
838
- "epoch": 2.861952861952862,
839
- "grad_norm": 0.08659425377845764,
840
- "learning_rate": 1.0430991385293575e-06,
841
- "loss": 1.0673,
842
- "step": 5950
843
  },
844
  {
845
- "epoch": 2.886002886002886,
846
- "grad_norm": 0.08142086863517761,
847
- "learning_rate": 7.117035497478553e-07,
848
- "loss": 1.0697,
849
- "step": 6000
850
  },
851
  {
852
- "epoch": 2.91005291005291,
853
- "grad_norm": 0.080448217689991,
854
- "learning_rate": 4.432822639630407e-07,
855
- "loss": 1.0655,
856
- "step": 6050
857
  },
858
  {
859
- "epoch": 2.934102934102934,
860
- "grad_norm": 0.08980288356542587,
861
- "learning_rate": 2.380055292704575e-07,
862
- "loss": 1.0701,
863
- "step": 6100
864
  },
865
  {
866
- "epoch": 2.958152958152958,
867
- "grad_norm": 0.08309097588062286,
868
- "learning_rate": 9.600354388833443e-08,
869
- "loss": 1.0684,
870
- "step": 6150
871
  },
872
  {
873
- "epoch": 2.982202982202982,
874
- "grad_norm": 0.08456841111183167,
875
- "learning_rate": 1.7366373578442397e-08,
876
- "loss": 1.0684,
877
- "step": 6200
878
  }
879
  ],
880
- "logging_steps": 50,
881
- "max_steps": 6237,
882
  "num_input_tokens_seen": 0,
883
- "num_train_epochs": 3,
884
- "save_steps": 500,
885
  "stateful_callbacks": {
886
  "TrainerControl": {
887
  "args": {
@@ -894,8 +2798,8 @@
894
  "attributes": {}
895
  }
896
  },
897
- "total_flos": 2.056700790948663e+20,
898
- "train_batch_size": 4,
899
  "trial_name": null,
900
  "trial_params": null
901
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9998107374929026,
5
+ "eval_steps": 500,
6
+ "global_step": 3962,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0025235000946312535,
13
+ "grad_norm": 3.572803497314453,
14
+ "learning_rate": 1.2594458438287156e-06,
15
+ "loss": 1.0672,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.005047000189262507,
20
+ "grad_norm": 1.6470932960510254,
21
+ "learning_rate": 2.518891687657431e-06,
22
+ "loss": 1.0001,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.007570500283893761,
27
+ "grad_norm": 1.1262171268463135,
28
+ "learning_rate": 3.7783375314861467e-06,
29
+ "loss": 0.9414,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.010094000378525014,
34
+ "grad_norm": 0.8495129346847534,
35
+ "learning_rate": 5.037783375314862e-06,
36
+ "loss": 0.9321,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.012617500473156268,
41
+ "grad_norm": 0.8612141013145447,
42
+ "learning_rate": 6.297229219143577e-06,
43
+ "loss": 0.8746,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.015141000567787522,
48
+ "grad_norm": 0.8412306308746338,
49
+ "learning_rate": 7.556675062972293e-06,
50
+ "loss": 0.9044,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.017664500662418776,
55
+ "grad_norm": 0.8401440978050232,
56
+ "learning_rate": 8.816120906801008e-06,
57
+ "loss": 0.9022,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.020188000757050028,
62
+ "grad_norm": 0.86940997838974,
63
+ "learning_rate": 1.0075566750629725e-05,
64
+ "loss": 0.8887,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.022711500851681284,
69
+ "grad_norm": 0.8858376741409302,
70
+ "learning_rate": 1.133501259445844e-05,
71
+ "loss": 0.8715,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.025235000946312536,
76
+ "grad_norm": 0.8635324239730835,
77
+ "learning_rate": 1.2594458438287154e-05,
78
+ "loss": 0.8709,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.027758501040943788,
83
+ "grad_norm": 0.9026916027069092,
84
+ "learning_rate": 1.385390428211587e-05,
85
+ "loss": 0.9428,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.030282001135575044,
90
+ "grad_norm": 0.6949167847633362,
91
+ "learning_rate": 1.5113350125944587e-05,
92
+ "loss": 0.9041,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.0328055012302063,
97
+ "grad_norm": 0.9495663046836853,
98
+ "learning_rate": 1.63727959697733e-05,
99
+ "loss": 0.8636,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.03532900132483755,
104
+ "grad_norm": 0.8648976683616638,
105
+ "learning_rate": 1.7632241813602016e-05,
106
+ "loss": 0.8471,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.037852501419468804,
111
+ "grad_norm": 0.8415878415107727,
112
+ "learning_rate": 1.8891687657430733e-05,
113
+ "loss": 0.9108,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.040376001514100056,
118
+ "grad_norm": 0.8484784364700317,
119
+ "learning_rate": 2.015113350125945e-05,
120
+ "loss": 0.8883,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.04289950160873131,
125
+ "grad_norm": 0.8620557188987732,
126
+ "learning_rate": 2.1410579345088162e-05,
127
+ "loss": 0.8657,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.04542300170336257,
132
+ "grad_norm": 0.8222241401672363,
133
+ "learning_rate": 2.267002518891688e-05,
134
+ "loss": 0.9174,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.04794650179799382,
139
+ "grad_norm": 0.8526296019554138,
140
+ "learning_rate": 2.392947103274559e-05,
141
+ "loss": 0.8997,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.05047000189262507,
146
+ "grad_norm": 0.8018633723258972,
147
+ "learning_rate": 2.5188916876574308e-05,
148
+ "loss": 0.9076,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.052993501987256324,
153
+ "grad_norm": 0.859157145023346,
154
+ "learning_rate": 2.6448362720403024e-05,
155
+ "loss": 0.8891,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.055517002081887576,
160
+ "grad_norm": 0.7516281604766846,
161
+ "learning_rate": 2.770780856423174e-05,
162
+ "loss": 0.9026,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.058040502176518835,
167
+ "grad_norm": 0.8353524804115295,
168
+ "learning_rate": 2.8967254408060457e-05,
169
+ "loss": 0.8393,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.06056400227115009,
174
+ "grad_norm": 0.7622519731521606,
175
+ "learning_rate": 3.0226700251889174e-05,
176
+ "loss": 0.8524,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.06308750236578134,
181
+ "grad_norm": 0.8780621290206909,
182
+ "learning_rate": 3.148614609571788e-05,
183
+ "loss": 0.9286,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.0656110024604126,
188
+ "grad_norm": 0.9684115052223206,
189
+ "learning_rate": 3.27455919395466e-05,
190
+ "loss": 0.8974,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.06813450255504384,
195
+ "grad_norm": 0.8870697617530823,
196
+ "learning_rate": 3.4005037783375316e-05,
197
+ "loss": 0.8969,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.0706580026496751,
202
+ "grad_norm": 0.8952546119689941,
203
+ "learning_rate": 3.526448362720403e-05,
204
+ "loss": 0.8689,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.07318150274430635,
209
+ "grad_norm": 0.9221532344818115,
210
+ "learning_rate": 3.652392947103275e-05,
211
+ "loss": 0.883,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.07570500283893761,
216
+ "grad_norm": 0.9585578441619873,
217
+ "learning_rate": 3.7783375314861465e-05,
218
+ "loss": 0.8508,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.07822850293356887,
223
+ "grad_norm": 0.9648734331130981,
224
+ "learning_rate": 3.904282115869018e-05,
225
+ "loss": 0.8982,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.08075200302820011,
230
+ "grad_norm": 0.8147997260093689,
231
+ "learning_rate": 4.03022670025189e-05,
232
+ "loss": 0.8694,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.08327550312283137,
237
+ "grad_norm": 0.8099369406700134,
238
+ "learning_rate": 4.1561712846347615e-05,
239
+ "loss": 0.8929,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.08579900321746262,
244
+ "grad_norm": 0.8512017130851746,
245
+ "learning_rate": 4.2821158690176324e-05,
246
+ "loss": 0.86,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.08832250331209388,
251
+ "grad_norm": 0.8499712347984314,
252
+ "learning_rate": 4.408060453400504e-05,
253
+ "loss": 0.8423,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.09084600340672513,
258
+ "grad_norm": 0.8530069589614868,
259
+ "learning_rate": 4.534005037783376e-05,
260
+ "loss": 0.8852,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.09336950350135638,
265
+ "grad_norm": 0.8837921023368835,
266
+ "learning_rate": 4.659949622166247e-05,
267
+ "loss": 0.8864,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.09589300359598764,
272
+ "grad_norm": 0.8840718865394592,
273
+ "learning_rate": 4.785894206549118e-05,
274
+ "loss": 0.8999,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.09841650369061888,
279
+ "grad_norm": 0.7395176887512207,
280
+ "learning_rate": 4.91183879093199e-05,
281
+ "loss": 0.8954,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.10094000378525014,
286
+ "grad_norm": 0.8697903752326965,
287
+ "learning_rate": 4.999991263591223e-05,
288
+ "loss": 0.8353,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.1034635038798814,
293
+ "grad_norm": 0.8527745008468628,
294
+ "learning_rate": 4.9998359513560176e-05,
295
+ "loss": 0.8591,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.10598700397451265,
300
+ "grad_norm": 0.806920051574707,
301
+ "learning_rate": 4.999486510586282e-05,
302
+ "loss": 0.9076,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.1085105040691439,
307
+ "grad_norm": 0.8428360223770142,
308
+ "learning_rate": 4.9989429684183686e-05,
309
+ "loss": 0.9032,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.11103400416377515,
314
+ "grad_norm": 0.8436294198036194,
315
+ "learning_rate": 4.9982053670618626e-05,
316
+ "loss": 0.8894,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.11355750425840641,
321
+ "grad_norm": 0.7664760947227478,
322
+ "learning_rate": 4.997273763796312e-05,
323
+ "loss": 0.8732,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.11608100435303767,
328
+ "grad_norm": 0.9134059548377991,
329
+ "learning_rate": 4.996148230966775e-05,
330
+ "loss": 0.8438,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.11860450444766892,
335
+ "grad_norm": 0.849233090877533,
336
+ "learning_rate": 4.994828855978202e-05,
337
+ "loss": 0.9281,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.12112800454230017,
342
+ "grad_norm": 0.8473492860794067,
343
+ "learning_rate": 4.99331574128865e-05,
344
+ "loss": 0.8368,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.12365150463693142,
349
+ "grad_norm": 0.8474506735801697,
350
+ "learning_rate": 4.991609004401324e-05,
351
+ "loss": 0.8852,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 0.12617500473156268,
356
+ "grad_norm": 0.7737945318222046,
357
+ "learning_rate": 4.989708777855453e-05,
358
+ "loss": 0.8881,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 0.12869850482619394,
363
+ "grad_norm": 0.6961573958396912,
364
+ "learning_rate": 4.9876152092159994e-05,
365
+ "loss": 0.9173,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 0.1312220049208252,
370
+ "grad_norm": 0.7320950031280518,
371
+ "learning_rate": 4.985328461062195e-05,
372
+ "loss": 0.8899,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 0.13374550501545643,
377
+ "grad_norm": 0.7261886596679688,
378
+ "learning_rate": 4.98284871097492e-05,
379
+ "loss": 0.8855,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 0.1362690051100877,
384
+ "grad_norm": 0.7850842475891113,
385
+ "learning_rate": 4.98017615152291e-05,
386
+ "loss": 0.884,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 0.13879250520471895,
391
+ "grad_norm": 0.8015512228012085,
392
+ "learning_rate": 4.977310990247807e-05,
393
+ "loss": 0.8767,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 0.1413160052993502,
398
+ "grad_norm": 0.8083379864692688,
399
+ "learning_rate": 4.974253449648031e-05,
400
+ "loss": 0.8861,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 0.14383950539398146,
405
+ "grad_norm": 0.7726438045501709,
406
+ "learning_rate": 4.971003767161516e-05,
407
+ "loss": 0.8747,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 0.1463630054886127,
412
+ "grad_norm": 0.7719607949256897,
413
+ "learning_rate": 4.9675621951472584e-05,
414
+ "loss": 0.8862,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 0.14888650558324396,
419
+ "grad_norm": 0.7348030209541321,
420
+ "learning_rate": 4.9639290008657304e-05,
421
+ "loss": 0.8915,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 0.15141000567787521,
426
+ "grad_norm": 0.7903275489807129,
427
+ "learning_rate": 4.960104466458118e-05,
428
+ "loss": 0.8916,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 0.15393350577250647,
433
+ "grad_norm": 0.778893232345581,
434
+ "learning_rate": 4.956088888924414e-05,
435
+ "loss": 0.8674,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 0.15645700586713773,
440
+ "grad_norm": 0.7972787618637085,
441
+ "learning_rate": 4.951882580100353e-05,
442
+ "loss": 0.8908,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 0.15898050596176896,
447
+ "grad_norm": 0.741663932800293,
448
+ "learning_rate": 4.947485866633199e-05,
449
+ "loss": 0.8876,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 0.16150400605640022,
454
+ "grad_norm": 0.8059448003768921,
455
+ "learning_rate": 4.94289908995637e-05,
456
+ "loss": 0.8164,
457
+ "step": 640
458
+ },
459
+ {
460
+ "epoch": 0.16402750615103148,
461
+ "grad_norm": 0.8004572987556458,
462
+ "learning_rate": 4.938122606262936e-05,
463
+ "loss": 0.9075,
464
+ "step": 650
465
+ },
466
+ {
467
+ "epoch": 0.16655100624566274,
468
+ "grad_norm": 1.5734790563583374,
469
+ "learning_rate": 4.9331567864779457e-05,
470
+ "loss": 0.9146,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 0.169074506340294,
475
+ "grad_norm": 0.7215042114257812,
476
+ "learning_rate": 4.928002016229634e-05,
477
+ "loss": 0.885,
478
+ "step": 670
479
+ },
480
+ {
481
+ "epoch": 0.17159800643492523,
482
+ "grad_norm": 0.6652220487594604,
483
+ "learning_rate": 4.9226586958194647e-05,
484
+ "loss": 0.9085,
485
+ "step": 680
486
+ },
487
+ {
488
+ "epoch": 0.1741215065295565,
489
+ "grad_norm": 0.676682710647583,
490
+ "learning_rate": 4.9171272401910504e-05,
491
+ "loss": 0.837,
492
+ "step": 690
493
+ },
494
+ {
495
+ "epoch": 0.17664500662418775,
496
+ "grad_norm": 0.7034597396850586,
497
+ "learning_rate": 4.9114080788979284e-05,
498
+ "loss": 0.8905,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 0.179168506718819,
503
+ "grad_norm": 0.7657853960990906,
504
+ "learning_rate": 4.905501656070202e-05,
505
+ "loss": 0.8945,
506
+ "step": 710
507
+ },
508
+ {
509
+ "epoch": 0.18169200681345027,
510
+ "grad_norm": 0.7395844459533691,
511
+ "learning_rate": 4.8994084303800525e-05,
512
+ "loss": 0.8762,
513
+ "step": 720
514
+ },
515
+ {
516
+ "epoch": 0.1842155069080815,
517
+ "grad_norm": 0.7073786854743958,
518
+ "learning_rate": 4.89312887500612e-05,
519
+ "loss": 0.8824,
520
+ "step": 730
521
+ },
522
+ {
523
+ "epoch": 0.18673900700271276,
524
+ "grad_norm": 0.7239210605621338,
525
+ "learning_rate": 4.8866634775967544e-05,
526
+ "loss": 0.8855,
527
+ "step": 740
528
+ },
529
+ {
530
+ "epoch": 0.18926250709734402,
531
+ "grad_norm": 0.6406372785568237,
532
+ "learning_rate": 4.880012740232154e-05,
533
+ "loss": 0.8775,
534
+ "step": 750
535
+ },
536
+ {
537
+ "epoch": 0.19178600719197528,
538
+ "grad_norm": 0.76404869556427,
539
+ "learning_rate": 4.873177179385368e-05,
540
+ "loss": 0.862,
541
+ "step": 760
542
+ },
543
+ {
544
+ "epoch": 0.19430950728660654,
545
+ "grad_norm": 0.7401562929153442,
546
+ "learning_rate": 4.866157325882192e-05,
547
+ "loss": 0.8734,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 0.19683300738123777,
552
+ "grad_norm": 0.7563286423683167,
553
+ "learning_rate": 4.858953724859948e-05,
554
+ "loss": 0.8652,
555
+ "step": 780
556
+ },
557
+ {
558
+ "epoch": 0.19935650747586903,
559
+ "grad_norm": 0.7244860529899597,
560
+ "learning_rate": 4.851566935725147e-05,
561
+ "loss": 0.8436,
562
+ "step": 790
563
+ },
564
+ {
565
+ "epoch": 0.20188000757050029,
566
+ "grad_norm": 0.7061064839363098,
567
+ "learning_rate": 4.843997532110051e-05,
568
+ "loss": 0.8717,
569
+ "step": 800
570
+ },
571
+ {
572
+ "epoch": 0.20440350766513155,
573
+ "grad_norm": 0.9287751913070679,
574
+ "learning_rate": 4.836246101828124e-05,
575
+ "loss": 0.884,
576
+ "step": 810
577
+ },
578
+ {
579
+ "epoch": 0.2069270077597628,
580
+ "grad_norm": 0.6689929366111755,
581
+ "learning_rate": 4.828313246828386e-05,
582
+ "loss": 0.8871,
583
+ "step": 820
584
+ },
585
+ {
586
+ "epoch": 0.20945050785439404,
587
+ "grad_norm": 0.7176743149757385,
588
+ "learning_rate": 4.820199583148667e-05,
589
+ "loss": 0.8799,
590
+ "step": 830
591
+ },
592
+ {
593
+ "epoch": 0.2119740079490253,
594
+ "grad_norm": 0.6979175209999084,
595
+ "learning_rate": 4.811905740867769e-05,
596
+ "loss": 0.8874,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 0.21449750804365655,
601
+ "grad_norm": 0.780451774597168,
602
+ "learning_rate": 4.803432364056535e-05,
603
+ "loss": 0.8843,
604
+ "step": 850
605
+ },
606
+ {
607
+ "epoch": 0.2170210081382878,
608
+ "grad_norm": 0.7175182700157166,
609
+ "learning_rate": 4.794780110727832e-05,
610
+ "loss": 0.8578,
611
+ "step": 860
612
+ },
613
+ {
614
+ "epoch": 0.21954450823291907,
615
+ "grad_norm": 0.724116861820221,
616
+ "learning_rate": 4.785949652785453e-05,
617
+ "loss": 0.8869,
618
+ "step": 870
619
+ },
620
+ {
621
+ "epoch": 0.2220680083275503,
622
+ "grad_norm": 0.8724785447120667,
623
+ "learning_rate": 4.776941675971941e-05,
624
+ "loss": 0.8648,
625
+ "step": 880
626
+ },
627
+ {
628
+ "epoch": 0.22459150842218156,
629
+ "grad_norm": 0.7354777455329895,
630
+ "learning_rate": 4.767756879815334e-05,
631
+ "loss": 0.8683,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 0.22711500851681282,
636
+ "grad_norm": 0.7593517899513245,
637
+ "learning_rate": 4.758395977574841e-05,
638
+ "loss": 0.9101,
639
+ "step": 900
640
+ },
641
+ {
642
+ "epoch": 0.22963850861144408,
643
+ "grad_norm": 0.7243201732635498,
644
+ "learning_rate": 4.748859696185458e-05,
645
+ "loss": 0.8945,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 0.23216200870607534,
650
+ "grad_norm": 0.6870005130767822,
651
+ "learning_rate": 4.739148776201512e-05,
652
+ "loss": 0.8158,
653
+ "step": 920
654
+ },
655
+ {
656
+ "epoch": 0.23468550880070657,
657
+ "grad_norm": 0.7116649746894836,
658
+ "learning_rate": 4.729263971739154e-05,
659
+ "loss": 0.8855,
660
+ "step": 930
661
+ },
662
+ {
663
+ "epoch": 0.23720900889533783,
664
+ "grad_norm": 0.6931096911430359,
665
+ "learning_rate": 4.719206050417796e-05,
666
+ "loss": 0.8674,
667
+ "step": 940
668
+ },
669
+ {
670
+ "epoch": 0.2397325089899691,
671
+ "grad_norm": 0.7314079999923706,
672
+ "learning_rate": 4.7089757933005016e-05,
673
+ "loss": 0.8743,
674
+ "step": 950
675
+ },
676
+ {
677
+ "epoch": 0.24225600908460035,
678
+ "grad_norm": 0.7538678646087646,
679
+ "learning_rate": 4.698573994833332e-05,
680
+ "loss": 0.866,
681
+ "step": 960
682
+ },
683
+ {
684
+ "epoch": 0.2447795091792316,
685
+ "grad_norm": 0.6961751580238342,
686
+ "learning_rate": 4.688001462783648e-05,
687
+ "loss": 0.8528,
688
+ "step": 970
689
+ },
690
+ {
691
+ "epoch": 0.24730300927386284,
692
+ "grad_norm": 0.7808176875114441,
693
+ "learning_rate": 4.6772590181773866e-05,
694
+ "loss": 0.8315,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 0.2498265093684941,
699
+ "grad_norm": 0.716074526309967,
700
+ "learning_rate": 4.6663474952353004e-05,
701
+ "loss": 0.8372,
702
+ "step": 990
703
+ },
704
+ {
705
+ "epoch": 0.25235000946312536,
706
+ "grad_norm": 0.8192372918128967,
707
+ "learning_rate": 4.6552677413081756e-05,
708
+ "loss": 0.902,
709
+ "step": 1000
710
+ },
711
+ {
712
+ "epoch": 0.2548735095577566,
713
+ "grad_norm": 0.7088383436203003,
714
+ "learning_rate": 4.644020616811029e-05,
715
+ "loss": 0.8847,
716
+ "step": 1010
717
+ },
718
+ {
719
+ "epoch": 0.2573970096523879,
720
+ "grad_norm": 0.8579234480857849,
721
+ "learning_rate": 4.6326069951562924e-05,
722
+ "loss": 0.9071,
723
+ "step": 1020
724
+ },
725
+ {
726
+ "epoch": 0.25992050974701914,
727
+ "grad_norm": 0.6537004709243774,
728
+ "learning_rate": 4.6210277626859856e-05,
729
+ "loss": 0.8187,
730
+ "step": 1030
731
+ },
732
+ {
733
+ "epoch": 0.2624440098416504,
734
+ "grad_norm": 0.6265996694564819,
735
+ "learning_rate": 4.609283818602884e-05,
736
+ "loss": 0.8744,
737
+ "step": 1040
738
+ },
739
+ {
740
+ "epoch": 0.2649675099362816,
741
+ "grad_norm": 0.7445203065872192,
742
+ "learning_rate": 4.5973760749006963e-05,
743
+ "loss": 0.8831,
744
+ "step": 1050
745
+ },
746
+ {
747
+ "epoch": 0.26749101003091286,
748
+ "grad_norm": 0.7054116129875183,
749
+ "learning_rate": 4.585305456293235e-05,
750
+ "loss": 0.9171,
751
+ "step": 1060
752
+ },
753
+ {
754
+ "epoch": 0.2700145101255441,
755
+ "grad_norm": 1.429075837135315,
756
+ "learning_rate": 4.5730729001426083e-05,
757
+ "loss": 0.8894,
758
+ "step": 1070
759
+ },
760
+ {
761
+ "epoch": 0.2725380102201754,
762
+ "grad_norm": 0.6793610453605652,
763
+ "learning_rate": 4.5606793563864316e-05,
764
+ "loss": 0.8629,
765
+ "step": 1080
766
+ },
767
+ {
768
+ "epoch": 0.27506151031480663,
769
+ "grad_norm": 0.6932589411735535,
770
+ "learning_rate": 4.548125787464054e-05,
771
+ "loss": 0.8564,
772
+ "step": 1090
773
+ },
774
+ {
775
+ "epoch": 0.2775850104094379,
776
+ "grad_norm": 0.6600730419158936,
777
+ "learning_rate": 4.535413168241821e-05,
778
+ "loss": 0.8685,
779
+ "step": 1100
780
+ },
781
+ {
782
+ "epoch": 0.28010851050406915,
783
+ "grad_norm": 0.6784124970436096,
784
+ "learning_rate": 4.522542485937369e-05,
785
+ "loss": 0.9024,
786
+ "step": 1110
787
+ },
788
+ {
789
+ "epoch": 0.2826320105987004,
790
+ "grad_norm": 0.6841257214546204,
791
+ "learning_rate": 4.509514740042962e-05,
792
+ "loss": 0.8698,
793
+ "step": 1120
794
+ },
795
+ {
796
+ "epoch": 0.28515551069333167,
797
+ "grad_norm": 0.7785212397575378,
798
+ "learning_rate": 4.496330942247873e-05,
799
+ "loss": 0.8731,
800
+ "step": 1130
801
+ },
802
+ {
803
+ "epoch": 0.28767901078796293,
804
+ "grad_norm": 0.730110228061676,
805
+ "learning_rate": 4.482992116359824e-05,
806
+ "loss": 0.8542,
807
+ "step": 1140
808
+ },
809
+ {
810
+ "epoch": 0.29020251088259413,
811
+ "grad_norm": 0.6644122004508972,
812
+ "learning_rate": 4.469499298225473e-05,
813
+ "loss": 0.8246,
814
+ "step": 1150
815
+ },
816
+ {
817
+ "epoch": 0.2927260109772254,
818
+ "grad_norm": 0.7170603275299072,
819
+ "learning_rate": 4.455853535649984e-05,
820
+ "loss": 0.8576,
821
+ "step": 1160
822
+ },
823
+ {
824
+ "epoch": 0.29524951107185665,
825
+ "grad_norm": 0.6883527040481567,
826
+ "learning_rate": 4.442055888315646e-05,
827
+ "loss": 0.8639,
828
+ "step": 1170
829
+ },
830
+ {
831
+ "epoch": 0.2977730111664879,
832
+ "grad_norm": 0.6971318125724792,
833
+ "learning_rate": 4.4281074276995936e-05,
834
+ "loss": 0.8218,
835
+ "step": 1180
836
+ },
837
+ {
838
+ "epoch": 0.30029651126111917,
839
+ "grad_norm": 0.7020850777626038,
840
+ "learning_rate": 4.4140092369905914e-05,
841
+ "loss": 0.8376,
842
+ "step": 1190
843
+ },
844
+ {
845
+ "epoch": 0.30282001135575043,
846
+ "grad_norm": 0.6218104362487793,
847
+ "learning_rate": 4.399762411004922e-05,
848
+ "loss": 0.8741,
849
+ "step": 1200
850
+ },
851
+ {
852
+ "epoch": 0.3053435114503817,
853
+ "grad_norm": 0.8031836152076721,
854
+ "learning_rate": 4.3853680561013647e-05,
855
+ "loss": 0.8977,
856
+ "step": 1210
857
+ },
858
+ {
859
+ "epoch": 0.30786701154501295,
860
+ "grad_norm": 0.6999651789665222,
861
+ "learning_rate": 4.370827290095277e-05,
862
+ "loss": 0.8628,
863
+ "step": 1220
864
+ },
865
+ {
866
+ "epoch": 0.3103905116396442,
867
+ "grad_norm": 0.6727817058563232,
868
+ "learning_rate": 4.356141242171795e-05,
869
+ "loss": 0.8674,
870
+ "step": 1230
871
+ },
872
+ {
873
+ "epoch": 0.31291401173427547,
874
+ "grad_norm": 0.6965411305427551,
875
+ "learning_rate": 4.3413110527981406e-05,
876
+ "loss": 0.8416,
877
+ "step": 1240
878
+ },
879
+ {
880
+ "epoch": 0.31543751182890667,
881
+ "grad_norm": 0.7655733823776245,
882
+ "learning_rate": 4.3263378736350566e-05,
883
+ "loss": 0.8662,
884
+ "step": 1250
885
+ },
886
+ {
887
+ "epoch": 0.31796101192353793,
888
+ "grad_norm": 0.7115268111228943,
889
+ "learning_rate": 4.311222867447375e-05,
890
+ "loss": 0.9022,
891
+ "step": 1260
892
+ },
893
+ {
894
+ "epoch": 0.3204845120181692,
895
+ "grad_norm": 0.7572771310806274,
896
+ "learning_rate": 4.295967208013717e-05,
897
+ "loss": 0.8649,
898
+ "step": 1270
899
+ },
900
+ {
901
+ "epoch": 0.32300801211280045,
902
+ "grad_norm": 0.6894986629486084,
903
+ "learning_rate": 4.280572080035348e-05,
904
+ "loss": 0.8659,
905
+ "step": 1280
906
+ },
907
+ {
908
+ "epoch": 0.3255315122074317,
909
+ "grad_norm": 0.6966748833656311,
910
+ "learning_rate": 4.2650386790441696e-05,
911
+ "loss": 0.8558,
912
+ "step": 1290
913
+ },
914
+ {
915
+ "epoch": 0.32805501230206296,
916
+ "grad_norm": 0.7241553664207458,
917
+ "learning_rate": 4.2493682113098855e-05,
918
+ "loss": 0.8666,
919
+ "step": 1300
920
+ },
921
+ {
922
+ "epoch": 0.3305785123966942,
923
+ "grad_norm": 0.6839144825935364,
924
+ "learning_rate": 4.233561893746323e-05,
925
+ "loss": 0.8879,
926
+ "step": 1310
927
+ },
928
+ {
929
+ "epoch": 0.3331020124913255,
930
+ "grad_norm": 0.6955851912498474,
931
+ "learning_rate": 4.217620953816935e-05,
932
+ "loss": 0.8446,
933
+ "step": 1320
934
+ },
935
+ {
936
+ "epoch": 0.33562551258595674,
937
+ "grad_norm": 0.6097539067268372,
938
+ "learning_rate": 4.2015466294394756e-05,
939
+ "loss": 0.8816,
940
+ "step": 1330
941
+ },
942
+ {
943
+ "epoch": 0.338149012680588,
944
+ "grad_norm": 0.7663230299949646,
945
+ "learning_rate": 4.185340168889868e-05,
946
+ "loss": 0.8518,
947
+ "step": 1340
948
+ },
949
+ {
950
+ "epoch": 0.3406725127752192,
951
+ "grad_norm": 0.6563027501106262,
952
+ "learning_rate": 4.169002830705274e-05,
953
+ "loss": 0.8516,
954
+ "step": 1350
955
+ },
956
+ {
957
+ "epoch": 0.34319601286985046,
958
+ "grad_norm": 0.639011025428772,
959
+ "learning_rate": 4.152535883586352e-05,
960
+ "loss": 0.8324,
961
+ "step": 1360
962
+ },
963
+ {
964
+ "epoch": 0.3457195129644817,
965
+ "grad_norm": 0.7072712779045105,
966
+ "learning_rate": 4.135940606298738e-05,
967
+ "loss": 0.8445,
968
+ "step": 1370
969
+ },
970
+ {
971
+ "epoch": 0.348243013059113,
972
+ "grad_norm": 0.6532591581344604,
973
+ "learning_rate": 4.119218287573743e-05,
974
+ "loss": 0.8293,
975
+ "step": 1380
976
+ },
977
+ {
978
+ "epoch": 0.35076651315374424,
979
+ "grad_norm": 0.6421136260032654,
980
+ "learning_rate": 4.102370226008271e-05,
981
+ "loss": 0.8809,
982
+ "step": 1390
983
+ },
984
+ {
985
+ "epoch": 0.3532900132483755,
986
+ "grad_norm": 0.6466293931007385,
987
+ "learning_rate": 4.085397729963976e-05,
988
+ "loss": 0.8478,
989
+ "step": 1400
990
+ },
991
+ {
992
+ "epoch": 0.35581351334300676,
993
+ "grad_norm": 0.7026222348213196,
994
+ "learning_rate": 4.06830211746566e-05,
995
+ "loss": 0.8855,
996
+ "step": 1410
997
+ },
998
+ {
999
+ "epoch": 0.358337013437638,
1000
+ "grad_norm": 0.7792401313781738,
1001
+ "learning_rate": 4.051084716098921e-05,
1002
+ "loss": 0.8523,
1003
+ "step": 1420
1004
+ },
1005
+ {
1006
+ "epoch": 0.3608605135322693,
1007
+ "grad_norm": 0.641736626625061,
1008
+ "learning_rate": 4.0337468629070496e-05,
1009
+ "loss": 0.8605,
1010
+ "step": 1430
1011
+ },
1012
+ {
1013
+ "epoch": 0.36338401362690054,
1014
+ "grad_norm": 0.6911234855651855,
1015
+ "learning_rate": 4.016289904287212e-05,
1016
+ "loss": 0.8492,
1017
+ "step": 1440
1018
+ },
1019
+ {
1020
+ "epoch": 0.36590751372153174,
1021
+ "grad_norm": 0.7274027466773987,
1022
+ "learning_rate": 3.9987151958858794e-05,
1023
+ "loss": 0.8623,
1024
+ "step": 1450
1025
+ },
1026
+ {
1027
+ "epoch": 0.368431013816163,
1028
+ "grad_norm": 0.6672956347465515,
1029
+ "learning_rate": 3.981024102493566e-05,
1030
+ "loss": 0.8309,
1031
+ "step": 1460
1032
+ },
1033
+ {
1034
+ "epoch": 0.37095451391079426,
1035
+ "grad_norm": 0.7280237078666687,
1036
+ "learning_rate": 3.963217997938834e-05,
1037
+ "loss": 0.8633,
1038
+ "step": 1470
1039
+ },
1040
+ {
1041
+ "epoch": 0.3734780140054255,
1042
+ "grad_norm": 0.749769389629364,
1043
+ "learning_rate": 3.945298264981614e-05,
1044
+ "loss": 0.8433,
1045
+ "step": 1480
1046
+ },
1047
+ {
1048
+ "epoch": 0.3760015141000568,
1049
+ "grad_norm": 0.7026387453079224,
1050
+ "learning_rate": 3.927266295205818e-05,
1051
+ "loss": 0.8665,
1052
+ "step": 1490
1053
+ },
1054
+ {
1055
+ "epoch": 0.37852501419468804,
1056
+ "grad_norm": 0.6626182794570923,
1057
+ "learning_rate": 3.9091234889112815e-05,
1058
+ "loss": 0.8597,
1059
+ "step": 1500
1060
+ },
1061
+ {
1062
+ "epoch": 0.3810485142893193,
1063
+ "grad_norm": 0.6502306461334229,
1064
+ "learning_rate": 3.8908712550050154e-05,
1065
+ "loss": 0.8652,
1066
+ "step": 1510
1067
+ },
1068
+ {
1069
+ "epoch": 0.38357201438395055,
1070
+ "grad_norm": 0.6474471688270569,
1071
+ "learning_rate": 3.8725110108917975e-05,
1072
+ "loss": 0.8258,
1073
+ "step": 1520
1074
+ },
1075
+ {
1076
+ "epoch": 0.3860955144785818,
1077
+ "grad_norm": 0.6739810109138489,
1078
+ "learning_rate": 3.854044182364098e-05,
1079
+ "loss": 0.8578,
1080
+ "step": 1530
1081
+ },
1082
+ {
1083
+ "epoch": 0.3886190145732131,
1084
+ "grad_norm": 0.7030637264251709,
1085
+ "learning_rate": 3.835472203491367e-05,
1086
+ "loss": 0.8468,
1087
+ "step": 1540
1088
+ },
1089
+ {
1090
+ "epoch": 0.3911425146678443,
1091
+ "grad_norm": 0.6305805444717407,
1092
+ "learning_rate": 3.816796516508658e-05,
1093
+ "loss": 0.8476,
1094
+ "step": 1550
1095
+ },
1096
+ {
1097
+ "epoch": 0.39366601476247554,
1098
+ "grad_norm": 0.6209976077079773,
1099
+ "learning_rate": 3.798018571704638e-05,
1100
+ "loss": 0.8376,
1101
+ "step": 1560
1102
+ },
1103
+ {
1104
+ "epoch": 0.3961895148571068,
1105
+ "grad_norm": 0.6698387265205383,
1106
+ "learning_rate": 3.779139827308956e-05,
1107
+ "loss": 0.8744,
1108
+ "step": 1570
1109
+ },
1110
+ {
1111
+ "epoch": 0.39871301495173805,
1112
+ "grad_norm": 0.7300374507904053,
1113
+ "learning_rate": 3.760161749379008e-05,
1114
+ "loss": 0.8609,
1115
+ "step": 1580
1116
+ },
1117
+ {
1118
+ "epoch": 0.4012365150463693,
1119
+ "grad_norm": 0.6837272047996521,
1120
+ "learning_rate": 3.7410858116860836e-05,
1121
+ "loss": 0.837,
1122
+ "step": 1590
1123
+ },
1124
+ {
1125
+ "epoch": 0.40376001514100057,
1126
+ "grad_norm": 0.6649072170257568,
1127
+ "learning_rate": 3.721913495600923e-05,
1128
+ "loss": 0.8694,
1129
+ "step": 1600
1130
+ },
1131
+ {
1132
+ "epoch": 0.40628351523563183,
1133
+ "grad_norm": 0.5960752367973328,
1134
+ "learning_rate": 3.7026462899786726e-05,
1135
+ "loss": 0.8129,
1136
+ "step": 1610
1137
+ },
1138
+ {
1139
+ "epoch": 0.4088070153302631,
1140
+ "grad_norm": 0.6648868322372437,
1141
+ "learning_rate": 3.683285691043272e-05,
1142
+ "loss": 0.8634,
1143
+ "step": 1620
1144
+ },
1145
+ {
1146
+ "epoch": 0.41133051542489435,
1147
+ "grad_norm": 0.7035058736801147,
1148
+ "learning_rate": 3.663833202271257e-05,
1149
+ "loss": 0.8685,
1150
+ "step": 1630
1151
+ },
1152
+ {
1153
+ "epoch": 0.4138540155195256,
1154
+ "grad_norm": 0.6673656702041626,
1155
+ "learning_rate": 3.6442903342750084e-05,
1156
+ "loss": 0.8063,
1157
+ "step": 1640
1158
+ },
1159
+ {
1160
+ "epoch": 0.4163775156141568,
1161
+ "grad_norm": 0.6990562081336975,
1162
+ "learning_rate": 3.624658604685443e-05,
1163
+ "loss": 0.8335,
1164
+ "step": 1650
1165
+ },
1166
+ {
1167
+ "epoch": 0.41890101570878807,
1168
+ "grad_norm": 0.7190445065498352,
1169
+ "learning_rate": 3.604939538034158e-05,
1170
+ "loss": 0.8509,
1171
+ "step": 1660
1172
+ },
1173
+ {
1174
+ "epoch": 0.42142451580341933,
1175
+ "grad_norm": 0.7450734376907349,
1176
+ "learning_rate": 3.585134665635041e-05,
1177
+ "loss": 0.8446,
1178
+ "step": 1670
1179
+ },
1180
+ {
1181
+ "epoch": 0.4239480158980506,
1182
+ "grad_norm": 0.6475887298583984,
1183
+ "learning_rate": 3.565245525465355e-05,
1184
+ "loss": 0.8836,
1185
+ "step": 1680
1186
+ },
1187
+ {
1188
+ "epoch": 0.42647151599268185,
1189
+ "grad_norm": 0.6419990658760071,
1190
+ "learning_rate": 3.5452736620463064e-05,
1191
+ "loss": 0.8428,
1192
+ "step": 1690
1193
+ },
1194
+ {
1195
+ "epoch": 0.4289950160873131,
1196
+ "grad_norm": 0.7428763508796692,
1197
+ "learning_rate": 3.525220626323097e-05,
1198
+ "loss": 0.8247,
1199
+ "step": 1700
1200
+ },
1201
+ {
1202
+ "epoch": 0.43151851618194437,
1203
+ "grad_norm": 0.6717978119850159,
1204
+ "learning_rate": 3.5050879755444877e-05,
1205
+ "loss": 0.881,
1206
+ "step": 1710
1207
+ },
1208
+ {
1209
+ "epoch": 0.4340420162765756,
1210
+ "grad_norm": 0.6862205862998962,
1211
+ "learning_rate": 3.484877273141866e-05,
1212
+ "loss": 0.8511,
1213
+ "step": 1720
1214
+ },
1215
+ {
1216
+ "epoch": 0.4365655163712069,
1217
+ "grad_norm": 0.6874988079071045,
1218
+ "learning_rate": 3.464590088607839e-05,
1219
+ "loss": 0.8649,
1220
+ "step": 1730
1221
+ },
1222
+ {
1223
+ "epoch": 0.43908901646583814,
1224
+ "grad_norm": 0.6635965704917908,
1225
+ "learning_rate": 3.444227997374345e-05,
1226
+ "loss": 0.8719,
1227
+ "step": 1740
1228
+ },
1229
+ {
1230
+ "epoch": 0.44161251656046935,
1231
+ "grad_norm": 0.7285788655281067,
1232
+ "learning_rate": 3.4237925806903184e-05,
1233
+ "loss": 0.8534,
1234
+ "step": 1750
1235
+ },
1236
+ {
1237
+ "epoch": 0.4441360166551006,
1238
+ "grad_norm": 0.6177170872688293,
1239
+ "learning_rate": 3.403285425498889e-05,
1240
+ "loss": 0.8516,
1241
+ "step": 1760
1242
+ },
1243
+ {
1244
+ "epoch": 0.44665951674973187,
1245
+ "grad_norm": 0.7633406519889832,
1246
+ "learning_rate": 3.3827081243141534e-05,
1247
+ "loss": 0.8193,
1248
+ "step": 1770
1249
+ },
1250
+ {
1251
+ "epoch": 0.4491830168443631,
1252
+ "grad_norm": 0.6661052107810974,
1253
+ "learning_rate": 3.362062275097496e-05,
1254
+ "loss": 0.8745,
1255
+ "step": 1780
1256
+ },
1257
+ {
1258
+ "epoch": 0.4517065169389944,
1259
+ "grad_norm": 0.7744668126106262,
1260
+ "learning_rate": 3.341349481133507e-05,
1261
+ "loss": 0.8158,
1262
+ "step": 1790
1263
+ },
1264
+ {
1265
+ "epoch": 0.45423001703362564,
1266
+ "grad_norm": 0.6634140014648438,
1267
+ "learning_rate": 3.320571350905466e-05,
1268
+ "loss": 0.8574,
1269
+ "step": 1800
1270
+ },
1271
+ {
1272
+ "epoch": 0.4567535171282569,
1273
+ "grad_norm": 0.7289906740188599,
1274
+ "learning_rate": 3.299729497970444e-05,
1275
+ "loss": 0.8776,
1276
+ "step": 1810
1277
+ },
1278
+ {
1279
+ "epoch": 0.45927701722288816,
1280
+ "grad_norm": 0.6595107913017273,
1281
+ "learning_rate": 3.278825540833995e-05,
1282
+ "loss": 0.8416,
1283
+ "step": 1820
1284
+ },
1285
+ {
1286
+ "epoch": 0.4618005173175194,
1287
+ "grad_norm": 0.6596432328224182,
1288
+ "learning_rate": 3.2578611028244656e-05,
1289
+ "loss": 0.8295,
1290
+ "step": 1830
1291
+ },
1292
+ {
1293
+ "epoch": 0.4643240174121507,
1294
+ "grad_norm": 0.7007511258125305,
1295
+ "learning_rate": 3.2368378119669363e-05,
1296
+ "loss": 0.8075,
1297
+ "step": 1840
1298
  },
1299
  {
1300
+ "epoch": 0.4668475175067819,
1301
+ "grad_norm": 0.5890100598335266,
1302
+ "learning_rate": 3.215757300856796e-05,
1303
+ "loss": 0.8331,
1304
+ "step": 1850
1305
  },
1306
  {
1307
+ "epoch": 0.46937101760141314,
1308
+ "grad_norm": 0.670438826084137,
1309
+ "learning_rate": 3.194621206532957e-05,
1310
+ "loss": 0.8739,
1311
+ "step": 1860
1312
  },
1313
  {
1314
+ "epoch": 0.4718945176960444,
1315
+ "grad_norm": 0.6237263083457947,
1316
+ "learning_rate": 3.173431170350732e-05,
1317
+ "loss": 0.8377,
1318
+ "step": 1870
1319
  },
1320
  {
1321
+ "epoch": 0.47441801779067566,
1322
+ "grad_norm": 0.7160887122154236,
1323
+ "learning_rate": 3.152188837854369e-05,
1324
+ "loss": 0.8708,
1325
+ "step": 1880
1326
  },
1327
  {
1328
+ "epoch": 0.4769415178853069,
1329
+ "grad_norm": 0.6525737643241882,
1330
+ "learning_rate": 3.130895858649264e-05,
1331
+ "loss": 0.8207,
1332
+ "step": 1890
1333
  },
1334
  {
1335
+ "epoch": 0.4794650179799382,
1336
+ "grad_norm": 0.7249549627304077,
1337
+ "learning_rate": 3.109553886273863e-05,
1338
+ "loss": 0.8516,
1339
+ "step": 1900
1340
  },
1341
  {
1342
+ "epoch": 0.48198851807456944,
1343
+ "grad_norm": 0.6668533682823181,
1344
+ "learning_rate": 3.088164578071246e-05,
1345
+ "loss": 0.8275,
1346
+ "step": 1910
1347
  },
1348
  {
1349
+ "epoch": 0.4845120181692007,
1350
+ "grad_norm": 0.7262100577354431,
1351
+ "learning_rate": 3.066729595060431e-05,
1352
+ "loss": 0.8147,
1353
+ "step": 1920
1354
  },
1355
  {
1356
+ "epoch": 0.48703551826383196,
1357
+ "grad_norm": 0.7166665196418762,
1358
+ "learning_rate": 3.0452506018073833e-05,
1359
+ "loss": 0.8514,
1360
+ "step": 1930
1361
  },
1362
  {
1363
+ "epoch": 0.4895590183584632,
1364
+ "grad_norm": 0.6810010075569153,
1365
+ "learning_rate": 3.0237292662957473e-05,
1366
+ "loss": 0.8323,
1367
+ "step": 1940
1368
  },
1369
  {
1370
+ "epoch": 0.4920825184530944,
1371
+ "grad_norm": 0.6473044157028198,
1372
+ "learning_rate": 3.0021672597973207e-05,
1373
+ "loss": 0.8265,
1374
+ "step": 1950
1375
  },
1376
  {
1377
+ "epoch": 0.4946060185477257,
1378
+ "grad_norm": 0.6784878969192505,
1379
+ "learning_rate": 2.9805662567422676e-05,
1380
+ "loss": 0.8636,
1381
+ "step": 1960
1382
  },
1383
  {
1384
+ "epoch": 0.49712951864235694,
1385
+ "grad_norm": 0.7378344535827637,
1386
+ "learning_rate": 2.9589279345890895e-05,
1387
+ "loss": 0.8483,
1388
+ "step": 1970
1389
  },
1390
  {
1391
+ "epoch": 0.4996530187369882,
1392
+ "grad_norm": 0.5715174078941345,
1393
+ "learning_rate": 2.9372539736943577e-05,
1394
+ "loss": 0.8434,
1395
+ "step": 1980
1396
  },
1397
  {
1398
+ "epoch": 0.5021765188316195,
1399
+ "grad_norm": 0.5842220783233643,
1400
+ "learning_rate": 2.9155460571822245e-05,
1401
+ "loss": 0.8305,
1402
+ "step": 1990
1403
  },
1404
  {
1405
+ "epoch": 0.5047000189262507,
1406
+ "grad_norm": 0.7206842303276062,
1407
+ "learning_rate": 2.893805870813717e-05,
1408
+ "loss": 0.8127,
1409
+ "step": 2000
1410
  },
1411
  {
1412
+ "epoch": 0.5072235190208819,
1413
+ "grad_norm": 0.6641551852226257,
1414
+ "learning_rate": 2.872035102855826e-05,
1415
+ "loss": 0.8272,
1416
+ "step": 2010
1417
  },
1418
  {
1419
+ "epoch": 0.5097470191155132,
1420
+ "grad_norm": 0.6917135119438171,
1421
+ "learning_rate": 2.850235443950402e-05,
1422
+ "loss": 0.7998,
1423
+ "step": 2020
1424
  },
1425
  {
1426
+ "epoch": 0.5122705192101444,
1427
+ "grad_norm": 0.6133066415786743,
1428
+ "learning_rate": 2.8284085869828665e-05,
1429
+ "loss": 0.8413,
1430
+ "step": 2030
1431
  },
1432
  {
1433
+ "epoch": 0.5147940193047758,
1434
+ "grad_norm": 0.6827579140663147,
1435
+ "learning_rate": 2.8065562269507463e-05,
1436
+ "loss": 0.8452,
1437
+ "step": 2040
1438
  },
1439
  {
1440
+ "epoch": 0.517317519399407,
1441
+ "grad_norm": 0.7090153694152832,
1442
+ "learning_rate": 2.7846800608320485e-05,
1443
+ "loss": 0.8293,
1444
+ "step": 2050
1445
  },
1446
  {
1447
+ "epoch": 0.5198410194940383,
1448
+ "grad_norm": 0.6256769299507141,
1449
+ "learning_rate": 2.7627817874534762e-05,
1450
+ "loss": 0.8159,
1451
+ "step": 2060
1452
  },
1453
  {
1454
+ "epoch": 0.5223645195886695,
1455
+ "grad_norm": 0.6957070231437683,
1456
+ "learning_rate": 2.7408631073585068e-05,
1457
+ "loss": 0.8023,
1458
+ "step": 2070
1459
  },
1460
  {
1461
+ "epoch": 0.5248880196833008,
1462
+ "grad_norm": 0.6817536950111389,
1463
+ "learning_rate": 2.7189257226753305e-05,
1464
+ "loss": 0.8334,
1465
+ "step": 2080
1466
  },
1467
  {
1468
+ "epoch": 0.527411519777932,
1469
+ "grad_norm": 0.6535147428512573,
1470
+ "learning_rate": 2.696971336984672e-05,
1471
+ "loss": 0.8558,
1472
+ "step": 2090
1473
  },
1474
  {
1475
+ "epoch": 0.5299350198725632,
1476
+ "grad_norm": 0.7457418441772461,
1477
+ "learning_rate": 2.6750016551874945e-05,
1478
+ "loss": 0.8244,
1479
+ "step": 2100
1480
  },
1481
  {
1482
+ "epoch": 0.5324585199671945,
1483
+ "grad_norm": 0.6570724248886108,
1484
+ "learning_rate": 2.6530183833726025e-05,
1485
+ "loss": 0.8283,
1486
+ "step": 2110
1487
  },
1488
  {
1489
+ "epoch": 0.5349820200618257,
1490
+ "grad_norm": 0.7065024375915527,
1491
+ "learning_rate": 2.6310232286841546e-05,
1492
+ "loss": 0.8565,
1493
+ "step": 2120
1494
  },
1495
  {
1496
+ "epoch": 0.537505520156457,
1497
+ "grad_norm": 0.671667218208313,
1498
+ "learning_rate": 2.609017899189092e-05,
1499
+ "loss": 0.8447,
1500
+ "step": 2130
1501
  },
1502
  {
1503
+ "epoch": 0.5400290202510882,
1504
+ "grad_norm": 0.6672875285148621,
1505
+ "learning_rate": 2.587004103744495e-05,
1506
+ "loss": 0.7912,
1507
+ "step": 2140
1508
  },
1509
  {
1510
+ "epoch": 0.5425525203457195,
1511
+ "grad_norm": 0.6282544732093811,
1512
+ "learning_rate": 2.564983551864882e-05,
1513
+ "loss": 0.8079,
1514
+ "step": 2150
1515
  },
1516
  {
1517
+ "epoch": 0.5450760204403508,
1518
+ "grad_norm": 0.7435926795005798,
1519
+ "learning_rate": 2.54295795358945e-05,
1520
+ "loss": 0.8342,
1521
+ "step": 2160
1522
  },
1523
  {
1524
+ "epoch": 0.5475995205349821,
1525
+ "grad_norm": 0.6785821318626404,
1526
+ "learning_rate": 2.5209290193492834e-05,
1527
+ "loss": 0.8281,
1528
+ "step": 2170
1529
  },
1530
  {
1531
+ "epoch": 0.5501230206296133,
1532
+ "grad_norm": 0.6483226418495178,
1533
+ "learning_rate": 2.4988984598345247e-05,
1534
+ "loss": 0.79,
1535
+ "step": 2180
1536
  },
1537
  {
1538
+ "epoch": 0.5526465207242445,
1539
+ "grad_norm": 0.6465590000152588,
1540
+ "learning_rate": 2.4768679858615304e-05,
1541
+ "loss": 0.841,
1542
+ "step": 2190
1543
  },
1544
  {
1545
+ "epoch": 0.5551700208188758,
1546
+ "grad_norm": 0.7468442916870117,
1547
+ "learning_rate": 2.454839308240014e-05,
1548
+ "loss": 0.8717,
1549
+ "step": 2200
1550
  },
1551
  {
1552
+ "epoch": 0.557693520913507,
1553
+ "grad_norm": 0.6535473465919495,
1554
+ "learning_rate": 2.4328141376401903e-05,
1555
+ "loss": 0.826,
1556
+ "step": 2210
1557
  },
1558
  {
1559
+ "epoch": 0.5602170210081383,
1560
+ "grad_norm": 0.6404563188552856,
1561
+ "learning_rate": 2.4107941844599312e-05,
1562
+ "loss": 0.8062,
1563
+ "step": 2220
1564
  },
1565
  {
1566
+ "epoch": 0.5627405211027695,
1567
+ "grad_norm": 0.6602795720100403,
1568
+ "learning_rate": 2.3887811586919424e-05,
1569
+ "loss": 0.8418,
1570
+ "step": 2230
1571
  },
1572
  {
1573
+ "epoch": 0.5652640211974008,
1574
+ "grad_norm": 0.6988357305526733,
1575
+ "learning_rate": 2.3667767697909694e-05,
1576
+ "loss": 0.8177,
1577
+ "step": 2240
1578
  },
1579
  {
1580
+ "epoch": 0.567787521292032,
1581
+ "grad_norm": 0.6755298376083374,
1582
+ "learning_rate": 2.3447827265410517e-05,
1583
+ "loss": 0.8653,
1584
+ "step": 2250
1585
  },
1586
  {
1587
+ "epoch": 0.5703110213866633,
1588
+ "grad_norm": 0.72756028175354,
1589
+ "learning_rate": 2.3228007369228178e-05,
1590
+ "loss": 0.8896,
1591
+ "step": 2260
1592
  },
1593
  {
1594
+ "epoch": 0.5728345214812945,
1595
+ "grad_norm": 0.6584864854812622,
1596
+ "learning_rate": 2.3008325079808576e-05,
1597
+ "loss": 0.8393,
1598
+ "step": 2270
1599
+ },
1600
+ {
1601
+ "epoch": 0.5753580215759259,
1602
+ "grad_norm": 0.6699262857437134,
1603
+ "learning_rate": 2.2788797456911503e-05,
1604
+ "loss": 0.7976,
1605
+ "step": 2280
1606
  },
1607
  {
1608
+ "epoch": 0.5778815216705571,
1609
+ "grad_norm": 0.7463390827178955,
1610
+ "learning_rate": 2.2569441548285934e-05,
1611
+ "loss": 0.8321,
1612
+ "step": 2290
1613
+ },
1614
+ {
1615
+ "epoch": 0.5804050217651883,
1616
+ "grad_norm": 0.542870283126831,
1617
+ "learning_rate": 2.2350274388346064e-05,
1618
+ "loss": 0.786,
1619
  "step": 2300
1620
  },
1621
  {
1622
+ "epoch": 0.5829285218598196,
1623
+ "grad_norm": 0.652056872844696,
1624
+ "learning_rate": 2.213131299684858e-05,
1625
+ "loss": 0.848,
1626
+ "step": 2310
1627
+ },
1628
+ {
1629
+ "epoch": 0.5854520219544508,
1630
+ "grad_norm": 0.7307469248771667,
1631
+ "learning_rate": 2.191257437757086e-05,
1632
+ "loss": 0.8117,
1633
+ "step": 2320
1634
+ },
1635
+ {
1636
+ "epoch": 0.5879755220490821,
1637
+ "grad_norm": 0.6336262822151184,
1638
+ "learning_rate": 2.16940755169906e-05,
1639
+ "loss": 0.8417,
1640
+ "step": 2330
1641
+ },
1642
+ {
1643
+ "epoch": 0.5904990221437133,
1644
+ "grad_norm": 0.7636166214942932,
1645
+ "learning_rate": 2.1475833382966647e-05,
1646
+ "loss": 0.8786,
1647
+ "step": 2340
1648
+ },
1649
+ {
1650
+ "epoch": 0.5930225222383446,
1651
+ "grad_norm": 0.6622100472450256,
1652
+ "learning_rate": 2.1257864923421404e-05,
1653
+ "loss": 0.8629,
1654
  "step": 2350
1655
  },
1656
  {
1657
+ "epoch": 0.5955460223329758,
1658
+ "grad_norm": 0.602483332157135,
1659
+ "learning_rate": 2.1040187065024605e-05,
1660
+ "loss": 0.7786,
1661
+ "step": 2360
1662
+ },
1663
+ {
1664
+ "epoch": 0.5980695224276071,
1665
+ "grad_norm": 0.6503065824508667,
1666
+ "learning_rate": 2.0822816711878978e-05,
1667
+ "loss": 0.8445,
1668
+ "step": 2370
1669
+ },
1670
+ {
1671
+ "epoch": 0.6005930225222383,
1672
+ "grad_norm": 0.6901794672012329,
1673
+ "learning_rate": 2.0605770744207413e-05,
1674
+ "loss": 0.8259,
1675
+ "step": 2380
1676
+ },
1677
+ {
1678
+ "epoch": 0.6031165226168695,
1679
+ "grad_norm": 0.7173271179199219,
1680
+ "learning_rate": 2.0389066017042192e-05,
1681
+ "loss": 0.802,
1682
+ "step": 2390
1683
+ },
1684
+ {
1685
+ "epoch": 0.6056400227115009,
1686
+ "grad_norm": 0.7431663870811462,
1687
+ "learning_rate": 2.0172719358916042e-05,
1688
+ "loss": 0.8092,
1689
  "step": 2400
1690
  },
1691
  {
1692
+ "epoch": 0.6081635228061321,
1693
+ "grad_norm": 0.7227687239646912,
1694
+ "learning_rate": 1.9956747570555288e-05,
1695
+ "loss": 0.8563,
1696
+ "step": 2410
1697
+ },
1698
+ {
1699
+ "epoch": 0.6106870229007634,
1700
+ "grad_norm": 0.6300061345100403,
1701
+ "learning_rate": 1.9741167423575186e-05,
1702
+ "loss": 0.7849,
1703
+ "step": 2420
1704
+ },
1705
+ {
1706
+ "epoch": 0.6132105229953946,
1707
+ "grad_norm": 0.6208367347717285,
1708
+ "learning_rate": 1.9525995659177484e-05,
1709
+ "loss": 0.8239,
1710
+ "step": 2430
1711
+ },
1712
+ {
1713
+ "epoch": 0.6157340230900259,
1714
+ "grad_norm": 0.6272019147872925,
1715
+ "learning_rate": 1.9311248986850365e-05,
1716
+ "loss": 0.8102,
1717
+ "step": 2440
1718
+ },
1719
+ {
1720
+ "epoch": 0.6182575231846571,
1721
+ "grad_norm": 0.6594968438148499,
1722
+ "learning_rate": 1.9096944083070866e-05,
1723
+ "loss": 0.8266,
1724
  "step": 2450
1725
  },
1726
  {
1727
+ "epoch": 0.6207810232792884,
1728
+ "grad_norm": 0.673553466796875,
1729
+ "learning_rate": 1.8883097590009775e-05,
1730
+ "loss": 0.8375,
1731
+ "step": 2460
1732
+ },
1733
+ {
1734
+ "epoch": 0.6233045233739196,
1735
+ "grad_norm": 0.7199084162712097,
1736
+ "learning_rate": 1.866972611423936e-05,
1737
+ "loss": 0.8188,
1738
+ "step": 2470
1739
+ },
1740
+ {
1741
+ "epoch": 0.6258280234685509,
1742
+ "grad_norm": 0.697413444519043,
1743
+ "learning_rate": 1.8456846225443648e-05,
1744
+ "loss": 0.7709,
1745
+ "step": 2480
1746
+ },
1747
+ {
1748
+ "epoch": 0.6283515235631821,
1749
+ "grad_norm": 0.6711037158966064,
1750
+ "learning_rate": 1.8244474455131792e-05,
1751
+ "loss": 0.8156,
1752
+ "step": 2490
1753
+ },
1754
+ {
1755
+ "epoch": 0.6308750236578133,
1756
+ "grad_norm": 0.7030087113380432,
1757
+ "learning_rate": 1.8032627295354183e-05,
1758
+ "loss": 0.8125,
1759
  "step": 2500
1760
  },
1761
  {
1762
+ "epoch": 0.6333985237524447,
1763
+ "grad_norm": 0.7960418462753296,
1764
+ "learning_rate": 1.7821321197421837e-05,
1765
+ "loss": 0.8604,
1766
+ "step": 2510
1767
+ },
1768
+ {
1769
+ "epoch": 0.6359220238470759,
1770
+ "grad_norm": 0.6948102116584778,
1771
+ "learning_rate": 1.761057257062876e-05,
1772
+ "loss": 0.8301,
1773
+ "step": 2520
1774
+ },
1775
+ {
1776
+ "epoch": 0.6384455239417072,
1777
+ "grad_norm": 0.5919877290725708,
1778
+ "learning_rate": 1.740039778097772e-05,
1779
+ "loss": 0.7821,
1780
+ "step": 2530
1781
+ },
1782
+ {
1783
+ "epoch": 0.6409690240363384,
1784
+ "grad_norm": 0.6569110751152039,
1785
+ "learning_rate": 1.7190813149909274e-05,
1786
+ "loss": 0.8213,
1787
+ "step": 2540
1788
+ },
1789
+ {
1790
+ "epoch": 0.6434925241309697,
1791
+ "grad_norm": 0.677099347114563,
1792
+ "learning_rate": 1.6981834953034344e-05,
1793
+ "loss": 0.8278,
1794
  "step": 2550
1795
  },
1796
  {
1797
+ "epoch": 0.6460160242256009,
1798
+ "grad_norm": 0.7233052253723145,
1799
+ "learning_rate": 1.677347941887028e-05,
1800
+ "loss": 0.7919,
1801
+ "step": 2560
1802
+ },
1803
+ {
1804
+ "epoch": 0.6485395243202322,
1805
+ "grad_norm": 0.7088631987571716,
1806
+ "learning_rate": 1.656576272758061e-05,
1807
+ "loss": 0.8444,
1808
+ "step": 2570
1809
+ },
1810
+ {
1811
+ "epoch": 0.6510630244148634,
1812
+ "grad_norm": 0.6909515857696533,
1813
+ "learning_rate": 1.6358701009718577e-05,
1814
+ "loss": 0.8222,
1815
+ "step": 2580
1816
+ },
1817
+ {
1818
+ "epoch": 0.6535865245094946,
1819
+ "grad_norm": 0.5979318618774414,
1820
+ "learning_rate": 1.615231034497444e-05,
1821
+ "loss": 0.8376,
1822
+ "step": 2590
1823
+ },
1824
+ {
1825
+ "epoch": 0.6561100246041259,
1826
+ "grad_norm": 0.7273426055908203,
1827
+ "learning_rate": 1.5946606760926865e-05,
1828
+ "loss": 0.8037,
1829
  "step": 2600
1830
  },
1831
  {
1832
+ "epoch": 0.6586335246987571,
1833
+ "grad_norm": 0.719450056552887,
1834
+ "learning_rate": 1.574160623179816e-05,
1835
+ "loss": 0.8268,
1836
+ "step": 2610
1837
+ },
1838
+ {
1839
+ "epoch": 0.6611570247933884,
1840
+ "grad_norm": 0.7163055539131165,
1841
+ "learning_rate": 1.553732467721392e-05,
1842
+ "loss": 0.7853,
1843
+ "step": 2620
1844
+ },
1845
+ {
1846
+ "epoch": 0.6636805248880197,
1847
+ "grad_norm": 0.6172025799751282,
1848
+ "learning_rate": 1.5333777960966616e-05,
1849
+ "loss": 0.7926,
1850
+ "step": 2630
1851
+ },
1852
+ {
1853
+ "epoch": 0.666204024982651,
1854
+ "grad_norm": 0.6272744536399841,
1855
+ "learning_rate": 1.5130981889783795e-05,
1856
+ "loss": 0.7982,
1857
+ "step": 2640
1858
+ },
1859
+ {
1860
+ "epoch": 0.6687275250772822,
1861
+ "grad_norm": 0.680596649646759,
1862
+ "learning_rate": 1.4928952212100483e-05,
1863
+ "loss": 0.8312,
1864
  "step": 2650
1865
  },
1866
  {
1867
+ "epoch": 0.6712510251719135,
1868
+ "grad_norm": 0.6080834865570068,
1869
+ "learning_rate": 1.4727704616836296e-05,
1870
+ "loss": 0.8273,
1871
+ "step": 2660
1872
+ },
1873
+ {
1874
+ "epoch": 0.6737745252665447,
1875
+ "grad_norm": 0.6613759398460388,
1876
+ "learning_rate": 1.4527254732177043e-05,
1877
+ "loss": 0.8141,
1878
+ "step": 2670
1879
+ },
1880
+ {
1881
+ "epoch": 0.676298025361176,
1882
+ "grad_norm": 0.6180728077888489,
1883
+ "learning_rate": 1.4327618124361114e-05,
1884
+ "loss": 0.8231,
1885
+ "step": 2680
1886
+ },
1887
+ {
1888
+ "epoch": 0.6788215254558072,
1889
+ "grad_norm": 0.6406080722808838,
1890
+ "learning_rate": 1.412881029647065e-05,
1891
+ "loss": 0.7876,
1892
+ "step": 2690
1893
+ },
1894
+ {
1895
+ "epoch": 0.6813450255504384,
1896
+ "grad_norm": 0.6109746098518372,
1897
+ "learning_rate": 1.3930846687227664e-05,
1898
+ "loss": 0.7957,
1899
  "step": 2700
1900
  },
1901
  {
1902
+ "epoch": 0.6838685256450697,
1903
+ "grad_norm": 0.6827517747879028,
1904
+ "learning_rate": 1.3733742669795049e-05,
1905
+ "loss": 0.8428,
1906
+ "step": 2710
1907
+ },
1908
+ {
1909
+ "epoch": 0.6863920257397009,
1910
+ "grad_norm": 0.7277110815048218,
1911
+ "learning_rate": 1.3537513550582853e-05,
1912
+ "loss": 0.8326,
1913
+ "step": 2720
1914
+ },
1915
+ {
1916
+ "epoch": 0.6889155258343322,
1917
+ "grad_norm": 0.597568154335022,
1918
+ "learning_rate": 1.3342174568059527e-05,
1919
+ "loss": 0.7998,
1920
+ "step": 2730
1921
+ },
1922
+ {
1923
+ "epoch": 0.6914390259289634,
1924
+ "grad_norm": 0.6378962993621826,
1925
+ "learning_rate": 1.3147740891568661e-05,
1926
+ "loss": 0.785,
1927
+ "step": 2740
1928
+ },
1929
+ {
1930
+ "epoch": 0.6939625260235948,
1931
+ "grad_norm": 0.6579405069351196,
1932
+ "learning_rate": 1.2954227620150904e-05,
1933
+ "loss": 0.8332,
1934
  "step": 2750
1935
  },
1936
  {
1937
+ "epoch": 0.696486026118226,
1938
+ "grad_norm": 0.6977427005767822,
1939
+ "learning_rate": 1.2761649781371479e-05,
1940
+ "loss": 0.8095,
1941
+ "step": 2760
1942
+ },
1943
+ {
1944
+ "epoch": 0.6990095262128573,
1945
+ "grad_norm": 0.6410185098648071,
1946
+ "learning_rate": 1.257002233015318e-05,
1947
+ "loss": 0.8341,
1948
+ "step": 2770
1949
+ },
1950
+ {
1951
+ "epoch": 0.7015330263074885,
1952
+ "grad_norm": 0.6869609355926514,
1953
+ "learning_rate": 1.2379360147614994e-05,
1954
+ "loss": 0.8023,
1955
+ "step": 2780
1956
+ },
1957
+ {
1958
+ "epoch": 0.7040565264021197,
1959
+ "grad_norm": 0.6658973097801208,
1960
+ "learning_rate": 1.2189678039916532e-05,
1961
+ "loss": 0.7755,
1962
+ "step": 2790
1963
+ },
1964
+ {
1965
+ "epoch": 0.706580026496751,
1966
+ "grad_norm": 0.6188139915466309,
1967
+ "learning_rate": 1.2000990737108225e-05,
1968
+ "loss": 0.796,
1969
  "step": 2800
1970
  },
1971
  {
1972
+ "epoch": 0.7091035265913822,
1973
+ "grad_norm": 0.7432144284248352,
1974
+ "learning_rate": 1.1813312891987392e-05,
1975
+ "loss": 0.8381,
1976
+ "step": 2810
1977
+ },
1978
+ {
1979
+ "epoch": 0.7116270266860135,
1980
+ "grad_norm": 0.6776263117790222,
1981
+ "learning_rate": 1.1626659078960424e-05,
1982
+ "loss": 0.8087,
1983
+ "step": 2820
1984
+ },
1985
+ {
1986
+ "epoch": 0.7141505267806447,
1987
+ "grad_norm": 0.6468738913536072,
1988
+ "learning_rate": 1.1441043792910936e-05,
1989
+ "loss": 0.8032,
1990
+ "step": 2830
1991
+ },
1992
+ {
1993
+ "epoch": 0.716674026875276,
1994
+ "grad_norm": 0.7177358865737915,
1995
+ "learning_rate": 1.1256481448074179e-05,
1996
+ "loss": 0.8039,
1997
+ "step": 2840
1998
+ },
1999
+ {
2000
+ "epoch": 0.7191975269699072,
2001
+ "grad_norm": 0.6401441693305969,
2002
+ "learning_rate": 1.1072986376917638e-05,
2003
+ "loss": 0.8135,
2004
  "step": 2850
2005
  },
2006
  {
2007
+ "epoch": 0.7217210270645386,
2008
+ "grad_norm": 0.6511224508285522,
2009
+ "learning_rate": 1.0890572829028087e-05,
2010
+ "loss": 0.8496,
2011
+ "step": 2860
2012
+ },
2013
+ {
2014
+ "epoch": 0.7242445271591698,
2015
+ "grad_norm": 0.632625162601471,
2016
+ "learning_rate": 1.0709254970004937e-05,
2017
+ "loss": 0.7964,
2018
+ "step": 2870
2019
+ },
2020
+ {
2021
+ "epoch": 0.7267680272538011,
2022
+ "grad_norm": 0.5535660982131958,
2023
+ "learning_rate": 1.0529046880360263e-05,
2024
+ "loss": 0.7932,
2025
+ "step": 2880
2026
+ },
2027
+ {
2028
+ "epoch": 0.7292915273484323,
2029
+ "grad_norm": 0.5996463298797607,
2030
+ "learning_rate": 1.034996255442529e-05,
2031
+ "loss": 0.8437,
2032
+ "step": 2890
2033
+ },
2034
+ {
2035
+ "epoch": 0.7318150274430635,
2036
+ "grad_norm": 0.6257640719413757,
2037
+ "learning_rate": 1.0172015899263712e-05,
2038
+ "loss": 0.8069,
2039
  "step": 2900
2040
  },
2041
  {
2042
+ "epoch": 0.7343385275376948,
2043
+ "grad_norm": 0.6533858776092529,
2044
+ "learning_rate": 9.995220733591639e-06,
2045
+ "loss": 0.7921,
2046
+ "step": 2910
2047
+ },
2048
+ {
2049
+ "epoch": 0.736862027632326,
2050
+ "grad_norm": 0.6002010107040405,
2051
+ "learning_rate": 9.819590786704572e-06,
2052
+ "loss": 0.8307,
2053
+ "step": 2920
2054
+ },
2055
+ {
2056
+ "epoch": 0.7393855277269573,
2057
+ "grad_norm": 0.6418666243553162,
2058
+ "learning_rate": 9.645139697411149e-06,
2059
+ "loss": 0.8036,
2060
+ "step": 2930
2061
+ },
2062
+ {
2063
+ "epoch": 0.7419090278215885,
2064
+ "grad_norm": 0.6554102897644043,
2065
+ "learning_rate": 9.471881012974071e-06,
2066
+ "loss": 0.8285,
2067
+ "step": 2940
2068
+ },
2069
+ {
2070
+ "epoch": 0.7444325279162198,
2071
+ "grad_norm": 0.6879960894584656,
2072
+ "learning_rate": 9.299828188058013e-06,
2073
+ "loss": 0.8154,
2074
  "step": 2950
2075
  },
2076
  {
2077
+ "epoch": 0.746956028010851,
2078
+ "grad_norm": 0.6418633460998535,
2079
+ "learning_rate": 9.128994583684838e-06,
2080
+ "loss": 0.7945,
2081
+ "step": 2960
2082
+ },
2083
+ {
2084
+ "epoch": 0.7494795281054824,
2085
+ "grad_norm": 0.6467211246490479,
2086
+ "learning_rate": 8.959393466195972e-06,
2087
+ "loss": 0.8464,
2088
+ "step": 2970
2089
+ },
2090
+ {
2091
+ "epoch": 0.7520030282001136,
2092
+ "grad_norm": 0.6477042436599731,
2093
+ "learning_rate": 8.791038006222233e-06,
2094
+ "loss": 0.8235,
2095
+ "step": 2980
2096
+ },
2097
+ {
2098
+ "epoch": 0.7545265282947448,
2099
+ "grad_norm": 0.6426742672920227,
2100
+ "learning_rate": 8.623941277660994e-06,
2101
+ "loss": 0.8001,
2102
+ "step": 2990
2103
+ },
2104
+ {
2105
+ "epoch": 0.7570500283893761,
2106
+ "grad_norm": 0.7026243805885315,
2107
+ "learning_rate": 8.458116256660981e-06,
2108
+ "loss": 0.842,
2109
  "step": 3000
2110
  },
2111
  {
2112
+ "epoch": 0.7595735284840073,
2113
+ "grad_norm": 0.6429437398910522,
2114
+ "learning_rate": 8.293575820614508e-06,
2115
+ "loss": 0.8143,
2116
+ "step": 3010
2117
+ },
2118
+ {
2119
+ "epoch": 0.7620970285786386,
2120
+ "grad_norm": 0.654498815536499,
2121
+ "learning_rate": 8.130332747157542e-06,
2122
+ "loss": 0.7697,
2123
+ "step": 3020
2124
+ },
2125
+ {
2126
+ "epoch": 0.7646205286732698,
2127
+ "grad_norm": 0.8270076513290405,
2128
+ "learning_rate": 7.968399713177366e-06,
2129
+ "loss": 0.825,
2130
+ "step": 3030
2131
+ },
2132
+ {
2133
+ "epoch": 0.7671440287679011,
2134
+ "grad_norm": 0.6423079967498779,
2135
+ "learning_rate": 7.807789293828204e-06,
2136
+ "loss": 0.8366,
2137
+ "step": 3040
2138
+ },
2139
+ {
2140
+ "epoch": 0.7696675288625323,
2141
+ "grad_norm": 0.662451446056366,
2142
+ "learning_rate": 7.648513961554607e-06,
2143
+ "loss": 0.7695,
2144
  "step": 3050
2145
  },
2146
  {
2147
+ "epoch": 0.7721910289571636,
2148
+ "grad_norm": 0.5953843593597412,
2149
+ "learning_rate": 7.4905860851229605e-06,
2150
+ "loss": 0.8296,
2151
+ "step": 3060
2152
+ },
2153
+ {
2154
+ "epoch": 0.7747145290517948,
2155
+ "grad_norm": 0.7210749387741089,
2156
+ "learning_rate": 7.334017928660902e-06,
2157
+ "loss": 0.8201,
2158
+ "step": 3070
2159
+ },
2160
+ {
2161
+ "epoch": 0.7772380291464261,
2162
+ "grad_norm": 0.6214151382446289,
2163
+ "learning_rate": 7.1788216507049865e-06,
2164
+ "loss": 0.8034,
2165
+ "step": 3080
2166
+ },
2167
+ {
2168
+ "epoch": 0.7797615292410573,
2169
+ "grad_norm": 0.6791695356369019,
2170
+ "learning_rate": 7.0250093032564494e-06,
2171
+ "loss": 0.7624,
2172
+ "step": 3090
2173
+ },
2174
+ {
2175
+ "epoch": 0.7822850293356886,
2176
+ "grad_norm": 0.6388612985610962,
2177
+ "learning_rate": 6.872592830845339e-06,
2178
+ "loss": 0.8004,
2179
  "step": 3100
2180
  },
2181
  {
2182
+ "epoch": 0.7848085294303199,
2183
+ "grad_norm": 0.5958021283149719,
2184
+ "learning_rate": 6.72158406960289e-06,
2185
+ "loss": 0.8275,
2186
+ "step": 3110
2187
+ },
2188
+ {
2189
+ "epoch": 0.7873320295249511,
2190
+ "grad_norm": 0.572040855884552,
2191
+ "learning_rate": 6.571994746342439e-06,
2192
+ "loss": 0.8078,
2193
+ "step": 3120
2194
+ },
2195
+ {
2196
+ "epoch": 0.7898555296195824,
2197
+ "grad_norm": 0.6328415274620056,
2198
+ "learning_rate": 6.4238364776486785e-06,
2199
+ "loss": 0.7883,
2200
+ "step": 3130
2201
+ },
2202
+ {
2203
+ "epoch": 0.7923790297142136,
2204
+ "grad_norm": 0.6552072763442993,
2205
+ "learning_rate": 6.277120768975644e-06,
2206
+ "loss": 0.8398,
2207
+ "step": 3140
2208
+ },
2209
+ {
2210
+ "epoch": 0.7949025298088449,
2211
+ "grad_norm": 0.7182049751281738,
2212
+ "learning_rate": 6.131859013753155e-06,
2213
+ "loss": 0.7919,
2214
  "step": 3150
2215
  },
2216
  {
2217
+ "epoch": 0.7974260299034761,
2218
+ "grad_norm": 0.7126038074493408,
2219
+ "learning_rate": 5.988062492502117e-06,
2220
+ "loss": 0.7782,
2221
+ "step": 3160
2222
+ },
2223
+ {
2224
+ "epoch": 0.7999495299981074,
2225
+ "grad_norm": 0.6005820631980896,
2226
+ "learning_rate": 5.8457423719584435e-06,
2227
+ "loss": 0.7979,
2228
+ "step": 3170
2229
+ },
2230
+ {
2231
+ "epoch": 0.8024730300927386,
2232
+ "grad_norm": 0.6624283790588379,
2233
+ "learning_rate": 5.704909704205949e-06,
2234
+ "loss": 0.8297,
2235
+ "step": 3180
2236
+ },
2237
+ {
2238
+ "epoch": 0.8049965301873698,
2239
+ "grad_norm": 0.6289507150650024,
2240
+ "learning_rate": 5.565575425818054e-06,
2241
+ "loss": 0.8147,
2242
+ "step": 3190
2243
+ },
2244
+ {
2245
+ "epoch": 0.8075200302820011,
2246
+ "grad_norm": 0.6975149512290955,
2247
+ "learning_rate": 5.427750357008468e-06,
2248
+ "loss": 0.7733,
2249
  "step": 3200
2250
  },
2251
  {
2252
+ "epoch": 0.8100435303766323,
2253
+ "grad_norm": 0.6802620887756348,
2254
+ "learning_rate": 5.291445200790982e-06,
2255
+ "loss": 0.8226,
2256
+ "step": 3210
2257
+ },
2258
+ {
2259
+ "epoch": 0.8125670304712637,
2260
+ "grad_norm": 0.6158818602561951,
2261
+ "learning_rate": 5.156670542148267e-06,
2262
+ "loss": 0.8282,
2263
+ "step": 3220
2264
+ },
2265
+ {
2266
+ "epoch": 0.8150905305658949,
2267
+ "grad_norm": 0.7228125333786011,
2268
+ "learning_rate": 5.023436847209887e-06,
2269
+ "loss": 0.816,
2270
+ "step": 3230
2271
+ },
2272
+ {
2273
+ "epoch": 0.8176140306605262,
2274
+ "grad_norm": 0.6515725255012512,
2275
+ "learning_rate": 4.891754462439557e-06,
2276
+ "loss": 0.775,
2277
+ "step": 3240
2278
+ },
2279
+ {
2280
+ "epoch": 0.8201375307551574,
2281
+ "grad_norm": 0.6829689741134644,
2282
+ "learning_rate": 4.761633613831645e-06,
2283
+ "loss": 0.8156,
2284
  "step": 3250
2285
  },
2286
  {
2287
+ "epoch": 0.8226610308497887,
2288
+ "grad_norm": 0.7261675596237183,
2289
+ "learning_rate": 4.6330844061170914e-06,
2290
+ "loss": 0.7862,
2291
+ "step": 3260
2292
+ },
2293
+ {
2294
+ "epoch": 0.8251845309444199,
2295
+ "grad_norm": 0.6911167502403259,
2296
+ "learning_rate": 4.506116821978662e-06,
2297
+ "loss": 0.8016,
2298
+ "step": 3270
2299
+ },
2300
+ {
2301
+ "epoch": 0.8277080310390512,
2302
+ "grad_norm": 0.5780116319656372,
2303
+ "learning_rate": 4.380740721275786e-06,
2304
+ "loss": 0.824,
2305
+ "step": 3280
2306
+ },
2307
+ {
2308
+ "epoch": 0.8302315311336824,
2309
+ "grad_norm": 0.6704926490783691,
2310
+ "learning_rate": 4.25696584027882e-06,
2311
+ "loss": 0.8037,
2312
+ "step": 3290
2313
+ },
2314
+ {
2315
+ "epoch": 0.8327550312283136,
2316
+ "grad_norm": 0.7162071466445923,
2317
+ "learning_rate": 4.134801790913006e-06,
2318
+ "loss": 0.7651,
2319
  "step": 3300
2320
  },
2321
  {
2322
+ "epoch": 0.8352785313229449,
2323
+ "grad_norm": 0.7350740432739258,
2324
+ "learning_rate": 4.014258060012005e-06,
2325
+ "loss": 0.8278,
2326
+ "step": 3310
2327
+ },
2328
+ {
2329
+ "epoch": 0.8378020314175761,
2330
+ "grad_norm": 0.6031658053398132,
2331
+ "learning_rate": 3.895344008581222e-06,
2332
+ "loss": 0.7945,
2333
+ "step": 3320
2334
+ },
2335
+ {
2336
+ "epoch": 0.8403255315122075,
2337
+ "grad_norm": 0.6996452212333679,
2338
+ "learning_rate": 3.7780688710708223e-06,
2339
+ "loss": 0.7821,
2340
+ "step": 3330
2341
+ },
2342
+ {
2343
+ "epoch": 0.8428490316068387,
2344
+ "grad_norm": 0.6655017733573914,
2345
+ "learning_rate": 3.6624417546586574e-06,
2346
+ "loss": 0.7526,
2347
+ "step": 3340
2348
+ },
2349
+ {
2350
+ "epoch": 0.84537253170147,
2351
+ "grad_norm": 0.7387165427207947,
2352
+ "learning_rate": 3.548471638542991e-06,
2353
+ "loss": 0.8259,
2354
  "step": 3350
2355
  },
2356
  {
2357
+ "epoch": 0.8478960317961012,
2358
+ "grad_norm": 0.6410266757011414,
2359
+ "learning_rate": 3.436167373245247e-06,
2360
+ "loss": 0.8054,
2361
+ "step": 3360
2362
+ },
2363
+ {
2364
+ "epoch": 0.8504195318907325,
2365
+ "grad_norm": 0.6522373557090759,
2366
+ "learning_rate": 3.325537679922672e-06,
2367
+ "loss": 0.8168,
2368
+ "step": 3370
2369
+ },
2370
+ {
2371
+ "epoch": 0.8529430319853637,
2372
+ "grad_norm": 0.7458412647247314,
2373
+ "learning_rate": 3.2165911496911173e-06,
2374
+ "loss": 0.7892,
2375
+ "step": 3380
2376
+ },
2377
+ {
2378
+ "epoch": 0.8554665320799949,
2379
+ "grad_norm": 0.6441506743431091,
2380
+ "learning_rate": 3.1093362429578414e-06,
2381
+ "loss": 0.8105,
2382
+ "step": 3390
2383
+ },
2384
+ {
2385
+ "epoch": 0.8579900321746262,
2386
+ "grad_norm": 0.5970674753189087,
2387
+ "learning_rate": 3.0037812887645483e-06,
2388
+ "loss": 0.8326,
2389
  "step": 3400
2390
  },
2391
  {
2392
+ "epoch": 0.8605135322692574,
2393
+ "grad_norm": 0.6173757314682007,
2394
+ "learning_rate": 2.8999344841405373e-06,
2395
+ "loss": 0.7956,
2396
+ "step": 3410
2397
  },
2398
  {
2399
+ "epoch": 0.8630370323638887,
2400
+ "grad_norm": 0.6268020868301392,
2401
+ "learning_rate": 2.7978038934662024e-06,
2402
+ "loss": 0.7859,
2403
+ "step": 3420
2404
  },
2405
  {
2406
+ "epoch": 0.8655605324585199,
2407
+ "grad_norm": 0.6534834504127502,
2408
+ "learning_rate": 2.697397447846725e-06,
2409
+ "loss": 0.8041,
2410
+ "step": 3430
2411
  },
2412
  {
2413
+ "epoch": 0.8680840325531513,
2414
+ "grad_norm": 0.6108519434928894,
2415
+ "learning_rate": 2.5987229444962237e-06,
2416
+ "loss": 0.823,
2417
+ "step": 3440
2418
  },
2419
  {
2420
+ "epoch": 0.8706075326477825,
2421
+ "grad_norm": 0.6347935795783997,
2422
+ "learning_rate": 2.501788046132203e-06,
2423
+ "loss": 0.831,
2424
+ "step": 3450
2425
  },
2426
  {
2427
+ "epoch": 0.8731310327424138,
2428
+ "grad_norm": 0.6183903813362122,
2429
+ "learning_rate": 2.4066002803805386e-06,
2430
+ "loss": 0.7974,
2431
+ "step": 3460
2432
  },
2433
  {
2434
+ "epoch": 0.875654532837045,
2435
+ "grad_norm": 0.6723082065582275,
2436
+ "learning_rate": 2.313167039190861e-06,
2437
+ "loss": 0.8058,
2438
+ "step": 3470
2439
  },
2440
  {
2441
+ "epoch": 0.8781780329316763,
2442
+ "grad_norm": 0.6427431702613831,
2443
+ "learning_rate": 2.2214955782625752e-06,
2444
+ "loss": 0.805,
2445
+ "step": 3480
2446
  },
2447
  {
2448
+ "epoch": 0.8807015330263075,
2449
+ "grad_norm": 0.7344009280204773,
2450
+ "learning_rate": 2.1315930164813507e-06,
2451
+ "loss": 0.8366,
2452
+ "step": 3490
2453
  },
2454
  {
2455
+ "epoch": 0.8832250331209387,
2456
+ "grad_norm": 0.6524431109428406,
2457
+ "learning_rate": 2.0434663353663536e-06,
2458
+ "loss": 0.8022,
2459
+ "step": 3500
2460
  },
2461
  {
2462
+ "epoch": 0.88574853321557,
2463
+ "grad_norm": 0.6769471168518066,
2464
+ "learning_rate": 1.9571223785280314e-06,
2465
+ "loss": 0.8062,
2466
+ "step": 3510
2467
  },
2468
  {
2469
+ "epoch": 0.8882720333102012,
2470
+ "grad_norm": 0.6867194771766663,
2471
+ "learning_rate": 1.8725678511367001e-06,
2472
+ "loss": 0.8171,
2473
+ "step": 3520
2474
  },
2475
  {
2476
+ "epoch": 0.8907955334048325,
2477
+ "grad_norm": 0.6660215854644775,
2478
+ "learning_rate": 1.789809319401825e-06,
2479
+ "loss": 0.8169,
2480
+ "step": 3530
2481
  },
2482
  {
2483
+ "epoch": 0.8933190334994637,
2484
+ "grad_norm": 0.6402613520622253,
2485
+ "learning_rate": 1.7088532100621224e-06,
2486
+ "loss": 0.7813,
2487
+ "step": 3540
2488
  },
2489
  {
2490
+ "epoch": 0.895842533594095,
2491
+ "grad_norm": 0.6413708925247192,
2492
+ "learning_rate": 1.629705809886467e-06,
2493
+ "loss": 0.7837,
2494
+ "step": 3550
2495
  },
2496
  {
2497
+ "epoch": 0.8983660336887263,
2498
+ "grad_norm": 0.6048439741134644,
2499
+ "learning_rate": 1.5523732651857082e-06,
2500
+ "loss": 0.7984,
2501
+ "step": 3560
2502
  },
2503
  {
2504
+ "epoch": 0.9008895337833576,
2505
+ "grad_norm": 0.6774916052818298,
2506
+ "learning_rate": 1.4768615813353398e-06,
2507
+ "loss": 0.8033,
2508
+ "step": 3570
2509
  },
2510
  {
2511
+ "epoch": 0.9034130338779888,
2512
+ "grad_norm": 0.6154995560646057,
2513
+ "learning_rate": 1.4031766223091603e-06,
2514
+ "loss": 0.8015,
2515
+ "step": 3580
2516
  },
2517
  {
2518
+ "epoch": 0.90593653397262,
2519
+ "grad_norm": 0.6018934845924377,
2520
+ "learning_rate": 1.3313241102239054e-06,
2521
+ "loss": 0.7761,
2522
+ "step": 3590
2523
  },
2524
  {
2525
+ "epoch": 0.9084600340672513,
2526
+ "grad_norm": 0.658366322517395,
2527
+ "learning_rate": 1.261309624894863e-06,
2528
+ "loss": 0.8173,
2529
+ "step": 3600
2530
  },
2531
  {
2532
+ "epoch": 0.9109835341618825,
2533
+ "grad_norm": 0.6167306900024414,
2534
+ "learning_rate": 1.1931386034025882e-06,
2535
+ "loss": 0.8024,
2536
+ "step": 3610
2537
  },
2538
  {
2539
+ "epoch": 0.9135070342565138,
2540
+ "grad_norm": 0.5509990453720093,
2541
+ "learning_rate": 1.1268163396706583e-06,
2542
+ "loss": 0.8128,
2543
+ "step": 3620
2544
  },
2545
  {
2546
+ "epoch": 0.916030534351145,
2547
+ "grad_norm": 0.6154832243919373,
2548
+ "learning_rate": 1.0623479840545874e-06,
2549
+ "loss": 0.7569,
2550
+ "step": 3630
2551
  },
2552
  {
2553
+ "epoch": 0.9185540344457763,
2554
+ "grad_norm": 0.679389238357544,
2555
+ "learning_rate": 9.997385429418555e-07,
2556
+ "loss": 0.8276,
2557
+ "step": 3640
2558
  },
2559
  {
2560
+ "epoch": 0.9210775345404075,
2561
+ "grad_norm": 0.662276566028595,
2562
+ "learning_rate": 9.389928783631207e-07,
2563
+ "loss": 0.8304,
2564
+ "step": 3650
2565
  },
2566
  {
2567
+ "epoch": 0.9236010346350388,
2568
+ "grad_norm": 0.6233845949172974,
2569
+ "learning_rate": 8.801157076146705e-07,
2570
+ "loss": 0.7851,
2571
+ "step": 3660
2572
  },
2573
  {
2574
+ "epoch": 0.92612453472967,
2575
+ "grad_norm": 0.7036879658699036,
2576
+ "learning_rate": 8.231116028920765e-07,
2577
+ "loss": 0.793,
2578
+ "step": 3670
2579
  },
2580
  {
2581
+ "epoch": 0.9286480348243014,
2582
+ "grad_norm": 0.6103026270866394,
2583
+ "learning_rate": 7.679849909351472e-07,
2584
+ "loss": 0.7818,
2585
+ "step": 3680
2586
  },
2587
  {
2588
+ "epoch": 0.9311715349189326,
2589
+ "grad_norm": 0.6900059580802917,
2590
+ "learning_rate": 7.147401526841485e-07,
2591
+ "loss": 0.773,
2592
+ "step": 3690
2593
  },
2594
  {
2595
+ "epoch": 0.9336950350135638,
2596
+ "grad_norm": 0.681058943271637,
2597
+ "learning_rate": 6.633812229473791e-07,
2598
+ "loss": 0.8357,
2599
+ "step": 3700
2600
  },
2601
  {
2602
+ "epoch": 0.9362185351081951,
2603
+ "grad_norm": 0.7187952995300293,
2604
+ "learning_rate": 6.139121900800515e-07,
2605
+ "loss": 0.7779,
2606
+ "step": 3710
2607
  },
2608
  {
2609
+ "epoch": 0.9387420352028263,
2610
+ "grad_norm": 0.6179840564727783,
2611
+ "learning_rate": 5.663368956745963e-07,
2612
+ "loss": 0.7871,
2613
+ "step": 3720
2614
  },
2615
  {
2616
+ "epoch": 0.9412655352974576,
2617
+ "grad_norm": 0.6663089394569397,
2618
+ "learning_rate": 5.206590342623164e-07,
2619
+ "loss": 0.7901,
2620
+ "step": 3730
2621
  },
2622
  {
2623
+ "epoch": 0.9437890353920888,
2624
+ "grad_norm": 0.6079100370407104,
2625
+ "learning_rate": 4.768821530264977e-07,
2626
+ "loss": 0.8226,
2627
+ "step": 3740
2628
  },
2629
  {
2630
+ "epoch": 0.9463125354867201,
2631
+ "grad_norm": 0.68614262342453,
2632
+ "learning_rate": 4.350096515269325e-07,
2633
+ "loss": 0.8185,
2634
+ "step": 3750
2635
  },
2636
  {
2637
+ "epoch": 0.9488360355813513,
2638
+ "grad_norm": 0.6491347551345825,
2639
+ "learning_rate": 3.950447814359409e-07,
2640
+ "loss": 0.817,
2641
+ "step": 3760
2642
  },
2643
  {
2644
+ "epoch": 0.9513595356759826,
2645
+ "grad_norm": 0.6513685584068298,
2646
+ "learning_rate": 3.5699064628583745e-07,
2647
+ "loss": 0.7997,
2648
+ "step": 3770
2649
  },
2650
  {
2651
+ "epoch": 0.9538830357706138,
2652
+ "grad_norm": 0.6080814003944397,
2653
+ "learning_rate": 3.2085020122793186e-07,
2654
+ "loss": 0.7956,
2655
+ "step": 3780
2656
  },
2657
  {
2658
+ "epoch": 0.956406535865245,
2659
+ "grad_norm": 0.6476254463195801,
2660
+ "learning_rate": 2.8662625280304613e-07,
2661
+ "loss": 0.7888,
2662
+ "step": 3790
2663
  },
2664
  {
2665
+ "epoch": 0.9589300359598764,
2666
+ "grad_norm": 0.6439909934997559,
2667
+ "learning_rate": 2.5432145872355816e-07,
2668
+ "loss": 0.7847,
2669
+ "step": 3800
2670
  },
2671
  {
2672
+ "epoch": 0.9614535360545076,
2673
+ "grad_norm": 0.6744981408119202,
2674
+ "learning_rate": 2.2393832766701706e-07,
2675
+ "loss": 0.8093,
2676
+ "step": 3810
2677
  },
2678
  {
2679
+ "epoch": 0.9639770361491389,
2680
+ "grad_norm": 0.5795860886573792,
2681
+ "learning_rate": 1.9547921908133483e-07,
2682
+ "loss": 0.8082,
2683
+ "step": 3820
2684
  },
2685
  {
2686
+ "epoch": 0.9665005362437701,
2687
+ "grad_norm": 0.6693094968795776,
2688
+ "learning_rate": 1.689463430015442e-07,
2689
+ "loss": 0.7857,
2690
+ "step": 3830
2691
  },
2692
  {
2693
+ "epoch": 0.9690240363384014,
2694
+ "grad_norm": 0.645203173160553,
2695
+ "learning_rate": 1.443417598781971e-07,
2696
+ "loss": 0.8056,
2697
+ "step": 3840
2698
  },
2699
  {
2700
+ "epoch": 0.9715475364330326,
2701
+ "grad_norm": 0.6820341348648071,
2702
+ "learning_rate": 1.2166738041733684e-07,
2703
+ "loss": 0.802,
2704
+ "step": 3850
2705
  },
2706
  {
2707
+ "epoch": 0.9740710365276639,
2708
+ "grad_norm": 0.6292694807052612,
2709
+ "learning_rate": 1.0092496543212814e-07,
2710
+ "loss": 0.7937,
2711
+ "step": 3860
2712
  },
2713
  {
2714
+ "epoch": 0.9765945366222951,
2715
+ "grad_norm": 0.6253132224082947,
2716
+ "learning_rate": 8.211612570611926e-08,
2717
+ "loss": 0.7846,
2718
+ "step": 3870
2719
  },
2720
  {
2721
+ "epoch": 0.9791180367169264,
2722
+ "grad_norm": 0.6571831107139587,
2723
+ "learning_rate": 6.524232186815305e-08,
2724
+ "loss": 0.785,
2725
+ "step": 3880
2726
  },
2727
  {
2728
+ "epoch": 0.9816415368115576,
2729
+ "grad_norm": 0.6356094479560852,
2730
+ "learning_rate": 5.03048642789411e-08,
2731
+ "loss": 0.7789,
2732
+ "step": 3890
2733
  },
2734
  {
2735
+ "epoch": 0.9841650369061888,
2736
+ "grad_norm": 0.8404703140258789,
2737
+ "learning_rate": 3.730491292930072e-08,
2738
+ "loss": 0.7954,
2739
+ "step": 3900
2740
  },
2741
  {
2742
+ "epoch": 0.9866885370008202,
2743
+ "grad_norm": 0.7891058325767517,
2744
+ "learning_rate": 2.624347735007693e-08,
2745
+ "loss": 0.8129,
2746
+ "step": 3910
2747
  },
2748
  {
2749
+ "epoch": 0.9892120370954514,
2750
+ "grad_norm": 0.6858798265457153,
2751
+ "learning_rate": 1.7121416533749658e-08,
2752
+ "loss": 0.8076,
2753
+ "step": 3920
2754
  },
2755
  {
2756
+ "epoch": 0.9917355371900827,
2757
+ "grad_norm": 0.6489024758338928,
2758
+ "learning_rate": 9.939438867723194e-09,
2759
+ "loss": 0.8087,
2760
+ "step": 3930
2761
  },
2762
  {
2763
+ "epoch": 0.9942590372847139,
2764
+ "grad_norm": 0.6204003691673279,
2765
+ "learning_rate": 4.6981020793118725e-09,
2766
+ "loss": 0.8162,
2767
+ "step": 3940
2768
  },
2769
  {
2770
+ "epoch": 0.9967825373793452,
2771
+ "grad_norm": 0.6356140971183777,
2772
+ "learning_rate": 1.3978131924385906e-09,
2773
+ "loss": 0.7862,
2774
+ "step": 3950
2775
  },
2776
  {
2777
+ "epoch": 0.9993060374739764,
2778
+ "grad_norm": 0.6472454071044922,
2779
+ "learning_rate": 3.88284960184393e-11,
2780
+ "loss": 0.8188,
2781
+ "step": 3960
2782
  }
2783
  ],
2784
+ "logging_steps": 10,
2785
+ "max_steps": 3962,
2786
  "num_input_tokens_seen": 0,
2787
+ "num_train_epochs": 1,
2788
+ "save_steps": 1000,
2789
  "stateful_callbacks": {
2790
  "TrainerControl": {
2791
  "args": {
 
2798
  "attributes": {}
2799
  }
2800
  },
2801
+ "total_flos": 2.0380844918675866e+18,
2802
+ "train_batch_size": 2,
2803
  "trial_name": null,
2804
  "trial_params": null
2805
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cb09fa3cec0d925b5877a57afba4d17f256716f468a4f84dfa477dd700225e0
3
- size 6968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ba6c8e40dc2d34ebe219f16f05b9e60b23b1741dcfccb9613485fc1e913f881
3
+ size 7032
zero_to_fp32.py CHANGED
@@ -10,10 +10,7 @@
10
  # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
  # application.
12
  #
13
- # example:
14
- # python zero_to_fp32.py . output_dir/
15
- # or
16
- # python zero_to_fp32.py . output_dir/ --safe_serialization
17
 
18
  import argparse
19
  import torch
@@ -21,8 +18,6 @@ import glob
21
  import math
22
  import os
23
  import re
24
- import json
25
- from tqdm import tqdm
26
  from collections import OrderedDict
27
  from dataclasses import dataclass
28
 
@@ -144,6 +139,7 @@ def parse_model_states(files):
144
 
145
 
146
  def parse_optim_states(files, ds_checkpoint_dir):
 
147
  total_files = len(files)
148
  state_dicts = []
149
  for f in files:
@@ -424,10 +420,12 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
424
  offset = 0
425
  total_numel = 0
426
  total_params = 0
427
- for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
 
428
  unpartitioned_numel = shape.numel()
429
  total_numel += unpartitioned_numel
430
  total_params += 1
 
431
  partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
432
 
433
  if debug:
@@ -523,75 +521,21 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
523
  return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
524
 
525
 
526
- def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
527
- output_dir,
528
- max_shard_size="5GB",
529
- safe_serialization=False,
530
- tag=None,
531
- exclude_frozen_parameters=False):
532
  """
533
  Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
534
  loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
535
 
536
  Args:
537
  - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
538
- - ``output_dir``: directory to the pytorch fp32 state_dict output files
539
- - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
540
- - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
541
  - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
542
  - ``exclude_frozen_parameters``: exclude frozen parameters
543
  """
544
- # Dependency pre-check
545
- if safe_serialization:
546
- try:
547
- from safetensors.torch import save_file
548
- except ImportError:
549
- print('If you want to use `safe_serialization`, please `pip install safetensors`')
550
- raise
551
- if max_shard_size is not None:
552
- try:
553
- from huggingface_hub import split_torch_state_dict_into_shards
554
- except ImportError:
555
- print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
556
- raise
557
-
558
- # Convert zero checkpoint to state_dict
559
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
560
 
561
- # Shard the model if it is too big.
562
- weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
563
- if max_shard_size is not None:
564
- filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
565
- state_dict_split = split_torch_state_dict_into_shards(state_dict,
566
- filename_pattern=filename_pattern,
567
- max_shard_size=max_shard_size)
568
- else:
569
- from collections import namedtuple
570
- StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
571
- state_dict_split = StateDictSplit(is_sharded=False,
572
- filename_to_tensors={weights_name: list(state_dict.keys())})
573
-
574
- # Save the model
575
- filename_to_tensors = state_dict_split.filename_to_tensors.items()
576
- for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
577
- shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
578
- output_path = os.path.join(output_dir, shard_file)
579
- if safe_serialization:
580
- save_file(shard, output_path, metadata={"format": "pt"})
581
- else:
582
- torch.save(shard, output_path)
583
-
584
- # Save index if sharded
585
- if state_dict_split.is_sharded:
586
- index = {
587
- "metadata": state_dict_split.metadata,
588
- "weight_map": state_dict_split.tensor_to_filename,
589
- }
590
- save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
591
- save_index_file = os.path.join(output_dir, save_index_file)
592
- with open(save_index_file, "w", encoding="utf-8") as f:
593
- content = json.dumps(index, indent=2, sort_keys=True) + "\n"
594
- f.write(content)
595
 
596
 
597
  def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
@@ -634,27 +578,15 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
634
 
635
 
636
  if __name__ == "__main__":
 
637
  parser = argparse.ArgumentParser()
638
  parser.add_argument("checkpoint_dir",
639
  type=str,
640
  help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
641
- parser.add_argument("output_dir",
642
- type=str,
643
- help="directory to the pytorch fp32 state_dict output files"
644
- "(e.g. path/checkpoint-12-output/)")
645
  parser.add_argument(
646
- "--max_shard_size",
647
  type=str,
648
- default="5GB",
649
- help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
650
- "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
651
- "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
652
- "without CPU OOM issues.")
653
- parser.add_argument(
654
- "--safe_serialization",
655
- default=False,
656
- action='store_true',
657
- help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
658
  parser.add_argument("-t",
659
  "--tag",
660
  type=str,
@@ -667,8 +599,6 @@ if __name__ == "__main__":
667
  debug = args.debug
668
 
669
  convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
670
- args.output_dir,
671
- max_shard_size=args.max_shard_size,
672
- safe_serialization=args.safe_serialization,
673
  tag=args.tag,
674
  exclude_frozen_parameters=args.exclude_frozen_parameters)
 
10
  # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
  # application.
12
  #
13
+ # example: python zero_to_fp32.py . pytorch_model.bin
 
 
 
14
 
15
  import argparse
16
  import torch
 
18
  import math
19
  import os
20
  import re
 
 
21
  from collections import OrderedDict
22
  from dataclasses import dataclass
23
 
 
139
 
140
 
141
  def parse_optim_states(files, ds_checkpoint_dir):
142
+
143
  total_files = len(files)
144
  state_dicts = []
145
  for f in files:
 
420
  offset = 0
421
  total_numel = 0
422
  total_params = 0
423
+ for name, shape in param_shapes.items():
424
+
425
  unpartitioned_numel = shape.numel()
426
  total_numel += unpartitioned_numel
427
  total_params += 1
428
+
429
  partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
430
 
431
  if debug:
 
521
  return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
522
 
523
 
524
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
 
 
 
 
 
525
  """
526
  Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
527
  loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
528
 
529
  Args:
530
  - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
531
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
 
 
532
  - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
533
  - ``exclude_frozen_parameters``: exclude frozen parameters
534
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
 
536
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
537
+ print(f"Saving fp32 state dict to {output_file}")
538
+ torch.save(state_dict, output_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
 
541
  def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
 
578
 
579
 
580
  if __name__ == "__main__":
581
+
582
  parser = argparse.ArgumentParser()
583
  parser.add_argument("checkpoint_dir",
584
  type=str,
585
  help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
 
 
 
 
586
  parser.add_argument(
587
+ "output_file",
588
  type=str,
589
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
 
 
 
 
 
 
 
 
 
590
  parser.add_argument("-t",
591
  "--tag",
592
  type=str,
 
599
  debug = args.debug
600
 
601
  convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
602
+ args.output_file,
 
 
603
  tag=args.tag,
604
  exclude_frozen_parameters=args.exclude_frozen_parameters)