Dnq2025 commited on
Commit
65ed337
·
verified ·
1 Parent(s): 13cb0b6

End of training

Browse files
README.md CHANGED
@@ -3,6 +3,8 @@ library_name: transformers
3
  license: other
4
  base_model: facebook/mask2former-swin-base-IN21k-ade-semantic
5
  tags:
 
 
6
  - generated_from_trainer
7
  model-index:
8
  - name: mask2former-finetuned-ER-Mito-LD3
@@ -14,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # mask2former-finetuned-ER-Mito-LD3
16
 
17
- This model is a fine-tuned version of [facebook/mask2former-swin-base-IN21k-ade-semantic](https://huggingface.co/facebook/mask2former-swin-base-IN21k-ade-semantic) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 39.8755
20
  - Dummy: 1.0
21
 
22
  ## Model description
 
3
  license: other
4
  base_model: facebook/mask2former-swin-base-IN21k-ade-semantic
5
  tags:
6
+ - image-segmentation
7
+ - vision
8
  - generated_from_trainer
9
  model-index:
10
  - name: mask2former-finetuned-ER-Mito-LD3
 
16
 
17
  # mask2former-finetuned-ER-Mito-LD3
18
 
19
+ This model is a fine-tuned version of [facebook/mask2former-swin-base-IN21k-ade-semantic](https://huggingface.co/facebook/mask2former-swin-base-IN21k-ade-semantic) on the Dnq2025/Mask2former_Pretrain dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 39.9236
22
  - Dummy: 1.0
23
 
24
  ## Model description
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 124.03846153846153,
3
  "eval_dummy": 1.0,
4
- "eval_loss": 33.54045104980469,
5
- "eval_runtime": 18.5302,
6
- "eval_samples_per_second": 2.59,
7
- "eval_steps_per_second": 0.54,
8
- "total_flos": 1.8148731642810335e+19,
9
- "train_loss": 14.019830729166667,
10
- "train_runtime": 31415.0856,
11
- "train_samples_per_second": 2.053,
12
- "train_steps_per_second": 0.411
13
  }
 
1
  {
2
+ "epoch": 100.0,
3
  "eval_dummy": 1.0,
4
+ "eval_loss": 39.92361831665039,
5
+ "eval_runtime": 17.0469,
6
+ "eval_samples_per_second": 2.816,
7
+ "eval_steps_per_second": 0.704,
8
+ "total_flos": 1.4631500418239693e+19,
9
+ "train_loss": 16.309644344832545,
10
+ "train_runtime": 27199.2721,
11
+ "train_samples_per_second": 1.897,
12
+ "train_steps_per_second": 0.474
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 124.03846153846153,
3
  "eval_dummy": 1.0,
4
- "eval_loss": 33.54045104980469,
5
- "eval_runtime": 18.5302,
6
- "eval_samples_per_second": 2.59,
7
- "eval_steps_per_second": 0.54
8
  }
 
1
  {
2
+ "epoch": 100.0,
3
  "eval_dummy": 1.0,
4
+ "eval_loss": 39.92361831665039,
5
+ "eval_runtime": 17.0469,
6
+ "eval_samples_per_second": 2.816,
7
+ "eval_steps_per_second": 0.704
8
  }
runs/Mar27_08-43-10_cdr2539.int.cedar.computecanada.ca/events.out.tfevents.1743117495.cdr2539.int.cedar.computecanada.ca.2316099.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca0b1280ead51acad1dc481185973289077fe063d3e3d99f1c5fa45ab41ed821
3
+ size 408
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 124.03846153846153,
3
- "total_flos": 1.8148731642810335e+19,
4
- "train_loss": 14.019830729166667,
5
- "train_runtime": 31415.0856,
6
- "train_samples_per_second": 2.053,
7
- "train_steps_per_second": 0.411
8
  }
 
1
  {
2
+ "epoch": 100.0,
3
+ "total_flos": 1.4631500418239693e+19,
4
+ "train_loss": 16.309644344832545,
5
+ "train_runtime": 27199.2721,
6
+ "train_samples_per_second": 1.897,
7
+ "train_steps_per_second": 0.474
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 124.03846153846153,
6
  "eval_steps": 500,
7
  "global_step": 12900,
8
  "is_hyper_param_search": false,
@@ -10,2047 +10,1822 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.9615384615384616,
14
- "grad_norm": 165.56590270996094,
15
- "learning_rate": 9.922558139534884e-05,
16
- "loss": 48.5372,
17
  "step": 100
18
  },
19
  {
20
  "epoch": 1.0,
21
  "eval_dummy": 1.0,
22
- "eval_loss": 36.333778381347656,
23
- "eval_runtime": 21.9927,
24
- "eval_samples_per_second": 2.183,
25
- "eval_steps_per_second": 0.455,
26
- "step": 104
27
  },
28
  {
29
- "epoch": 1.9230769230769231,
30
- "grad_norm": 143.93603515625,
31
- "learning_rate": 9.845116279069768e-05,
32
- "loss": 33.5327,
33
  "step": 200
34
  },
35
  {
36
  "epoch": 2.0,
37
  "eval_dummy": 1.0,
38
- "eval_loss": 31.735132217407227,
39
- "eval_runtime": 16.9414,
40
- "eval_samples_per_second": 2.833,
41
- "eval_steps_per_second": 0.59,
42
- "step": 208
43
  },
44
  {
45
- "epoch": 2.8846153846153846,
46
- "grad_norm": 135.81661987304688,
47
- "learning_rate": 9.767674418604652e-05,
48
- "loss": 29.7691,
49
  "step": 300
50
  },
51
  {
52
  "epoch": 3.0,
53
  "eval_dummy": 1.0,
54
- "eval_loss": 30.485803604125977,
55
- "eval_runtime": 16.7561,
56
- "eval_samples_per_second": 2.865,
57
- "eval_steps_per_second": 0.597,
58
- "step": 312
59
  },
60
  {
61
- "epoch": 3.8461538461538463,
62
- "grad_norm": 110.2176284790039,
63
- "learning_rate": 9.690232558139534e-05,
64
- "loss": 26.3002,
65
  "step": 400
66
  },
67
  {
68
- "epoch": 4.0,
69
- "eval_dummy": 1.0,
70
- "eval_loss": 28.60906219482422,
71
- "eval_runtime": 16.9439,
72
- "eval_samples_per_second": 2.833,
73
- "eval_steps_per_second": 0.59,
74
- "step": 416
75
- },
76
- {
77
- "epoch": 4.8076923076923075,
78
- "grad_norm": 112.39118194580078,
79
- "learning_rate": 9.612790697674419e-05,
80
- "loss": 24.7501,
81
  "step": 500
82
  },
83
  {
84
- "epoch": 5.0,
85
  "eval_dummy": 1.0,
86
- "eval_loss": 27.096742630004883,
87
- "eval_runtime": 16.7521,
88
- "eval_samples_per_second": 2.865,
89
- "eval_steps_per_second": 0.597,
90
- "step": 520
91
  },
92
  {
93
- "epoch": 5.769230769230769,
94
- "grad_norm": 86.3100357055664,
95
- "learning_rate": 9.535348837209303e-05,
96
- "loss": 23.4495,
97
  "step": 600
98
  },
99
  {
100
- "epoch": 6.0,
101
  "eval_dummy": 1.0,
102
- "eval_loss": 26.624135971069336,
103
- "eval_runtime": 16.6013,
104
- "eval_samples_per_second": 2.891,
105
- "eval_steps_per_second": 0.602,
106
- "step": 624
107
  },
108
  {
109
- "epoch": 6.730769230769231,
110
- "grad_norm": 58.464332580566406,
111
- "learning_rate": 9.457906976744187e-05,
112
- "loss": 23.274,
113
  "step": 700
114
  },
115
  {
116
- "epoch": 7.0,
117
  "eval_dummy": 1.0,
118
- "eval_loss": 27.154443740844727,
119
- "eval_runtime": 17.6955,
120
- "eval_samples_per_second": 2.713,
121
- "eval_steps_per_second": 0.565,
122
- "step": 728
123
  },
124
  {
125
- "epoch": 7.6923076923076925,
126
- "grad_norm": 42.398406982421875,
127
- "learning_rate": 9.38046511627907e-05,
128
- "loss": 21.1617,
129
  "step": 800
130
  },
131
  {
132
- "epoch": 8.0,
133
- "eval_dummy": 1.0,
134
- "eval_loss": 27.462488174438477,
135
- "eval_runtime": 16.2589,
136
- "eval_samples_per_second": 2.952,
137
- "eval_steps_per_second": 0.615,
138
- "step": 832
139
- },
140
- {
141
- "epoch": 8.653846153846153,
142
- "grad_norm": 69.99388122558594,
143
- "learning_rate": 9.303023255813954e-05,
144
- "loss": 20.373,
145
  "step": 900
146
  },
147
  {
148
- "epoch": 9.0,
149
  "eval_dummy": 1.0,
150
- "eval_loss": 27.574462890625,
151
- "eval_runtime": 17.0559,
152
- "eval_samples_per_second": 2.814,
153
- "eval_steps_per_second": 0.586,
154
- "step": 936
155
  },
156
  {
157
- "epoch": 9.615384615384615,
158
- "grad_norm": 51.642242431640625,
159
- "learning_rate": 9.225581395348837e-05,
160
- "loss": 20.4295,
161
  "step": 1000
162
  },
163
  {
164
- "epoch": 10.0,
165
  "eval_dummy": 1.0,
166
- "eval_loss": 27.69423484802246,
167
- "eval_runtime": 16.3331,
168
- "eval_samples_per_second": 2.939,
169
- "eval_steps_per_second": 0.612,
170
- "step": 1040
171
  },
172
  {
173
- "epoch": 10.576923076923077,
174
- "grad_norm": 39.88796615600586,
175
- "learning_rate": 9.14813953488372e-05,
176
- "loss": 20.2526,
177
  "step": 1100
178
  },
179
  {
180
- "epoch": 11.0,
181
  "eval_dummy": 1.0,
182
- "eval_loss": 27.782920837402344,
183
- "eval_runtime": 16.9662,
184
- "eval_samples_per_second": 2.829,
185
- "eval_steps_per_second": 0.589,
186
- "step": 1144
187
  },
188
  {
189
- "epoch": 11.538461538461538,
190
- "grad_norm": 50.71794509887695,
191
- "learning_rate": 9.070697674418606e-05,
192
- "loss": 19.2572,
193
  "step": 1200
194
  },
195
  {
196
- "epoch": 12.0,
197
  "eval_dummy": 1.0,
198
- "eval_loss": 27.295957565307617,
199
- "eval_runtime": 16.7472,
200
- "eval_samples_per_second": 2.866,
201
- "eval_steps_per_second": 0.597,
202
- "step": 1248
203
  },
204
  {
205
- "epoch": 12.5,
206
- "grad_norm": 45.02073669433594,
207
- "learning_rate": 8.99325581395349e-05,
208
- "loss": 19.0089,
209
  "step": 1300
210
  },
211
  {
212
- "epoch": 13.0,
213
- "eval_dummy": 1.0,
214
- "eval_loss": 26.0039005279541,
215
- "eval_runtime": 16.8217,
216
- "eval_samples_per_second": 2.853,
217
- "eval_steps_per_second": 0.594,
218
- "step": 1352
219
- },
220
- {
221
- "epoch": 13.461538461538462,
222
- "grad_norm": 33.62112808227539,
223
- "learning_rate": 8.915813953488373e-05,
224
- "loss": 18.3621,
225
  "step": 1400
226
  },
227
  {
228
- "epoch": 14.0,
229
  "eval_dummy": 1.0,
230
- "eval_loss": 26.562255859375,
231
- "eval_runtime": 16.6581,
232
- "eval_samples_per_second": 2.881,
233
- "eval_steps_per_second": 0.6,
234
- "step": 1456
235
  },
236
  {
237
- "epoch": 14.423076923076923,
238
- "grad_norm": 44.2991943359375,
239
- "learning_rate": 8.838372093023257e-05,
240
- "loss": 18.0517,
241
  "step": 1500
242
  },
243
  {
244
- "epoch": 15.0,
245
  "eval_dummy": 1.0,
246
- "eval_loss": 26.27000617980957,
247
- "eval_runtime": 17.496,
248
- "eval_samples_per_second": 2.743,
249
- "eval_steps_per_second": 0.572,
250
- "step": 1560
251
  },
252
  {
253
- "epoch": 15.384615384615385,
254
- "grad_norm": 44.68825912475586,
255
- "learning_rate": 8.76093023255814e-05,
256
- "loss": 18.3139,
257
  "step": 1600
258
  },
259
  {
260
- "epoch": 16.0,
261
  "eval_dummy": 1.0,
262
- "eval_loss": 27.297170639038086,
263
- "eval_runtime": 16.2819,
264
- "eval_samples_per_second": 2.948,
265
- "eval_steps_per_second": 0.614,
266
- "step": 1664
267
  },
268
  {
269
- "epoch": 16.346153846153847,
270
- "grad_norm": 50.15152359008789,
271
- "learning_rate": 8.683488372093023e-05,
272
- "loss": 17.6129,
273
  "step": 1700
274
  },
275
  {
276
- "epoch": 17.0,
277
- "eval_dummy": 1.0,
278
- "eval_loss": 26.486886978149414,
279
- "eval_runtime": 17.6532,
280
- "eval_samples_per_second": 2.719,
281
- "eval_steps_per_second": 0.566,
282
- "step": 1768
283
- },
284
- {
285
- "epoch": 17.307692307692307,
286
- "grad_norm": 31.138574600219727,
287
- "learning_rate": 8.606046511627907e-05,
288
- "loss": 17.8402,
289
  "step": 1800
290
  },
291
  {
292
- "epoch": 18.0,
293
  "eval_dummy": 1.0,
294
- "eval_loss": 27.761808395385742,
295
- "eval_runtime": 16.6098,
296
- "eval_samples_per_second": 2.89,
297
- "eval_steps_per_second": 0.602,
298
- "step": 1872
299
  },
300
  {
301
- "epoch": 18.26923076923077,
302
- "grad_norm": 45.22633361816406,
303
- "learning_rate": 8.52860465116279e-05,
304
- "loss": 16.6494,
305
  "step": 1900
306
  },
307
  {
308
- "epoch": 19.0,
309
  "eval_dummy": 1.0,
310
- "eval_loss": 27.517309188842773,
311
- "eval_runtime": 16.9988,
312
- "eval_samples_per_second": 2.824,
313
- "eval_steps_per_second": 0.588,
314
- "step": 1976
315
  },
316
  {
317
- "epoch": 19.23076923076923,
318
- "grad_norm": 25.58769989013672,
319
- "learning_rate": 8.451162790697674e-05,
320
- "loss": 17.0833,
321
  "step": 2000
322
  },
323
  {
324
- "epoch": 20.0,
325
  "eval_dummy": 1.0,
326
- "eval_loss": 28.124242782592773,
327
- "eval_runtime": 16.6787,
328
- "eval_samples_per_second": 2.878,
329
- "eval_steps_per_second": 0.6,
330
- "step": 2080
331
  },
332
  {
333
- "epoch": 20.192307692307693,
334
- "grad_norm": 30.538772583007812,
335
- "learning_rate": 8.373720930232558e-05,
336
- "loss": 16.5967,
337
  "step": 2100
338
  },
339
  {
340
- "epoch": 21.0,
341
  "eval_dummy": 1.0,
342
- "eval_loss": 29.119504928588867,
343
- "eval_runtime": 16.9877,
344
- "eval_samples_per_second": 2.826,
345
- "eval_steps_per_second": 0.589,
346
- "step": 2184
347
  },
348
  {
349
- "epoch": 21.153846153846153,
350
- "grad_norm": 34.11674499511719,
351
- "learning_rate": 8.296279069767443e-05,
352
- "loss": 16.2634,
353
  "step": 2200
354
  },
355
  {
356
- "epoch": 22.0,
357
- "eval_dummy": 1.0,
358
- "eval_loss": 27.036653518676758,
359
- "eval_runtime": 16.3157,
360
- "eval_samples_per_second": 2.942,
361
- "eval_steps_per_second": 0.613,
362
- "step": 2288
363
- },
364
- {
365
- "epoch": 22.115384615384617,
366
- "grad_norm": 23.20075798034668,
367
- "learning_rate": 8.218837209302326e-05,
368
- "loss": 16.6797,
369
  "step": 2300
370
  },
371
  {
372
- "epoch": 23.0,
373
  "eval_dummy": 1.0,
374
- "eval_loss": 27.179914474487305,
375
- "eval_runtime": 17.6647,
376
- "eval_samples_per_second": 2.717,
377
- "eval_steps_per_second": 0.566,
378
- "step": 2392
379
  },
380
  {
381
- "epoch": 23.076923076923077,
382
- "grad_norm": 38.62594223022461,
383
- "learning_rate": 8.14139534883721e-05,
384
- "loss": 16.0344,
385
  "step": 2400
386
  },
387
  {
388
- "epoch": 24.0,
389
  "eval_dummy": 1.0,
390
- "eval_loss": 26.640764236450195,
391
- "eval_runtime": 17.4121,
392
- "eval_samples_per_second": 2.757,
393
- "eval_steps_per_second": 0.574,
394
- "step": 2496
395
  },
396
  {
397
- "epoch": 24.03846153846154,
398
- "grad_norm": 25.629343032836914,
399
- "learning_rate": 8.063953488372093e-05,
400
- "loss": 15.8706,
401
  "step": 2500
402
  },
403
  {
404
- "epoch": 25.0,
405
- "grad_norm": 108.9260482788086,
406
- "learning_rate": 7.986511627906977e-05,
407
- "loss": 15.7701,
408
- "step": 2600
 
 
409
  },
410
  {
411
- "epoch": 25.0,
412
- "eval_dummy": 1.0,
413
- "eval_loss": 28.403966903686523,
414
- "eval_runtime": 16.6389,
415
- "eval_samples_per_second": 2.885,
416
- "eval_steps_per_second": 0.601,
417
  "step": 2600
418
  },
419
  {
420
- "epoch": 25.96153846153846,
421
- "grad_norm": 39.876651763916016,
422
- "learning_rate": 7.90906976744186e-05,
423
- "loss": 15.6061,
424
  "step": 2700
425
  },
426
  {
427
- "epoch": 26.0,
428
  "eval_dummy": 1.0,
429
- "eval_loss": 28.06866455078125,
430
- "eval_runtime": 17.057,
431
- "eval_samples_per_second": 2.814,
432
- "eval_steps_per_second": 0.586,
433
- "step": 2704
434
  },
435
  {
436
- "epoch": 26.923076923076923,
437
- "grad_norm": 24.31376838684082,
438
- "learning_rate": 7.831627906976746e-05,
439
- "loss": 15.3311,
440
  "step": 2800
441
  },
442
  {
443
- "epoch": 27.0,
444
  "eval_dummy": 1.0,
445
- "eval_loss": 27.17653465270996,
446
- "eval_runtime": 16.4838,
447
- "eval_samples_per_second": 2.912,
448
- "eval_steps_per_second": 0.607,
449
- "step": 2808
450
  },
451
  {
452
- "epoch": 27.884615384615383,
453
- "grad_norm": 21.51975440979004,
454
- "learning_rate": 7.754186046511628e-05,
455
- "loss": 15.2464,
456
  "step": 2900
457
  },
458
  {
459
- "epoch": 28.0,
460
  "eval_dummy": 1.0,
461
- "eval_loss": 28.20502471923828,
462
- "eval_runtime": 17.0908,
463
- "eval_samples_per_second": 2.809,
464
- "eval_steps_per_second": 0.585,
465
- "step": 2912
466
  },
467
  {
468
- "epoch": 28.846153846153847,
469
- "grad_norm": 25.89150047302246,
470
- "learning_rate": 7.676744186046512e-05,
471
- "loss": 15.0459,
472
  "step": 3000
473
  },
474
  {
475
- "epoch": 29.0,
476
  "eval_dummy": 1.0,
477
- "eval_loss": 28.629125595092773,
478
- "eval_runtime": 17.0746,
479
- "eval_samples_per_second": 2.811,
480
- "eval_steps_per_second": 0.586,
481
- "step": 3016
482
  },
483
  {
484
- "epoch": 29.807692307692307,
485
- "grad_norm": 24.885116577148438,
486
- "learning_rate": 7.599302325581396e-05,
487
- "loss": 14.7514,
488
  "step": 3100
489
  },
490
  {
491
- "epoch": 30.0,
492
- "eval_dummy": 1.0,
493
- "eval_loss": 27.824087142944336,
494
- "eval_runtime": 16.6658,
495
- "eval_samples_per_second": 2.88,
496
- "eval_steps_per_second": 0.6,
497
- "step": 3120
498
- },
499
- {
500
- "epoch": 30.76923076923077,
501
- "grad_norm": 23.02336883544922,
502
- "learning_rate": 7.52186046511628e-05,
503
- "loss": 15.0833,
504
  "step": 3200
505
  },
506
  {
507
- "epoch": 31.0,
508
  "eval_dummy": 1.0,
509
- "eval_loss": 29.19361686706543,
510
- "eval_runtime": 19.4392,
511
- "eval_samples_per_second": 2.469,
512
- "eval_steps_per_second": 0.514,
513
- "step": 3224
514
  },
515
  {
516
- "epoch": 31.73076923076923,
517
- "grad_norm": 24.152498245239258,
518
- "learning_rate": 7.444418604651163e-05,
519
- "loss": 15.0817,
520
  "step": 3300
521
  },
522
  {
523
- "epoch": 32.0,
524
  "eval_dummy": 1.0,
525
- "eval_loss": 28.40435218811035,
526
- "eval_runtime": 16.6685,
527
- "eval_samples_per_second": 2.88,
528
- "eval_steps_per_second": 0.6,
529
- "step": 3328
530
  },
531
  {
532
- "epoch": 32.69230769230769,
533
- "grad_norm": 20.687313079833984,
534
- "learning_rate": 7.366976744186047e-05,
535
- "loss": 14.3201,
536
  "step": 3400
537
  },
538
  {
539
- "epoch": 33.0,
540
  "eval_dummy": 1.0,
541
- "eval_loss": 28.370887756347656,
542
- "eval_runtime": 16.9055,
543
- "eval_samples_per_second": 2.839,
544
- "eval_steps_per_second": 0.592,
545
- "step": 3432
546
  },
547
  {
548
- "epoch": 33.65384615384615,
549
- "grad_norm": 18.918672561645508,
550
- "learning_rate": 7.289534883720931e-05,
551
- "loss": 14.5918,
552
  "step": 3500
553
  },
554
  {
555
- "epoch": 34.0,
556
- "eval_dummy": 1.0,
557
- "eval_loss": 29.38982582092285,
558
- "eval_runtime": 17.1503,
559
- "eval_samples_per_second": 2.799,
560
- "eval_steps_per_second": 0.583,
561
- "step": 3536
562
- },
563
- {
564
- "epoch": 34.61538461538461,
565
- "grad_norm": 34.329986572265625,
566
- "learning_rate": 7.212093023255815e-05,
567
- "loss": 14.7177,
568
  "step": 3600
569
  },
570
  {
571
- "epoch": 35.0,
572
  "eval_dummy": 1.0,
573
- "eval_loss": 28.512975692749023,
574
- "eval_runtime": 17.026,
575
- "eval_samples_per_second": 2.819,
576
- "eval_steps_per_second": 0.587,
577
- "step": 3640
578
  },
579
  {
580
- "epoch": 35.57692307692308,
581
- "grad_norm": 14.776827812194824,
582
- "learning_rate": 7.134651162790697e-05,
583
- "loss": 13.9919,
584
  "step": 3700
585
  },
586
  {
587
- "epoch": 36.0,
588
  "eval_dummy": 1.0,
589
- "eval_loss": 27.759668350219727,
590
- "eval_runtime": 16.7517,
591
- "eval_samples_per_second": 2.865,
592
- "eval_steps_per_second": 0.597,
593
- "step": 3744
594
  },
595
  {
596
- "epoch": 36.53846153846154,
597
- "grad_norm": 33.82184982299805,
598
- "learning_rate": 7.057209302325581e-05,
599
- "loss": 14.2267,
600
  "step": 3800
601
  },
602
  {
603
- "epoch": 37.0,
604
  "eval_dummy": 1.0,
605
- "eval_loss": 29.232383728027344,
606
- "eval_runtime": 16.949,
607
- "eval_samples_per_second": 2.832,
608
- "eval_steps_per_second": 0.59,
609
- "step": 3848
610
  },
611
  {
612
- "epoch": 37.5,
613
- "grad_norm": 71.82172393798828,
614
- "learning_rate": 6.979767441860466e-05,
615
- "loss": 13.7801,
616
  "step": 3900
617
  },
618
  {
619
- "epoch": 38.0,
620
  "eval_dummy": 1.0,
621
- "eval_loss": 28.357421875,
622
- "eval_runtime": 16.6259,
623
- "eval_samples_per_second": 2.887,
624
- "eval_steps_per_second": 0.601,
625
- "step": 3952
626
  },
627
  {
628
- "epoch": 38.46153846153846,
629
- "grad_norm": 24.108745574951172,
630
- "learning_rate": 6.90232558139535e-05,
631
- "loss": 14.1839,
632
  "step": 4000
633
  },
634
  {
635
- "epoch": 39.0,
636
- "eval_dummy": 1.0,
637
- "eval_loss": 28.8710994720459,
638
- "eval_runtime": 16.7167,
639
- "eval_samples_per_second": 2.871,
640
- "eval_steps_per_second": 0.598,
641
- "step": 4056
642
- },
643
- {
644
- "epoch": 39.42307692307692,
645
- "grad_norm": 24.0490665435791,
646
- "learning_rate": 6.824883720930233e-05,
647
- "loss": 13.7545,
648
  "step": 4100
649
  },
650
  {
651
- "epoch": 40.0,
652
  "eval_dummy": 1.0,
653
- "eval_loss": 28.294702529907227,
654
- "eval_runtime": 16.5809,
655
- "eval_samples_per_second": 2.895,
656
- "eval_steps_per_second": 0.603,
657
- "step": 4160
658
  },
659
  {
660
- "epoch": 40.38461538461539,
661
- "grad_norm": 17.7755184173584,
662
- "learning_rate": 6.747441860465117e-05,
663
- "loss": 14.1627,
664
  "step": 4200
665
  },
666
  {
667
- "epoch": 41.0,
668
  "eval_dummy": 1.0,
669
- "eval_loss": 29.48655128479004,
670
- "eval_runtime": 16.9288,
671
- "eval_samples_per_second": 2.835,
672
- "eval_steps_per_second": 0.591,
673
- "step": 4264
674
  },
675
  {
676
- "epoch": 41.34615384615385,
677
- "grad_norm": 25.07582664489746,
678
- "learning_rate": 6.670000000000001e-05,
679
- "loss": 13.5155,
680
  "step": 4300
681
  },
682
  {
683
- "epoch": 42.0,
684
  "eval_dummy": 1.0,
685
- "eval_loss": 29.852659225463867,
686
- "eval_runtime": 16.5485,
687
- "eval_samples_per_second": 2.901,
688
- "eval_steps_per_second": 0.604,
689
- "step": 4368
690
  },
691
  {
692
- "epoch": 42.30769230769231,
693
- "grad_norm": 35.709686279296875,
694
- "learning_rate": 6.592558139534885e-05,
695
- "loss": 13.704,
696
  "step": 4400
697
  },
698
  {
699
- "epoch": 43.0,
700
- "eval_dummy": 1.0,
701
- "eval_loss": 29.4291934967041,
702
- "eval_runtime": 16.8607,
703
- "eval_samples_per_second": 2.847,
704
- "eval_steps_per_second": 0.593,
705
- "step": 4472
706
- },
707
- {
708
- "epoch": 43.26923076923077,
709
- "grad_norm": 21.939144134521484,
710
- "learning_rate": 6.515116279069768e-05,
711
- "loss": 13.6644,
712
  "step": 4500
713
  },
714
  {
715
- "epoch": 44.0,
716
  "eval_dummy": 1.0,
717
- "eval_loss": 29.23241424560547,
718
- "eval_runtime": 16.6868,
719
- "eval_samples_per_second": 2.877,
720
- "eval_steps_per_second": 0.599,
721
- "step": 4576
722
  },
723
  {
724
- "epoch": 44.23076923076923,
725
- "grad_norm": 22.5328426361084,
726
- "learning_rate": 6.437674418604652e-05,
727
- "loss": 13.2006,
728
  "step": 4600
729
  },
730
  {
731
- "epoch": 45.0,
732
  "eval_dummy": 1.0,
733
- "eval_loss": 29.541372299194336,
734
- "eval_runtime": 16.3804,
735
- "eval_samples_per_second": 2.93,
736
- "eval_steps_per_second": 0.61,
737
- "step": 4680
738
  },
739
  {
740
- "epoch": 45.19230769230769,
741
- "grad_norm": 19.775171279907227,
742
- "learning_rate": 6.360232558139536e-05,
743
- "loss": 13.1545,
744
  "step": 4700
745
  },
746
  {
747
- "epoch": 46.0,
748
  "eval_dummy": 1.0,
749
- "eval_loss": 29.698814392089844,
750
- "eval_runtime": 17.0544,
751
- "eval_samples_per_second": 2.815,
752
- "eval_steps_per_second": 0.586,
753
- "step": 4784
754
  },
755
  {
756
- "epoch": 46.15384615384615,
757
- "grad_norm": 23.579957962036133,
758
- "learning_rate": 6.28279069767442e-05,
759
- "loss": 13.5744,
760
  "step": 4800
761
  },
762
  {
763
- "epoch": 47.0,
764
- "eval_dummy": 1.0,
765
- "eval_loss": 28.993318557739258,
766
- "eval_runtime": 16.712,
767
- "eval_samples_per_second": 2.872,
768
- "eval_steps_per_second": 0.598,
769
- "step": 4888
770
- },
771
- {
772
- "epoch": 47.11538461538461,
773
- "grad_norm": 16.147850036621094,
774
- "learning_rate": 6.205348837209302e-05,
775
- "loss": 12.8073,
776
  "step": 4900
777
  },
778
  {
779
- "epoch": 48.0,
780
  "eval_dummy": 1.0,
781
- "eval_loss": 28.976961135864258,
782
- "eval_runtime": 17.0515,
783
- "eval_samples_per_second": 2.815,
784
- "eval_steps_per_second": 0.586,
785
- "step": 4992
786
  },
787
  {
788
- "epoch": 48.07692307692308,
789
- "grad_norm": 39.781028747558594,
790
- "learning_rate": 6.127906976744186e-05,
791
- "loss": 13.3773,
792
  "step": 5000
793
  },
794
  {
795
- "epoch": 49.0,
796
  "eval_dummy": 1.0,
797
- "eval_loss": 30.39496421813965,
798
- "eval_runtime": 16.6402,
799
- "eval_samples_per_second": 2.885,
800
- "eval_steps_per_second": 0.601,
801
- "step": 5096
802
  },
803
  {
804
- "epoch": 49.03846153846154,
805
- "grad_norm": 28.625978469848633,
806
- "learning_rate": 6.0504651162790696e-05,
807
- "loss": 12.9449,
808
  "step": 5100
809
  },
810
  {
811
- "epoch": 50.0,
812
- "grad_norm": 30.444610595703125,
813
- "learning_rate": 5.9730232558139533e-05,
814
- "loss": 12.9506,
815
- "step": 5200
 
 
816
  },
817
  {
818
- "epoch": 50.0,
819
- "eval_dummy": 1.0,
820
- "eval_loss": 31.28708839416504,
821
- "eval_runtime": 16.7982,
822
- "eval_samples_per_second": 2.857,
823
- "eval_steps_per_second": 0.595,
824
  "step": 5200
825
  },
826
  {
827
- "epoch": 50.96153846153846,
828
- "grad_norm": 22.916648864746094,
829
- "learning_rate": 5.895581395348837e-05,
830
- "loss": 13.0674,
831
- "step": 5300
 
 
832
  },
833
  {
834
- "epoch": 51.0,
835
- "eval_dummy": 1.0,
836
- "eval_loss": 29.57108497619629,
837
- "eval_runtime": 16.4065,
838
- "eval_samples_per_second": 2.926,
839
- "eval_steps_per_second": 0.61,
840
- "step": 5304
841
  },
842
  {
843
- "epoch": 51.92307692307692,
844
- "grad_norm": 21.012855529785156,
845
- "learning_rate": 5.818139534883721e-05,
846
- "loss": 13.1265,
847
  "step": 5400
848
  },
849
  {
850
- "epoch": 52.0,
851
  "eval_dummy": 1.0,
852
- "eval_loss": 31.08868980407715,
853
- "eval_runtime": 16.8841,
854
- "eval_samples_per_second": 2.843,
855
- "eval_steps_per_second": 0.592,
856
- "step": 5408
857
  },
858
  {
859
- "epoch": 52.88461538461539,
860
- "grad_norm": 16.615829467773438,
861
- "learning_rate": 5.740697674418606e-05,
862
- "loss": 13.1392,
863
  "step": 5500
864
  },
865
  {
866
- "epoch": 53.0,
867
  "eval_dummy": 1.0,
868
- "eval_loss": 29.843284606933594,
869
- "eval_runtime": 17.0042,
870
- "eval_samples_per_second": 2.823,
871
- "eval_steps_per_second": 0.588,
872
- "step": 5512
873
  },
874
  {
875
- "epoch": 53.84615384615385,
876
- "grad_norm": 17.38899040222168,
877
- "learning_rate": 5.6632558139534884e-05,
878
- "loss": 12.6108,
879
  "step": 5600
880
  },
881
  {
882
- "epoch": 54.0,
883
  "eval_dummy": 1.0,
884
- "eval_loss": 29.643640518188477,
885
- "eval_runtime": 16.9549,
886
- "eval_samples_per_second": 2.831,
887
- "eval_steps_per_second": 0.59,
888
- "step": 5616
889
  },
890
  {
891
- "epoch": 54.80769230769231,
892
- "grad_norm": 20.021800994873047,
893
- "learning_rate": 5.585813953488372e-05,
894
- "loss": 12.7608,
895
  "step": 5700
896
  },
897
  {
898
- "epoch": 55.0,
899
- "eval_dummy": 1.0,
900
- "eval_loss": 29.870615005493164,
901
- "eval_runtime": 16.4959,
902
- "eval_samples_per_second": 2.91,
903
- "eval_steps_per_second": 0.606,
904
- "step": 5720
905
- },
906
- {
907
- "epoch": 55.76923076923077,
908
- "grad_norm": 19.976686477661133,
909
- "learning_rate": 5.5083720930232566e-05,
910
- "loss": 12.8723,
911
  "step": 5800
912
  },
913
  {
914
- "epoch": 56.0,
915
  "eval_dummy": 1.0,
916
- "eval_loss": 30.059600830078125,
917
- "eval_runtime": 16.9691,
918
- "eval_samples_per_second": 2.829,
919
- "eval_steps_per_second": 0.589,
920
- "step": 5824
921
  },
922
  {
923
- "epoch": 56.73076923076923,
924
- "grad_norm": 34.17774963378906,
925
- "learning_rate": 5.4309302325581404e-05,
926
- "loss": 12.5437,
927
  "step": 5900
928
  },
929
  {
930
- "epoch": 57.0,
931
  "eval_dummy": 1.0,
932
- "eval_loss": 30.136695861816406,
933
- "eval_runtime": 16.1925,
934
- "eval_samples_per_second": 2.964,
935
- "eval_steps_per_second": 0.618,
936
- "step": 5928
937
  },
938
  {
939
- "epoch": 57.69230769230769,
940
- "grad_norm": 16.256919860839844,
941
- "learning_rate": 5.353488372093024e-05,
942
- "loss": 12.1387,
943
  "step": 6000
944
  },
945
  {
946
- "epoch": 58.0,
947
  "eval_dummy": 1.0,
948
- "eval_loss": 30.4089298248291,
949
- "eval_runtime": 17.0149,
950
- "eval_samples_per_second": 2.821,
951
- "eval_steps_per_second": 0.588,
952
- "step": 6032
953
  },
954
  {
955
- "epoch": 58.65384615384615,
956
- "grad_norm": 24.035823822021484,
957
- "learning_rate": 5.276046511627908e-05,
958
- "loss": 12.948,
959
  "step": 6100
960
  },
961
  {
962
- "epoch": 59.0,
963
  "eval_dummy": 1.0,
964
- "eval_loss": 30.537500381469727,
965
- "eval_runtime": 16.6335,
966
- "eval_samples_per_second": 2.886,
967
- "eval_steps_per_second": 0.601,
968
- "step": 6136
969
  },
970
  {
971
- "epoch": 59.61538461538461,
972
- "grad_norm": 18.700029373168945,
973
- "learning_rate": 5.1986046511627916e-05,
974
- "loss": 12.2869,
975
  "step": 6200
976
  },
977
  {
978
- "epoch": 60.0,
979
- "eval_dummy": 1.0,
980
- "eval_loss": 32.38269805908203,
981
- "eval_runtime": 16.6776,
982
- "eval_samples_per_second": 2.878,
983
- "eval_steps_per_second": 0.6,
984
- "step": 6240
985
- },
986
- {
987
- "epoch": 60.57692307692308,
988
- "grad_norm": 34.31249237060547,
989
- "learning_rate": 5.121162790697675e-05,
990
- "loss": 12.7717,
991
  "step": 6300
992
  },
993
  {
994
- "epoch": 61.0,
995
  "eval_dummy": 1.0,
996
- "eval_loss": 30.639678955078125,
997
- "eval_runtime": 16.6901,
998
- "eval_samples_per_second": 2.876,
999
- "eval_steps_per_second": 0.599,
1000
- "step": 6344
1001
  },
1002
  {
1003
- "epoch": 61.53846153846154,
1004
- "grad_norm": 26.939144134521484,
1005
- "learning_rate": 5.0437209302325585e-05,
1006
- "loss": 12.4924,
1007
  "step": 6400
1008
  },
1009
  {
1010
- "epoch": 62.0,
1011
  "eval_dummy": 1.0,
1012
- "eval_loss": 30.700525283813477,
1013
- "eval_runtime": 16.6341,
1014
- "eval_samples_per_second": 2.886,
1015
- "eval_steps_per_second": 0.601,
1016
- "step": 6448
1017
  },
1018
  {
1019
- "epoch": 62.5,
1020
- "grad_norm": 13.92809009552002,
1021
- "learning_rate": 4.966279069767442e-05,
1022
- "loss": 12.3031,
1023
  "step": 6500
1024
  },
1025
  {
1026
- "epoch": 63.0,
1027
  "eval_dummy": 1.0,
1028
- "eval_loss": 29.986501693725586,
1029
- "eval_runtime": 16.2627,
1030
- "eval_samples_per_second": 2.952,
1031
- "eval_steps_per_second": 0.615,
1032
- "step": 6552
1033
  },
1034
  {
1035
- "epoch": 63.46153846153846,
1036
- "grad_norm": 53.49395751953125,
1037
- "learning_rate": 4.888837209302326e-05,
1038
- "loss": 12.5575,
1039
  "step": 6600
1040
  },
1041
  {
1042
- "epoch": 64.0,
1043
- "eval_dummy": 1.0,
1044
- "eval_loss": 31.06968116760254,
1045
- "eval_runtime": 16.634,
1046
- "eval_samples_per_second": 2.886,
1047
- "eval_steps_per_second": 0.601,
1048
- "step": 6656
1049
- },
1050
- {
1051
- "epoch": 64.42307692307692,
1052
- "grad_norm": 20.313173294067383,
1053
- "learning_rate": 4.811395348837209e-05,
1054
- "loss": 11.9496,
1055
  "step": 6700
1056
  },
1057
  {
1058
- "epoch": 65.0,
1059
  "eval_dummy": 1.0,
1060
- "eval_loss": 31.579355239868164,
1061
- "eval_runtime": 16.5822,
1062
- "eval_samples_per_second": 2.895,
1063
- "eval_steps_per_second": 0.603,
1064
- "step": 6760
1065
  },
1066
  {
1067
- "epoch": 65.38461538461539,
1068
- "grad_norm": 33.59545135498047,
1069
- "learning_rate": 4.733953488372093e-05,
1070
- "loss": 12.0462,
1071
  "step": 6800
1072
  },
1073
  {
1074
- "epoch": 66.0,
1075
  "eval_dummy": 1.0,
1076
- "eval_loss": 31.65366554260254,
1077
- "eval_runtime": 16.6216,
1078
- "eval_samples_per_second": 2.888,
1079
- "eval_steps_per_second": 0.602,
1080
- "step": 6864
1081
  },
1082
  {
1083
- "epoch": 66.34615384615384,
1084
- "grad_norm": 41.989036560058594,
1085
- "learning_rate": 4.656511627906977e-05,
1086
- "loss": 12.7167,
1087
  "step": 6900
1088
  },
1089
  {
1090
- "epoch": 67.0,
1091
  "eval_dummy": 1.0,
1092
- "eval_loss": 30.74114990234375,
1093
- "eval_runtime": 16.9556,
1094
- "eval_samples_per_second": 2.831,
1095
- "eval_steps_per_second": 0.59,
1096
- "step": 6968
1097
  },
1098
  {
1099
- "epoch": 67.3076923076923,
1100
- "grad_norm": 39.545814514160156,
1101
- "learning_rate": 4.579069767441861e-05,
1102
- "loss": 11.8595,
1103
  "step": 7000
1104
  },
1105
  {
1106
- "epoch": 68.0,
1107
  "eval_dummy": 1.0,
1108
- "eval_loss": 30.4969539642334,
1109
- "eval_runtime": 16.5848,
1110
- "eval_samples_per_second": 2.894,
1111
- "eval_steps_per_second": 0.603,
1112
- "step": 7072
1113
  },
1114
  {
1115
- "epoch": 68.26923076923077,
1116
- "grad_norm": 17.500286102294922,
1117
- "learning_rate": 4.501627906976745e-05,
1118
- "loss": 11.7458,
1119
  "step": 7100
1120
  },
1121
  {
1122
- "epoch": 69.0,
1123
- "eval_dummy": 1.0,
1124
- "eval_loss": 30.833221435546875,
1125
- "eval_runtime": 16.7855,
1126
- "eval_samples_per_second": 2.86,
1127
- "eval_steps_per_second": 0.596,
1128
- "step": 7176
1129
- },
1130
- {
1131
- "epoch": 69.23076923076923,
1132
- "grad_norm": 18.128597259521484,
1133
- "learning_rate": 4.4241860465116286e-05,
1134
- "loss": 12.2058,
1135
  "step": 7200
1136
  },
1137
  {
1138
- "epoch": 70.0,
1139
  "eval_dummy": 1.0,
1140
- "eval_loss": 32.0950813293457,
1141
- "eval_runtime": 17.2742,
1142
- "eval_samples_per_second": 2.779,
1143
- "eval_steps_per_second": 0.579,
1144
- "step": 7280
1145
  },
1146
  {
1147
- "epoch": 70.1923076923077,
1148
- "grad_norm": 25.363075256347656,
1149
- "learning_rate": 4.3467441860465117e-05,
1150
- "loss": 12.0874,
1151
  "step": 7300
1152
  },
1153
  {
1154
- "epoch": 71.0,
1155
  "eval_dummy": 1.0,
1156
- "eval_loss": 32.46952438354492,
1157
- "eval_runtime": 17.4937,
1158
- "eval_samples_per_second": 2.744,
1159
- "eval_steps_per_second": 0.572,
1160
- "step": 7384
1161
  },
1162
  {
1163
- "epoch": 71.15384615384616,
1164
- "grad_norm": 9.088305473327637,
1165
- "learning_rate": 4.2693023255813954e-05,
1166
- "loss": 11.705,
1167
  "step": 7400
1168
  },
1169
  {
1170
- "epoch": 72.0,
1171
  "eval_dummy": 1.0,
1172
- "eval_loss": 31.311721801757812,
1173
- "eval_runtime": 16.6157,
1174
- "eval_samples_per_second": 2.889,
1175
- "eval_steps_per_second": 0.602,
1176
- "step": 7488
1177
  },
1178
  {
1179
- "epoch": 72.11538461538461,
1180
- "grad_norm": 21.332366943359375,
1181
- "learning_rate": 4.191860465116279e-05,
1182
- "loss": 12.0,
1183
  "step": 7500
1184
  },
1185
  {
1186
- "epoch": 73.0,
1187
- "eval_dummy": 1.0,
1188
- "eval_loss": 30.654035568237305,
1189
- "eval_runtime": 17.001,
1190
- "eval_samples_per_second": 2.823,
1191
- "eval_steps_per_second": 0.588,
1192
- "step": 7592
1193
- },
1194
- {
1195
- "epoch": 73.07692307692308,
1196
- "grad_norm": 16.92377281188965,
1197
- "learning_rate": 4.114418604651163e-05,
1198
- "loss": 11.9852,
1199
  "step": 7600
1200
  },
1201
  {
1202
- "epoch": 74.0,
1203
  "eval_dummy": 1.0,
1204
- "eval_loss": 34.29496765136719,
1205
- "eval_runtime": 16.6247,
1206
- "eval_samples_per_second": 2.887,
1207
- "eval_steps_per_second": 0.602,
1208
- "step": 7696
1209
  },
1210
  {
1211
- "epoch": 74.03846153846153,
1212
- "grad_norm": 47.94444274902344,
1213
- "learning_rate": 4.0369767441860474e-05,
1214
- "loss": 11.8647,
1215
  "step": 7700
1216
  },
1217
  {
1218
- "epoch": 75.0,
1219
- "grad_norm": 31.79910659790039,
1220
- "learning_rate": 3.959534883720931e-05,
1221
- "loss": 11.7597,
 
 
 
 
 
 
 
 
 
1222
  "step": 7800
1223
  },
1224
  {
1225
- "epoch": 75.0,
1226
  "eval_dummy": 1.0,
1227
- "eval_loss": 31.636077880859375,
1228
- "eval_runtime": 16.64,
1229
- "eval_samples_per_second": 2.885,
1230
- "eval_steps_per_second": 0.601,
1231
- "step": 7800
1232
  },
1233
  {
1234
- "epoch": 75.96153846153847,
1235
- "grad_norm": 17.918701171875,
1236
- "learning_rate": 3.882093023255814e-05,
1237
- "loss": 11.8713,
1238
  "step": 7900
1239
  },
1240
  {
1241
- "epoch": 76.0,
1242
  "eval_dummy": 1.0,
1243
- "eval_loss": 31.10820198059082,
1244
- "eval_runtime": 16.6206,
1245
- "eval_samples_per_second": 2.888,
1246
- "eval_steps_per_second": 0.602,
1247
- "step": 7904
1248
  },
1249
  {
1250
- "epoch": 76.92307692307692,
1251
- "grad_norm": 34.80290985107422,
1252
- "learning_rate": 3.804651162790698e-05,
1253
- "loss": 11.705,
1254
  "step": 8000
1255
  },
1256
  {
1257
- "epoch": 77.0,
1258
- "eval_dummy": 1.0,
1259
- "eval_loss": 31.87137794494629,
1260
- "eval_runtime": 16.986,
1261
- "eval_samples_per_second": 2.826,
1262
- "eval_steps_per_second": 0.589,
1263
- "step": 8008
1264
- },
1265
- {
1266
- "epoch": 77.88461538461539,
1267
- "grad_norm": 16.33148193359375,
1268
- "learning_rate": 3.727209302325582e-05,
1269
- "loss": 11.5474,
1270
  "step": 8100
1271
  },
1272
  {
1273
- "epoch": 78.0,
1274
  "eval_dummy": 1.0,
1275
- "eval_loss": 31.029922485351562,
1276
- "eval_runtime": 16.598,
1277
- "eval_samples_per_second": 2.892,
1278
- "eval_steps_per_second": 0.602,
1279
- "step": 8112
1280
  },
1281
  {
1282
- "epoch": 78.84615384615384,
1283
- "grad_norm": 15.126340866088867,
1284
- "learning_rate": 3.6497674418604655e-05,
1285
- "loss": 11.8387,
1286
  "step": 8200
1287
  },
1288
  {
1289
- "epoch": 79.0,
1290
  "eval_dummy": 1.0,
1291
- "eval_loss": 31.367172241210938,
1292
- "eval_runtime": 16.6089,
1293
- "eval_samples_per_second": 2.89,
1294
- "eval_steps_per_second": 0.602,
1295
- "step": 8216
1296
  },
1297
  {
1298
- "epoch": 79.8076923076923,
1299
- "grad_norm": 13.069074630737305,
1300
- "learning_rate": 3.5723255813953486e-05,
1301
- "loss": 11.7057,
1302
  "step": 8300
1303
  },
1304
  {
1305
- "epoch": 80.0,
1306
  "eval_dummy": 1.0,
1307
- "eval_loss": 31.643484115600586,
1308
- "eval_runtime": 16.4881,
1309
- "eval_samples_per_second": 2.911,
1310
- "eval_steps_per_second": 0.606,
1311
- "step": 8320
1312
  },
1313
  {
1314
- "epoch": 80.76923076923077,
1315
- "grad_norm": 24.73469352722168,
1316
- "learning_rate": 3.4948837209302323e-05,
1317
- "loss": 11.5656,
1318
  "step": 8400
1319
  },
1320
  {
1321
- "epoch": 81.0,
1322
- "eval_dummy": 1.0,
1323
- "eval_loss": 31.194046020507812,
1324
- "eval_runtime": 16.6985,
1325
- "eval_samples_per_second": 2.875,
1326
- "eval_steps_per_second": 0.599,
1327
- "step": 8424
1328
- },
1329
- {
1330
- "epoch": 81.73076923076923,
1331
- "grad_norm": 26.05837631225586,
1332
- "learning_rate": 3.417441860465117e-05,
1333
- "loss": 11.6578,
1334
  "step": 8500
1335
  },
1336
  {
1337
- "epoch": 82.0,
1338
  "eval_dummy": 1.0,
1339
- "eval_loss": 31.818357467651367,
1340
- "eval_runtime": 16.6321,
1341
- "eval_samples_per_second": 2.886,
1342
- "eval_steps_per_second": 0.601,
1343
- "step": 8528
1344
  },
1345
  {
1346
- "epoch": 82.6923076923077,
1347
- "grad_norm": 29.041229248046875,
1348
- "learning_rate": 3.3400000000000005e-05,
1349
- "loss": 11.3049,
1350
  "step": 8600
1351
  },
1352
  {
1353
- "epoch": 83.0,
1354
  "eval_dummy": 1.0,
1355
- "eval_loss": 31.866790771484375,
1356
- "eval_runtime": 16.6756,
1357
- "eval_samples_per_second": 2.878,
1358
- "eval_steps_per_second": 0.6,
1359
- "step": 8632
1360
  },
1361
  {
1362
- "epoch": 83.65384615384616,
1363
- "grad_norm": 14.160347938537598,
1364
- "learning_rate": 3.262558139534884e-05,
1365
- "loss": 11.5542,
1366
  "step": 8700
1367
  },
1368
  {
1369
- "epoch": 84.0,
1370
  "eval_dummy": 1.0,
1371
- "eval_loss": 32.81920623779297,
1372
- "eval_runtime": 16.6408,
1373
- "eval_samples_per_second": 2.884,
1374
- "eval_steps_per_second": 0.601,
1375
- "step": 8736
1376
  },
1377
  {
1378
- "epoch": 84.61538461538461,
1379
- "grad_norm": 11.912644386291504,
1380
- "learning_rate": 3.185116279069768e-05,
1381
- "loss": 11.3942,
1382
  "step": 8800
1383
  },
1384
  {
1385
- "epoch": 85.0,
1386
- "eval_dummy": 1.0,
1387
- "eval_loss": 30.972320556640625,
1388
- "eval_runtime": 16.7795,
1389
- "eval_samples_per_second": 2.861,
1390
- "eval_steps_per_second": 0.596,
1391
- "step": 8840
1392
- },
1393
- {
1394
- "epoch": 85.57692307692308,
1395
- "grad_norm": 19.492916107177734,
1396
- "learning_rate": 3.107674418604651e-05,
1397
- "loss": 11.6955,
1398
  "step": 8900
1399
  },
1400
  {
1401
- "epoch": 86.0,
1402
  "eval_dummy": 1.0,
1403
- "eval_loss": 31.348657608032227,
1404
- "eval_runtime": 16.743,
1405
- "eval_samples_per_second": 2.867,
1406
- "eval_steps_per_second": 0.597,
1407
- "step": 8944
1408
  },
1409
  {
1410
- "epoch": 86.53846153846153,
1411
- "grad_norm": 14.419718742370605,
1412
- "learning_rate": 3.0302325581395346e-05,
1413
- "loss": 11.4862,
1414
  "step": 9000
1415
  },
1416
  {
1417
- "epoch": 87.0,
1418
  "eval_dummy": 1.0,
1419
- "eval_loss": 32.045135498046875,
1420
- "eval_runtime": 16.7069,
1421
- "eval_samples_per_second": 2.873,
1422
- "eval_steps_per_second": 0.599,
1423
- "step": 9048
1424
  },
1425
  {
1426
- "epoch": 87.5,
1427
- "grad_norm": 24.9334659576416,
1428
- "learning_rate": 2.9527906976744187e-05,
1429
- "loss": 11.5867,
1430
  "step": 9100
1431
  },
1432
  {
1433
- "epoch": 88.0,
1434
  "eval_dummy": 1.0,
1435
- "eval_loss": 31.976896286010742,
1436
- "eval_runtime": 16.5865,
1437
- "eval_samples_per_second": 2.894,
1438
- "eval_steps_per_second": 0.603,
1439
- "step": 9152
1440
  },
1441
  {
1442
- "epoch": 88.46153846153847,
1443
- "grad_norm": 10.158917427062988,
1444
- "learning_rate": 2.8753488372093018e-05,
1445
- "loss": 11.0975,
1446
  "step": 9200
1447
  },
1448
  {
1449
- "epoch": 89.0,
1450
  "eval_dummy": 1.0,
1451
- "eval_loss": 31.972078323364258,
1452
- "eval_runtime": 16.6959,
1453
- "eval_samples_per_second": 2.875,
1454
- "eval_steps_per_second": 0.599,
1455
- "step": 9256
1456
  },
1457
  {
1458
- "epoch": 89.42307692307692,
1459
- "grad_norm": 11.408435821533203,
1460
- "learning_rate": 2.797906976744187e-05,
1461
- "loss": 11.5126,
1462
  "step": 9300
1463
  },
1464
  {
1465
- "epoch": 90.0,
1466
- "eval_dummy": 1.0,
1467
- "eval_loss": 35.387664794921875,
1468
- "eval_runtime": 16.6889,
1469
- "eval_samples_per_second": 2.876,
1470
- "eval_steps_per_second": 0.599,
1471
- "step": 9360
1472
- },
1473
- {
1474
- "epoch": 90.38461538461539,
1475
- "grad_norm": 18.4831600189209,
1476
- "learning_rate": 2.72046511627907e-05,
1477
- "loss": 11.067,
1478
  "step": 9400
1479
  },
1480
  {
1481
- "epoch": 91.0,
1482
  "eval_dummy": 1.0,
1483
- "eval_loss": 33.76138687133789,
1484
- "eval_runtime": 16.2325,
1485
- "eval_samples_per_second": 2.957,
1486
- "eval_steps_per_second": 0.616,
1487
- "step": 9464
1488
  },
1489
  {
1490
- "epoch": 91.34615384615384,
1491
- "grad_norm": 14.10232925415039,
1492
- "learning_rate": 2.643023255813954e-05,
1493
- "loss": 11.3857,
1494
  "step": 9500
1495
  },
1496
  {
1497
- "epoch": 92.0,
1498
  "eval_dummy": 1.0,
1499
- "eval_loss": 32.704566955566406,
1500
- "eval_runtime": 16.6368,
1501
- "eval_samples_per_second": 2.885,
1502
- "eval_steps_per_second": 0.601,
1503
- "step": 9568
1504
  },
1505
  {
1506
- "epoch": 92.3076923076923,
1507
- "grad_norm": 10.36620807647705,
1508
- "learning_rate": 2.565581395348837e-05,
1509
- "loss": 11.5511,
1510
  "step": 9600
1511
  },
1512
  {
1513
- "epoch": 93.0,
1514
  "eval_dummy": 1.0,
1515
- "eval_loss": 32.10957336425781,
1516
- "eval_runtime": 16.8564,
1517
- "eval_samples_per_second": 2.848,
1518
- "eval_steps_per_second": 0.593,
1519
- "step": 9672
1520
  },
1521
  {
1522
- "epoch": 93.26923076923077,
1523
- "grad_norm": 13.063793182373047,
1524
- "learning_rate": 2.488139534883721e-05,
1525
- "loss": 11.0961,
1526
  "step": 9700
1527
  },
1528
  {
1529
- "epoch": 94.0,
1530
- "eval_dummy": 1.0,
1531
- "eval_loss": 32.8302001953125,
1532
- "eval_runtime": 16.6194,
1533
- "eval_samples_per_second": 2.888,
1534
- "eval_steps_per_second": 0.602,
1535
- "step": 9776
1536
- },
1537
- {
1538
- "epoch": 94.23076923076923,
1539
- "grad_norm": 47.0684814453125,
1540
- "learning_rate": 2.4106976744186043e-05,
1541
- "loss": 11.2935,
1542
  "step": 9800
1543
  },
1544
  {
1545
- "epoch": 95.0,
1546
  "eval_dummy": 1.0,
1547
- "eval_loss": 32.66879653930664,
1548
- "eval_runtime": 16.8788,
1549
- "eval_samples_per_second": 2.844,
1550
- "eval_steps_per_second": 0.592,
1551
- "step": 9880
1552
  },
1553
  {
1554
- "epoch": 95.1923076923077,
1555
- "grad_norm": 16.18996238708496,
1556
- "learning_rate": 2.333255813953488e-05,
1557
- "loss": 11.2398,
1558
  "step": 9900
1559
  },
1560
  {
1561
- "epoch": 96.0,
1562
  "eval_dummy": 1.0,
1563
- "eval_loss": 32.2806510925293,
1564
- "eval_runtime": 16.4167,
1565
- "eval_samples_per_second": 2.924,
1566
- "eval_steps_per_second": 0.609,
1567
- "step": 9984
1568
  },
1569
  {
1570
- "epoch": 96.15384615384616,
1571
- "grad_norm": 20.301530838012695,
1572
- "learning_rate": 2.2558139534883715e-05,
1573
- "loss": 11.0444,
1574
  "step": 10000
1575
  },
1576
  {
1577
- "epoch": 97.0,
1578
  "eval_dummy": 1.0,
1579
- "eval_loss": 32.276641845703125,
1580
- "eval_runtime": 16.7261,
1581
- "eval_samples_per_second": 2.87,
1582
- "eval_steps_per_second": 0.598,
1583
- "step": 10088
1584
  },
1585
  {
1586
- "epoch": 97.11538461538461,
1587
- "grad_norm": 16.50993537902832,
1588
- "learning_rate": 2.1783720930232563e-05,
1589
- "loss": 11.3157,
1590
  "step": 10100
1591
  },
1592
  {
1593
- "epoch": 98.0,
1594
  "eval_dummy": 1.0,
1595
- "eval_loss": 32.443729400634766,
1596
- "eval_runtime": 16.7525,
1597
- "eval_samples_per_second": 2.865,
1598
- "eval_steps_per_second": 0.597,
1599
- "step": 10192
1600
  },
1601
  {
1602
- "epoch": 98.07692307692308,
1603
- "grad_norm": 13.487881660461426,
1604
- "learning_rate": 2.1009302325581397e-05,
1605
- "loss": 11.0191,
1606
  "step": 10200
1607
  },
1608
  {
1609
- "epoch": 99.0,
1610
- "eval_dummy": 1.0,
1611
- "eval_loss": 32.385108947753906,
1612
- "eval_runtime": 16.6154,
1613
- "eval_samples_per_second": 2.889,
1614
- "eval_steps_per_second": 0.602,
1615
- "step": 10296
1616
  },
1617
  {
1618
- "epoch": 99.03846153846153,
1619
- "grad_norm": 13.905572891235352,
1620
- "learning_rate": 2.0234883720930235e-05,
1621
- "loss": 11.2286,
1622
- "step": 10300
 
 
1623
  },
1624
  {
1625
- "epoch": 100.0,
1626
- "grad_norm": 37.709373474121094,
1627
- "learning_rate": 1.946046511627907e-05,
1628
- "loss": 11.1406,
1629
  "step": 10400
1630
  },
1631
  {
1632
- "epoch": 100.0,
1633
  "eval_dummy": 1.0,
1634
- "eval_loss": 32.138919830322266,
1635
- "eval_runtime": 16.6356,
1636
- "eval_samples_per_second": 2.885,
1637
- "eval_steps_per_second": 0.601,
1638
- "step": 10400
1639
  },
1640
  {
1641
- "epoch": 100.96153846153847,
1642
- "grad_norm": 9.66380500793457,
1643
- "learning_rate": 1.8686046511627907e-05,
1644
- "loss": 11.1237,
1645
  "step": 10500
1646
  },
1647
  {
1648
- "epoch": 101.0,
1649
  "eval_dummy": 1.0,
1650
- "eval_loss": 32.488643646240234,
1651
- "eval_runtime": 16.5945,
1652
- "eval_samples_per_second": 2.893,
1653
- "eval_steps_per_second": 0.603,
1654
- "step": 10504
1655
  },
1656
  {
1657
- "epoch": 101.92307692307692,
1658
- "grad_norm": 8.697158813476562,
1659
- "learning_rate": 1.791162790697674e-05,
1660
- "loss": 10.9485,
1661
  "step": 10600
1662
  },
1663
  {
1664
- "epoch": 102.0,
1665
- "eval_dummy": 1.0,
1666
- "eval_loss": 32.50514602661133,
1667
- "eval_runtime": 16.4431,
1668
- "eval_samples_per_second": 2.919,
1669
- "eval_steps_per_second": 0.608,
1670
- "step": 10608
1671
- },
1672
- {
1673
- "epoch": 102.88461538461539,
1674
- "grad_norm": 7.563882350921631,
1675
- "learning_rate": 1.713720930232558e-05,
1676
- "loss": 10.9188,
1677
  "step": 10700
1678
  },
1679
  {
1680
- "epoch": 103.0,
1681
  "eval_dummy": 1.0,
1682
- "eval_loss": 32.86152648925781,
1683
- "eval_runtime": 16.6025,
1684
- "eval_samples_per_second": 2.891,
1685
- "eval_steps_per_second": 0.602,
1686
- "step": 10712
1687
  },
1688
  {
1689
- "epoch": 103.84615384615384,
1690
- "grad_norm": 9.366110801696777,
1691
- "learning_rate": 1.6362790697674413e-05,
1692
- "loss": 11.3029,
1693
  "step": 10800
1694
  },
1695
  {
1696
- "epoch": 104.0,
1697
  "eval_dummy": 1.0,
1698
- "eval_loss": 33.038761138916016,
1699
- "eval_runtime": 16.8928,
1700
- "eval_samples_per_second": 2.841,
1701
- "eval_steps_per_second": 0.592,
1702
- "step": 10816
1703
  },
1704
  {
1705
- "epoch": 104.8076923076923,
1706
- "grad_norm": 28.598913192749023,
1707
- "learning_rate": 1.558837209302326e-05,
1708
- "loss": 11.2023,
1709
  "step": 10900
1710
  },
1711
  {
1712
- "epoch": 105.0,
1713
  "eval_dummy": 1.0,
1714
- "eval_loss": 32.492279052734375,
1715
- "eval_runtime": 16.6606,
1716
- "eval_samples_per_second": 2.881,
1717
- "eval_steps_per_second": 0.6,
1718
- "step": 10920
1719
  },
1720
  {
1721
- "epoch": 105.76923076923077,
1722
- "grad_norm": 23.246570587158203,
1723
- "learning_rate": 1.4813953488372098e-05,
1724
- "loss": 10.9634,
1725
  "step": 11000
1726
  },
1727
  {
1728
- "epoch": 106.0,
1729
  "eval_dummy": 1.0,
1730
- "eval_loss": 32.32876968383789,
1731
- "eval_runtime": 16.8242,
1732
- "eval_samples_per_second": 2.853,
1733
- "eval_steps_per_second": 0.594,
1734
- "step": 11024
1735
  },
1736
  {
1737
- "epoch": 106.73076923076923,
1738
- "grad_norm": 12.571370124816895,
1739
- "learning_rate": 1.4039534883720934e-05,
1740
- "loss": 11.257,
1741
  "step": 11100
1742
  },
1743
  {
1744
- "epoch": 107.0,
1745
- "eval_dummy": 1.0,
1746
- "eval_loss": 31.88549041748047,
1747
- "eval_runtime": 16.349,
1748
- "eval_samples_per_second": 2.936,
1749
- "eval_steps_per_second": 0.612,
1750
- "step": 11128
1751
- },
1752
- {
1753
- "epoch": 107.6923076923077,
1754
- "grad_norm": 26.309329986572266,
1755
- "learning_rate": 1.326511627906977e-05,
1756
- "loss": 11.0193,
1757
  "step": 11200
1758
  },
1759
  {
1760
- "epoch": 108.0,
1761
  "eval_dummy": 1.0,
1762
- "eval_loss": 34.006710052490234,
1763
- "eval_runtime": 17.4954,
1764
- "eval_samples_per_second": 2.744,
1765
- "eval_steps_per_second": 0.572,
1766
- "step": 11232
1767
  },
1768
  {
1769
- "epoch": 108.65384615384616,
1770
- "grad_norm": 21.006258010864258,
1771
- "learning_rate": 1.2490697674418606e-05,
1772
- "loss": 10.6401,
1773
  "step": 11300
1774
  },
1775
  {
1776
- "epoch": 109.0,
1777
  "eval_dummy": 1.0,
1778
- "eval_loss": 33.29460144042969,
1779
- "eval_runtime": 16.6851,
1780
- "eval_samples_per_second": 2.877,
1781
- "eval_steps_per_second": 0.599,
1782
- "step": 11336
1783
  },
1784
  {
1785
- "epoch": 109.61538461538461,
1786
- "grad_norm": 17.321447372436523,
1787
- "learning_rate": 1.1716279069767442e-05,
1788
- "loss": 11.0542,
1789
  "step": 11400
1790
  },
1791
  {
1792
- "epoch": 110.0,
1793
  "eval_dummy": 1.0,
1794
- "eval_loss": 34.05351638793945,
1795
- "eval_runtime": 16.6555,
1796
- "eval_samples_per_second": 2.882,
1797
- "eval_steps_per_second": 0.6,
1798
- "step": 11440
1799
  },
1800
  {
1801
- "epoch": 110.57692307692308,
1802
- "grad_norm": 15.087108612060547,
1803
- "learning_rate": 1.0941860465116278e-05,
1804
- "loss": 10.888,
1805
  "step": 11500
1806
  },
1807
  {
1808
- "epoch": 111.0,
1809
- "eval_dummy": 1.0,
1810
- "eval_loss": 32.72056198120117,
1811
- "eval_runtime": 16.9072,
1812
- "eval_samples_per_second": 2.839,
1813
- "eval_steps_per_second": 0.591,
1814
- "step": 11544
1815
- },
1816
- {
1817
- "epoch": 111.53846153846153,
1818
- "grad_norm": 8.586106300354004,
1819
- "learning_rate": 1.0167441860465113e-05,
1820
- "loss": 10.9706,
1821
  "step": 11600
1822
  },
1823
  {
1824
- "epoch": 112.0,
1825
  "eval_dummy": 1.0,
1826
- "eval_loss": 33.12382888793945,
1827
- "eval_runtime": 16.5804,
1828
- "eval_samples_per_second": 2.895,
1829
- "eval_steps_per_second": 0.603,
1830
- "step": 11648
1831
  },
1832
  {
1833
- "epoch": 112.5,
1834
- "grad_norm": 7.436498165130615,
1835
- "learning_rate": 9.39302325581396e-06,
1836
- "loss": 11.0075,
1837
  "step": 11700
1838
  },
1839
  {
1840
- "epoch": 113.0,
1841
  "eval_dummy": 1.0,
1842
- "eval_loss": 32.988155364990234,
1843
- "eval_runtime": 16.4227,
1844
- "eval_samples_per_second": 2.923,
1845
- "eval_steps_per_second": 0.609,
1846
- "step": 11752
1847
  },
1848
  {
1849
- "epoch": 113.46153846153847,
1850
- "grad_norm": 8.474443435668945,
1851
- "learning_rate": 8.618604651162795e-06,
1852
- "loss": 10.7895,
1853
  "step": 11800
1854
  },
1855
  {
1856
- "epoch": 114.0,
1857
  "eval_dummy": 1.0,
1858
- "eval_loss": 32.79851150512695,
1859
- "eval_runtime": 17.5126,
1860
- "eval_samples_per_second": 2.741,
1861
- "eval_steps_per_second": 0.571,
1862
- "step": 11856
1863
  },
1864
  {
1865
- "epoch": 114.42307692307692,
1866
- "grad_norm": 17.689861297607422,
1867
- "learning_rate": 7.844186046511631e-06,
1868
- "loss": 10.9181,
1869
  "step": 11900
1870
  },
1871
  {
1872
- "epoch": 115.0,
1873
  "eval_dummy": 1.0,
1874
- "eval_loss": 32.91426086425781,
1875
- "eval_runtime": 17.7135,
1876
- "eval_samples_per_second": 2.71,
1877
- "eval_steps_per_second": 0.565,
1878
- "step": 11960
1879
  },
1880
  {
1881
- "epoch": 115.38461538461539,
1882
- "grad_norm": 10.550300598144531,
1883
- "learning_rate": 7.069767441860467e-06,
1884
- "loss": 10.5938,
1885
  "step": 12000
1886
  },
1887
  {
1888
- "epoch": 116.0,
1889
- "eval_dummy": 1.0,
1890
- "eval_loss": 33.07219314575195,
1891
- "eval_runtime": 16.7996,
1892
- "eval_samples_per_second": 2.857,
1893
- "eval_steps_per_second": 0.595,
1894
- "step": 12064
1895
- },
1896
- {
1897
- "epoch": 116.34615384615384,
1898
- "grad_norm": 10.545084953308105,
1899
- "learning_rate": 6.295348837209302e-06,
1900
- "loss": 10.4932,
1901
  "step": 12100
1902
  },
1903
  {
1904
- "epoch": 117.0,
1905
  "eval_dummy": 1.0,
1906
- "eval_loss": 34.2365837097168,
1907
- "eval_runtime": 17.0204,
1908
- "eval_samples_per_second": 2.82,
1909
- "eval_steps_per_second": 0.588,
1910
- "step": 12168
1911
  },
1912
  {
1913
- "epoch": 117.3076923076923,
1914
- "grad_norm": 11.177499771118164,
1915
- "learning_rate": 5.520930232558138e-06,
1916
- "loss": 10.9761,
1917
  "step": 12200
1918
  },
1919
  {
1920
- "epoch": 118.0,
1921
  "eval_dummy": 1.0,
1922
- "eval_loss": 33.88801956176758,
1923
- "eval_runtime": 16.5143,
1924
- "eval_samples_per_second": 2.907,
1925
- "eval_steps_per_second": 0.606,
1926
- "step": 12272
1927
  },
1928
  {
1929
- "epoch": 118.26923076923077,
1930
- "grad_norm": 9.421356201171875,
1931
- "learning_rate": 4.746511627906974e-06,
1932
- "loss": 10.6918,
1933
  "step": 12300
1934
  },
1935
  {
1936
- "epoch": 119.0,
1937
  "eval_dummy": 1.0,
1938
- "eval_loss": 34.32889175415039,
1939
- "eval_runtime": 17.2621,
1940
- "eval_samples_per_second": 2.781,
1941
- "eval_steps_per_second": 0.579,
1942
- "step": 12376
1943
  },
1944
  {
1945
- "epoch": 119.23076923076923,
1946
- "grad_norm": 11.958416938781738,
1947
- "learning_rate": 3.97209302325581e-06,
1948
- "loss": 10.896,
1949
  "step": 12400
1950
  },
1951
  {
1952
- "epoch": 120.0,
1953
- "eval_dummy": 1.0,
1954
- "eval_loss": 33.60952377319336,
1955
- "eval_runtime": 17.2959,
1956
- "eval_samples_per_second": 2.775,
1957
- "eval_steps_per_second": 0.578,
1958
- "step": 12480
1959
- },
1960
- {
1961
- "epoch": 120.1923076923077,
1962
- "grad_norm": 7.985867023468018,
1963
- "learning_rate": 3.1976744186046562e-06,
1964
- "loss": 10.6876,
1965
  "step": 12500
1966
  },
1967
  {
1968
- "epoch": 121.0,
1969
  "eval_dummy": 1.0,
1970
- "eval_loss": 33.86077880859375,
1971
- "eval_runtime": 17.1725,
1972
- "eval_samples_per_second": 2.795,
1973
- "eval_steps_per_second": 0.582,
1974
- "step": 12584
1975
  },
1976
  {
1977
- "epoch": 121.15384615384616,
1978
- "grad_norm": 7.811230182647705,
1979
- "learning_rate": 2.4232558139534926e-06,
1980
- "loss": 10.5666,
1981
  "step": 12600
1982
  },
1983
  {
1984
- "epoch": 122.0,
1985
  "eval_dummy": 1.0,
1986
- "eval_loss": 33.69937515258789,
1987
- "eval_runtime": 16.6446,
1988
- "eval_samples_per_second": 2.884,
1989
- "eval_steps_per_second": 0.601,
1990
- "step": 12688
1991
  },
1992
  {
1993
- "epoch": 122.11538461538461,
1994
- "grad_norm": 6.883728504180908,
1995
- "learning_rate": 1.6488372093023285e-06,
1996
- "loss": 10.8161,
1997
  "step": 12700
1998
  },
1999
  {
2000
- "epoch": 123.0,
2001
  "eval_dummy": 1.0,
2002
- "eval_loss": 33.61716842651367,
2003
- "eval_runtime": 17.0639,
2004
- "eval_samples_per_second": 2.813,
2005
- "eval_steps_per_second": 0.586,
2006
- "step": 12792
2007
  },
2008
  {
2009
- "epoch": 123.07692307692308,
2010
- "grad_norm": 7.218296527862549,
2011
- "learning_rate": 8.744186046511642e-07,
2012
- "loss": 10.7195,
2013
  "step": 12800
2014
  },
2015
  {
2016
- "epoch": 124.0,
2017
- "eval_dummy": 1.0,
2018
- "eval_loss": 33.539737701416016,
2019
- "eval_runtime": 16.3513,
2020
- "eval_samples_per_second": 2.936,
2021
- "eval_steps_per_second": 0.612,
2022
- "step": 12896
2023
- },
2024
- {
2025
- "epoch": 124.03846153846153,
2026
- "grad_norm": 7.083764553070068,
2027
  "learning_rate": 9.999999999999998e-08,
2028
- "loss": 10.6712,
2029
  "step": 12900
2030
  },
2031
  {
2032
- "epoch": 124.03846153846153,
2033
  "eval_dummy": 1.0,
2034
- "eval_loss": 33.490596771240234,
2035
- "eval_runtime": 17.1907,
2036
- "eval_samples_per_second": 2.792,
2037
- "eval_steps_per_second": 0.582,
2038
  "step": 12900
2039
  },
2040
  {
2041
- "epoch": 124.03846153846153,
2042
  "step": 12900,
2043
- "total_flos": 1.8148731642810335e+19,
2044
- "train_loss": 14.019830729166667,
2045
- "train_runtime": 31415.0856,
2046
- "train_samples_per_second": 2.053,
2047
- "train_steps_per_second": 0.411
2048
  }
2049
  ],
2050
  "logging_steps": 100,
2051
  "max_steps": 12900,
2052
  "num_input_tokens_seen": 0,
2053
- "num_train_epochs": 125,
2054
  "save_steps": 500,
2055
  "stateful_callbacks": {
2056
  "TrainerControl": {
@@ -2064,8 +1839,8 @@
2064
  "attributes": {}
2065
  }
2066
  },
2067
- "total_flos": 1.8148731642810335e+19,
2068
- "train_batch_size": 5,
2069
  "trial_name": null,
2070
  "trial_params": null
2071
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 100.0,
6
  "eval_steps": 500,
7
  "global_step": 12900,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.7751937984496124,
14
+ "grad_norm": 126.4291763305664,
15
+ "learning_rate": 0.0003969,
16
+ "loss": 57.794,
17
  "step": 100
18
  },
19
  {
20
  "epoch": 1.0,
21
  "eval_dummy": 1.0,
22
+ "eval_loss": 47.50299072265625,
23
+ "eval_runtime": 18.4297,
24
+ "eval_samples_per_second": 2.604,
25
+ "eval_steps_per_second": 0.651,
26
+ "step": 129
27
  },
28
  {
29
+ "epoch": 1.550387596899225,
30
+ "grad_norm": 110.41481018066406,
31
+ "learning_rate": 0.00039380000000000003,
32
+ "loss": 45.635,
33
  "step": 200
34
  },
35
  {
36
  "epoch": 2.0,
37
  "eval_dummy": 1.0,
38
+ "eval_loss": 42.4621696472168,
39
+ "eval_runtime": 17.0743,
40
+ "eval_samples_per_second": 2.811,
41
+ "eval_steps_per_second": 0.703,
42
+ "step": 258
43
  },
44
  {
45
+ "epoch": 2.3255813953488373,
46
+ "grad_norm": 119.6634750366211,
47
+ "learning_rate": 0.0003907,
48
+ "loss": 43.6742,
49
  "step": 300
50
  },
51
  {
52
  "epoch": 3.0,
53
  "eval_dummy": 1.0,
54
+ "eval_loss": 40.13833999633789,
55
+ "eval_runtime": 18.4155,
56
+ "eval_samples_per_second": 2.606,
57
+ "eval_steps_per_second": 0.652,
58
+ "step": 387
59
  },
60
  {
61
+ "epoch": 3.10077519379845,
62
+ "grad_norm": 118.22651672363281,
63
+ "learning_rate": 0.0003876,
64
+ "loss": 40.4744,
65
  "step": 400
66
  },
67
  {
68
+ "epoch": 3.875968992248062,
69
+ "grad_norm": 134.10031127929688,
70
+ "learning_rate": 0.0003845,
71
+ "loss": 37.5286,
 
 
 
 
 
 
 
 
 
72
  "step": 500
73
  },
74
  {
75
+ "epoch": 4.0,
76
  "eval_dummy": 1.0,
77
+ "eval_loss": 41.196353912353516,
78
+ "eval_runtime": 17.0535,
79
+ "eval_samples_per_second": 2.815,
80
+ "eval_steps_per_second": 0.704,
81
+ "step": 516
82
  },
83
  {
84
+ "epoch": 4.651162790697675,
85
+ "grad_norm": 80.17464447021484,
86
+ "learning_rate": 0.00038140000000000005,
87
+ "loss": 33.7618,
88
  "step": 600
89
  },
90
  {
91
+ "epoch": 5.0,
92
  "eval_dummy": 1.0,
93
+ "eval_loss": 34.43735122680664,
94
+ "eval_runtime": 19.3245,
95
+ "eval_samples_per_second": 2.484,
96
+ "eval_steps_per_second": 0.621,
97
+ "step": 645
98
  },
99
  {
100
+ "epoch": 5.426356589147287,
101
+ "grad_norm": 372.6417236328125,
102
+ "learning_rate": 0.00037830000000000003,
103
+ "loss": 31.5899,
104
  "step": 700
105
  },
106
  {
107
+ "epoch": 6.0,
108
  "eval_dummy": 1.0,
109
+ "eval_loss": 39.824180603027344,
110
+ "eval_runtime": 17.2226,
111
+ "eval_samples_per_second": 2.787,
112
+ "eval_steps_per_second": 0.697,
113
+ "step": 774
114
  },
115
  {
116
+ "epoch": 6.2015503875969,
117
+ "grad_norm": 226.50611877441406,
118
+ "learning_rate": 0.0003752,
119
+ "loss": 31.2097,
120
  "step": 800
121
  },
122
  {
123
+ "epoch": 6.976744186046512,
124
+ "grad_norm": 99.72407531738281,
125
+ "learning_rate": 0.0003721,
126
+ "loss": 29.0727,
 
 
 
 
 
 
 
 
 
127
  "step": 900
128
  },
129
  {
130
+ "epoch": 7.0,
131
  "eval_dummy": 1.0,
132
+ "eval_loss": 33.32234191894531,
133
+ "eval_runtime": 18.5858,
134
+ "eval_samples_per_second": 2.583,
135
+ "eval_steps_per_second": 0.646,
136
+ "step": 903
137
  },
138
  {
139
+ "epoch": 7.751937984496124,
140
+ "grad_norm": 107.07185363769531,
141
+ "learning_rate": 0.000369,
142
+ "loss": 27.8483,
143
  "step": 1000
144
  },
145
  {
146
+ "epoch": 8.0,
147
  "eval_dummy": 1.0,
148
+ "eval_loss": 30.962478637695312,
149
+ "eval_runtime": 17.3408,
150
+ "eval_samples_per_second": 2.768,
151
+ "eval_steps_per_second": 0.692,
152
+ "step": 1032
153
  },
154
  {
155
+ "epoch": 8.527131782945737,
156
+ "grad_norm": 82.22681427001953,
157
+ "learning_rate": 0.0003659,
158
+ "loss": 26.0904,
159
  "step": 1100
160
  },
161
  {
162
+ "epoch": 9.0,
163
  "eval_dummy": 1.0,
164
+ "eval_loss": 31.708364486694336,
165
+ "eval_runtime": 18.5166,
166
+ "eval_samples_per_second": 2.592,
167
+ "eval_steps_per_second": 0.648,
168
+ "step": 1161
169
  },
170
  {
171
+ "epoch": 9.30232558139535,
172
+ "grad_norm": 184.18087768554688,
173
+ "learning_rate": 0.00036280000000000004,
174
+ "loss": 26.1043,
175
  "step": 1200
176
  },
177
  {
178
+ "epoch": 10.0,
179
  "eval_dummy": 1.0,
180
+ "eval_loss": 31.808786392211914,
181
+ "eval_runtime": 17.2835,
182
+ "eval_samples_per_second": 2.777,
183
+ "eval_steps_per_second": 0.694,
184
+ "step": 1290
185
  },
186
  {
187
+ "epoch": 10.077519379844961,
188
+ "grad_norm": 82.96527862548828,
189
+ "learning_rate": 0.0003597,
190
+ "loss": 26.1461,
191
  "step": 1300
192
  },
193
  {
194
+ "epoch": 10.852713178294573,
195
+ "grad_norm": 76.06727600097656,
196
+ "learning_rate": 0.00035660000000000005,
197
+ "loss": 24.3038,
 
 
 
 
 
 
 
 
 
198
  "step": 1400
199
  },
200
  {
201
+ "epoch": 11.0,
202
  "eval_dummy": 1.0,
203
+ "eval_loss": 30.336084365844727,
204
+ "eval_runtime": 18.0862,
205
+ "eval_samples_per_second": 2.654,
206
+ "eval_steps_per_second": 0.663,
207
+ "step": 1419
208
  },
209
  {
210
+ "epoch": 11.627906976744185,
211
+ "grad_norm": 57.900630950927734,
212
+ "learning_rate": 0.00035350000000000003,
213
+ "loss": 23.6493,
214
  "step": 1500
215
  },
216
  {
217
+ "epoch": 12.0,
218
  "eval_dummy": 1.0,
219
+ "eval_loss": 30.202993392944336,
220
+ "eval_runtime": 16.9497,
221
+ "eval_samples_per_second": 2.832,
222
+ "eval_steps_per_second": 0.708,
223
+ "step": 1548
224
  },
225
  {
226
+ "epoch": 12.4031007751938,
227
+ "grad_norm": 60.565650939941406,
228
+ "learning_rate": 0.0003504,
229
+ "loss": 23.9146,
230
  "step": 1600
231
  },
232
  {
233
+ "epoch": 13.0,
234
  "eval_dummy": 1.0,
235
+ "eval_loss": 31.08062744140625,
236
+ "eval_runtime": 17.9909,
237
+ "eval_samples_per_second": 2.668,
238
+ "eval_steps_per_second": 0.667,
239
+ "step": 1677
240
  },
241
  {
242
+ "epoch": 13.178294573643411,
243
+ "grad_norm": 75.52674865722656,
244
+ "learning_rate": 0.00034730000000000004,
245
+ "loss": 23.033,
246
  "step": 1700
247
  },
248
  {
249
+ "epoch": 13.953488372093023,
250
+ "grad_norm": 45.41800308227539,
251
+ "learning_rate": 0.0003442,
252
+ "loss": 21.9133,
 
 
 
 
 
 
 
 
 
253
  "step": 1800
254
  },
255
  {
256
+ "epoch": 14.0,
257
  "eval_dummy": 1.0,
258
+ "eval_loss": 31.397354125976562,
259
+ "eval_runtime": 17.0643,
260
+ "eval_samples_per_second": 2.813,
261
+ "eval_steps_per_second": 0.703,
262
+ "step": 1806
263
  },
264
  {
265
+ "epoch": 14.728682170542635,
266
+ "grad_norm": 46.21736145019531,
267
+ "learning_rate": 0.0003411,
268
+ "loss": 22.3071,
269
  "step": 1900
270
  },
271
  {
272
+ "epoch": 15.0,
273
  "eval_dummy": 1.0,
274
+ "eval_loss": 32.09249496459961,
275
+ "eval_runtime": 17.5136,
276
+ "eval_samples_per_second": 2.741,
277
+ "eval_steps_per_second": 0.685,
278
+ "step": 1935
279
  },
280
  {
281
+ "epoch": 15.503875968992247,
282
+ "grad_norm": 29.049968719482422,
283
+ "learning_rate": 0.000338,
284
+ "loss": 21.0819,
285
  "step": 2000
286
  },
287
  {
288
+ "epoch": 16.0,
289
  "eval_dummy": 1.0,
290
+ "eval_loss": 29.936742782592773,
291
+ "eval_runtime": 17.2024,
292
+ "eval_samples_per_second": 2.79,
293
+ "eval_steps_per_second": 0.698,
294
+ "step": 2064
295
  },
296
  {
297
+ "epoch": 16.27906976744186,
298
+ "grad_norm": 70.16988372802734,
299
+ "learning_rate": 0.0003349,
300
+ "loss": 21.0089,
301
  "step": 2100
302
  },
303
  {
304
+ "epoch": 17.0,
305
  "eval_dummy": 1.0,
306
+ "eval_loss": 30.042001724243164,
307
+ "eval_runtime": 17.9057,
308
+ "eval_samples_per_second": 2.681,
309
+ "eval_steps_per_second": 0.67,
310
+ "step": 2193
311
  },
312
  {
313
+ "epoch": 17.05426356589147,
314
+ "grad_norm": 77.12342834472656,
315
+ "learning_rate": 0.00033180000000000004,
316
+ "loss": 21.1193,
317
  "step": 2200
318
  },
319
  {
320
+ "epoch": 17.829457364341085,
321
+ "grad_norm": 40.200958251953125,
322
+ "learning_rate": 0.0003287,
323
+ "loss": 20.9169,
 
 
 
 
 
 
 
 
 
324
  "step": 2300
325
  },
326
  {
327
+ "epoch": 18.0,
328
  "eval_dummy": 1.0,
329
+ "eval_loss": 29.293771743774414,
330
+ "eval_runtime": 17.1083,
331
+ "eval_samples_per_second": 2.806,
332
+ "eval_steps_per_second": 0.701,
333
+ "step": 2322
334
  },
335
  {
336
+ "epoch": 18.6046511627907,
337
+ "grad_norm": 31.95384979248047,
338
+ "learning_rate": 0.0003256,
339
+ "loss": 19.7935,
340
  "step": 2400
341
  },
342
  {
343
+ "epoch": 19.0,
344
  "eval_dummy": 1.0,
345
+ "eval_loss": 31.394454956054688,
346
+ "eval_runtime": 18.6297,
347
+ "eval_samples_per_second": 2.577,
348
+ "eval_steps_per_second": 0.644,
349
+ "step": 2451
350
  },
351
  {
352
+ "epoch": 19.37984496124031,
353
+ "grad_norm": 54.4798698425293,
354
+ "learning_rate": 0.00032250000000000003,
355
+ "loss": 19.8749,
356
  "step": 2500
357
  },
358
  {
359
+ "epoch": 20.0,
360
+ "eval_dummy": 1.0,
361
+ "eval_loss": 29.845718383789062,
362
+ "eval_runtime": 17.4128,
363
+ "eval_samples_per_second": 2.757,
364
+ "eval_steps_per_second": 0.689,
365
+ "step": 2580
366
  },
367
  {
368
+ "epoch": 20.155038759689923,
369
+ "grad_norm": 61.0432243347168,
370
+ "learning_rate": 0.0003194,
371
+ "loss": 19.6959,
 
 
372
  "step": 2600
373
  },
374
  {
375
+ "epoch": 20.930232558139537,
376
+ "grad_norm": 64.08226013183594,
377
+ "learning_rate": 0.0003163,
378
+ "loss": 19.2973,
379
  "step": 2700
380
  },
381
  {
382
+ "epoch": 21.0,
383
  "eval_dummy": 1.0,
384
+ "eval_loss": 29.071313858032227,
385
+ "eval_runtime": 18.6354,
386
+ "eval_samples_per_second": 2.576,
387
+ "eval_steps_per_second": 0.644,
388
+ "step": 2709
389
  },
390
  {
391
+ "epoch": 21.705426356589147,
392
+ "grad_norm": 54.097721099853516,
393
+ "learning_rate": 0.0003132,
394
+ "loss": 18.5436,
395
  "step": 2800
396
  },
397
  {
398
+ "epoch": 22.0,
399
  "eval_dummy": 1.0,
400
+ "eval_loss": 29.084577560424805,
401
+ "eval_runtime": 17.3593,
402
+ "eval_samples_per_second": 2.765,
403
+ "eval_steps_per_second": 0.691,
404
+ "step": 2838
405
  },
406
  {
407
+ "epoch": 22.48062015503876,
408
+ "grad_norm": 37.09468460083008,
409
+ "learning_rate": 0.00031010000000000006,
410
+ "loss": 18.5996,
411
  "step": 2900
412
  },
413
  {
414
+ "epoch": 23.0,
415
  "eval_dummy": 1.0,
416
+ "eval_loss": 29.88102149963379,
417
+ "eval_runtime": 17.5696,
418
+ "eval_samples_per_second": 2.732,
419
+ "eval_steps_per_second": 0.683,
420
+ "step": 2967
421
  },
422
  {
423
+ "epoch": 23.25581395348837,
424
+ "grad_norm": 40.900291442871094,
425
+ "learning_rate": 0.00030700000000000004,
426
+ "loss": 19.1228,
427
  "step": 3000
428
  },
429
  {
430
+ "epoch": 24.0,
431
  "eval_dummy": 1.0,
432
+ "eval_loss": 29.301599502563477,
433
+ "eval_runtime": 17.3752,
434
+ "eval_samples_per_second": 2.763,
435
+ "eval_steps_per_second": 0.691,
436
+ "step": 3096
437
  },
438
  {
439
+ "epoch": 24.031007751937985,
440
+ "grad_norm": 16.258365631103516,
441
+ "learning_rate": 0.0003039,
442
+ "loss": 18.2692,
443
  "step": 3100
444
  },
445
  {
446
+ "epoch": 24.8062015503876,
447
+ "grad_norm": 53.048091888427734,
448
+ "learning_rate": 0.0003008,
449
+ "loss": 18.0519,
 
 
 
 
 
 
 
 
 
450
  "step": 3200
451
  },
452
  {
453
+ "epoch": 25.0,
454
  "eval_dummy": 1.0,
455
+ "eval_loss": 30.71547508239746,
456
+ "eval_runtime": 18.0599,
457
+ "eval_samples_per_second": 2.658,
458
+ "eval_steps_per_second": 0.664,
459
+ "step": 3225
460
  },
461
  {
462
+ "epoch": 25.58139534883721,
463
+ "grad_norm": 24.09917640686035,
464
+ "learning_rate": 0.0002977,
465
+ "loss": 17.7073,
466
  "step": 3300
467
  },
468
  {
469
+ "epoch": 26.0,
470
  "eval_dummy": 1.0,
471
+ "eval_loss": 28.716806411743164,
472
+ "eval_runtime": 17.32,
473
+ "eval_samples_per_second": 2.771,
474
+ "eval_steps_per_second": 0.693,
475
+ "step": 3354
476
  },
477
  {
478
+ "epoch": 26.356589147286822,
479
+ "grad_norm": 86.21530151367188,
480
+ "learning_rate": 0.0002946,
481
+ "loss": 17.5055,
482
  "step": 3400
483
  },
484
  {
485
+ "epoch": 27.0,
486
  "eval_dummy": 1.0,
487
+ "eval_loss": 28.989931106567383,
488
+ "eval_runtime": 17.6249,
489
+ "eval_samples_per_second": 2.723,
490
+ "eval_steps_per_second": 0.681,
491
+ "step": 3483
492
  },
493
  {
494
+ "epoch": 27.131782945736433,
495
+ "grad_norm": 20.129249572753906,
496
+ "learning_rate": 0.0002915,
497
+ "loss": 18.054,
498
  "step": 3500
499
  },
500
  {
501
+ "epoch": 27.906976744186046,
502
+ "grad_norm": 22.863677978515625,
503
+ "learning_rate": 0.0002884,
504
+ "loss": 17.4854,
 
 
 
 
 
 
 
 
 
505
  "step": 3600
506
  },
507
  {
508
+ "epoch": 28.0,
509
  "eval_dummy": 1.0,
510
+ "eval_loss": 30.19437599182129,
511
+ "eval_runtime": 17.6593,
512
+ "eval_samples_per_second": 2.718,
513
+ "eval_steps_per_second": 0.68,
514
+ "step": 3612
515
  },
516
  {
517
+ "epoch": 28.68217054263566,
518
+ "grad_norm": 13.843172073364258,
519
+ "learning_rate": 0.0002853,
520
+ "loss": 17.0048,
521
  "step": 3700
522
  },
523
  {
524
+ "epoch": 29.0,
525
  "eval_dummy": 1.0,
526
+ "eval_loss": 29.28289031982422,
527
+ "eval_runtime": 18.1574,
528
+ "eval_samples_per_second": 2.644,
529
+ "eval_steps_per_second": 0.661,
530
+ "step": 3741
531
  },
532
  {
533
+ "epoch": 29.45736434108527,
534
+ "grad_norm": 20.727048873901367,
535
+ "learning_rate": 0.0002822,
536
+ "loss": 16.8731,
537
  "step": 3800
538
  },
539
  {
540
+ "epoch": 30.0,
541
  "eval_dummy": 1.0,
542
+ "eval_loss": 30.1208438873291,
543
+ "eval_runtime": 16.9906,
544
+ "eval_samples_per_second": 2.825,
545
+ "eval_steps_per_second": 0.706,
546
+ "step": 3870
547
  },
548
  {
549
+ "epoch": 30.232558139534884,
550
+ "grad_norm": 19.721155166625977,
551
+ "learning_rate": 0.0002791,
552
+ "loss": 16.683,
553
  "step": 3900
554
  },
555
  {
556
+ "epoch": 31.0,
557
  "eval_dummy": 1.0,
558
+ "eval_loss": 30.758291244506836,
559
+ "eval_runtime": 17.7849,
560
+ "eval_samples_per_second": 2.699,
561
+ "eval_steps_per_second": 0.675,
562
+ "step": 3999
563
  },
564
  {
565
+ "epoch": 31.007751937984494,
566
+ "grad_norm": 25.496213912963867,
567
+ "learning_rate": 0.00027600000000000004,
568
+ "loss": 16.9178,
569
  "step": 4000
570
  },
571
  {
572
+ "epoch": 31.782945736434108,
573
+ "grad_norm": 26.640628814697266,
574
+ "learning_rate": 0.0002729,
575
+ "loss": 16.6109,
 
 
 
 
 
 
 
 
 
576
  "step": 4100
577
  },
578
  {
579
+ "epoch": 32.0,
580
  "eval_dummy": 1.0,
581
+ "eval_loss": 30.623199462890625,
582
+ "eval_runtime": 16.843,
583
+ "eval_samples_per_second": 2.85,
584
+ "eval_steps_per_second": 0.712,
585
+ "step": 4128
586
  },
587
  {
588
+ "epoch": 32.55813953488372,
589
+ "grad_norm": 25.067975997924805,
590
+ "learning_rate": 0.0002698,
591
+ "loss": 15.8261,
592
  "step": 4200
593
  },
594
  {
595
+ "epoch": 33.0,
596
  "eval_dummy": 1.0,
597
+ "eval_loss": 29.416189193725586,
598
+ "eval_runtime": 17.6752,
599
+ "eval_samples_per_second": 2.716,
600
+ "eval_steps_per_second": 0.679,
601
+ "step": 4257
602
  },
603
  {
604
+ "epoch": 33.333333333333336,
605
+ "grad_norm": 30.74283218383789,
606
+ "learning_rate": 0.00026670000000000003,
607
+ "loss": 16.9002,
608
  "step": 4300
609
  },
610
  {
611
+ "epoch": 34.0,
612
  "eval_dummy": 1.0,
613
+ "eval_loss": 30.438751220703125,
614
+ "eval_runtime": 16.8785,
615
+ "eval_samples_per_second": 2.844,
616
+ "eval_steps_per_second": 0.711,
617
+ "step": 4386
618
  },
619
  {
620
+ "epoch": 34.10852713178294,
621
+ "grad_norm": 72.6050033569336,
622
+ "learning_rate": 0.0002636,
623
+ "loss": 15.7742,
624
  "step": 4400
625
  },
626
  {
627
+ "epoch": 34.883720930232556,
628
+ "grad_norm": 93.86290740966797,
629
+ "learning_rate": 0.00026050000000000004,
630
+ "loss": 16.3081,
 
 
 
 
 
 
 
 
 
631
  "step": 4500
632
  },
633
  {
634
+ "epoch": 35.0,
635
  "eval_dummy": 1.0,
636
+ "eval_loss": 29.97564697265625,
637
+ "eval_runtime": 17.4518,
638
+ "eval_samples_per_second": 2.75,
639
+ "eval_steps_per_second": 0.688,
640
+ "step": 4515
641
  },
642
  {
643
+ "epoch": 35.65891472868217,
644
+ "grad_norm": 24.93766975402832,
645
+ "learning_rate": 0.0002574,
646
+ "loss": 15.4745,
647
  "step": 4600
648
  },
649
  {
650
+ "epoch": 36.0,
651
  "eval_dummy": 1.0,
652
+ "eval_loss": 28.821380615234375,
653
+ "eval_runtime": 16.9764,
654
+ "eval_samples_per_second": 2.827,
655
+ "eval_steps_per_second": 0.707,
656
+ "step": 4644
657
  },
658
  {
659
+ "epoch": 36.434108527131784,
660
+ "grad_norm": 33.70745849609375,
661
+ "learning_rate": 0.0002543,
662
+ "loss": 15.938,
663
  "step": 4700
664
  },
665
  {
666
+ "epoch": 37.0,
667
  "eval_dummy": 1.0,
668
+ "eval_loss": 29.100107192993164,
669
+ "eval_runtime": 17.5981,
670
+ "eval_samples_per_second": 2.728,
671
+ "eval_steps_per_second": 0.682,
672
+ "step": 4773
673
  },
674
  {
675
+ "epoch": 37.2093023255814,
676
+ "grad_norm": 59.88523864746094,
677
+ "learning_rate": 0.00025120000000000003,
678
+ "loss": 14.9862,
679
  "step": 4800
680
  },
681
  {
682
+ "epoch": 37.98449612403101,
683
+ "grad_norm": 20.979228973388672,
684
+ "learning_rate": 0.0002481,
685
+ "loss": 15.9947,
 
 
 
 
 
 
 
 
 
686
  "step": 4900
687
  },
688
  {
689
+ "epoch": 38.0,
690
  "eval_dummy": 1.0,
691
+ "eval_loss": 31.053319931030273,
692
+ "eval_runtime": 17.0472,
693
+ "eval_samples_per_second": 2.816,
694
+ "eval_steps_per_second": 0.704,
695
+ "step": 4902
696
  },
697
  {
698
+ "epoch": 38.75968992248062,
699
+ "grad_norm": 17.90158462524414,
700
+ "learning_rate": 0.000245,
701
+ "loss": 15.2328,
702
  "step": 5000
703
  },
704
  {
705
+ "epoch": 39.0,
706
  "eval_dummy": 1.0,
707
+ "eval_loss": 31.62113380432129,
708
+ "eval_runtime": 17.318,
709
+ "eval_samples_per_second": 2.772,
710
+ "eval_steps_per_second": 0.693,
711
+ "step": 5031
712
  },
713
  {
714
+ "epoch": 39.53488372093023,
715
+ "grad_norm": 33.11941909790039,
716
+ "learning_rate": 0.00024190000000000003,
717
+ "loss": 15.202,
718
  "step": 5100
719
  },
720
  {
721
+ "epoch": 40.0,
722
+ "eval_dummy": 1.0,
723
+ "eval_loss": 33.138301849365234,
724
+ "eval_runtime": 17.0128,
725
+ "eval_samples_per_second": 2.821,
726
+ "eval_steps_per_second": 0.705,
727
+ "step": 5160
728
  },
729
  {
730
+ "epoch": 40.310077519379846,
731
+ "grad_norm": 15.685113906860352,
732
+ "learning_rate": 0.0002388,
733
+ "loss": 15.0583,
 
 
734
  "step": 5200
735
  },
736
  {
737
+ "epoch": 41.0,
738
+ "eval_dummy": 1.0,
739
+ "eval_loss": 31.408859252929688,
740
+ "eval_runtime": 17.9066,
741
+ "eval_samples_per_second": 2.681,
742
+ "eval_steps_per_second": 0.67,
743
+ "step": 5289
744
  },
745
  {
746
+ "epoch": 41.08527131782946,
747
+ "grad_norm": 20.353235244750977,
748
+ "learning_rate": 0.00023569999999999998,
749
+ "loss": 14.7257,
750
+ "step": 5300
 
 
751
  },
752
  {
753
+ "epoch": 41.86046511627907,
754
+ "grad_norm": 22.713470458984375,
755
+ "learning_rate": 0.00023259999999999996,
756
+ "loss": 14.573,
757
  "step": 5400
758
  },
759
  {
760
+ "epoch": 42.0,
761
  "eval_dummy": 1.0,
762
+ "eval_loss": 31.568130493164062,
763
+ "eval_runtime": 17.7042,
764
+ "eval_samples_per_second": 2.711,
765
+ "eval_steps_per_second": 0.678,
766
+ "step": 5418
767
  },
768
  {
769
+ "epoch": 42.63565891472868,
770
+ "grad_norm": 24.60871696472168,
771
+ "learning_rate": 0.0002295,
772
+ "loss": 14.7401,
773
  "step": 5500
774
  },
775
  {
776
+ "epoch": 43.0,
777
  "eval_dummy": 1.0,
778
+ "eval_loss": 30.554765701293945,
779
+ "eval_runtime": 18.0539,
780
+ "eval_samples_per_second": 2.659,
781
+ "eval_steps_per_second": 0.665,
782
+ "step": 5547
783
  },
784
  {
785
+ "epoch": 43.41085271317829,
786
+ "grad_norm": 36.38352966308594,
787
+ "learning_rate": 0.0002264,
788
+ "loss": 14.6052,
789
  "step": 5600
790
  },
791
  {
792
+ "epoch": 44.0,
793
  "eval_dummy": 1.0,
794
+ "eval_loss": 31.39527702331543,
795
+ "eval_runtime": 17.3086,
796
+ "eval_samples_per_second": 2.773,
797
+ "eval_steps_per_second": 0.693,
798
+ "step": 5676
799
  },
800
  {
801
+ "epoch": 44.18604651162791,
802
+ "grad_norm": 14.869057655334473,
803
+ "learning_rate": 0.00022330000000000003,
804
+ "loss": 13.9636,
805
  "step": 5700
806
  },
807
  {
808
+ "epoch": 44.96124031007752,
809
+ "grad_norm": 12.379744529724121,
810
+ "learning_rate": 0.0002202,
811
+ "loss": 14.1299,
 
 
 
 
 
 
 
 
 
812
  "step": 5800
813
  },
814
  {
815
+ "epoch": 45.0,
816
  "eval_dummy": 1.0,
817
+ "eval_loss": 30.81528663635254,
818
+ "eval_runtime": 17.5777,
819
+ "eval_samples_per_second": 2.731,
820
+ "eval_steps_per_second": 0.683,
821
+ "step": 5805
822
  },
823
  {
824
+ "epoch": 45.736434108527135,
825
+ "grad_norm": 51.40928649902344,
826
+ "learning_rate": 0.00021710000000000005,
827
+ "loss": 13.6851,
828
  "step": 5900
829
  },
830
  {
831
+ "epoch": 46.0,
832
  "eval_dummy": 1.0,
833
+ "eval_loss": 30.969324111938477,
834
+ "eval_runtime": 17.5744,
835
+ "eval_samples_per_second": 2.731,
836
+ "eval_steps_per_second": 0.683,
837
+ "step": 5934
838
  },
839
  {
840
+ "epoch": 46.51162790697674,
841
+ "grad_norm": 13.089680671691895,
842
+ "learning_rate": 0.00021400000000000002,
843
+ "loss": 14.6677,
844
  "step": 6000
845
  },
846
  {
847
+ "epoch": 47.0,
848
  "eval_dummy": 1.0,
849
+ "eval_loss": 31.936065673828125,
850
+ "eval_runtime": 18.038,
851
+ "eval_samples_per_second": 2.661,
852
+ "eval_steps_per_second": 0.665,
853
+ "step": 6063
854
  },
855
  {
856
+ "epoch": 47.286821705426355,
857
+ "grad_norm": 15.198484420776367,
858
+ "learning_rate": 0.0002109,
859
+ "loss": 13.6493,
860
  "step": 6100
861
  },
862
  {
863
+ "epoch": 48.0,
864
  "eval_dummy": 1.0,
865
+ "eval_loss": 34.3327751159668,
866
+ "eval_runtime": 17.0231,
867
+ "eval_samples_per_second": 2.82,
868
+ "eval_steps_per_second": 0.705,
869
+ "step": 6192
870
  },
871
  {
872
+ "epoch": 48.06201550387597,
873
+ "grad_norm": 17.197790145874023,
874
+ "learning_rate": 0.00020780000000000004,
875
+ "loss": 13.7191,
876
  "step": 6200
877
  },
878
  {
879
+ "epoch": 48.83720930232558,
880
+ "grad_norm": 22.198528289794922,
881
+ "learning_rate": 0.00020470000000000002,
882
+ "loss": 14.166,
 
 
 
 
 
 
 
 
 
883
  "step": 6300
884
  },
885
  {
886
+ "epoch": 49.0,
887
  "eval_dummy": 1.0,
888
+ "eval_loss": 32.62310791015625,
889
+ "eval_runtime": 17.8418,
890
+ "eval_samples_per_second": 2.69,
891
+ "eval_steps_per_second": 0.673,
892
+ "step": 6321
893
  },
894
  {
895
+ "epoch": 49.6124031007752,
896
+ "grad_norm": 12.192609786987305,
897
+ "learning_rate": 0.00020160000000000002,
898
+ "loss": 13.7388,
899
  "step": 6400
900
  },
901
  {
902
+ "epoch": 50.0,
903
  "eval_dummy": 1.0,
904
+ "eval_loss": 33.17361831665039,
905
+ "eval_runtime": 16.9796,
906
+ "eval_samples_per_second": 2.827,
907
+ "eval_steps_per_second": 0.707,
908
+ "step": 6450
909
  },
910
  {
911
+ "epoch": 50.3875968992248,
912
+ "grad_norm": 24.24190330505371,
913
+ "learning_rate": 0.0001985,
914
+ "loss": 13.0849,
915
  "step": 6500
916
  },
917
  {
918
+ "epoch": 51.0,
919
  "eval_dummy": 1.0,
920
+ "eval_loss": 34.95216369628906,
921
+ "eval_runtime": 17.6852,
922
+ "eval_samples_per_second": 2.714,
923
+ "eval_steps_per_second": 0.679,
924
+ "step": 6579
925
  },
926
  {
927
+ "epoch": 51.16279069767442,
928
+ "grad_norm": 10.653921127319336,
929
+ "learning_rate": 0.0001954,
930
+ "loss": 13.7478,
931
  "step": 6600
932
  },
933
  {
934
+ "epoch": 51.93798449612403,
935
+ "grad_norm": 12.344590187072754,
936
+ "learning_rate": 0.00019229999999999999,
937
+ "loss": 13.2502,
 
 
 
 
 
 
 
 
 
938
  "step": 6700
939
  },
940
  {
941
+ "epoch": 52.0,
942
  "eval_dummy": 1.0,
943
+ "eval_loss": 35.79899215698242,
944
+ "eval_runtime": 17.0885,
945
+ "eval_samples_per_second": 2.809,
946
+ "eval_steps_per_second": 0.702,
947
+ "step": 6708
948
  },
949
  {
950
+ "epoch": 52.713178294573645,
951
+ "grad_norm": 11.102241516113281,
952
+ "learning_rate": 0.0001892,
953
+ "loss": 13.5116,
954
  "step": 6800
955
  },
956
  {
957
+ "epoch": 53.0,
958
  "eval_dummy": 1.0,
959
+ "eval_loss": 31.57374382019043,
960
+ "eval_runtime": 17.6271,
961
+ "eval_samples_per_second": 2.723,
962
+ "eval_steps_per_second": 0.681,
963
+ "step": 6837
964
  },
965
  {
966
+ "epoch": 53.48837209302326,
967
+ "grad_norm": 10.652983665466309,
968
+ "learning_rate": 0.00018610000000000002,
969
+ "loss": 12.6993,
970
  "step": 6900
971
  },
972
  {
973
+ "epoch": 54.0,
974
  "eval_dummy": 1.0,
975
+ "eval_loss": 33.26504898071289,
976
+ "eval_runtime": 17.4525,
977
+ "eval_samples_per_second": 2.75,
978
+ "eval_steps_per_second": 0.688,
979
+ "step": 6966
980
  },
981
  {
982
+ "epoch": 54.263565891472865,
983
+ "grad_norm": 12.211697578430176,
984
+ "learning_rate": 0.00018300000000000003,
985
+ "loss": 13.3602,
986
  "step": 7000
987
  },
988
  {
989
+ "epoch": 55.0,
990
  "eval_dummy": 1.0,
991
+ "eval_loss": 34.891380310058594,
992
+ "eval_runtime": 18.9822,
993
+ "eval_samples_per_second": 2.529,
994
+ "eval_steps_per_second": 0.632,
995
+ "step": 7095
996
  },
997
  {
998
+ "epoch": 55.03875968992248,
999
+ "grad_norm": 14.056374549865723,
1000
+ "learning_rate": 0.0001799,
1001
+ "loss": 12.9955,
1002
  "step": 7100
1003
  },
1004
  {
1005
+ "epoch": 55.81395348837209,
1006
+ "grad_norm": 10.69999885559082,
1007
+ "learning_rate": 0.00017680000000000001,
1008
+ "loss": 12.9585,
 
 
 
 
 
 
 
 
 
1009
  "step": 7200
1010
  },
1011
  {
1012
+ "epoch": 56.0,
1013
  "eval_dummy": 1.0,
1014
+ "eval_loss": 35.98616409301758,
1015
+ "eval_runtime": 17.2599,
1016
+ "eval_samples_per_second": 2.781,
1017
+ "eval_steps_per_second": 0.695,
1018
+ "step": 7224
1019
  },
1020
  {
1021
+ "epoch": 56.58914728682171,
1022
+ "grad_norm": 7.194685459136963,
1023
+ "learning_rate": 0.00017370000000000002,
1024
+ "loss": 12.7434,
1025
  "step": 7300
1026
  },
1027
  {
1028
+ "epoch": 57.0,
1029
  "eval_dummy": 1.0,
1030
+ "eval_loss": 34.91057205200195,
1031
+ "eval_runtime": 18.4913,
1032
+ "eval_samples_per_second": 2.596,
1033
+ "eval_steps_per_second": 0.649,
1034
+ "step": 7353
1035
  },
1036
  {
1037
+ "epoch": 57.36434108527132,
1038
+ "grad_norm": 9.769908905029297,
1039
+ "learning_rate": 0.0001706,
1040
+ "loss": 12.7299,
1041
  "step": 7400
1042
  },
1043
  {
1044
+ "epoch": 58.0,
1045
  "eval_dummy": 1.0,
1046
+ "eval_loss": 34.010562896728516,
1047
+ "eval_runtime": 17.0454,
1048
+ "eval_samples_per_second": 2.816,
1049
+ "eval_steps_per_second": 0.704,
1050
+ "step": 7482
1051
  },
1052
  {
1053
+ "epoch": 58.13953488372093,
1054
+ "grad_norm": 18.091665267944336,
1055
+ "learning_rate": 0.0001675,
1056
+ "loss": 12.3929,
1057
  "step": 7500
1058
  },
1059
  {
1060
+ "epoch": 58.91472868217054,
1061
+ "grad_norm": 8.427603721618652,
1062
+ "learning_rate": 0.00016439999999999998,
1063
+ "loss": 12.717,
 
 
 
 
 
 
 
 
 
1064
  "step": 7600
1065
  },
1066
  {
1067
+ "epoch": 59.0,
1068
  "eval_dummy": 1.0,
1069
+ "eval_loss": 36.35882568359375,
1070
+ "eval_runtime": 18.0781,
1071
+ "eval_samples_per_second": 2.655,
1072
+ "eval_steps_per_second": 0.664,
1073
+ "step": 7611
1074
  },
1075
  {
1076
+ "epoch": 59.689922480620154,
1077
+ "grad_norm": 15.918642044067383,
1078
+ "learning_rate": 0.00016130000000000002,
1079
+ "loss": 12.0563,
1080
  "step": 7700
1081
  },
1082
  {
1083
+ "epoch": 60.0,
1084
+ "eval_dummy": 1.0,
1085
+ "eval_loss": 35.09232711791992,
1086
+ "eval_runtime": 16.9066,
1087
+ "eval_samples_per_second": 2.839,
1088
+ "eval_steps_per_second": 0.71,
1089
+ "step": 7740
1090
+ },
1091
+ {
1092
+ "epoch": 60.46511627906977,
1093
+ "grad_norm": 13.870895385742188,
1094
+ "learning_rate": 0.00015820000000000002,
1095
+ "loss": 13.012,
1096
  "step": 7800
1097
  },
1098
  {
1099
+ "epoch": 61.0,
1100
  "eval_dummy": 1.0,
1101
+ "eval_loss": 38.73225402832031,
1102
+ "eval_runtime": 17.9846,
1103
+ "eval_samples_per_second": 2.669,
1104
+ "eval_steps_per_second": 0.667,
1105
+ "step": 7869
1106
  },
1107
  {
1108
+ "epoch": 61.24031007751938,
1109
+ "grad_norm": 7.798965930938721,
1110
+ "learning_rate": 0.00015510000000000003,
1111
+ "loss": 12.2878,
1112
  "step": 7900
1113
  },
1114
  {
1115
+ "epoch": 62.0,
1116
  "eval_dummy": 1.0,
1117
+ "eval_loss": 34.9967155456543,
1118
+ "eval_runtime": 17.0439,
1119
+ "eval_samples_per_second": 2.816,
1120
+ "eval_steps_per_second": 0.704,
1121
+ "step": 7998
1122
  },
1123
  {
1124
+ "epoch": 62.01550387596899,
1125
+ "grad_norm": 8.46688461303711,
1126
+ "learning_rate": 0.000152,
1127
+ "loss": 12.3515,
1128
  "step": 8000
1129
  },
1130
  {
1131
+ "epoch": 62.7906976744186,
1132
+ "grad_norm": 9.745466232299805,
1133
+ "learning_rate": 0.00014890000000000001,
1134
+ "loss": 12.2794,
 
 
 
 
 
 
 
 
 
1135
  "step": 8100
1136
  },
1137
  {
1138
+ "epoch": 63.0,
1139
  "eval_dummy": 1.0,
1140
+ "eval_loss": 37.55772399902344,
1141
+ "eval_runtime": 18.0451,
1142
+ "eval_samples_per_second": 2.66,
1143
+ "eval_steps_per_second": 0.665,
1144
+ "step": 8127
1145
  },
1146
  {
1147
+ "epoch": 63.565891472868216,
1148
+ "grad_norm": 7.328401565551758,
1149
+ "learning_rate": 0.0001458,
1150
+ "loss": 12.4147,
1151
  "step": 8200
1152
  },
1153
  {
1154
+ "epoch": 64.0,
1155
  "eval_dummy": 1.0,
1156
+ "eval_loss": 37.27333068847656,
1157
+ "eval_runtime": 19.2621,
1158
+ "eval_samples_per_second": 2.492,
1159
+ "eval_steps_per_second": 0.623,
1160
+ "step": 8256
1161
  },
1162
  {
1163
+ "epoch": 64.34108527131782,
1164
+ "grad_norm": 12.89833927154541,
1165
+ "learning_rate": 0.0001427,
1166
+ "loss": 12.0032,
1167
  "step": 8300
1168
  },
1169
  {
1170
+ "epoch": 65.0,
1171
  "eval_dummy": 1.0,
1172
+ "eval_loss": 35.3015022277832,
1173
+ "eval_runtime": 17.8838,
1174
+ "eval_samples_per_second": 2.684,
1175
+ "eval_steps_per_second": 0.671,
1176
+ "step": 8385
1177
  },
1178
  {
1179
+ "epoch": 65.11627906976744,
1180
+ "grad_norm": 15.308392524719238,
1181
+ "learning_rate": 0.00013959999999999998,
1182
+ "loss": 11.7392,
1183
  "step": 8400
1184
  },
1185
  {
1186
+ "epoch": 65.89147286821705,
1187
+ "grad_norm": 12.101038932800293,
1188
+ "learning_rate": 0.00013650000000000004,
1189
+ "loss": 12.2793,
 
 
 
 
 
 
 
 
 
1190
  "step": 8500
1191
  },
1192
  {
1193
+ "epoch": 66.0,
1194
  "eval_dummy": 1.0,
1195
+ "eval_loss": 35.280582427978516,
1196
+ "eval_runtime": 17.6628,
1197
+ "eval_samples_per_second": 2.718,
1198
+ "eval_steps_per_second": 0.679,
1199
+ "step": 8514
1200
  },
1201
  {
1202
+ "epoch": 66.66666666666667,
1203
+ "grad_norm": 12.754354476928711,
1204
+ "learning_rate": 0.00013340000000000002,
1205
+ "loss": 12.2309,
1206
  "step": 8600
1207
  },
1208
  {
1209
+ "epoch": 67.0,
1210
  "eval_dummy": 1.0,
1211
+ "eval_loss": 36.24875259399414,
1212
+ "eval_runtime": 17.2522,
1213
+ "eval_samples_per_second": 2.782,
1214
+ "eval_steps_per_second": 0.696,
1215
+ "step": 8643
1216
  },
1217
  {
1218
+ "epoch": 67.44186046511628,
1219
+ "grad_norm": 9.756113052368164,
1220
+ "learning_rate": 0.00013030000000000002,
1221
+ "loss": 11.7082,
1222
  "step": 8700
1223
  },
1224
  {
1225
+ "epoch": 68.0,
1226
  "eval_dummy": 1.0,
1227
+ "eval_loss": 35.66865158081055,
1228
+ "eval_runtime": 18.2695,
1229
+ "eval_samples_per_second": 2.627,
1230
+ "eval_steps_per_second": 0.657,
1231
+ "step": 8772
1232
  },
1233
  {
1234
+ "epoch": 68.21705426356588,
1235
+ "grad_norm": 9.372435569763184,
1236
+ "learning_rate": 0.0001272,
1237
+ "loss": 11.5136,
1238
  "step": 8800
1239
  },
1240
  {
1241
+ "epoch": 68.9922480620155,
1242
+ "grad_norm": 15.044282913208008,
1243
+ "learning_rate": 0.0001241,
1244
+ "loss": 11.8694,
 
 
 
 
 
 
 
 
 
1245
  "step": 8900
1246
  },
1247
  {
1248
+ "epoch": 69.0,
1249
  "eval_dummy": 1.0,
1250
+ "eval_loss": 36.04698944091797,
1251
+ "eval_runtime": 17.3888,
1252
+ "eval_samples_per_second": 2.76,
1253
+ "eval_steps_per_second": 0.69,
1254
+ "step": 8901
1255
  },
1256
  {
1257
+ "epoch": 69.76744186046511,
1258
+ "grad_norm": 9.250027656555176,
1259
+ "learning_rate": 0.000121,
1260
+ "loss": 11.782,
1261
  "step": 9000
1262
  },
1263
  {
1264
+ "epoch": 70.0,
1265
  "eval_dummy": 1.0,
1266
+ "eval_loss": 35.40549087524414,
1267
+ "eval_runtime": 18.0194,
1268
+ "eval_samples_per_second": 2.664,
1269
+ "eval_steps_per_second": 0.666,
1270
+ "step": 9030
1271
  },
1272
  {
1273
+ "epoch": 70.54263565891473,
1274
+ "grad_norm": 16.64485740661621,
1275
+ "learning_rate": 0.00011789999999999999,
1276
+ "loss": 11.6254,
1277
  "step": 9100
1278
  },
1279
  {
1280
+ "epoch": 71.0,
1281
  "eval_dummy": 1.0,
1282
+ "eval_loss": 36.70663070678711,
1283
+ "eval_runtime": 17.3613,
1284
+ "eval_samples_per_second": 2.765,
1285
+ "eval_steps_per_second": 0.691,
1286
+ "step": 9159
1287
  },
1288
  {
1289
+ "epoch": 71.31782945736434,
1290
+ "grad_norm": 15.693510055541992,
1291
+ "learning_rate": 0.00011479999999999997,
1292
+ "loss": 11.5873,
1293
  "step": 9200
1294
  },
1295
  {
1296
+ "epoch": 72.0,
1297
  "eval_dummy": 1.0,
1298
+ "eval_loss": 36.10844421386719,
1299
+ "eval_runtime": 17.5839,
1300
+ "eval_samples_per_second": 2.73,
1301
+ "eval_steps_per_second": 0.682,
1302
+ "step": 9288
1303
  },
1304
  {
1305
+ "epoch": 72.09302325581395,
1306
+ "grad_norm": 7.485771179199219,
1307
+ "learning_rate": 0.00011170000000000003,
1308
+ "loss": 11.6159,
1309
  "step": 9300
1310
  },
1311
  {
1312
+ "epoch": 72.86821705426357,
1313
+ "grad_norm": 15.41925048828125,
1314
+ "learning_rate": 0.00010860000000000004,
1315
+ "loss": 11.6251,
 
 
 
 
 
 
 
 
 
1316
  "step": 9400
1317
  },
1318
  {
1319
+ "epoch": 73.0,
1320
  "eval_dummy": 1.0,
1321
+ "eval_loss": 38.29316329956055,
1322
+ "eval_runtime": 17.0634,
1323
+ "eval_samples_per_second": 2.813,
1324
+ "eval_steps_per_second": 0.703,
1325
+ "step": 9417
1326
  },
1327
  {
1328
+ "epoch": 73.64341085271317,
1329
+ "grad_norm": 16.74988555908203,
1330
+ "learning_rate": 0.00010550000000000002,
1331
+ "loss": 11.4589,
1332
  "step": 9500
1333
  },
1334
  {
1335
+ "epoch": 74.0,
1336
  "eval_dummy": 1.0,
1337
+ "eval_loss": 36.55695724487305,
1338
+ "eval_runtime": 17.9041,
1339
+ "eval_samples_per_second": 2.681,
1340
+ "eval_steps_per_second": 0.67,
1341
+ "step": 9546
1342
  },
1343
  {
1344
+ "epoch": 74.4186046511628,
1345
+ "grad_norm": 146.4043426513672,
1346
+ "learning_rate": 0.00010240000000000001,
1347
+ "loss": 11.7378,
1348
  "step": 9600
1349
  },
1350
  {
1351
+ "epoch": 75.0,
1352
  "eval_dummy": 1.0,
1353
+ "eval_loss": 35.988651275634766,
1354
+ "eval_runtime": 17.0167,
1355
+ "eval_samples_per_second": 2.821,
1356
+ "eval_steps_per_second": 0.705,
1357
+ "step": 9675
1358
  },
1359
  {
1360
+ "epoch": 75.1937984496124,
1361
+ "grad_norm": 10.800848960876465,
1362
+ "learning_rate": 9.93e-05,
1363
+ "loss": 11.4043,
1364
  "step": 9700
1365
  },
1366
  {
1367
+ "epoch": 75.96899224806202,
1368
+ "grad_norm": 9.41010570526123,
1369
+ "learning_rate": 9.62e-05,
1370
+ "loss": 11.4933,
 
 
 
 
 
 
 
 
 
1371
  "step": 9800
1372
  },
1373
  {
1374
+ "epoch": 76.0,
1375
  "eval_dummy": 1.0,
1376
+ "eval_loss": 36.47134017944336,
1377
+ "eval_runtime": 17.8569,
1378
+ "eval_samples_per_second": 2.688,
1379
+ "eval_steps_per_second": 0.672,
1380
+ "step": 9804
1381
  },
1382
  {
1383
+ "epoch": 76.74418604651163,
1384
+ "grad_norm": 19.61960220336914,
1385
+ "learning_rate": 9.31e-05,
1386
+ "loss": 11.2566,
1387
  "step": 9900
1388
  },
1389
  {
1390
+ "epoch": 77.0,
1391
  "eval_dummy": 1.0,
1392
+ "eval_loss": 36.96221923828125,
1393
+ "eval_runtime": 17.0325,
1394
+ "eval_samples_per_second": 2.818,
1395
+ "eval_steps_per_second": 0.705,
1396
+ "step": 9933
1397
  },
1398
  {
1399
+ "epoch": 77.51937984496124,
1400
+ "grad_norm": 9.528326034545898,
1401
+ "learning_rate": 8.999999999999999e-05,
1402
+ "loss": 11.25,
1403
  "step": 10000
1404
  },
1405
  {
1406
+ "epoch": 78.0,
1407
  "eval_dummy": 1.0,
1408
+ "eval_loss": 37.10159683227539,
1409
+ "eval_runtime": 18.225,
1410
+ "eval_samples_per_second": 2.634,
1411
+ "eval_steps_per_second": 0.658,
1412
+ "step": 10062
1413
  },
1414
  {
1415
+ "epoch": 78.29457364341086,
1416
+ "grad_norm": 8.064875602722168,
1417
+ "learning_rate": 8.690000000000003e-05,
1418
+ "loss": 11.2962,
1419
  "step": 10100
1420
  },
1421
  {
1422
+ "epoch": 79.0,
1423
  "eval_dummy": 1.0,
1424
+ "eval_loss": 37.87105178833008,
1425
+ "eval_runtime": 16.9948,
1426
+ "eval_samples_per_second": 2.824,
1427
+ "eval_steps_per_second": 0.706,
1428
+ "step": 10191
1429
  },
1430
  {
1431
+ "epoch": 79.06976744186046,
1432
+ "grad_norm": 10.451505661010742,
1433
+ "learning_rate": 8.380000000000002e-05,
1434
+ "loss": 11.1642,
1435
  "step": 10200
1436
  },
1437
  {
1438
+ "epoch": 79.84496124031008,
1439
+ "grad_norm": 7.318461894989014,
1440
+ "learning_rate": 8.070000000000001e-05,
1441
+ "loss": 11.0868,
1442
+ "step": 10300
 
 
1443
  },
1444
  {
1445
+ "epoch": 80.0,
1446
+ "eval_dummy": 1.0,
1447
+ "eval_loss": 38.571414947509766,
1448
+ "eval_runtime": 18.0923,
1449
+ "eval_samples_per_second": 2.653,
1450
+ "eval_steps_per_second": 0.663,
1451
+ "step": 10320
1452
  },
1453
  {
1454
+ "epoch": 80.62015503875969,
1455
+ "grad_norm": 10.301888465881348,
1456
+ "learning_rate": 7.760000000000002e-05,
1457
+ "loss": 11.2786,
1458
  "step": 10400
1459
  },
1460
  {
1461
+ "epoch": 81.0,
1462
  "eval_dummy": 1.0,
1463
+ "eval_loss": 38.1493034362793,
1464
+ "eval_runtime": 17.0167,
1465
+ "eval_samples_per_second": 2.821,
1466
+ "eval_steps_per_second": 0.705,
1467
+ "step": 10449
1468
  },
1469
  {
1470
+ "epoch": 81.3953488372093,
1471
+ "grad_norm": 8.667201042175293,
1472
+ "learning_rate": 7.450000000000001e-05,
1473
+ "loss": 11.1528,
1474
  "step": 10500
1475
  },
1476
  {
1477
+ "epoch": 82.0,
1478
  "eval_dummy": 1.0,
1479
+ "eval_loss": 39.0099983215332,
1480
+ "eval_runtime": 17.9494,
1481
+ "eval_samples_per_second": 2.674,
1482
+ "eval_steps_per_second": 0.669,
1483
+ "step": 10578
1484
  },
1485
  {
1486
+ "epoch": 82.17054263565892,
1487
+ "grad_norm": 5.117663860321045,
1488
+ "learning_rate": 7.14e-05,
1489
+ "loss": 10.9299,
1490
  "step": 10600
1491
  },
1492
  {
1493
+ "epoch": 82.94573643410853,
1494
+ "grad_norm": 5.9621806144714355,
1495
+ "learning_rate": 6.829999999999999e-05,
1496
+ "loss": 11.089,
 
 
 
 
 
 
 
 
 
1497
  "step": 10700
1498
  },
1499
  {
1500
+ "epoch": 83.0,
1501
  "eval_dummy": 1.0,
1502
+ "eval_loss": 38.5473518371582,
1503
+ "eval_runtime": 17.0039,
1504
+ "eval_samples_per_second": 2.823,
1505
+ "eval_steps_per_second": 0.706,
1506
+ "step": 10707
1507
  },
1508
  {
1509
+ "epoch": 83.72093023255815,
1510
+ "grad_norm": 6.17501974105835,
1511
+ "learning_rate": 6.519999999999999e-05,
1512
+ "loss": 10.954,
1513
  "step": 10800
1514
  },
1515
  {
1516
+ "epoch": 84.0,
1517
  "eval_dummy": 1.0,
1518
+ "eval_loss": 38.940486907958984,
1519
+ "eval_runtime": 17.721,
1520
+ "eval_samples_per_second": 2.709,
1521
+ "eval_steps_per_second": 0.677,
1522
+ "step": 10836
1523
  },
1524
  {
1525
+ "epoch": 84.49612403100775,
1526
+ "grad_norm": 31.69377326965332,
1527
+ "learning_rate": 6.210000000000003e-05,
1528
+ "loss": 11.0157,
1529
  "step": 10900
1530
  },
1531
  {
1532
+ "epoch": 85.0,
1533
  "eval_dummy": 1.0,
1534
+ "eval_loss": 39.3872184753418,
1535
+ "eval_runtime": 16.9062,
1536
+ "eval_samples_per_second": 2.839,
1537
+ "eval_steps_per_second": 0.71,
1538
+ "step": 10965
1539
  },
1540
  {
1541
+ "epoch": 85.27131782945736,
1542
+ "grad_norm": 7.1002984046936035,
1543
+ "learning_rate": 5.9000000000000025e-05,
1544
+ "loss": 10.9849,
1545
  "step": 11000
1546
  },
1547
  {
1548
+ "epoch": 86.0,
1549
  "eval_dummy": 1.0,
1550
+ "eval_loss": 39.4875373840332,
1551
+ "eval_runtime": 17.7347,
1552
+ "eval_samples_per_second": 2.707,
1553
+ "eval_steps_per_second": 0.677,
1554
+ "step": 11094
1555
  },
1556
  {
1557
+ "epoch": 86.04651162790698,
1558
+ "grad_norm": 13.370129585266113,
1559
+ "learning_rate": 5.590000000000002e-05,
1560
+ "loss": 11.0614,
1561
  "step": 11100
1562
  },
1563
  {
1564
+ "epoch": 86.82170542635659,
1565
+ "grad_norm": 5.192051887512207,
1566
+ "learning_rate": 5.28e-05,
1567
+ "loss": 10.5423,
 
 
 
 
 
 
 
 
 
1568
  "step": 11200
1569
  },
1570
  {
1571
+ "epoch": 87.0,
1572
  "eval_dummy": 1.0,
1573
+ "eval_loss": 39.11787796020508,
1574
+ "eval_runtime": 16.9675,
1575
+ "eval_samples_per_second": 2.829,
1576
+ "eval_steps_per_second": 0.707,
1577
+ "step": 11223
1578
  },
1579
  {
1580
+ "epoch": 87.59689922480621,
1581
+ "grad_norm": 5.747579097747803,
1582
+ "learning_rate": 4.97e-05,
1583
+ "loss": 11.1968,
1584
  "step": 11300
1585
  },
1586
  {
1587
+ "epoch": 88.0,
1588
  "eval_dummy": 1.0,
1589
+ "eval_loss": 39.4084358215332,
1590
+ "eval_runtime": 17.9374,
1591
+ "eval_samples_per_second": 2.676,
1592
+ "eval_steps_per_second": 0.669,
1593
+ "step": 11352
1594
  },
1595
  {
1596
+ "epoch": 88.37209302325581,
1597
+ "grad_norm": 11.57238483428955,
1598
+ "learning_rate": 4.66e-05,
1599
+ "loss": 10.6376,
1600
  "step": 11400
1601
  },
1602
  {
1603
+ "epoch": 89.0,
1604
  "eval_dummy": 1.0,
1605
+ "eval_loss": 39.82176971435547,
1606
+ "eval_runtime": 16.9422,
1607
+ "eval_samples_per_second": 2.833,
1608
+ "eval_steps_per_second": 0.708,
1609
+ "step": 11481
1610
  },
1611
  {
1612
+ "epoch": 89.14728682170542,
1613
+ "grad_norm": 8.44890308380127,
1614
+ "learning_rate": 4.3499999999999993e-05,
1615
+ "loss": 10.8035,
1616
  "step": 11500
1617
  },
1618
  {
1619
+ "epoch": 89.92248062015504,
1620
+ "grad_norm": 7.732810974121094,
1621
+ "learning_rate": 4.0399999999999986e-05,
1622
+ "loss": 10.7131,
 
 
 
 
 
 
 
 
 
1623
  "step": 11600
1624
  },
1625
  {
1626
+ "epoch": 90.0,
1627
  "eval_dummy": 1.0,
1628
+ "eval_loss": 39.25526428222656,
1629
+ "eval_runtime": 18.1455,
1630
+ "eval_samples_per_second": 2.645,
1631
+ "eval_steps_per_second": 0.661,
1632
+ "step": 11610
1633
  },
1634
  {
1635
+ "epoch": 90.69767441860465,
1636
+ "grad_norm": 6.144818305969238,
1637
+ "learning_rate": 3.7300000000000026e-05,
1638
+ "loss": 10.8252,
1639
  "step": 11700
1640
  },
1641
  {
1642
+ "epoch": 91.0,
1643
  "eval_dummy": 1.0,
1644
+ "eval_loss": 39.136837005615234,
1645
+ "eval_runtime": 16.9741,
1646
+ "eval_samples_per_second": 2.828,
1647
+ "eval_steps_per_second": 0.707,
1648
+ "step": 11739
1649
  },
1650
  {
1651
+ "epoch": 91.47286821705427,
1652
+ "grad_norm": 4.7243475914001465,
1653
+ "learning_rate": 3.420000000000002e-05,
1654
+ "loss": 10.6456,
1655
  "step": 11800
1656
  },
1657
  {
1658
+ "epoch": 92.0,
1659
  "eval_dummy": 1.0,
1660
+ "eval_loss": 38.91936111450195,
1661
+ "eval_runtime": 17.9915,
1662
+ "eval_samples_per_second": 2.668,
1663
+ "eval_steps_per_second": 0.667,
1664
+ "step": 11868
1665
  },
1666
  {
1667
+ "epoch": 92.24806201550388,
1668
+ "grad_norm": 13.374404907226562,
1669
+ "learning_rate": 3.110000000000002e-05,
1670
+ "loss": 10.8488,
1671
  "step": 11900
1672
  },
1673
  {
1674
+ "epoch": 93.0,
1675
  "eval_dummy": 1.0,
1676
+ "eval_loss": 39.595462799072266,
1677
+ "eval_runtime": 16.9478,
1678
+ "eval_samples_per_second": 2.832,
1679
+ "eval_steps_per_second": 0.708,
1680
+ "step": 11997
1681
  },
1682
  {
1683
+ "epoch": 93.02325581395348,
1684
+ "grad_norm": 9.406952857971191,
1685
+ "learning_rate": 2.8000000000000003e-05,
1686
+ "loss": 10.5219,
1687
  "step": 12000
1688
  },
1689
  {
1690
+ "epoch": 93.7984496124031,
1691
+ "grad_norm": 5.360720634460449,
1692
+ "learning_rate": 2.49e-05,
1693
+ "loss": 10.8675,
 
 
 
 
 
 
 
 
 
1694
  "step": 12100
1695
  },
1696
  {
1697
+ "epoch": 94.0,
1698
  "eval_dummy": 1.0,
1699
+ "eval_loss": 39.47597885131836,
1700
+ "eval_runtime": 17.7005,
1701
+ "eval_samples_per_second": 2.712,
1702
+ "eval_steps_per_second": 0.678,
1703
+ "step": 12126
1704
  },
1705
  {
1706
+ "epoch": 94.57364341085271,
1707
+ "grad_norm": 3.855013132095337,
1708
+ "learning_rate": 2.1799999999999995e-05,
1709
+ "loss": 10.4757,
1710
  "step": 12200
1711
  },
1712
  {
1713
+ "epoch": 95.0,
1714
  "eval_dummy": 1.0,
1715
+ "eval_loss": 40.484397888183594,
1716
+ "eval_runtime": 16.9799,
1717
+ "eval_samples_per_second": 2.827,
1718
+ "eval_steps_per_second": 0.707,
1719
+ "step": 12255
1720
  },
1721
  {
1722
+ "epoch": 95.34883720930233,
1723
+ "grad_norm": 7.718498229980469,
1724
+ "learning_rate": 1.8699999999999987e-05,
1725
+ "loss": 10.3191,
1726
  "step": 12300
1727
  },
1728
  {
1729
+ "epoch": 96.0,
1730
  "eval_dummy": 1.0,
1731
+ "eval_loss": 39.06733322143555,
1732
+ "eval_runtime": 17.7268,
1733
+ "eval_samples_per_second": 2.708,
1734
+ "eval_steps_per_second": 0.677,
1735
+ "step": 12384
1736
  },
1737
  {
1738
+ "epoch": 96.12403100775194,
1739
+ "grad_norm": 3.9156548976898193,
1740
+ "learning_rate": 1.5599999999999983e-05,
1741
+ "loss": 10.6169,
1742
  "step": 12400
1743
  },
1744
  {
1745
+ "epoch": 96.89922480620154,
1746
+ "grad_norm": 3.464470148086548,
1747
+ "learning_rate": 1.2500000000000023e-05,
1748
+ "loss": 10.6073,
 
 
 
 
 
 
 
 
 
1749
  "step": 12500
1750
  },
1751
  {
1752
+ "epoch": 97.0,
1753
  "eval_dummy": 1.0,
1754
+ "eval_loss": 39.37672805786133,
1755
+ "eval_runtime": 16.9277,
1756
+ "eval_samples_per_second": 2.836,
1757
+ "eval_steps_per_second": 0.709,
1758
+ "step": 12513
1759
  },
1760
  {
1761
+ "epoch": 97.67441860465117,
1762
+ "grad_norm": 5.994311332702637,
1763
+ "learning_rate": 9.400000000000018e-06,
1764
+ "loss": 10.3038,
1765
  "step": 12600
1766
  },
1767
  {
1768
+ "epoch": 98.0,
1769
  "eval_dummy": 1.0,
1770
+ "eval_loss": 39.69685745239258,
1771
+ "eval_runtime": 18.0251,
1772
+ "eval_samples_per_second": 2.663,
1773
+ "eval_steps_per_second": 0.666,
1774
+ "step": 12642
1775
  },
1776
  {
1777
+ "epoch": 98.44961240310077,
1778
+ "grad_norm": 4.125715732574463,
1779
+ "learning_rate": 6.300000000000012e-06,
1780
+ "loss": 11.0709,
1781
  "step": 12700
1782
  },
1783
  {
1784
+ "epoch": 99.0,
1785
  "eval_dummy": 1.0,
1786
+ "eval_loss": 39.93254470825195,
1787
+ "eval_runtime": 17.9269,
1788
+ "eval_samples_per_second": 2.678,
1789
+ "eval_steps_per_second": 0.669,
1790
+ "step": 12771
1791
  },
1792
  {
1793
+ "epoch": 99.2248062015504,
1794
+ "grad_norm": 4.381137371063232,
1795
+ "learning_rate": 3.200000000000005e-06,
1796
+ "loss": 10.2398,
1797
  "step": 12800
1798
  },
1799
  {
1800
+ "epoch": 100.0,
1801
+ "grad_norm": 4.022866249084473,
 
 
 
 
 
 
 
 
 
1802
  "learning_rate": 9.999999999999998e-08,
1803
+ "loss": 10.5951,
1804
  "step": 12900
1805
  },
1806
  {
1807
+ "epoch": 100.0,
1808
  "eval_dummy": 1.0,
1809
+ "eval_loss": 39.87546157836914,
1810
+ "eval_runtime": 17.8005,
1811
+ "eval_samples_per_second": 2.697,
1812
+ "eval_steps_per_second": 0.674,
1813
  "step": 12900
1814
  },
1815
  {
1816
+ "epoch": 100.0,
1817
  "step": 12900,
1818
+ "total_flos": 1.4631500418239693e+19,
1819
+ "train_loss": 16.309644344832545,
1820
+ "train_runtime": 27199.2721,
1821
+ "train_samples_per_second": 1.897,
1822
+ "train_steps_per_second": 0.474
1823
  }
1824
  ],
1825
  "logging_steps": 100,
1826
  "max_steps": 12900,
1827
  "num_input_tokens_seen": 0,
1828
+ "num_train_epochs": 100,
1829
  "save_steps": 500,
1830
  "stateful_callbacks": {
1831
  "TrainerControl": {
 
1839
  "attributes": {}
1840
  }
1841
  },
1842
+ "total_flos": 1.4631500418239693e+19,
1843
+ "train_batch_size": 4,
1844
  "trial_name": null,
1845
  "trial_params": null
1846
  }