w06618pm commited on
Commit
4ab0ec0
·
1 Parent(s): 7157452

Final trained model ready for demo

Browse files
checkpoint-16000/trainer_state.json DELETED
@@ -1,1226 +0,0 @@
1
- {
2
- "best_global_step": 4000,
3
- "best_metric": 0.47247838973999023,
4
- "best_model_checkpoint": "mamba_nli_ensemble/checkpoint-4000",
5
- "epoch": 2.619515389652914,
6
- "eval_steps": 2000,
7
- "global_step": 16000,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.016371971185330715,
14
- "grad_norm": 29.85440444946289,
15
- "learning_rate": 1.6371971185330716e-06,
16
- "loss": 0.7262,
17
- "step": 100
18
- },
19
- {
20
- "epoch": 0.03274394237066143,
21
- "grad_norm": 63.15359878540039,
22
- "learning_rate": 3.2743942370661432e-06,
23
- "loss": 0.69,
24
- "step": 200
25
- },
26
- {
27
- "epoch": 0.04911591355599214,
28
- "grad_norm": 9.709274291992188,
29
- "learning_rate": 4.911591355599214e-06,
30
- "loss": 0.6864,
31
- "step": 300
32
- },
33
- {
34
- "epoch": 0.06548788474132286,
35
- "grad_norm": 12.61182689666748,
36
- "learning_rate": 6.5487884741322864e-06,
37
- "loss": 0.6999,
38
- "step": 400
39
- },
40
- {
41
- "epoch": 0.08185985592665357,
42
- "grad_norm": 8.176433563232422,
43
- "learning_rate": 8.185985592665357e-06,
44
- "loss": 0.6842,
45
- "step": 500
46
- },
47
- {
48
- "epoch": 0.09823182711198428,
49
- "grad_norm": 2.774385690689087,
50
- "learning_rate": 9.823182711198428e-06,
51
- "loss": 0.6686,
52
- "step": 600
53
- },
54
- {
55
- "epoch": 0.114603798297315,
56
- "grad_norm": 5.177670955657959,
57
- "learning_rate": 1.14603798297315e-05,
58
- "loss": 0.6835,
59
- "step": 700
60
- },
61
- {
62
- "epoch": 0.13097576948264572,
63
- "grad_norm": 3.342498302459717,
64
- "learning_rate": 1.3097576948264573e-05,
65
- "loss": 0.6631,
66
- "step": 800
67
- },
68
- {
69
- "epoch": 0.14734774066797643,
70
- "grad_norm": 4.506181716918945,
71
- "learning_rate": 1.4734774066797644e-05,
72
- "loss": 0.6665,
73
- "step": 900
74
- },
75
- {
76
- "epoch": 0.16371971185330714,
77
- "grad_norm": 6.701014995574951,
78
- "learning_rate": 1.6371971185330713e-05,
79
- "loss": 0.6644,
80
- "step": 1000
81
- },
82
- {
83
- "epoch": 0.18009168303863785,
84
- "grad_norm": 5.5269246101379395,
85
- "learning_rate": 1.8009168303863786e-05,
86
- "loss": 0.651,
87
- "step": 1100
88
- },
89
- {
90
- "epoch": 0.19646365422396855,
91
- "grad_norm": 2.700180768966675,
92
- "learning_rate": 1.9646365422396855e-05,
93
- "loss": 0.6115,
94
- "step": 1200
95
- },
96
- {
97
- "epoch": 0.2128356254092993,
98
- "grad_norm": 10.621367454528809,
99
- "learning_rate": 2.128356254092993e-05,
100
- "loss": 0.6403,
101
- "step": 1300
102
- },
103
- {
104
- "epoch": 0.22920759659463,
105
- "grad_norm": 2.2622969150543213,
106
- "learning_rate": 2.2920759659463e-05,
107
- "loss": 0.5914,
108
- "step": 1400
109
- },
110
- {
111
- "epoch": 0.2455795677799607,
112
- "grad_norm": 4.627520561218262,
113
- "learning_rate": 2.4557956777996073e-05,
114
- "loss": 0.6149,
115
- "step": 1500
116
- },
117
- {
118
- "epoch": 0.26195153896529144,
119
- "grad_norm": 5.510996341705322,
120
- "learning_rate": 2.6195153896529146e-05,
121
- "loss": 0.6149,
122
- "step": 1600
123
- },
124
- {
125
- "epoch": 0.2783235101506221,
126
- "grad_norm": 6.97050142288208,
127
- "learning_rate": 2.7832351015062215e-05,
128
- "loss": 0.6256,
129
- "step": 1700
130
- },
131
- {
132
- "epoch": 0.29469548133595286,
133
- "grad_norm": 2.5861706733703613,
134
- "learning_rate": 2.9469548133595288e-05,
135
- "loss": 0.5884,
136
- "step": 1800
137
- },
138
- {
139
- "epoch": 0.31106745252128354,
140
- "grad_norm": 12.085710525512695,
141
- "learning_rate": 3.110674525212836e-05,
142
- "loss": 0.5986,
143
- "step": 1900
144
- },
145
- {
146
- "epoch": 0.3274394237066143,
147
- "grad_norm": 9.400935173034668,
148
- "learning_rate": 3.2743942370661426e-05,
149
- "loss": 0.6357,
150
- "step": 2000
151
- },
152
- {
153
- "epoch": 0.3274394237066143,
154
- "eval_accuracy": 0.7325408618127786,
155
- "eval_loss": 0.5683333277702332,
156
- "eval_runtime": 5.8436,
157
- "eval_samples_per_second": 115.17,
158
- "eval_steps_per_second": 7.359,
159
- "step": 2000
160
- },
161
- {
162
- "epoch": 0.343811394891945,
163
- "grad_norm": 4.135758399963379,
164
- "learning_rate": 3.43811394891945e-05,
165
- "loss": 0.5574,
166
- "step": 2100
167
- },
168
- {
169
- "epoch": 0.3601833660772757,
170
- "grad_norm": 8.234903335571289,
171
- "learning_rate": 3.601833660772757e-05,
172
- "loss": 0.5793,
173
- "step": 2200
174
- },
175
- {
176
- "epoch": 0.3765553372626064,
177
- "grad_norm": 11.848236083984375,
178
- "learning_rate": 3.765553372626065e-05,
179
- "loss": 0.5812,
180
- "step": 2300
181
- },
182
- {
183
- "epoch": 0.3929273084479371,
184
- "grad_norm": 15.417450904846191,
185
- "learning_rate": 3.929273084479371e-05,
186
- "loss": 0.5165,
187
- "step": 2400
188
- },
189
- {
190
- "epoch": 0.40929927963326784,
191
- "grad_norm": 170.8882293701172,
192
- "learning_rate": 4.0929927963326786e-05,
193
- "loss": 0.6833,
194
- "step": 2500
195
- },
196
- {
197
- "epoch": 0.4256712508185986,
198
- "grad_norm": 256.7959289550781,
199
- "learning_rate": 4.256712508185986e-05,
200
- "loss": 0.6222,
201
- "step": 2600
202
- },
203
- {
204
- "epoch": 0.44204322200392926,
205
- "grad_norm": 7.283181667327881,
206
- "learning_rate": 4.4204322200392925e-05,
207
- "loss": 0.6134,
208
- "step": 2700
209
- },
210
- {
211
- "epoch": 0.45841519318926,
212
- "grad_norm": 3.3215067386627197,
213
- "learning_rate": 4.5841519318926e-05,
214
- "loss": 0.5355,
215
- "step": 2800
216
- },
217
- {
218
- "epoch": 0.4747871643745907,
219
- "grad_norm": 6.261496543884277,
220
- "learning_rate": 4.747871643745907e-05,
221
- "loss": 0.5777,
222
- "step": 2900
223
- },
224
- {
225
- "epoch": 0.4911591355599214,
226
- "grad_norm": 5.140758991241455,
227
- "learning_rate": 4.9115913555992146e-05,
228
- "loss": 0.586,
229
- "step": 3000
230
- },
231
- {
232
- "epoch": 0.5075311067452521,
233
- "grad_norm": 4.261934280395508,
234
- "learning_rate": 4.999965445760666e-05,
235
- "loss": 0.6135,
236
- "step": 3100
237
- },
238
- {
239
- "epoch": 0.5239030779305829,
240
- "grad_norm": 3.795130968093872,
241
- "learning_rate": 4.999651917405523e-05,
242
- "loss": 0.6042,
243
- "step": 3200
244
- },
245
- {
246
- "epoch": 0.5402750491159135,
247
- "grad_norm": 5.0412116050720215,
248
- "learning_rate": 4.999011837711028e-05,
249
- "loss": 0.5806,
250
- "step": 3300
251
- },
252
- {
253
- "epoch": 0.5566470203012442,
254
- "grad_norm": 10.38508415222168,
255
- "learning_rate": 4.998045290296376e-05,
256
- "loss": 0.4947,
257
- "step": 3400
258
- },
259
- {
260
- "epoch": 0.573018991486575,
261
- "grad_norm": 38.63774108886719,
262
- "learning_rate": 4.9967524014300896e-05,
263
- "loss": 0.5181,
264
- "step": 3500
265
- },
266
- {
267
- "epoch": 0.5893909626719057,
268
- "grad_norm": 26.06133460998535,
269
- "learning_rate": 4.995133340013522e-05,
270
- "loss": 0.5092,
271
- "step": 3600
272
- },
273
- {
274
- "epoch": 0.6057629338572365,
275
- "grad_norm": 17.468584060668945,
276
- "learning_rate": 4.993188317558791e-05,
277
- "loss": 0.6309,
278
- "step": 3700
279
- },
280
- {
281
- "epoch": 0.6221349050425671,
282
- "grad_norm": 2.848097562789917,
283
- "learning_rate": 4.9909175881611514e-05,
284
- "loss": 0.496,
285
- "step": 3800
286
- },
287
- {
288
- "epoch": 0.6385068762278978,
289
- "grad_norm": 13.13325309753418,
290
- "learning_rate": 4.9883214484657957e-05,
291
- "loss": 0.5044,
292
- "step": 3900
293
- },
294
- {
295
- "epoch": 0.6548788474132285,
296
- "grad_norm": 10.311493873596191,
297
- "learning_rate": 4.9854002376291046e-05,
298
- "loss": 0.6015,
299
- "step": 4000
300
- },
301
- {
302
- "epoch": 0.6548788474132285,
303
- "eval_accuracy": 0.7934621099554234,
304
- "eval_loss": 0.47247838973999023,
305
- "eval_runtime": 5.8787,
306
- "eval_samples_per_second": 114.482,
307
- "eval_steps_per_second": 7.315,
308
- "step": 4000
309
- },
310
- {
311
- "epoch": 0.6712508185985593,
312
- "grad_norm": 47.73946762084961,
313
- "learning_rate": 4.9821543372743355e-05,
314
- "loss": 0.4682,
315
- "step": 4100
316
- },
317
- {
318
- "epoch": 0.68762278978389,
319
- "grad_norm": 14.78221321105957,
320
- "learning_rate": 4.9785841714417734e-05,
321
- "loss": 0.5595,
322
- "step": 4200
323
- },
324
- {
325
- "epoch": 0.7039947609692206,
326
- "grad_norm": 11.297691345214844,
327
- "learning_rate": 4.97469020653333e-05,
328
- "loss": 0.5608,
329
- "step": 4300
330
- },
331
- {
332
- "epoch": 0.7203667321545514,
333
- "grad_norm": 29.37626075744629,
334
- "learning_rate": 4.970472951251617e-05,
335
- "loss": 0.5414,
336
- "step": 4400
337
- },
338
- {
339
- "epoch": 0.7367387033398821,
340
- "grad_norm": 11.616437911987305,
341
- "learning_rate": 4.9659329565334854e-05,
342
- "loss": 0.5016,
343
- "step": 4500
344
- },
345
- {
346
- "epoch": 0.7531106745252129,
347
- "grad_norm": 8.013294219970703,
348
- "learning_rate": 4.9610708154780585e-05,
349
- "loss": 0.5775,
350
- "step": 4600
351
- },
352
- {
353
- "epoch": 0.7694826457105436,
354
- "grad_norm": 5.327792167663574,
355
- "learning_rate": 4.955887163269243e-05,
356
- "loss": 0.5126,
357
- "step": 4700
358
- },
359
- {
360
- "epoch": 0.7858546168958742,
361
- "grad_norm": 1.6188075542449951,
362
- "learning_rate": 4.950382677092754e-05,
363
- "loss": 0.5971,
364
- "step": 4800
365
- },
366
- {
367
- "epoch": 0.802226588081205,
368
- "grad_norm": 6.731476306915283,
369
- "learning_rate": 4.944558076047649e-05,
370
- "loss": 0.5159,
371
- "step": 4900
372
- },
373
- {
374
- "epoch": 0.8185985592665357,
375
- "grad_norm": 1.4254143238067627,
376
- "learning_rate": 4.9384141210523804e-05,
377
- "loss": 0.5246,
378
- "step": 5000
379
- },
380
- {
381
- "epoch": 0.8349705304518664,
382
- "grad_norm": 44.3812370300293,
383
- "learning_rate": 4.931951614745395e-05,
384
- "loss": 0.5282,
385
- "step": 5100
386
- },
387
- {
388
- "epoch": 0.8513425016371972,
389
- "grad_norm": 2.9070582389831543,
390
- "learning_rate": 4.925171401380278e-05,
391
- "loss": 0.5517,
392
- "step": 5200
393
- },
394
- {
395
- "epoch": 0.8677144728225278,
396
- "grad_norm": 33.969303131103516,
397
- "learning_rate": 4.918074366715457e-05,
398
- "loss": 0.5576,
399
- "step": 5300
400
- },
401
- {
402
- "epoch": 0.8840864440078585,
403
- "grad_norm": 1.7158643007278442,
404
- "learning_rate": 4.910661437898493e-05,
405
- "loss": 0.6597,
406
- "step": 5400
407
- },
408
- {
409
- "epoch": 0.9004584151931893,
410
- "grad_norm": 42.425262451171875,
411
- "learning_rate": 4.902933583344954e-05,
412
- "loss": 0.5421,
413
- "step": 5500
414
- },
415
- {
416
- "epoch": 0.91683038637852,
417
- "grad_norm": 9.340117454528809,
418
- "learning_rate": 4.8948918126119056e-05,
419
- "loss": 0.5591,
420
- "step": 5600
421
- },
422
- {
423
- "epoch": 0.9332023575638507,
424
- "grad_norm": 1.2870229482650757,
425
- "learning_rate": 4.886537176266024e-05,
426
- "loss": 0.5126,
427
- "step": 5700
428
- },
429
- {
430
- "epoch": 0.9495743287491814,
431
- "grad_norm": 36.61764907836914,
432
- "learning_rate": 4.877870765746347e-05,
433
- "loss": 0.5015,
434
- "step": 5800
435
- },
436
- {
437
- "epoch": 0.9659462999345121,
438
- "grad_norm": 13.943790435791016,
439
- "learning_rate": 4.8688937132216966e-05,
440
- "loss": 0.4911,
441
- "step": 5900
442
- },
443
- {
444
- "epoch": 0.9823182711198428,
445
- "grad_norm": 18.877832412719727,
446
- "learning_rate": 4.859607191442768e-05,
447
- "loss": 0.5838,
448
- "step": 6000
449
- },
450
- {
451
- "epoch": 0.9823182711198428,
452
- "eval_accuracy": 0.787518573551263,
453
- "eval_loss": 0.5758853554725647,
454
- "eval_runtime": 5.9463,
455
- "eval_samples_per_second": 113.18,
456
- "eval_steps_per_second": 7.231,
457
- "step": 6000
458
- },
459
- {
460
- "epoch": 0.9986902423051736,
461
- "grad_norm": 24.715587615966797,
462
- "learning_rate": 4.850012413588926e-05,
463
- "loss": 0.4975,
464
- "step": 6100
465
- },
466
- {
467
- "epoch": 1.0150622134905043,
468
- "grad_norm": 13.0919828414917,
469
- "learning_rate": 4.840110633109716e-05,
470
- "loss": 0.511,
471
- "step": 6200
472
- },
473
- {
474
- "epoch": 1.031434184675835,
475
- "grad_norm": 0.9892318844795227,
476
- "learning_rate": 4.829903143561113e-05,
477
- "loss": 0.539,
478
- "step": 6300
479
- },
480
- {
481
- "epoch": 1.0478061558611658,
482
- "grad_norm": 18.864852905273438,
483
- "learning_rate": 4.819391278436539e-05,
484
- "loss": 0.4764,
485
- "step": 6400
486
- },
487
- {
488
- "epoch": 1.0641781270464965,
489
- "grad_norm": 0.43955135345458984,
490
- "learning_rate": 4.8085764109926494e-05,
491
- "loss": 0.3564,
492
- "step": 6500
493
- },
494
- {
495
- "epoch": 1.080550098231827,
496
- "grad_norm": 0.3193933963775635,
497
- "learning_rate": 4.7974599540699386e-05,
498
- "loss": 0.4985,
499
- "step": 6600
500
- },
501
- {
502
- "epoch": 1.0969220694171578,
503
- "grad_norm": 17.164764404296875,
504
- "learning_rate": 4.7860433599081654e-05,
505
- "loss": 0.4675,
506
- "step": 6700
507
- },
508
- {
509
- "epoch": 1.1132940406024885,
510
- "grad_norm": 0.31629478931427,
511
- "learning_rate": 4.774328119956633e-05,
512
- "loss": 0.5588,
513
- "step": 6800
514
- },
515
- {
516
- "epoch": 1.1296660117878192,
517
- "grad_norm": 8.383326530456543,
518
- "learning_rate": 4.762315764679353e-05,
519
- "loss": 0.4681,
520
- "step": 6900
521
- },
522
- {
523
- "epoch": 1.14603798297315,
524
- "grad_norm": 0.6111562252044678,
525
- "learning_rate": 4.750007863355102e-05,
526
- "loss": 0.4538,
527
- "step": 7000
528
- },
529
- {
530
- "epoch": 1.1624099541584807,
531
- "grad_norm": 12.500487327575684,
532
- "learning_rate": 4.737406023872416e-05,
533
- "loss": 0.445,
534
- "step": 7100
535
- },
536
- {
537
- "epoch": 1.1787819253438114,
538
- "grad_norm": 0.2397201806306839,
539
- "learning_rate": 4.7245118925195374e-05,
540
- "loss": 0.4252,
541
- "step": 7200
542
- },
543
- {
544
- "epoch": 1.1951538965291422,
545
- "grad_norm": 22.245431900024414,
546
- "learning_rate": 4.7113271537693454e-05,
547
- "loss": 0.4567,
548
- "step": 7300
549
- },
550
- {
551
- "epoch": 1.211525867714473,
552
- "grad_norm": 0.6519126892089844,
553
- "learning_rate": 4.6978535300593e-05,
554
- "loss": 0.5072,
555
- "step": 7400
556
- },
557
- {
558
- "epoch": 1.2278978388998034,
559
- "grad_norm": 0.5901318192481995,
560
- "learning_rate": 4.684092781566422e-05,
561
- "loss": 0.3924,
562
- "step": 7500
563
- },
564
- {
565
- "epoch": 1.2442698100851342,
566
- "grad_norm": 12.497309684753418,
567
- "learning_rate": 4.67004670597735e-05,
568
- "loss": 0.5874,
569
- "step": 7600
570
- },
571
- {
572
- "epoch": 1.260641781270465,
573
- "grad_norm": 2.344989538192749,
574
- "learning_rate": 4.6557171382534915e-05,
575
- "loss": 0.4327,
576
- "step": 7700
577
- },
578
- {
579
- "epoch": 1.2770137524557956,
580
- "grad_norm": 17.146575927734375,
581
- "learning_rate": 4.6411059503913e-05,
582
- "loss": 0.5129,
583
- "step": 7800
584
- },
585
- {
586
- "epoch": 1.2933857236411264,
587
- "grad_norm": 1.8164324760437012,
588
- "learning_rate": 4.62621505117773e-05,
589
- "loss": 0.5208,
590
- "step": 7900
591
- },
592
- {
593
- "epoch": 1.309757694826457,
594
- "grad_norm": 13.502025604248047,
595
- "learning_rate": 4.611046385940868e-05,
596
- "loss": 0.4884,
597
- "step": 8000
598
- },
599
- {
600
- "epoch": 1.309757694826457,
601
- "eval_accuracy": 0.7815750371471025,
602
- "eval_loss": 0.6778917908668518,
603
- "eval_runtime": 5.9221,
604
- "eval_samples_per_second": 113.642,
605
- "eval_steps_per_second": 7.261,
606
- "step": 8000
607
- },
608
- {
609
- "epoch": 1.3261296660117878,
610
- "grad_norm": 8.141022682189941,
611
- "learning_rate": 4.5956019362958006e-05,
612
- "loss": 0.4438,
613
- "step": 8100
614
- },
615
- {
616
- "epoch": 1.3425016371971186,
617
- "grad_norm": 0.91554856300354,
618
- "learning_rate": 4.5798837198857356e-05,
619
- "loss": 0.4617,
620
- "step": 8200
621
- },
622
- {
623
- "epoch": 1.3588736083824493,
624
- "grad_norm": 0.6429493427276611,
625
- "learning_rate": 4.563893790118426e-05,
626
- "loss": 0.4947,
627
- "step": 8300
628
- },
629
- {
630
- "epoch": 1.37524557956778,
631
- "grad_norm": 0.22166290879249573,
632
- "learning_rate": 4.547634235897906e-05,
633
- "loss": 0.4057,
634
- "step": 8400
635
- },
636
- {
637
- "epoch": 1.3916175507531108,
638
- "grad_norm": 9.768654823303223,
639
- "learning_rate": 4.5311071813516106e-05,
640
- "loss": 0.3861,
641
- "step": 8500
642
- },
643
- {
644
- "epoch": 1.4079895219384415,
645
- "grad_norm": 0.4799940586090088,
646
- "learning_rate": 4.514314785552871e-05,
647
- "loss": 0.4809,
648
- "step": 8600
649
- },
650
- {
651
- "epoch": 1.424361493123772,
652
- "grad_norm": 11.279892921447754,
653
- "learning_rate": 4.4972592422388634e-05,
654
- "loss": 0.4472,
655
- "step": 8700
656
- },
657
- {
658
- "epoch": 1.4407334643091028,
659
- "grad_norm": 0.24561847746372223,
660
- "learning_rate": 4.479942779524022e-05,
661
- "loss": 0.5542,
662
- "step": 8800
663
- },
664
- {
665
- "epoch": 1.4571054354944335,
666
- "grad_norm": 0.3300779163837433,
667
- "learning_rate": 4.462367659608955e-05,
668
- "loss": 0.427,
669
- "step": 8900
670
- },
671
- {
672
- "epoch": 1.4734774066797642,
673
- "grad_norm": 25.95767593383789,
674
- "learning_rate": 4.4445361784849195e-05,
675
- "loss": 0.4385,
676
- "step": 9000
677
- },
678
- {
679
- "epoch": 1.489849377865095,
680
- "grad_norm": 7.935421466827393,
681
- "learning_rate": 4.4264506656338745e-05,
682
- "loss": 0.4963,
683
- "step": 9100
684
- },
685
- {
686
- "epoch": 1.5062213490504257,
687
- "grad_norm": 0.8363397717475891,
688
- "learning_rate": 4.4081134837241585e-05,
689
- "loss": 0.4049,
690
- "step": 9200
691
- },
692
- {
693
- "epoch": 1.5225933202357562,
694
- "grad_norm": 0.5877137780189514,
695
- "learning_rate": 4.389527028301836e-05,
696
- "loss": 0.4596,
697
- "step": 9300
698
- },
699
- {
700
- "epoch": 1.538965291421087,
701
- "grad_norm": 7.968419075012207,
702
- "learning_rate": 4.370693727477745e-05,
703
- "loss": 0.5687,
704
- "step": 9400
705
- },
706
- {
707
- "epoch": 1.5553372626064177,
708
- "grad_norm": 26.651159286499023,
709
- "learning_rate": 4.351616041610292e-05,
710
- "loss": 0.5284,
711
- "step": 9500
712
- },
713
- {
714
- "epoch": 1.5717092337917484,
715
- "grad_norm": 7.4741129875183105,
716
- "learning_rate": 4.3322964629840344e-05,
717
- "loss": 0.4048,
718
- "step": 9600
719
- },
720
- {
721
- "epoch": 1.5880812049770792,
722
- "grad_norm": 0.2695929706096649,
723
- "learning_rate": 4.312737515484091e-05,
724
- "loss": 0.4803,
725
- "step": 9700
726
- },
727
- {
728
- "epoch": 1.60445317616241,
729
- "grad_norm": 17.771469116210938,
730
- "learning_rate": 4.2929417542664244e-05,
731
- "loss": 0.5075,
732
- "step": 9800
733
- },
734
- {
735
- "epoch": 1.6208251473477406,
736
- "grad_norm": 15.19389820098877,
737
- "learning_rate": 4.272911765424039e-05,
738
- "loss": 0.4512,
739
- "step": 9900
740
- },
741
- {
742
- "epoch": 1.6371971185330714,
743
- "grad_norm": 0.9662685394287109,
744
- "learning_rate": 4.2526501656491405e-05,
745
- "loss": 0.4284,
746
- "step": 10000
747
- },
748
- {
749
- "epoch": 1.6371971185330714,
750
- "eval_accuracy": 0.8053491827637445,
751
- "eval_loss": 0.6046619415283203,
752
- "eval_runtime": 5.9328,
753
- "eval_samples_per_second": 113.438,
754
- "eval_steps_per_second": 7.248,
755
- "step": 10000
756
- },
757
- {
758
- "epoch": 1.653569089718402,
759
- "grad_norm": 6.945915222167969,
760
- "learning_rate": 4.232159601891287e-05,
761
- "loss": 0.4298,
762
- "step": 10100
763
- },
764
- {
765
- "epoch": 1.6699410609037328,
766
- "grad_norm": 0.882854700088501,
767
- "learning_rate": 4.2114427510116036e-05,
768
- "loss": 0.4362,
769
- "step": 10200
770
- },
771
- {
772
- "epoch": 1.6863130320890636,
773
- "grad_norm": 19.653860092163086,
774
- "learning_rate": 4.1905023194330726e-05,
775
- "loss": 0.4404,
776
- "step": 10300
777
- },
778
- {
779
- "epoch": 1.7026850032743943,
780
- "grad_norm": 26.812660217285156,
781
- "learning_rate": 4.169341042786977e-05,
782
- "loss": 0.513,
783
- "step": 10400
784
- },
785
- {
786
- "epoch": 1.719056974459725,
787
- "grad_norm": 0.45196932554244995,
788
- "learning_rate": 4.147961685555517e-05,
789
- "loss": 0.3751,
790
- "step": 10500
791
- },
792
- {
793
- "epoch": 1.7354289456450558,
794
- "grad_norm": 14.361971855163574,
795
- "learning_rate": 4.12636704071066e-05,
796
- "loss": 0.4847,
797
- "step": 10600
798
- },
799
- {
800
- "epoch": 1.7518009168303865,
801
- "grad_norm": 1.6658482551574707,
802
- "learning_rate": 4.104559929349277e-05,
803
- "loss": 0.3794,
804
- "step": 10700
805
- },
806
- {
807
- "epoch": 1.768172888015717,
808
- "grad_norm": 37.13115692138672,
809
- "learning_rate": 4.08254320032459e-05,
810
- "loss": 0.423,
811
- "step": 10800
812
- },
813
- {
814
- "epoch": 1.7845448592010478,
815
- "grad_norm": 0.23938187956809998,
816
- "learning_rate": 4.060319729874007e-05,
817
- "loss": 0.4097,
818
- "step": 10900
819
- },
820
- {
821
- "epoch": 1.8009168303863785,
822
- "grad_norm": 4.112476825714111,
823
- "learning_rate": 4.0378924212433715e-05,
824
- "loss": 0.472,
825
- "step": 11000
826
- },
827
- {
828
- "epoch": 1.8172888015717092,
829
- "grad_norm": 14.257973670959473,
830
- "learning_rate": 4.0152642043076884e-05,
831
- "loss": 0.4984,
832
- "step": 11100
833
- },
834
- {
835
- "epoch": 1.83366077275704,
836
- "grad_norm": 13.519189834594727,
837
- "learning_rate": 3.992438035188366e-05,
838
- "loss": 0.3799,
839
- "step": 11200
840
- },
841
- {
842
- "epoch": 1.8500327439423707,
843
- "grad_norm": 3.054865837097168,
844
- "learning_rate": 3.969416895867034e-05,
845
- "loss": 0.4403,
846
- "step": 11300
847
- },
848
- {
849
- "epoch": 1.8664047151277012,
850
- "grad_norm": 2.4153125286102295,
851
- "learning_rate": 3.946203793795982e-05,
852
- "loss": 0.4938,
853
- "step": 11400
854
- },
855
- {
856
- "epoch": 1.882776686313032,
857
- "grad_norm": 0.34407490491867065,
858
- "learning_rate": 3.922801761505264e-05,
859
- "loss": 0.4565,
860
- "step": 11500
861
- },
862
- {
863
- "epoch": 1.8991486574983627,
864
- "grad_norm": 19.420068740844727,
865
- "learning_rate": 3.8992138562065415e-05,
866
- "loss": 0.4581,
867
- "step": 11600
868
- },
869
- {
870
- "epoch": 1.9155206286836934,
871
- "grad_norm": 23.287689208984375,
872
- "learning_rate": 3.875443159393689e-05,
873
- "loss": 0.3939,
874
- "step": 11700
875
- },
876
- {
877
- "epoch": 1.9318925998690242,
878
- "grad_norm": 31.722305297851562,
879
- "learning_rate": 3.8514927764402274e-05,
880
- "loss": 0.5128,
881
- "step": 11800
882
- },
883
- {
884
- "epoch": 1.948264571054355,
885
- "grad_norm": 175.8199920654297,
886
- "learning_rate": 3.8273658361936505e-05,
887
- "loss": 0.397,
888
- "step": 11900
889
- },
890
- {
891
- "epoch": 1.9646365422396856,
892
- "grad_norm": 9.348654747009277,
893
- "learning_rate": 3.803065490566667e-05,
894
- "loss": 0.4633,
895
- "step": 12000
896
- },
897
- {
898
- "epoch": 1.9646365422396856,
899
- "eval_accuracy": 0.8172362555720654,
900
- "eval_loss": 0.6007680296897888,
901
- "eval_runtime": 5.8584,
902
- "eval_samples_per_second": 114.878,
903
- "eval_steps_per_second": 7.34,
904
- "step": 12000
905
- },
906
- {
907
- "epoch": 1.9810085134250164,
908
- "grad_norm": 0.6896190047264099,
909
- "learning_rate": 3.7785949141254475e-05,
910
- "loss": 0.3702,
911
- "step": 12100
912
- },
913
- {
914
- "epoch": 1.9973804846103471,
915
- "grad_norm": 3.107114315032959,
916
- "learning_rate": 3.753957303674897e-05,
917
- "loss": 0.4142,
918
- "step": 12200
919
- },
920
- {
921
- "epoch": 2.013752455795678,
922
- "grad_norm": 0.11377547681331635,
923
- "learning_rate": 3.7291558778410314e-05,
924
- "loss": 0.2155,
925
- "step": 12300
926
- },
927
- {
928
- "epoch": 2.0301244269810086,
929
- "grad_norm": 1.9735820293426514,
930
- "learning_rate": 3.704193876650499e-05,
931
- "loss": 0.2937,
932
- "step": 12400
933
- },
934
- {
935
- "epoch": 2.0464963981663393,
936
- "grad_norm": 0.07780765742063522,
937
- "learning_rate": 3.6790745611073065e-05,
938
- "loss": 0.189,
939
- "step": 12500
940
- },
941
- {
942
- "epoch": 2.06286836935167,
943
- "grad_norm": 10.954134941101074,
944
- "learning_rate": 3.65380121276681e-05,
945
- "loss": 0.2428,
946
- "step": 12600
947
- },
948
- {
949
- "epoch": 2.079240340537001,
950
- "grad_norm": 0.05607511103153229,
951
- "learning_rate": 3.6283771333070127e-05,
952
- "loss": 0.2512,
953
- "step": 12700
954
- },
955
- {
956
- "epoch": 2.0956123117223315,
957
- "grad_norm": 16.878358840942383,
958
- "learning_rate": 3.6028056440972374e-05,
959
- "loss": 0.226,
960
- "step": 12800
961
- },
962
- {
963
- "epoch": 2.1119842829076623,
964
- "grad_norm": 1.4940121173858643,
965
- "learning_rate": 3.5770900857642306e-05,
966
- "loss": 0.1941,
967
- "step": 12900
968
- },
969
- {
970
- "epoch": 2.128356254092993,
971
- "grad_norm": 28.74452781677246,
972
- "learning_rate": 3.551233817755745e-05,
973
- "loss": 0.1594,
974
- "step": 13000
975
- },
976
- {
977
- "epoch": 2.1447282252783237,
978
- "grad_norm": 0.10467255115509033,
979
- "learning_rate": 3.525240217901665e-05,
980
- "loss": 0.3147,
981
- "step": 13100
982
- },
983
- {
984
- "epoch": 2.161100196463654,
985
- "grad_norm": 125.1854248046875,
986
- "learning_rate": 3.499112681972734e-05,
987
- "loss": 0.3013,
988
- "step": 13200
989
- },
990
- {
991
- "epoch": 2.1774721676489848,
992
- "grad_norm": 0.17910659313201904,
993
- "learning_rate": 3.4728546232369303e-05,
994
- "loss": 0.163,
995
- "step": 13300
996
- },
997
- {
998
- "epoch": 2.1938441388343155,
999
- "grad_norm": 0.015883993357419968,
1000
- "learning_rate": 3.4464694720135695e-05,
1001
- "loss": 0.258,
1002
- "step": 13400
1003
- },
1004
- {
1005
- "epoch": 2.2102161100196462,
1006
- "grad_norm": 0.01641729474067688,
1007
- "learning_rate": 3.4199606752251634e-05,
1008
- "loss": 0.138,
1009
- "step": 13500
1010
- },
1011
- {
1012
- "epoch": 2.226588081204977,
1013
- "grad_norm": 0.05746021494269371,
1014
- "learning_rate": 3.3933316959471265e-05,
1015
- "loss": 0.2195,
1016
- "step": 13600
1017
- },
1018
- {
1019
- "epoch": 2.2429600523903077,
1020
- "grad_norm": 0.017622973769903183,
1021
- "learning_rate": 3.3665860129553584e-05,
1022
- "loss": 0.1332,
1023
- "step": 13700
1024
- },
1025
- {
1026
- "epoch": 2.2593320235756384,
1027
- "grad_norm": 0.048372671008110046,
1028
- "learning_rate": 3.3397271202717834e-05,
1029
- "loss": 0.2528,
1030
- "step": 13800
1031
- },
1032
- {
1033
- "epoch": 2.275703994760969,
1034
- "grad_norm": 0.06655670702457428,
1035
- "learning_rate": 3.312758526707895e-05,
1036
- "loss": 0.2649,
1037
- "step": 13900
1038
- },
1039
- {
1040
- "epoch": 2.2920759659463,
1041
- "grad_norm": 0.11124414205551147,
1042
- "learning_rate": 3.285683755406373e-05,
1043
- "loss": 0.2191,
1044
- "step": 14000
1045
- },
1046
- {
1047
- "epoch": 2.2920759659463,
1048
- "eval_accuracy": 0.8231797919762258,
1049
- "eval_loss": 0.8087013363838196,
1050
- "eval_runtime": 5.8945,
1051
- "eval_samples_per_second": 114.174,
1052
- "eval_steps_per_second": 7.295,
1053
- "step": 14000
1054
- },
1055
- {
1056
- "epoch": 2.3084479371316307,
1057
- "grad_norm": 0.172523632645607,
1058
- "learning_rate": 3.258506343380815e-05,
1059
- "loss": 0.2064,
1060
- "step": 14100
1061
- },
1062
- {
1063
- "epoch": 2.3248199083169614,
1064
- "grad_norm": 0.041510824114084244,
1065
- "learning_rate": 3.23122984105368e-05,
1066
- "loss": 0.2191,
1067
- "step": 14200
1068
- },
1069
- {
1070
- "epoch": 2.341191879502292,
1071
- "grad_norm": 0.022055430337786674,
1072
- "learning_rate": 3.203857811792451e-05,
1073
- "loss": 0.2203,
1074
- "step": 14300
1075
- },
1076
- {
1077
- "epoch": 2.357563850687623,
1078
- "grad_norm": 0.09686886519193649,
1079
- "learning_rate": 3.176393831444131e-05,
1080
- "loss": 0.271,
1081
- "step": 14400
1082
- },
1083
- {
1084
- "epoch": 2.3739358218729536,
1085
- "grad_norm": 0.0746513158082962,
1086
- "learning_rate": 3.148841487868095e-05,
1087
- "loss": 0.2665,
1088
- "step": 14500
1089
- },
1090
- {
1091
- "epoch": 2.3903077930582843,
1092
- "grad_norm": 0.01806914061307907,
1093
- "learning_rate": 3.121204380467379e-05,
1094
- "loss": 0.1947,
1095
- "step": 14600
1096
- },
1097
- {
1098
- "epoch": 2.406679764243615,
1099
- "grad_norm": 0.018033646047115326,
1100
- "learning_rate": 3.093486119718455e-05,
1101
- "loss": 0.1888,
1102
- "step": 14700
1103
- },
1104
- {
1105
- "epoch": 2.423051735428946,
1106
- "grad_norm": 0.01413845457136631,
1107
- "learning_rate": 3.065690326699564e-05,
1108
- "loss": 0.1409,
1109
- "step": 14800
1110
- },
1111
- {
1112
- "epoch": 2.4394237066142765,
1113
- "grad_norm": 0.12208285927772522,
1114
- "learning_rate": 3.0378206326176674e-05,
1115
- "loss": 0.2763,
1116
- "step": 14900
1117
- },
1118
- {
1119
- "epoch": 2.455795677799607,
1120
- "grad_norm": 0.08330941945314407,
1121
- "learning_rate": 3.0098806783340644e-05,
1122
- "loss": 0.2277,
1123
- "step": 15000
1124
- },
1125
- {
1126
- "epoch": 2.4721676489849376,
1127
- "grad_norm": 0.10370787978172302,
1128
- "learning_rate": 2.9818741138887584e-05,
1129
- "loss": 0.2776,
1130
- "step": 15100
1131
- },
1132
- {
1133
- "epoch": 2.4885396201702683,
1134
- "grad_norm": 0.2352355420589447,
1135
- "learning_rate": 2.9538045980236194e-05,
1136
- "loss": 0.2935,
1137
- "step": 15200
1138
- },
1139
- {
1140
- "epoch": 2.504911591355599,
1141
- "grad_norm": 2.1543736457824707,
1142
- "learning_rate": 2.925675797704411e-05,
1143
- "loss": 0.2717,
1144
- "step": 15300
1145
- },
1146
- {
1147
- "epoch": 2.52128356254093,
1148
- "grad_norm": 45.37192916870117,
1149
- "learning_rate": 2.89749138764174e-05,
1150
- "loss": 0.1947,
1151
- "step": 15400
1152
- },
1153
- {
1154
- "epoch": 2.5376555337262605,
1155
- "grad_norm": 0.01007983461022377,
1156
- "learning_rate": 2.8692550498110017e-05,
1157
- "loss": 0.1994,
1158
- "step": 15500
1159
- },
1160
- {
1161
- "epoch": 2.5540275049115913,
1162
- "grad_norm": 45.349517822265625,
1163
- "learning_rate": 2.8409704729713694e-05,
1164
- "loss": 0.3004,
1165
- "step": 15600
1166
- },
1167
- {
1168
- "epoch": 2.570399476096922,
1169
- "grad_norm": 0.0555860809981823,
1170
- "learning_rate": 2.812641352183897e-05,
1171
- "loss": 0.235,
1172
- "step": 15700
1173
- },
1174
- {
1175
- "epoch": 2.5867714472822527,
1176
- "grad_norm": 19.971233367919922,
1177
- "learning_rate": 2.784271388328802e-05,
1178
- "loss": 0.2062,
1179
- "step": 15800
1180
- },
1181
- {
1182
- "epoch": 2.6031434184675835,
1183
- "grad_norm": 0.2190929502248764,
1184
- "learning_rate": 2.755864287621992e-05,
1185
- "loss": 0.324,
1186
- "step": 15900
1187
- },
1188
- {
1189
- "epoch": 2.619515389652914,
1190
- "grad_norm": 0.01323526818305254,
1191
- "learning_rate": 2.7274237611308816e-05,
1192
- "loss": 0.2028,
1193
- "step": 16000
1194
- },
1195
- {
1196
- "epoch": 2.619515389652914,
1197
- "eval_accuracy": 0.8231797919762258,
1198
- "eval_loss": 0.8805943727493286,
1199
- "eval_runtime": 5.9036,
1200
- "eval_samples_per_second": 113.998,
1201
- "eval_steps_per_second": 7.284,
1202
- "step": 16000
1203
- }
1204
- ],
1205
- "logging_steps": 100,
1206
- "max_steps": 30540,
1207
- "num_input_tokens_seen": 0,
1208
- "num_train_epochs": 5,
1209
- "save_steps": 2000,
1210
- "stateful_callbacks": {
1211
- "TrainerControl": {
1212
- "args": {
1213
- "should_epoch_stop": false,
1214
- "should_evaluate": false,
1215
- "should_log": false,
1216
- "should_save": true,
1217
- "should_training_stop": false
1218
- },
1219
- "attributes": {}
1220
- }
1221
- },
1222
- "total_flos": 0.0,
1223
- "train_batch_size": 4,
1224
- "trial_name": null,
1225
- "trial_params": null
1226
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{checkpoint-16000 → checkpoint-6108}/config.json RENAMED
File without changes
{checkpoint-16000 → checkpoint-6108}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7bea97b55325c43b8f3dad204e399582a0cbb3ad5099895d9357b02b15c266c9
3
  size 1033299706
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d2cd402905fd4cf0e1e74e162118f4ea8a49452dd227cea5bc85c2145aa212c
3
  size 1033299706
{checkpoint-16000 → checkpoint-6108}/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:829b58ddaceb0fa2c29313fe2d068d2e14ad9c0149c357ae8eb72674d2eddc91
3
  size 516640282
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15c7d2912e1ad4ca2d24cd07a09f2ab74df17e26cb1433aa7f6a074520fb3d73
3
  size 516640282
{checkpoint-16000 → checkpoint-6108}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1a97db8e41139aa1239ba7fb79ddeb0af5998c6305a440c1fe182e6ad02f2f5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b66e3cc7c452b707ddac5caf0aa17618afb9bc1a0333600a22c4afb353f3165
3
  size 14244
{checkpoint-16000 → checkpoint-6108}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2e07fd269933d93775fcbd2b65b42a65580350ef79eda4f62b5567059bd0986
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e14171307974b46a1fb7cc9993cd3cfe809a4cc5438c6f02b936d5ff7f40b725
3
  size 1064
{checkpoint-16000 → checkpoint-6108}/special_tokens_map.json RENAMED
File without changes
{checkpoint-16000 → checkpoint-6108}/tokenizer.json RENAMED
File without changes
{checkpoint-16000 → checkpoint-6108}/tokenizer_config.json RENAMED
File without changes
checkpoint-6108/trainer_state.json ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 6108,
3
+ "best_metric": 0.6058866381645203,
4
+ "best_model_checkpoint": "mamba_nli_ensemble/checkpoint-6108",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 6108,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016371971185330715,
14
+ "grad_norm": 15.273209571838379,
15
+ "learning_rate": 1.6371971185330716e-06,
16
+ "loss": 0.7553,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.03274394237066143,
21
+ "grad_norm": 17.171979904174805,
22
+ "learning_rate": 3.2743942370661432e-06,
23
+ "loss": 0.7151,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.04911591355599214,
28
+ "grad_norm": 5.574963569641113,
29
+ "learning_rate": 4.911591355599214e-06,
30
+ "loss": 0.7065,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.06548788474132286,
35
+ "grad_norm": 7.3369059562683105,
36
+ "learning_rate": 6.5487884741322864e-06,
37
+ "loss": 0.6996,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.08185985592665357,
42
+ "grad_norm": 7.506777286529541,
43
+ "learning_rate": 8.185985592665357e-06,
44
+ "loss": 0.7071,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.09823182711198428,
49
+ "grad_norm": 6.177253723144531,
50
+ "learning_rate": 9.823182711198428e-06,
51
+ "loss": 0.6926,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.114603798297315,
56
+ "grad_norm": 5.600251197814941,
57
+ "learning_rate": 1.14603798297315e-05,
58
+ "loss": 0.7145,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.13097576948264572,
63
+ "grad_norm": 2.5524227619171143,
64
+ "learning_rate": 1.3097576948264573e-05,
65
+ "loss": 0.6919,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.14734774066797643,
70
+ "grad_norm": 8.479023933410645,
71
+ "learning_rate": 1.4734774066797644e-05,
72
+ "loss": 0.6819,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.16371971185330714,
77
+ "grad_norm": 5.893022060394287,
78
+ "learning_rate": 1.6371971185330713e-05,
79
+ "loss": 0.6944,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.18009168303863785,
84
+ "grad_norm": 4.31400728225708,
85
+ "learning_rate": 1.8009168303863786e-05,
86
+ "loss": 0.6787,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.19646365422396855,
91
+ "grad_norm": 1.9604185819625854,
92
+ "learning_rate": 1.9646365422396855e-05,
93
+ "loss": 0.6804,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.2128356254092993,
98
+ "grad_norm": 14.416400909423828,
99
+ "learning_rate": 2.128356254092993e-05,
100
+ "loss": 0.6979,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.22920759659463,
105
+ "grad_norm": 2.0943357944488525,
106
+ "learning_rate": 2.2920759659463e-05,
107
+ "loss": 0.6689,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.2455795677799607,
112
+ "grad_norm": 4.136998176574707,
113
+ "learning_rate": 2.4557956777996073e-05,
114
+ "loss": 0.6694,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 0.26195153896529144,
119
+ "grad_norm": 3.5071325302124023,
120
+ "learning_rate": 2.6195153896529146e-05,
121
+ "loss": 0.6168,
122
+ "step": 1600
123
+ },
124
+ {
125
+ "epoch": 0.2783235101506221,
126
+ "grad_norm": 7.638752460479736,
127
+ "learning_rate": 2.7832351015062215e-05,
128
+ "loss": 0.6237,
129
+ "step": 1700
130
+ },
131
+ {
132
+ "epoch": 0.29469548133595286,
133
+ "grad_norm": 1.9127601385116577,
134
+ "learning_rate": 2.9469548133595288e-05,
135
+ "loss": 0.6163,
136
+ "step": 1800
137
+ },
138
+ {
139
+ "epoch": 0.31106745252128354,
140
+ "grad_norm": 66.4927749633789,
141
+ "learning_rate": 3.110674525212836e-05,
142
+ "loss": 0.6006,
143
+ "step": 1900
144
+ },
145
+ {
146
+ "epoch": 0.3274394237066143,
147
+ "grad_norm": 52.92075729370117,
148
+ "learning_rate": 3.2743942370661426e-05,
149
+ "loss": 0.6598,
150
+ "step": 2000
151
+ },
152
+ {
153
+ "epoch": 0.343811394891945,
154
+ "grad_norm": 11.339043617248535,
155
+ "learning_rate": 3.43811394891945e-05,
156
+ "loss": 0.5987,
157
+ "step": 2100
158
+ },
159
+ {
160
+ "epoch": 0.3601833660772757,
161
+ "grad_norm": 28.995885848999023,
162
+ "learning_rate": 3.601833660772757e-05,
163
+ "loss": 0.6509,
164
+ "step": 2200
165
+ },
166
+ {
167
+ "epoch": 0.3765553372626064,
168
+ "grad_norm": 23.708646774291992,
169
+ "learning_rate": 3.765553372626065e-05,
170
+ "loss": 0.6729,
171
+ "step": 2300
172
+ },
173
+ {
174
+ "epoch": 0.3929273084479371,
175
+ "grad_norm": 3.438246726989746,
176
+ "learning_rate": 3.929273084479371e-05,
177
+ "loss": 0.5537,
178
+ "step": 2400
179
+ },
180
+ {
181
+ "epoch": 0.40929927963326784,
182
+ "grad_norm": 10.562445640563965,
183
+ "learning_rate": 4.0929927963326786e-05,
184
+ "loss": 0.6228,
185
+ "step": 2500
186
+ },
187
+ {
188
+ "epoch": 0.4256712508185986,
189
+ "grad_norm": 9.508832931518555,
190
+ "learning_rate": 4.256712508185986e-05,
191
+ "loss": 0.5776,
192
+ "step": 2600
193
+ },
194
+ {
195
+ "epoch": 0.44204322200392926,
196
+ "grad_norm": 12.658103942871094,
197
+ "learning_rate": 4.4204322200392925e-05,
198
+ "loss": 0.5455,
199
+ "step": 2700
200
+ },
201
+ {
202
+ "epoch": 0.45841519318926,
203
+ "grad_norm": 8.46078109741211,
204
+ "learning_rate": 4.5841519318926e-05,
205
+ "loss": 0.5583,
206
+ "step": 2800
207
+ },
208
+ {
209
+ "epoch": 0.4747871643745907,
210
+ "grad_norm": 5.642892360687256,
211
+ "learning_rate": 4.747871643745907e-05,
212
+ "loss": 0.5556,
213
+ "step": 2900
214
+ },
215
+ {
216
+ "epoch": 0.4911591355599214,
217
+ "grad_norm": 3.8212382793426514,
218
+ "learning_rate": 4.9115913555992146e-05,
219
+ "loss": 0.5552,
220
+ "step": 3000
221
+ },
222
+ {
223
+ "epoch": 0.5075311067452521,
224
+ "grad_norm": 8.145768165588379,
225
+ "learning_rate": 4.999965445760666e-05,
226
+ "loss": 0.5488,
227
+ "step": 3100
228
+ },
229
+ {
230
+ "epoch": 0.5239030779305829,
231
+ "grad_norm": 12.39121150970459,
232
+ "learning_rate": 4.999651917405523e-05,
233
+ "loss": 0.5595,
234
+ "step": 3200
235
+ },
236
+ {
237
+ "epoch": 0.5402750491159135,
238
+ "grad_norm": 6.998423099517822,
239
+ "learning_rate": 4.999011837711028e-05,
240
+ "loss": 0.5111,
241
+ "step": 3300
242
+ },
243
+ {
244
+ "epoch": 0.5566470203012442,
245
+ "grad_norm": 31.633630752563477,
246
+ "learning_rate": 4.998045290296376e-05,
247
+ "loss": 0.553,
248
+ "step": 3400
249
+ },
250
+ {
251
+ "epoch": 0.573018991486575,
252
+ "grad_norm": 56.126251220703125,
253
+ "learning_rate": 4.9967524014300896e-05,
254
+ "loss": 0.5713,
255
+ "step": 3500
256
+ },
257
+ {
258
+ "epoch": 0.5893909626719057,
259
+ "grad_norm": 6.04685640335083,
260
+ "learning_rate": 4.995133340013522e-05,
261
+ "loss": 0.526,
262
+ "step": 3600
263
+ },
264
+ {
265
+ "epoch": 0.6057629338572365,
266
+ "grad_norm": 9.806577682495117,
267
+ "learning_rate": 4.993188317558791e-05,
268
+ "loss": 0.6185,
269
+ "step": 3700
270
+ },
271
+ {
272
+ "epoch": 0.6221349050425671,
273
+ "grad_norm": 3.9068918228149414,
274
+ "learning_rate": 4.9909175881611514e-05,
275
+ "loss": 0.5086,
276
+ "step": 3800
277
+ },
278
+ {
279
+ "epoch": 0.6385068762278978,
280
+ "grad_norm": 19.12666130065918,
281
+ "learning_rate": 4.9883214484657957e-05,
282
+ "loss": 0.515,
283
+ "step": 3900
284
+ },
285
+ {
286
+ "epoch": 0.6548788474132285,
287
+ "grad_norm": 6.140756607055664,
288
+ "learning_rate": 4.9854002376291046e-05,
289
+ "loss": 0.5581,
290
+ "step": 4000
291
+ },
292
+ {
293
+ "epoch": 0.6712508185985593,
294
+ "grad_norm": 12.521078109741211,
295
+ "learning_rate": 4.9821543372743355e-05,
296
+ "loss": 0.5192,
297
+ "step": 4100
298
+ },
299
+ {
300
+ "epoch": 0.68762278978389,
301
+ "grad_norm": 18.783933639526367,
302
+ "learning_rate": 4.9785841714417734e-05,
303
+ "loss": 0.5334,
304
+ "step": 4200
305
+ },
306
+ {
307
+ "epoch": 0.7039947609692206,
308
+ "grad_norm": 7.139877796173096,
309
+ "learning_rate": 4.97469020653333e-05,
310
+ "loss": 0.5334,
311
+ "step": 4300
312
+ },
313
+ {
314
+ "epoch": 0.7203667321545514,
315
+ "grad_norm": 7.0137834548950195,
316
+ "learning_rate": 4.970472951251617e-05,
317
+ "loss": 0.5019,
318
+ "step": 4400
319
+ },
320
+ {
321
+ "epoch": 0.7367387033398821,
322
+ "grad_norm": 51.292449951171875,
323
+ "learning_rate": 4.9659329565334854e-05,
324
+ "loss": 0.4813,
325
+ "step": 4500
326
+ },
327
+ {
328
+ "epoch": 0.7531106745252129,
329
+ "grad_norm": 7.053626537322998,
330
+ "learning_rate": 4.9610708154780585e-05,
331
+ "loss": 0.6834,
332
+ "step": 4600
333
+ },
334
+ {
335
+ "epoch": 0.7694826457105436,
336
+ "grad_norm": 0.5209086537361145,
337
+ "learning_rate": 4.955887163269243e-05,
338
+ "loss": 0.4802,
339
+ "step": 4700
340
+ },
341
+ {
342
+ "epoch": 0.7858546168958742,
343
+ "grad_norm": 0.9966021776199341,
344
+ "learning_rate": 4.950382677092754e-05,
345
+ "loss": 0.5673,
346
+ "step": 4800
347
+ },
348
+ {
349
+ "epoch": 0.802226588081205,
350
+ "grad_norm": 1.4829602241516113,
351
+ "learning_rate": 4.944558076047649e-05,
352
+ "loss": 0.4976,
353
+ "step": 4900
354
+ },
355
+ {
356
+ "epoch": 0.8185985592665357,
357
+ "grad_norm": 63.20207214355469,
358
+ "learning_rate": 4.9384141210523804e-05,
359
+ "loss": 0.5398,
360
+ "step": 5000
361
+ },
362
+ {
363
+ "epoch": 0.8349705304518664,
364
+ "grad_norm": 9.008106231689453,
365
+ "learning_rate": 4.931951614745395e-05,
366
+ "loss": 0.5906,
367
+ "step": 5100
368
+ },
369
+ {
370
+ "epoch": 0.8513425016371972,
371
+ "grad_norm": 9.714171409606934,
372
+ "learning_rate": 4.925171401380278e-05,
373
+ "loss": 0.4833,
374
+ "step": 5200
375
+ },
376
+ {
377
+ "epoch": 0.8677144728225278,
378
+ "grad_norm": 1.0515024662017822,
379
+ "learning_rate": 4.918074366715457e-05,
380
+ "loss": 0.5046,
381
+ "step": 5300
382
+ },
383
+ {
384
+ "epoch": 0.8840864440078585,
385
+ "grad_norm": 0.32931941747665405,
386
+ "learning_rate": 4.910661437898493e-05,
387
+ "loss": 0.6394,
388
+ "step": 5400
389
+ },
390
+ {
391
+ "epoch": 0.9004584151931893,
392
+ "grad_norm": 19.654884338378906,
393
+ "learning_rate": 4.902933583344954e-05,
394
+ "loss": 0.5572,
395
+ "step": 5500
396
+ },
397
+ {
398
+ "epoch": 0.91683038637852,
399
+ "grad_norm": 6.547713279724121,
400
+ "learning_rate": 4.8948918126119056e-05,
401
+ "loss": 0.5898,
402
+ "step": 5600
403
+ },
404
+ {
405
+ "epoch": 0.9332023575638507,
406
+ "grad_norm": 0.6314940452575684,
407
+ "learning_rate": 4.886537176266024e-05,
408
+ "loss": 0.4681,
409
+ "step": 5700
410
+ },
411
+ {
412
+ "epoch": 0.9495743287491814,
413
+ "grad_norm": 9.13287353515625,
414
+ "learning_rate": 4.877870765746347e-05,
415
+ "loss": 0.4678,
416
+ "step": 5800
417
+ },
418
+ {
419
+ "epoch": 0.9659462999345121,
420
+ "grad_norm": 8.16297721862793,
421
+ "learning_rate": 4.8688937132216966e-05,
422
+ "loss": 0.5657,
423
+ "step": 5900
424
+ },
425
+ {
426
+ "epoch": 0.9823182711198428,
427
+ "grad_norm": 19.567949295043945,
428
+ "learning_rate": 4.859607191442768e-05,
429
+ "loss": 0.5164,
430
+ "step": 6000
431
+ },
432
+ {
433
+ "epoch": 0.9986902423051736,
434
+ "grad_norm": 20.028736114501953,
435
+ "learning_rate": 4.850012413588926e-05,
436
+ "loss": 0.4769,
437
+ "step": 6100
438
+ },
439
+ {
440
+ "epoch": 1.0,
441
+ "eval_accuracy": 0.8083209509658247,
442
+ "eval_loss": 0.6058866381645203,
443
+ "eval_mcc": 0.6161859428296993,
444
+ "eval_runtime": 5.3649,
445
+ "eval_samples_per_second": 125.446,
446
+ "eval_steps_per_second": 8.015,
447
+ "step": 6108
448
+ }
449
+ ],
450
+ "logging_steps": 100,
451
+ "max_steps": 30540,
452
+ "num_input_tokens_seen": 0,
453
+ "num_train_epochs": 5,
454
+ "save_steps": 500,
455
+ "stateful_callbacks": {
456
+ "TrainerControl": {
457
+ "args": {
458
+ "should_epoch_stop": false,
459
+ "should_evaluate": false,
460
+ "should_log": false,
461
+ "should_save": true,
462
+ "should_training_stop": false
463
+ },
464
+ "attributes": {}
465
+ }
466
+ },
467
+ "total_flos": 0.0,
468
+ "train_batch_size": 4,
469
+ "trial_name": null,
470
+ "trial_params": null
471
+ }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c239fef244a712098b8e4629d9d994c594ef095b2f967efec482942b3f225373
3
  size 516640282
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15c7d2912e1ad4ca2d24cd07a09f2ab74df17e26cb1433aa7f6a074520fb3d73
3
  size 516640282