File size: 25,052 Bytes
5acd714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.6837606837606838,
  "eval_steps": 40,
  "global_step": 200,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.017094017094017096,
      "grad_norm": 35.038580788061665,
      "learning_rate": 5e-07,
      "logits/chosen": -2.7457876205444336,
      "logits/rejected": -2.7444841861724854,
      "logps/chosen": -164.26461791992188,
      "logps/rejected": -170.55870056152344,
      "loss": 0.6935,
      "rewards/accuracies": 0.26875001192092896,
      "rewards/chosen": 0.003455913159996271,
      "rewards/margins": -0.0019886991940438747,
      "rewards/rejected": 0.0054446132853627205,
      "step": 5
    },
    {
      "epoch": 0.03418803418803419,
      "grad_norm": 36.203903910498276,
      "learning_rate": 1e-06,
      "logits/chosen": -2.7106502056121826,
      "logits/rejected": -2.716397523880005,
      "logps/chosen": -171.80043029785156,
      "logps/rejected": -165.20602416992188,
      "loss": 0.6875,
      "rewards/accuracies": 0.5062500238418579,
      "rewards/chosen": 0.012000308372080326,
      "rewards/margins": 0.0025437879376113415,
      "rewards/rejected": 0.009456520900130272,
      "step": 10
    },
    {
      "epoch": 0.05128205128205128,
      "grad_norm": 33.9576577784673,
      "learning_rate": 9.999177507263144e-07,
      "logits/chosen": -2.651571750640869,
      "logits/rejected": -2.629457473754883,
      "logps/chosen": -174.04080200195312,
      "logps/rejected": -174.0542755126953,
      "loss": 0.6698,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": 0.23909731209278107,
      "rewards/margins": 0.10868903249502182,
      "rewards/rejected": 0.13040827214717865,
      "step": 15
    },
    {
      "epoch": 0.06837606837606838,
      "grad_norm": 34.33646066636181,
      "learning_rate": 9.996710299650301e-07,
      "logits/chosen": -2.476440668106079,
      "logits/rejected": -2.450225353240967,
      "logps/chosen": -158.1311798095703,
      "logps/rejected": -158.0066680908203,
      "loss": 0.6613,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.4318675100803375,
      "rewards/margins": 0.14549395442008972,
      "rewards/rejected": 0.2863735556602478,
      "step": 20
    },
    {
      "epoch": 0.08547008547008547,
      "grad_norm": 33.16430522723429,
      "learning_rate": 9.992599188865604e-07,
      "logits/chosen": -2.3086318969726562,
      "logits/rejected": -2.3104796409606934,
      "logps/chosen": -150.59771728515625,
      "logps/rejected": -156.85037231445312,
      "loss": 0.6494,
      "rewards/accuracies": 0.581250011920929,
      "rewards/chosen": 0.5047669410705566,
      "rewards/margins": 0.16554531455039978,
      "rewards/rejected": 0.33922165632247925,
      "step": 25
    },
    {
      "epoch": 0.10256410256410256,
      "grad_norm": 34.52861424862365,
      "learning_rate": 9.98684552745256e-07,
      "logits/chosen": -2.217874050140381,
      "logits/rejected": -2.2254481315612793,
      "logps/chosen": -161.29412841796875,
      "logps/rejected": -161.40841674804688,
      "loss": 0.6295,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.4176379144191742,
      "rewards/margins": 0.26531916856765747,
      "rewards/rejected": 0.15231874585151672,
      "step": 30
    },
    {
      "epoch": 0.11965811965811966,
      "grad_norm": 31.455117829218544,
      "learning_rate": 9.979451208349055e-07,
      "logits/chosen": -2.2608728408813477,
      "logits/rejected": -2.246007204055786,
      "logps/chosen": -171.71456909179688,
      "logps/rejected": -174.46578979492188,
      "loss": 0.6305,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.01912705972790718,
      "rewards/margins": 0.31441593170166016,
      "rewards/rejected": -0.33354294300079346,
      "step": 35
    },
    {
      "epoch": 0.13675213675213677,
      "grad_norm": 31.67318837058587,
      "learning_rate": 9.970418664264595e-07,
      "logits/chosen": -2.345672130584717,
      "logits/rejected": -2.331491470336914,
      "logps/chosen": -171.24766540527344,
      "logps/rejected": -176.8189697265625,
      "loss": 0.5989,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.27867692708969116,
      "rewards/margins": 0.5290472507476807,
      "rewards/rejected": -0.8077241778373718,
      "step": 40
    },
    {
      "epoch": 0.13675213675213677,
      "eval_logits/chosen": -2.4102065563201904,
      "eval_logits/rejected": -2.401230573654175,
      "eval_logps/chosen": -162.36439514160156,
      "eval_logps/rejected": -167.4954071044922,
      "eval_loss": 0.6069236993789673,
      "eval_rewards/accuracies": 0.6365384459495544,
      "eval_rewards/chosen": -0.388705849647522,
      "eval_rewards/margins": 0.47280558943748474,
      "eval_rewards/rejected": -0.8615114688873291,
      "eval_runtime": 509.918,
      "eval_samples_per_second": 16.305,
      "eval_steps_per_second": 0.255,
      "step": 40
    },
    {
      "epoch": 0.15384615384615385,
      "grad_norm": 36.18313806223269,
      "learning_rate": 9.95975086687994e-07,
      "logits/chosen": -2.44050669670105,
      "logits/rejected": -2.4460220336914062,
      "logps/chosen": -163.82875061035156,
      "logps/rejected": -167.35989379882812,
      "loss": 0.6146,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -0.31098368763923645,
      "rewards/margins": 0.46269193291664124,
      "rewards/rejected": -0.7736755609512329,
      "step": 45
    },
    {
      "epoch": 0.17094017094017094,
      "grad_norm": 31.13412274683678,
      "learning_rate": 9.947451325869439e-07,
      "logits/chosen": -2.501091718673706,
      "logits/rejected": -2.4991250038146973,
      "logps/chosen": -172.09686279296875,
      "logps/rejected": -177.7747802734375,
      "loss": 0.577,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.212348073720932,
      "rewards/margins": 0.6062799692153931,
      "rewards/rejected": -0.8186280131340027,
      "step": 50
    },
    {
      "epoch": 0.18803418803418803,
      "grad_norm": 31.508672436862835,
      "learning_rate": 9.933524087746347e-07,
      "logits/chosen": -2.437525510787964,
      "logits/rejected": -2.4285693168640137,
      "logps/chosen": -168.1316375732422,
      "logps/rejected": -175.23193359375,
      "loss": 0.571,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.513076901435852,
      "rewards/margins": 0.7702310681343079,
      "rewards/rejected": -1.2833080291748047,
      "step": 55
    },
    {
      "epoch": 0.20512820512820512,
      "grad_norm": 30.148068867306787,
      "learning_rate": 9.917973734531549e-07,
      "logits/chosen": -2.431530475616455,
      "logits/rejected": -2.431729793548584,
      "logps/chosen": -159.38168334960938,
      "logps/rejected": -170.52500915527344,
      "loss": 0.5762,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.34855490922927856,
      "rewards/margins": 0.5969334244728088,
      "rewards/rejected": -0.9454883337020874,
      "step": 60
    },
    {
      "epoch": 0.2222222222222222,
      "grad_norm": 32.03814968183332,
      "learning_rate": 9.90080538224607e-07,
      "logits/chosen": -2.533193588256836,
      "logits/rejected": -2.5252978801727295,
      "logps/chosen": -157.30966186523438,
      "logps/rejected": -166.26011657714844,
      "loss": 0.5643,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": -0.007600936107337475,
      "rewards/margins": 0.5010749697685242,
      "rewards/rejected": -0.5086758732795715,
      "step": 65
    },
    {
      "epoch": 0.23931623931623933,
      "grad_norm": 29.16308768569833,
      "learning_rate": 9.882024679227938e-07,
      "logits/chosen": -2.5899624824523926,
      "logits/rejected": -2.5779967308044434,
      "logps/chosen": -178.4553985595703,
      "logps/rejected": -179.71542358398438,
      "loss": 0.5464,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.47189587354660034,
      "rewards/margins": 0.8304598927497864,
      "rewards/rejected": -1.3023556470870972,
      "step": 70
    },
    {
      "epoch": 0.2564102564102564,
      "grad_norm": 28.918531347661485,
      "learning_rate": 9.861637804273881e-07,
      "logits/chosen": -2.578892469406128,
      "logits/rejected": -2.5758416652679443,
      "logps/chosen": -162.60537719726562,
      "logps/rejected": -170.6789093017578,
      "loss": 0.5553,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.45147842168807983,
      "rewards/margins": 0.6994724273681641,
      "rewards/rejected": -1.1509509086608887,
      "step": 75
    },
    {
      "epoch": 0.27350427350427353,
      "grad_norm": 26.98866754941649,
      "learning_rate": 9.83965146460653e-07,
      "logits/chosen": -2.54936146736145,
      "logits/rejected": -2.5406956672668457,
      "logps/chosen": -168.81484985351562,
      "logps/rejected": -179.770751953125,
      "loss": 0.5452,
      "rewards/accuracies": 0.6812499761581421,
      "rewards/chosen": -0.6899678111076355,
      "rewards/margins": 0.8549306988716125,
      "rewards/rejected": -1.544898509979248,
      "step": 80
    },
    {
      "epoch": 0.27350427350427353,
      "eval_logits/chosen": -2.53336238861084,
      "eval_logits/rejected": -2.517695665359497,
      "eval_logps/chosen": -167.28964233398438,
      "eval_logps/rejected": -177.21824645996094,
      "eval_loss": 0.5331124663352966,
      "eval_rewards/accuracies": 0.7134615182876587,
      "eval_rewards/chosen": -0.8812309503555298,
      "eval_rewards/margins": 0.9525622725486755,
      "eval_rewards/rejected": -1.8337931632995605,
      "eval_runtime": 510.0922,
      "eval_samples_per_second": 16.299,
      "eval_steps_per_second": 0.255,
      "step": 80
    },
    {
      "epoch": 0.2905982905982906,
      "grad_norm": 34.783908892421536,
      "learning_rate": 9.816072893667758e-07,
      "logits/chosen": -2.5432825088500977,
      "logits/rejected": -2.5159504413604736,
      "logps/chosen": -174.62197875976562,
      "logps/rejected": -185.89413452148438,
      "loss": 0.5581,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -1.0434839725494385,
      "rewards/margins": 1.0283188819885254,
      "rewards/rejected": -2.0718026161193848,
      "step": 85
    },
    {
      "epoch": 0.3076923076923077,
      "grad_norm": 26.697686805838906,
      "learning_rate": 9.790909848738904e-07,
      "logits/chosen": -2.5102508068084717,
      "logits/rejected": -2.5222485065460205,
      "logps/chosen": -175.47544860839844,
      "logps/rejected": -183.92678833007812,
      "loss": 0.5208,
      "rewards/accuracies": 0.637499988079071,
      "rewards/chosen": -0.9199908971786499,
      "rewards/margins": 0.8521744608879089,
      "rewards/rejected": -1.7721655368804932,
      "step": 90
    },
    {
      "epoch": 0.3247863247863248,
      "grad_norm": 30.125094604814798,
      "learning_rate": 9.764170608388647e-07,
      "logits/chosen": -2.514260768890381,
      "logits/rejected": -2.4829812049865723,
      "logps/chosen": -167.62655639648438,
      "logps/rejected": -174.2395477294922,
      "loss": 0.5242,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.6241778135299683,
      "rewards/margins": 1.0742968320846558,
      "rewards/rejected": -1.6984745264053345,
      "step": 95
    },
    {
      "epoch": 0.3418803418803419,
      "grad_norm": 27.550843374580296,
      "learning_rate": 9.735863969749371e-07,
      "logits/chosen": -2.4171032905578613,
      "logits/rejected": -2.381608486175537,
      "logps/chosen": -177.05935668945312,
      "logps/rejected": -188.4621124267578,
      "loss": 0.5002,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": -0.7831762433052063,
      "rewards/margins": 1.0672458410263062,
      "rewards/rejected": -1.8504221439361572,
      "step": 100
    },
    {
      "epoch": 0.358974358974359,
      "grad_norm": 30.39392617500016,
      "learning_rate": 9.705999245622956e-07,
      "logits/chosen": -2.3619236946105957,
      "logits/rejected": -2.3391060829162598,
      "logps/chosen": -170.48300170898438,
      "logps/rejected": -183.28384399414062,
      "loss": 0.5026,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -0.8889726400375366,
      "rewards/margins": 0.9097515940666199,
      "rewards/rejected": -1.7987244129180908,
      "step": 105
    },
    {
      "epoch": 0.37606837606837606,
      "grad_norm": 26.741945030347612,
      "learning_rate": 9.674586261416873e-07,
      "logits/chosen": -2.2946972846984863,
      "logits/rejected": -2.2440435886383057,
      "logps/chosen": -179.06390380859375,
      "logps/rejected": -188.00010681152344,
      "loss": 0.5206,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.6539386510848999,
      "rewards/margins": 1.0372655391693115,
      "rewards/rejected": -1.691204309463501,
      "step": 110
    },
    {
      "epoch": 0.39316239316239315,
      "grad_norm": 33.116742735027486,
      "learning_rate": 9.641635351911664e-07,
      "logits/chosen": -2.218276262283325,
      "logits/rejected": -2.18500018119812,
      "logps/chosen": -171.17381286621094,
      "logps/rejected": -183.25845336914062,
      "loss": 0.4801,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -0.9279203414916992,
      "rewards/margins": 1.2727015018463135,
      "rewards/rejected": -2.200622081756592,
      "step": 115
    },
    {
      "epoch": 0.41025641025641024,
      "grad_norm": 27.185641229760538,
      "learning_rate": 9.607157357860821e-07,
      "logits/chosen": -2.124584436416626,
      "logits/rejected": -2.0961549282073975,
      "logps/chosen": -189.48277282714844,
      "logps/rejected": -203.43951416015625,
      "loss": 0.5026,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -1.2869656085968018,
      "rewards/margins": 1.3039339780807495,
      "rewards/rejected": -2.5908992290496826,
      "step": 120
    },
    {
      "epoch": 0.41025641025641024,
      "eval_logits/chosen": -2.0268211364746094,
      "eval_logits/rejected": -1.9764775037765503,
      "eval_logps/chosen": -172.888671875,
      "eval_logps/rejected": -185.58355712890625,
      "eval_loss": 0.49246644973754883,
      "eval_rewards/accuracies": 0.7442307472229004,
      "eval_rewards/chosen": -1.441135048866272,
      "eval_rewards/margins": 1.2291908264160156,
      "eval_rewards/rejected": -2.670325756072998,
      "eval_runtime": 510.1247,
      "eval_samples_per_second": 16.298,
      "eval_steps_per_second": 0.255,
      "step": 120
    },
    {
      "epoch": 0.42735042735042733,
      "grad_norm": 31.03461706328688,
      "learning_rate": 9.571163622424225e-07,
      "logits/chosen": -1.944964051246643,
      "logits/rejected": -1.9178746938705444,
      "logps/chosen": -175.3327178955078,
      "logps/rejected": -188.2616729736328,
      "loss": 0.5017,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -1.579502820968628,
      "rewards/margins": 1.2485122680664062,
      "rewards/rejected": -2.828014850616455,
      "step": 125
    },
    {
      "epoch": 0.4444444444444444,
      "grad_norm": 29.080520770184428,
      "learning_rate": 9.533665987436261e-07,
      "logits/chosen": -1.8825464248657227,
      "logits/rejected": -1.8078832626342773,
      "logps/chosen": -178.3484649658203,
      "logps/rejected": -197.55380249023438,
      "loss": 0.4983,
      "rewards/accuracies": 0.6937500238418579,
      "rewards/chosen": -1.5868518352508545,
      "rewards/margins": 1.2471343278884888,
      "rewards/rejected": -2.8339860439300537,
      "step": 130
    },
    {
      "epoch": 0.46153846153846156,
      "grad_norm": 28.903021536294002,
      "learning_rate": 9.494676789509899e-07,
      "logits/chosen": -1.8585374355316162,
      "logits/rejected": -1.8128669261932373,
      "logps/chosen": -178.5911407470703,
      "logps/rejected": -195.90933227539062,
      "loss": 0.492,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -1.2965319156646729,
      "rewards/margins": 1.4173026084899902,
      "rewards/rejected": -2.713834285736084,
      "step": 135
    },
    {
      "epoch": 0.47863247863247865,
      "grad_norm": 27.5476391641307,
      "learning_rate": 9.454208855977985e-07,
      "logits/chosen": -1.920654296875,
      "logits/rejected": -1.8412939310073853,
      "logps/chosen": -179.1053924560547,
      "logps/rejected": -196.11526489257812,
      "loss": 0.4753,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.5140180587768555,
      "rewards/margins": 1.5388453006744385,
      "rewards/rejected": -3.052863121032715,
      "step": 140
    },
    {
      "epoch": 0.49572649572649574,
      "grad_norm": 30.03317842923354,
      "learning_rate": 9.41227550067308e-07,
      "logits/chosen": -1.9514515399932861,
      "logits/rejected": -1.949883222579956,
      "logps/chosen": -178.63250732421875,
      "logps/rejected": -191.42721557617188,
      "loss": 0.4803,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -1.513338327407837,
      "rewards/margins": 1.4887291193008423,
      "rewards/rejected": -3.0020670890808105,
      "step": 145
    },
    {
      "epoch": 0.5128205128205128,
      "grad_norm": 30.28469957381902,
      "learning_rate": 9.36889051954725e-07,
      "logits/chosen": -2.0093894004821777,
      "logits/rejected": -1.9657704830169678,
      "logps/chosen": -180.35043334960938,
      "logps/rejected": -197.2502899169922,
      "loss": 0.4895,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -1.602224588394165,
      "rewards/margins": 1.6883083581924438,
      "rewards/rejected": -3.2905325889587402,
      "step": 150
    },
    {
      "epoch": 0.5299145299145299,
      "grad_norm": 28.420242591686232,
      "learning_rate": 9.324068186133245e-07,
      "logits/chosen": -1.9976894855499268,
      "logits/rejected": -1.9886022806167603,
      "logps/chosen": -171.70602416992188,
      "logps/rejected": -185.99795532226562,
      "loss": 0.4608,
      "rewards/accuracies": 0.793749988079071,
      "rewards/chosen": -1.2952425479888916,
      "rewards/margins": 1.7483227252960205,
      "rewards/rejected": -3.043565034866333,
      "step": 155
    },
    {
      "epoch": 0.5470085470085471,
      "grad_norm": 26.601543429998234,
      "learning_rate": 9.277823246848536e-07,
      "logits/chosen": -2.056879758834839,
      "logits/rejected": -1.9998328685760498,
      "logps/chosen": -186.3706817626953,
      "logps/rejected": -196.63290405273438,
      "loss": 0.4511,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -1.2312135696411133,
      "rewards/margins": 1.352858304977417,
      "rewards/rejected": -2.5840718746185303,
      "step": 160
    },
    {
      "epoch": 0.5470085470085471,
      "eval_logits/chosen": -2.070892095565796,
      "eval_logits/rejected": -2.0279953479766846,
      "eval_logps/chosen": -171.76034545898438,
      "eval_logps/rejected": -189.1643829345703,
      "eval_loss": 0.4683005213737488,
      "eval_rewards/accuracies": 0.762499988079071,
      "eval_rewards/chosen": -1.328302264213562,
      "eval_rewards/margins": 1.70010507106781,
      "eval_rewards/rejected": -3.028407096862793,
      "eval_runtime": 509.9565,
      "eval_samples_per_second": 16.303,
      "eval_steps_per_second": 0.255,
      "step": 160
    },
    {
      "epoch": 0.5641025641025641,
      "grad_norm": 41.76296476638838,
      "learning_rate": 9.230170916143793e-07,
      "logits/chosen": -2.1190731525421143,
      "logits/rejected": -2.083359956741333,
      "logps/chosen": -176.87539672851562,
      "logps/rejected": -198.44384765625,
      "loss": 0.4944,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": -1.2975060939788818,
      "rewards/margins": 1.6890850067138672,
      "rewards/rejected": -2.98659086227417,
      "step": 165
    },
    {
      "epoch": 0.5811965811965812,
      "grad_norm": 28.83194976337172,
      "learning_rate": 9.181126871497378e-07,
      "logits/chosen": -2.175851583480835,
      "logits/rejected": -2.1391243934631348,
      "logps/chosen": -178.2881317138672,
      "logps/rejected": -197.88473510742188,
      "loss": 0.4813,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": -1.2544641494750977,
      "rewards/margins": 1.7747846841812134,
      "rewards/rejected": -3.0292489528656006,
      "step": 170
    },
    {
      "epoch": 0.5982905982905983,
      "grad_norm": 30.93659066586097,
      "learning_rate": 9.130707248257491e-07,
      "logits/chosen": -2.313814640045166,
      "logits/rejected": -2.2677135467529297,
      "logps/chosen": -170.06781005859375,
      "logps/rejected": -177.8175811767578,
      "loss": 0.4863,
      "rewards/accuracies": 0.768750011920929,
      "rewards/chosen": -1.0524061918258667,
      "rewards/margins": 1.3644572496414185,
      "rewards/rejected": -2.416863441467285,
      "step": 175
    },
    {
      "epoch": 0.6153846153846154,
      "grad_norm": 25.018999438635433,
      "learning_rate": 9.078928634333698e-07,
      "logits/chosen": -2.302171230316162,
      "logits/rejected": -2.2788572311401367,
      "logps/chosen": -179.72390747070312,
      "logps/rejected": -197.12283325195312,
      "loss": 0.4553,
      "rewards/accuracies": 0.8062499761581421,
      "rewards/chosen": -0.6731274724006653,
      "rewards/margins": 1.6728944778442383,
      "rewards/rejected": -2.346021890640259,
      "step": 180
    },
    {
      "epoch": 0.6324786324786325,
      "grad_norm": 28.576400660174777,
      "learning_rate": 9.025808064739549e-07,
      "logits/chosen": -2.2794651985168457,
      "logits/rejected": -2.2391860485076904,
      "logps/chosen": -175.87045288085938,
      "logps/rejected": -189.4848175048828,
      "loss": 0.4854,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": -0.8901998400688171,
      "rewards/margins": 1.4675487279891968,
      "rewards/rejected": -2.357748508453369,
      "step": 185
    },
    {
      "epoch": 0.6495726495726496,
      "grad_norm": 25.73471562251865,
      "learning_rate": 8.971363015988113e-07,
      "logits/chosen": -2.1966824531555176,
      "logits/rejected": -2.1603925228118896,
      "logps/chosen": -172.0600128173828,
      "logps/rejected": -191.96176147460938,
      "loss": 0.4681,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": -0.9620615243911743,
      "rewards/margins": 1.4954371452331543,
      "rewards/rejected": -2.457498550415039,
      "step": 190
    },
    {
      "epoch": 0.6666666666666666,
      "grad_norm": 34.912982133976655,
      "learning_rate": 8.91561140034225e-07,
      "logits/chosen": -2.1389029026031494,
      "logits/rejected": -2.0825791358947754,
      "logps/chosen": -174.3153839111328,
      "logps/rejected": -194.2677459716797,
      "loss": 0.4935,
      "rewards/accuracies": 0.7437499761581421,
      "rewards/chosen": -1.4726169109344482,
      "rewards/margins": 1.4599871635437012,
      "rewards/rejected": -2.9326040744781494,
      "step": 195
    },
    {
      "epoch": 0.6837606837606838,
      "grad_norm": 25.756167591259292,
      "learning_rate": 8.858571559921537e-07,
      "logits/chosen": -2.135298013687134,
      "logits/rejected": -2.067862033843994,
      "logps/chosen": -178.73361206054688,
      "logps/rejected": -193.21209716796875,
      "loss": 0.4562,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": -1.452704668045044,
      "rewards/margins": 1.6391651630401611,
      "rewards/rejected": -3.091869831085205,
      "step": 200
    },
    {
      "epoch": 0.6837606837606838,
      "eval_logits/chosen": -2.1462392807006836,
      "eval_logits/rejected": -2.1028637886047363,
      "eval_logps/chosen": -173.41998291015625,
      "eval_logps/rejected": -191.55532836914062,
      "eval_loss": 0.4528014361858368,
      "eval_rewards/accuracies": 0.7567307949066162,
      "eval_rewards/chosen": -1.4942626953125,
      "eval_rewards/margins": 1.7732419967651367,
      "eval_rewards/rejected": -3.2675046920776367,
      "eval_runtime": 510.9487,
      "eval_samples_per_second": 16.272,
      "eval_steps_per_second": 0.254,
      "step": 200
    }
  ],
  "logging_steps": 5,
  "max_steps": 876,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 40,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 2358113407598592.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}