File size: 40,884 Bytes
0decd61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.9876543209876543,
  "eval_steps": 100,
  "global_step": 363,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0411522633744856,
      "grad_norm": 71.36946521074697,
      "learning_rate": 5e-07,
      "logits/chosen": -2.7249937057495117,
      "logits/rejected": -2.7219715118408203,
      "logps/chosen": -289.096435546875,
      "logps/rejected": -212.59097290039062,
      "loss": 0.6888,
      "rewards/accuracies": 0.35624998807907104,
      "rewards/chosen": 0.027115171775221825,
      "rewards/margins": 0.011037254706025124,
      "rewards/rejected": 0.01607791893184185,
      "step": 5
    },
    {
      "epoch": 0.0823045267489712,
      "grad_norm": 68.75739482144014,
      "learning_rate": 1e-06,
      "logits/chosen": -2.673173666000366,
      "logits/rejected": -2.6852009296417236,
      "logps/chosen": -258.5091857910156,
      "logps/rejected": -228.7921905517578,
      "loss": 0.6519,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": 0.6803622841835022,
      "rewards/margins": 0.2561650276184082,
      "rewards/rejected": 0.424197256565094,
      "step": 10
    },
    {
      "epoch": 0.12345679012345678,
      "grad_norm": 43.9449007096878,
      "learning_rate": 9.995050530093366e-07,
      "logits/chosen": -2.5606446266174316,
      "logits/rejected": -2.555354595184326,
      "logps/chosen": -258.5283508300781,
      "logps/rejected": -217.637939453125,
      "loss": 0.5873,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": 1.7179749011993408,
      "rewards/margins": 0.7173956036567688,
      "rewards/rejected": 1.0005793571472168,
      "step": 15
    },
    {
      "epoch": 0.1646090534979424,
      "grad_norm": 49.90030149803026,
      "learning_rate": 9.980211919274406e-07,
      "logits/chosen": -2.334833860397339,
      "logits/rejected": -2.3182854652404785,
      "logps/chosen": -234.5125732421875,
      "logps/rejected": -194.8851318359375,
      "loss": 0.6125,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": 1.7243343591690063,
      "rewards/margins": 0.8316472172737122,
      "rewards/rejected": 0.8926870226860046,
      "step": 20
    },
    {
      "epoch": 0.205761316872428,
      "grad_norm": 43.554349506398026,
      "learning_rate": 9.955513544846204e-07,
      "logits/chosen": -2.12056303024292,
      "logits/rejected": -2.095937728881836,
      "logps/chosen": -284.00323486328125,
      "logps/rejected": -210.3358154296875,
      "loss": 0.5749,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": 2.1191883087158203,
      "rewards/margins": 1.4548943042755127,
      "rewards/rejected": 0.6642940044403076,
      "step": 25
    },
    {
      "epoch": 0.24691358024691357,
      "grad_norm": 45.05915140113881,
      "learning_rate": 9.921004304353147e-07,
      "logits/chosen": -2.04213547706604,
      "logits/rejected": -2.0172839164733887,
      "logps/chosen": -232.2016143798828,
      "logps/rejected": -217.5736846923828,
      "loss": 0.5989,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": 1.9440408945083618,
      "rewards/margins": 1.5185799598693848,
      "rewards/rejected": 0.4254608750343323,
      "step": 30
    },
    {
      "epoch": 0.2880658436213992,
      "grad_norm": 40.00728614202134,
      "learning_rate": 9.876752518774164e-07,
      "logits/chosen": -2.0041847229003906,
      "logits/rejected": -1.9888496398925781,
      "logps/chosen": -255.5012969970703,
      "logps/rejected": -238.2528839111328,
      "loss": 0.6076,
      "rewards/accuracies": 0.71875,
      "rewards/chosen": 1.3912312984466553,
      "rewards/margins": 1.0289623737335205,
      "rewards/rejected": 0.36226886510849,
      "step": 35
    },
    {
      "epoch": 0.3292181069958848,
      "grad_norm": 42.181862044805364,
      "learning_rate": 9.822845797261675e-07,
      "logits/chosen": -2.024127244949341,
      "logits/rejected": -2.020592451095581,
      "logps/chosen": -249.13394165039062,
      "logps/rejected": -199.90975952148438,
      "loss": 0.5846,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": 1.3734517097473145,
      "rewards/margins": 0.8223851919174194,
      "rewards/rejected": 0.5510665774345398,
      "step": 40
    },
    {
      "epoch": 0.37037037037037035,
      "grad_norm": 41.15847921708812,
      "learning_rate": 9.759390863694029e-07,
      "logits/chosen": -2.0532474517822266,
      "logits/rejected": -1.9978084564208984,
      "logps/chosen": -256.01446533203125,
      "logps/rejected": -206.8267059326172,
      "loss": 0.5481,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": 1.589166283607483,
      "rewards/margins": 1.2907274961471558,
      "rewards/rejected": 0.2984387278556824,
      "step": 45
    },
    {
      "epoch": 0.411522633744856,
      "grad_norm": 42.63680924826028,
      "learning_rate": 9.68651334538488e-07,
      "logits/chosen": -2.034133195877075,
      "logits/rejected": -2.0025076866149902,
      "logps/chosen": -259.46942138671875,
      "logps/rejected": -229.2208251953125,
      "loss": 0.5652,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": 1.4298592805862427,
      "rewards/margins": 1.076907992362976,
      "rewards/rejected": 0.35295119881629944,
      "step": 50
    },
    {
      "epoch": 0.45267489711934156,
      "grad_norm": 36.48668334468458,
      "learning_rate": 9.604357524367722e-07,
      "logits/chosen": -2.0932247638702393,
      "logits/rejected": -2.0437166690826416,
      "logps/chosen": -281.03289794921875,
      "logps/rejected": -227.46109008789062,
      "loss": 0.5437,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": 1.40286123752594,
      "rewards/margins": 1.028884768486023,
      "rewards/rejected": 0.3739764094352722,
      "step": 55
    },
    {
      "epoch": 0.49382716049382713,
      "grad_norm": 35.36330599361053,
      "learning_rate": 9.513086051748067e-07,
      "logits/chosen": -2.1159732341766357,
      "logits/rejected": -2.078249931335449,
      "logps/chosen": -265.8070373535156,
      "logps/rejected": -214.79428100585938,
      "loss": 0.5166,
      "rewards/accuracies": 0.8187500238418579,
      "rewards/chosen": 1.7003364562988281,
      "rewards/margins": 1.7438255548477173,
      "rewards/rejected": -0.0434890016913414,
      "step": 60
    },
    {
      "epoch": 0.5349794238683128,
      "grad_norm": 31.36432376366485,
      "learning_rate": 9.412879625688742e-07,
      "logits/chosen": -2.183833599090576,
      "logits/rejected": -2.1385440826416016,
      "logps/chosen": -270.89263916015625,
      "logps/rejected": -203.67922973632812,
      "loss": 0.4848,
      "rewards/accuracies": 0.793749988079071,
      "rewards/chosen": 1.6083428859710693,
      "rewards/margins": 1.5229980945587158,
      "rewards/rejected": 0.08534489572048187,
      "step": 65
    },
    {
      "epoch": 0.5761316872427984,
      "grad_norm": 35.68216693219843,
      "learning_rate": 9.303936633665839e-07,
      "logits/chosen": -2.3082363605499268,
      "logits/rejected": -2.2824604511260986,
      "logps/chosen": -255.9834747314453,
      "logps/rejected": -194.7764892578125,
      "loss": 0.5289,
      "rewards/accuracies": 0.7562500238418579,
      "rewards/chosen": 1.1171067953109741,
      "rewards/margins": 1.4306641817092896,
      "rewards/rejected": -0.3135572373867035,
      "step": 70
    },
    {
      "epoch": 0.6172839506172839,
      "grad_norm": 38.51565362073314,
      "learning_rate": 9.186472759703578e-07,
      "logits/chosen": -2.3410449028015137,
      "logits/rejected": -2.3213045597076416,
      "logps/chosen": -275.8757019042969,
      "logps/rejected": -213.70693969726562,
      "loss": 0.5387,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": 0.7660292387008667,
      "rewards/margins": 1.4290556907653809,
      "rewards/rejected": -0.6630264520645142,
      "step": 75
    },
    {
      "epoch": 0.6584362139917695,
      "grad_norm": 30.908945588893605,
      "learning_rate": 9.060720557365682e-07,
      "logits/chosen": -2.3798623085021973,
      "logits/rejected": -2.378147602081299,
      "logps/chosen": -277.94622802734375,
      "logps/rejected": -228.6498565673828,
      "loss": 0.524,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": 0.9844567179679871,
      "rewards/margins": 1.5679962635040283,
      "rewards/rejected": -0.5835394859313965,
      "step": 80
    },
    {
      "epoch": 0.6995884773662552,
      "grad_norm": 36.12667290276971,
      "learning_rate": 8.926928989348611e-07,
      "logits/chosen": -2.438974618911743,
      "logits/rejected": -2.4293782711029053,
      "logps/chosen": -264.4499816894531,
      "logps/rejected": -233.60958862304688,
      "loss": 0.5124,
      "rewards/accuracies": 0.8062499761581421,
      "rewards/chosen": 0.7334972023963928,
      "rewards/margins": 1.825126051902771,
      "rewards/rejected": -1.091629147529602,
      "step": 85
    },
    {
      "epoch": 0.7407407407407407,
      "grad_norm": 31.43710612772888,
      "learning_rate": 8.785362934588233e-07,
      "logits/chosen": -2.4581363201141357,
      "logits/rejected": -2.4250473976135254,
      "logps/chosen": -272.42498779296875,
      "logps/rejected": -206.20614624023438,
      "loss": 0.5073,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": 1.6099742650985718,
      "rewards/margins": 1.7208999395370483,
      "rewards/rejected": -0.11092579364776611,
      "step": 90
    },
    {
      "epoch": 0.7818930041152263,
      "grad_norm": 35.922757319188804,
      "learning_rate": 8.636302663855681e-07,
      "logits/chosen": -2.368760585784912,
      "logits/rejected": -2.3825132846832275,
      "logps/chosen": -247.90396118164062,
      "logps/rejected": -212.88232421875,
      "loss": 0.4971,
      "rewards/accuracies": 0.8187500238418579,
      "rewards/chosen": 1.2823846340179443,
      "rewards/margins": 1.8866965770721436,
      "rewards/rejected": -0.6043121814727783,
      "step": 95
    },
    {
      "epoch": 0.823045267489712,
      "grad_norm": 27.664598721354345,
      "learning_rate": 8.480043284880664e-07,
      "logits/chosen": -2.346686601638794,
      "logits/rejected": -2.317147970199585,
      "logps/chosen": -269.21417236328125,
      "logps/rejected": -233.6097412109375,
      "loss": 0.4674,
      "rewards/accuracies": 0.8374999761581421,
      "rewards/chosen": 0.5551630258560181,
      "rewards/margins": 2.1603965759277344,
      "rewards/rejected": -1.6052335500717163,
      "step": 100
    },
    {
      "epoch": 0.823045267489712,
      "eval_logits/chosen": -2.2803401947021484,
      "eval_logits/rejected": -2.256579875946045,
      "eval_logps/chosen": -257.0998229980469,
      "eval_logps/rejected": -231.74539184570312,
      "eval_loss": 0.4985389709472656,
      "eval_rewards/accuracies": 0.7939814925193787,
      "eval_rewards/chosen": 0.17793893814086914,
      "eval_rewards/margins": 1.7997103929519653,
      "eval_rewards/rejected": -1.6217713356018066,
      "eval_runtime": 230.2785,
      "eval_samples_per_second": 15.008,
      "eval_steps_per_second": 0.234,
      "step": 100
    },
    {
      "epoch": 0.8641975308641975,
      "grad_norm": 33.43388986335041,
      "learning_rate": 8.316894158100727e-07,
      "logits/chosen": -2.238370895385742,
      "logits/rejected": -2.205950975418091,
      "logps/chosen": -270.1739807128906,
      "logps/rejected": -237.7426300048828,
      "loss": 0.5036,
      "rewards/accuracies": 0.824999988079071,
      "rewards/chosen": 0.2168927639722824,
      "rewards/margins": 2.0453083515167236,
      "rewards/rejected": -1.8284155130386353,
      "step": 105
    },
    {
      "epoch": 0.9053497942386831,
      "grad_norm": 41.06626958250484,
      "learning_rate": 8.147178284193184e-07,
      "logits/chosen": -1.9968522787094116,
      "logits/rejected": -1.9477859735488892,
      "logps/chosen": -271.5672912597656,
      "logps/rejected": -244.5254364013672,
      "loss": 0.523,
      "rewards/accuracies": 0.8374999761581421,
      "rewards/chosen": -0.18483969569206238,
      "rewards/margins": 2.090688467025757,
      "rewards/rejected": -2.2755284309387207,
      "step": 110
    },
    {
      "epoch": 0.9465020576131687,
      "grad_norm": 33.45568853055463,
      "learning_rate": 7.971231664602271e-07,
      "logits/chosen": -1.8657859563827515,
      "logits/rejected": -1.7577025890350342,
      "logps/chosen": -255.1681365966797,
      "logps/rejected": -235.93856811523438,
      "loss": 0.4781,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": 0.08642071485519409,
      "rewards/margins": 2.032249689102173,
      "rewards/rejected": -1.9458287954330444,
      "step": 115
    },
    {
      "epoch": 0.9876543209876543,
      "grad_norm": 31.32834367464404,
      "learning_rate": 7.789402636327525e-07,
      "logits/chosen": -1.7241904735565186,
      "logits/rejected": -1.6637340784072876,
      "logps/chosen": -269.67364501953125,
      "logps/rejected": -239.79965209960938,
      "loss": 0.4614,
      "rewards/accuracies": 0.8187500238418579,
      "rewards/chosen": 0.0743473693728447,
      "rewards/margins": 2.101712942123413,
      "rewards/rejected": -2.0273656845092773,
      "step": 120
    },
    {
      "epoch": 1.02880658436214,
      "grad_norm": 18.313357022047114,
      "learning_rate": 7.602051182290381e-07,
      "logits/chosen": -1.5669622421264648,
      "logits/rejected": -1.4961906671524048,
      "logps/chosen": -270.39056396484375,
      "logps/rejected": -223.95706176757812,
      "loss": 0.321,
      "rewards/accuracies": 0.9125000238418579,
      "rewards/chosen": 0.7599193453788757,
      "rewards/margins": 2.714322805404663,
      "rewards/rejected": -1.9544035196304321,
      "step": 125
    },
    {
      "epoch": 1.0699588477366255,
      "grad_norm": 18.77066721006591,
      "learning_rate": 7.409548218644331e-07,
      "logits/chosen": -1.4371721744537354,
      "logits/rejected": -1.3102617263793945,
      "logps/chosen": -257.923095703125,
      "logps/rejected": -222.04959106445312,
      "loss": 0.1777,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 1.8628284931182861,
      "rewards/margins": 3.623333692550659,
      "rewards/rejected": -1.7605053186416626,
      "step": 130
    },
    {
      "epoch": 1.1111111111111112,
      "grad_norm": 14.482571733068447,
      "learning_rate": 7.212274860439576e-07,
      "logits/chosen": -1.4088728427886963,
      "logits/rejected": -1.3359241485595703,
      "logps/chosen": -252.8369140625,
      "logps/rejected": -247.0041046142578,
      "loss": 0.2246,
      "rewards/accuracies": 0.8999999761581421,
      "rewards/chosen": 1.8369052410125732,
      "rewards/margins": 3.677825450897217,
      "rewards/rejected": -1.840920090675354,
      "step": 135
    },
    {
      "epoch": 1.1522633744855968,
      "grad_norm": 19.475514209975124,
      "learning_rate": 7.010621667096041e-07,
      "logits/chosen": -1.5916813611984253,
      "logits/rejected": -1.479448676109314,
      "logps/chosen": -254.99136352539062,
      "logps/rejected": -218.8384246826172,
      "loss": 0.2218,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 1.6720364093780518,
      "rewards/margins": 3.2526676654815674,
      "rewards/rejected": -1.5806310176849365,
      "step": 140
    },
    {
      "epoch": 1.1934156378600824,
      "grad_norm": 24.87312122824749,
      "learning_rate": 6.804987869178539e-07,
      "logits/chosen": -1.7563555240631104,
      "logits/rejected": -1.6887686252593994,
      "logps/chosen": -241.65676879882812,
      "logps/rejected": -225.277099609375,
      "loss": 0.2373,
      "rewards/accuracies": 0.9437500238418579,
      "rewards/chosen": 1.7571462392807007,
      "rewards/margins": 3.4047298431396484,
      "rewards/rejected": -1.6475833654403687,
      "step": 145
    },
    {
      "epoch": 1.2345679012345678,
      "grad_norm": 22.485449779074028,
      "learning_rate": 6.5957805780049e-07,
      "logits/chosen": -1.889991044998169,
      "logits/rejected": -1.8203752040863037,
      "logps/chosen": -250.935302734375,
      "logps/rejected": -223.52401733398438,
      "loss": 0.2196,
      "rewards/accuracies": 0.887499988079071,
      "rewards/chosen": 2.23984432220459,
      "rewards/margins": 3.5551295280456543,
      "rewards/rejected": -1.3152849674224854,
      "step": 150
    },
    {
      "epoch": 1.2757201646090535,
      "grad_norm": 20.0119744226792,
      "learning_rate": 6.383413979651893e-07,
      "logits/chosen": -1.9477765560150146,
      "logits/rejected": -1.8932664394378662,
      "logps/chosen": -242.27685546875,
      "logps/rejected": -231.18991088867188,
      "loss": 0.2229,
      "rewards/accuracies": 0.918749988079071,
      "rewards/chosen": 1.800172209739685,
      "rewards/margins": 3.45011568069458,
      "rewards/rejected": -1.6499433517456055,
      "step": 155
    },
    {
      "epoch": 1.316872427983539,
      "grad_norm": 20.63931604768156,
      "learning_rate": 6.168308514954602e-07,
      "logits/chosen": -1.973009705543518,
      "logits/rejected": -1.8899316787719727,
      "logps/chosen": -261.8257141113281,
      "logps/rejected": -258.97515869140625,
      "loss": 0.2121,
      "rewards/accuracies": 0.956250011920929,
      "rewards/chosen": 1.9451110363006592,
      "rewards/margins": 4.489598274230957,
      "rewards/rejected": -2.544487237930298,
      "step": 160
    },
    {
      "epoch": 1.3580246913580247,
      "grad_norm": 23.779662167366467,
      "learning_rate": 5.950890047122741e-07,
      "logits/chosen": -1.9724878072738647,
      "logits/rejected": -1.9425151348114014,
      "logps/chosen": -260.43084716796875,
      "logps/rejected": -236.8948211669922,
      "loss": 0.2464,
      "rewards/accuracies": 0.887499988079071,
      "rewards/chosen": 1.4944422245025635,
      "rewards/margins": 3.678725481033325,
      "rewards/rejected": -2.18428373336792,
      "step": 165
    },
    {
      "epoch": 1.3991769547325104,
      "grad_norm": 15.224094688709425,
      "learning_rate": 5.731589018621776e-07,
      "logits/chosen": -1.9535115957260132,
      "logits/rejected": -1.8948615789413452,
      "logps/chosen": -252.6552276611328,
      "logps/rejected": -226.4263916015625,
      "loss": 0.2351,
      "rewards/accuracies": 0.9437500238418579,
      "rewards/chosen": 1.834676742553711,
      "rewards/margins": 4.032426357269287,
      "rewards/rejected": -2.1977500915527344,
      "step": 170
    },
    {
      "epoch": 1.4403292181069958,
      "grad_norm": 20.636053561561848,
      "learning_rate": 5.510839598988136e-07,
      "logits/chosen": -1.8348503112792969,
      "logits/rejected": -1.7934105396270752,
      "logps/chosen": -255.14895629882812,
      "logps/rejected": -232.3575897216797,
      "loss": 0.2069,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 2.0120339393615723,
      "rewards/margins": 3.713160276412964,
      "rewards/rejected": -1.7011263370513916,
      "step": 175
    },
    {
      "epoch": 1.4814814814814814,
      "grad_norm": 21.755357371160876,
      "learning_rate": 5.289078825265572e-07,
      "logits/chosen": -1.7341606616973877,
      "logits/rejected": -1.6741468906402588,
      "logps/chosen": -237.35433959960938,
      "logps/rejected": -228.7030487060547,
      "loss": 0.234,
      "rewards/accuracies": 0.893750011920929,
      "rewards/chosen": 1.63128662109375,
      "rewards/margins": 3.659700393676758,
      "rewards/rejected": -2.028413772583008,
      "step": 180
    },
    {
      "epoch": 1.522633744855967,
      "grad_norm": 23.810123453795516,
      "learning_rate": 5.066745736764489e-07,
      "logits/chosen": -1.635679841041565,
      "logits/rejected": -1.5873550176620483,
      "logps/chosen": -248.98135375976562,
      "logps/rejected": -240.08987426757812,
      "loss": 0.2576,
      "rewards/accuracies": 0.90625,
      "rewards/chosen": 1.4315288066864014,
      "rewards/margins": 3.4555141925811768,
      "rewards/rejected": -2.0239853858947754,
      "step": 185
    },
    {
      "epoch": 1.5637860082304527,
      "grad_norm": 22.3759752093868,
      "learning_rate": 4.844280505857202e-07,
      "logits/chosen": -1.5894463062286377,
      "logits/rejected": -1.5013604164123535,
      "logps/chosen": -239.4411163330078,
      "logps/rejected": -219.7681121826172,
      "loss": 0.2732,
      "rewards/accuracies": 0.925000011920929,
      "rewards/chosen": 1.9483649730682373,
      "rewards/margins": 3.714170455932617,
      "rewards/rejected": -1.7658058404922485,
      "step": 190
    },
    {
      "epoch": 1.6049382716049383,
      "grad_norm": 21.666055935350588,
      "learning_rate": 4.6221235665299684e-07,
      "logits/chosen": -1.6968196630477905,
      "logits/rejected": -1.6124290227890015,
      "logps/chosen": -246.6077117919922,
      "logps/rejected": -233.7628631591797,
      "loss": 0.2689,
      "rewards/accuracies": 0.90625,
      "rewards/chosen": 2.1885757446289062,
      "rewards/margins": 3.544438600540161,
      "rewards/rejected": -1.3558627367019653,
      "step": 195
    },
    {
      "epoch": 1.646090534979424,
      "grad_norm": 23.567423393969673,
      "learning_rate": 4.400714742417091e-07,
      "logits/chosen": -1.7539150714874268,
      "logits/rejected": -1.6715869903564453,
      "logps/chosen": -289.3243713378906,
      "logps/rejected": -238.78271484375,
      "loss": 0.2463,
      "rewards/accuracies": 0.887499988079071,
      "rewards/chosen": 2.3782408237457275,
      "rewards/margins": 3.8166255950927734,
      "rewards/rejected": -1.438385248184204,
      "step": 200
    },
    {
      "epoch": 1.646090534979424,
      "eval_logits/chosen": -1.7062827348709106,
      "eval_logits/rejected": -1.629170298576355,
      "eval_logps/chosen": -247.42041015625,
      "eval_logps/rejected": -227.5958709716797,
      "eval_loss": 0.5190241932868958,
      "eval_rewards/accuracies": 0.7962962985038757,
      "eval_rewards/chosen": 1.1458828449249268,
      "eval_rewards/margins": 2.3527021408081055,
      "eval_rewards/rejected": -1.2068192958831787,
      "eval_runtime": 228.0783,
      "eval_samples_per_second": 15.153,
      "eval_steps_per_second": 0.237,
      "step": 200
    },
    {
      "epoch": 1.6872427983539096,
      "grad_norm": 20.05042831109418,
      "learning_rate": 4.180492376043371e-07,
      "logits/chosen": -1.7294807434082031,
      "logits/rejected": -1.6129295825958252,
      "logps/chosen": -239.91696166992188,
      "logps/rejected": -241.2155303955078,
      "loss": 0.2475,
      "rewards/accuracies": 0.8999999761581421,
      "rewards/chosen": 1.699279546737671,
      "rewards/margins": 3.706660747528076,
      "rewards/rejected": -2.007380962371826,
      "step": 205
    },
    {
      "epoch": 1.7283950617283952,
      "grad_norm": 17.373566601078217,
      "learning_rate": 3.961892460998862e-07,
      "logits/chosen": -1.7376630306243896,
      "logits/rejected": -1.672767996788025,
      "logps/chosen": -259.5295104980469,
      "logps/rejected": -219.8362274169922,
      "loss": 0.2275,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 1.8958297967910767,
      "rewards/margins": 3.636307954788208,
      "rewards/rejected": -1.7404781579971313,
      "step": 210
    },
    {
      "epoch": 1.7695473251028808,
      "grad_norm": 20.32259020536467,
      "learning_rate": 3.7453477787640077e-07,
      "logits/chosen": -1.6703641414642334,
      "logits/rejected": -1.6055065393447876,
      "logps/chosen": -259.04559326171875,
      "logps/rejected": -238.02713012695312,
      "loss": 0.2558,
      "rewards/accuracies": 0.893750011920929,
      "rewards/chosen": 1.8848392963409424,
      "rewards/margins": 3.780524492263794,
      "rewards/rejected": -1.8956845998764038,
      "step": 215
    },
    {
      "epoch": 1.8106995884773662,
      "grad_norm": 20.716775450731596,
      "learning_rate": 3.531287041894075e-07,
      "logits/chosen": -1.636228322982788,
      "logits/rejected": -1.5927408933639526,
      "logps/chosen": -259.4163513183594,
      "logps/rejected": -262.77691650390625,
      "loss": 0.2641,
      "rewards/accuracies": 0.9312499761581421,
      "rewards/chosen": 1.8492801189422607,
      "rewards/margins": 3.8553290367126465,
      "rewards/rejected": -2.0060486793518066,
      "step": 220
    },
    {
      "epoch": 1.8518518518518519,
      "grad_norm": 20.26085395927115,
      "learning_rate": 3.320134045259192e-07,
      "logits/chosen": -1.6199842691421509,
      "logits/rejected": -1.5809019804000854,
      "logps/chosen": -261.5071716308594,
      "logps/rejected": -244.0452117919922,
      "loss": 0.2836,
      "rewards/accuracies": 0.887499988079071,
      "rewards/chosen": 1.7676365375518799,
      "rewards/margins": 3.8491673469543457,
      "rewards/rejected": -2.081530809402466,
      "step": 225
    },
    {
      "epoch": 1.8930041152263375,
      "grad_norm": 19.9900109721012,
      "learning_rate": 3.112306827020377e-07,
      "logits/chosen": -1.6224733591079712,
      "logits/rejected": -1.5683706998825073,
      "logps/chosen": -246.66726684570312,
      "logps/rejected": -252.150634765625,
      "loss": 0.2967,
      "rewards/accuracies": 0.8999999761581421,
      "rewards/chosen": 1.2352790832519531,
      "rewards/margins": 3.3191657066345215,
      "rewards/rejected": -2.0838871002197266,
      "step": 230
    },
    {
      "epoch": 1.934156378600823,
      "grad_norm": 20.679234729146177,
      "learning_rate": 2.90821684100261e-07,
      "logits/chosen": -1.665122628211975,
      "logits/rejected": -1.585533857345581,
      "logps/chosen": -258.1650390625,
      "logps/rejected": -238.0010223388672,
      "loss": 0.2521,
      "rewards/accuracies": 0.9437500238418579,
      "rewards/chosen": 1.45806884765625,
      "rewards/margins": 3.821526288986206,
      "rewards/rejected": -2.363457202911377,
      "step": 235
    },
    {
      "epoch": 1.9753086419753085,
      "grad_norm": 17.897922449348748,
      "learning_rate": 2.708268142103509e-07,
      "logits/chosen": -1.6568527221679688,
      "logits/rejected": -1.594029426574707,
      "logps/chosen": -249.9292449951172,
      "logps/rejected": -217.1236114501953,
      "loss": 0.2458,
      "rewards/accuracies": 0.9125000238418579,
      "rewards/chosen": 1.140490174293518,
      "rewards/margins": 3.4049384593963623,
      "rewards/rejected": -2.264448404312134,
      "step": 240
    },
    {
      "epoch": 2.016460905349794,
      "grad_norm": 15.579483343495324,
      "learning_rate": 2.5128565863503e-07,
      "logits/chosen": -1.7464730739593506,
      "logits/rejected": -1.64523446559906,
      "logps/chosen": -269.5633544921875,
      "logps/rejected": -218.4349365234375,
      "loss": 0.1875,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 1.345157504081726,
      "rewards/margins": 3.8898367881774902,
      "rewards/rejected": -2.5446791648864746,
      "step": 245
    },
    {
      "epoch": 2.05761316872428,
      "grad_norm": 15.642770624996952,
      "learning_rate": 2.3223690471888286e-07,
      "logits/chosen": -1.7972164154052734,
      "logits/rejected": -1.6923631429672241,
      "logps/chosen": -276.4811706542969,
      "logps/rejected": -239.2648468017578,
      "loss": 0.1218,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 1.7780349254608154,
      "rewards/margins": 4.118841171264648,
      "rewards/rejected": -2.3408069610595703,
      "step": 250
    },
    {
      "epoch": 2.0987654320987654,
      "grad_norm": 13.364305072324674,
      "learning_rate": 2.1371826495561613e-07,
      "logits/chosen": -1.8449236154556274,
      "logits/rejected": -1.7506535053253174,
      "logps/chosen": -255.83792114257812,
      "logps/rejected": -221.6796875,
      "loss": 0.146,
      "rewards/accuracies": 0.9312499761581421,
      "rewards/chosen": 1.8460966348648071,
      "rewards/margins": 3.9246277809143066,
      "rewards/rejected": -2.078531265258789,
      "step": 255
    },
    {
      "epoch": 2.139917695473251,
      "grad_norm": 13.06395689210594,
      "learning_rate": 1.9576640232531784e-07,
      "logits/chosen": -1.8692007064819336,
      "logits/rejected": -1.8045275211334229,
      "logps/chosen": -248.9095916748047,
      "logps/rejected": -250.84481811523438,
      "loss": 0.1171,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 1.9673175811767578,
      "rewards/margins": 4.323936462402344,
      "rewards/rejected": -2.356618642807007,
      "step": 260
    },
    {
      "epoch": 2.1810699588477367,
      "grad_norm": 15.133332987736472,
      "learning_rate": 1.784168577095307e-07,
      "logits/chosen": -1.9296722412109375,
      "logits/rejected": -1.8828375339508057,
      "logps/chosen": -250.7962646484375,
      "logps/rejected": -228.93923950195312,
      "loss": 0.1322,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 2.0834712982177734,
      "rewards/margins": 3.928879499435425,
      "rewards/rejected": -1.8454080820083618,
      "step": 265
    },
    {
      "epoch": 2.2222222222222223,
      "grad_norm": 13.466085492542144,
      "learning_rate": 1.6170397952784248e-07,
      "logits/chosen": -1.9489628076553345,
      "logits/rejected": -1.8797670602798462,
      "logps/chosen": -270.56427001953125,
      "logps/rejected": -242.9454803466797,
      "loss": 0.1229,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 2.427950143814087,
      "rewards/margins": 4.682461261749268,
      "rewards/rejected": -2.2545108795166016,
      "step": 270
    },
    {
      "epoch": 2.263374485596708,
      "grad_norm": 14.794346267314218,
      "learning_rate": 1.4566085573529874e-07,
      "logits/chosen": -1.9156001806259155,
      "logits/rejected": -1.8757755756378174,
      "logps/chosen": -258.8504333496094,
      "logps/rejected": -229.5829315185547,
      "loss": 0.1305,
      "rewards/accuracies": 0.956250011920929,
      "rewards/chosen": 2.124898910522461,
      "rewards/margins": 4.520539283752441,
      "rewards/rejected": -2.3956406116485596,
      "step": 275
    },
    {
      "epoch": 2.3045267489711936,
      "grad_norm": 14.6085524255932,
      "learning_rate": 1.3031924831526737e-07,
      "logits/chosen": -1.918760895729065,
      "logits/rejected": -1.8703607320785522,
      "logps/chosen": -261.5938415527344,
      "logps/rejected": -230.3494415283203,
      "loss": 0.1162,
      "rewards/accuracies": 0.956250011920929,
      "rewards/chosen": 1.9034366607666016,
      "rewards/margins": 4.663661003112793,
      "rewards/rejected": -2.7602241039276123,
      "step": 280
    },
    {
      "epoch": 2.3456790123456788,
      "grad_norm": 16.362862237175147,
      "learning_rate": 1.1570953039744591e-07,
      "logits/chosen": -1.9305750131607056,
      "logits/rejected": -1.8696216344833374,
      "logps/chosen": -266.16680908203125,
      "logps/rejected": -258.2370910644531,
      "loss": 0.1186,
      "rewards/accuracies": 0.956250011920929,
      "rewards/chosen": 2.204184055328369,
      "rewards/margins": 4.997335433959961,
      "rewards/rejected": -2.7931509017944336,
      "step": 285
    },
    {
      "epoch": 2.386831275720165,
      "grad_norm": 13.275572612341923,
      "learning_rate": 1.0186062612550616e-07,
      "logits/chosen": -1.9214690923690796,
      "logits/rejected": -1.8716766834259033,
      "logps/chosen": -252.57180786132812,
      "logps/rejected": -259.24224853515625,
      "loss": 0.12,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 1.912581205368042,
      "rewards/margins": 4.5273051261901855,
      "rewards/rejected": -2.6147236824035645,
      "step": 290
    },
    {
      "epoch": 2.42798353909465,
      "grad_norm": 14.003480945619684,
      "learning_rate": 8.879995339342167e-08,
      "logits/chosen": -1.914181113243103,
      "logits/rejected": -1.8485758304595947,
      "logps/chosen": -248.25320434570312,
      "logps/rejected": -228.18118286132812,
      "loss": 0.1167,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 1.6209943294525146,
      "rewards/margins": 4.539933204650879,
      "rewards/rejected": -2.918938398361206,
      "step": 295
    },
    {
      "epoch": 2.4691358024691357,
      "grad_norm": 13.39746651643324,
      "learning_rate": 7.655336956385155e-08,
      "logits/chosen": -1.936248540878296,
      "logits/rejected": -1.8758357763290405,
      "logps/chosen": -251.0574951171875,
      "logps/rejected": -252.95425415039062,
      "loss": 0.1311,
      "rewards/accuracies": 0.981249988079071,
      "rewards/chosen": 1.768341302871704,
      "rewards/margins": 4.591066360473633,
      "rewards/rejected": -2.8227250576019287,
      "step": 300
    },
    {
      "epoch": 2.4691358024691357,
      "eval_logits/chosen": -1.9243203401565552,
      "eval_logits/rejected": -1.8631280660629272,
      "eval_logps/chosen": -251.93479919433594,
      "eval_logps/rejected": -234.54112243652344,
      "eval_loss": 0.5211819410324097,
      "eval_rewards/accuracies": 0.8194444179534912,
      "eval_rewards/chosen": 0.6944435238838196,
      "eval_rewards/margins": 2.59578800201416,
      "eval_rewards/rejected": -1.9013442993164062,
      "eval_runtime": 228.1654,
      "eval_samples_per_second": 15.147,
      "eval_steps_per_second": 0.237,
      "step": 300
    },
    {
      "epoch": 2.5102880658436213,
      "grad_norm": 15.074747119995138,
      "learning_rate": 6.514512027604508e-08,
      "logits/chosen": -1.9279800653457642,
      "logits/rejected": -1.8792842626571655,
      "logps/chosen": -232.16232299804688,
      "logps/rejected": -224.8663330078125,
      "loss": 0.1173,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 1.4570283889770508,
      "rewards/margins": 4.1241984367370605,
      "rewards/rejected": -2.667170286178589,
      "step": 305
    },
    {
      "epoch": 2.551440329218107,
      "grad_norm": 15.943407922179238,
      "learning_rate": 5.459779144461712e-08,
      "logits/chosen": -1.967230200767517,
      "logits/rejected": -1.8994722366333008,
      "logps/chosen": -251.5553436279297,
      "logps/rejected": -234.64218139648438,
      "loss": 0.132,
      "rewards/accuracies": 0.925000011920929,
      "rewards/chosen": 1.8404948711395264,
      "rewards/margins": 4.55427885055542,
      "rewards/rejected": -2.7137837409973145,
      "step": 310
    },
    {
      "epoch": 2.5925925925925926,
      "grad_norm": 15.791999145358414,
      "learning_rate": 4.49322645442266e-08,
      "logits/chosen": -1.9726388454437256,
      "logits/rejected": -1.9029220342636108,
      "logps/chosen": -226.0243377685547,
      "logps/rejected": -245.57943725585938,
      "loss": 0.1327,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 1.4504220485687256,
      "rewards/margins": 4.483643531799316,
      "rewards/rejected": -3.033221483230591,
      "step": 315
    },
    {
      "epoch": 2.633744855967078,
      "grad_norm": 14.527344025713838,
      "learning_rate": 3.616767526868353e-08,
      "logits/chosen": -1.9656314849853516,
      "logits/rejected": -1.898186445236206,
      "logps/chosen": -268.8167419433594,
      "logps/rejected": -251.64340209960938,
      "loss": 0.1062,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 2.3940348625183105,
      "rewards/margins": 5.15994930267334,
      "rewards/rejected": -2.7659144401550293,
      "step": 320
    },
    {
      "epoch": 2.674897119341564,
      "grad_norm": 14.467899944932638,
      "learning_rate": 2.8321375646333023e-08,
      "logits/chosen": -1.984684944152832,
      "logits/rejected": -1.905601143836975,
      "logps/chosen": -226.098876953125,
      "logps/rejected": -269.22723388671875,
      "loss": 0.1209,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 1.9057655334472656,
      "rewards/margins": 4.68411922454834,
      "rewards/rejected": -2.778353691101074,
      "step": 325
    },
    {
      "epoch": 2.7160493827160495,
      "grad_norm": 14.107526535593529,
      "learning_rate": 2.1408899686718996e-08,
      "logits/chosen": -1.996860146522522,
      "logits/rejected": -1.8913567066192627,
      "logps/chosen": -248.2650909423828,
      "logps/rejected": -243.4825439453125,
      "loss": 0.1195,
      "rewards/accuracies": 0.956250011920929,
      "rewards/chosen": 1.7427318096160889,
      "rewards/margins": 4.723761558532715,
      "rewards/rejected": -2.981029987335205,
      "step": 330
    },
    {
      "epoch": 2.757201646090535,
      "grad_norm": 16.668582895840217,
      "learning_rate": 1.5443932626538314e-08,
      "logits/chosen": -1.9676933288574219,
      "logits/rejected": -1.910146713256836,
      "logps/chosen": -238.7953338623047,
      "logps/rejected": -224.4933319091797,
      "loss": 0.15,
      "rewards/accuracies": 0.9312499761581421,
      "rewards/chosen": 1.859580636024475,
      "rewards/margins": 4.216904640197754,
      "rewards/rejected": -2.3573238849639893,
      "step": 335
    },
    {
      "epoch": 2.7983539094650207,
      "grad_norm": 13.029805689567587,
      "learning_rate": 1.0438283835774387e-08,
      "logits/chosen": -1.9859317541122437,
      "logits/rejected": -1.8881919384002686,
      "logps/chosen": -242.4602508544922,
      "logps/rejected": -228.0737762451172,
      "loss": 0.1257,
      "rewards/accuracies": 0.925000011920929,
      "rewards/chosen": 1.7638639211654663,
      "rewards/margins": 4.470877170562744,
      "rewards/rejected": -2.7070131301879883,
      "step": 340
    },
    {
      "epoch": 2.8395061728395063,
      "grad_norm": 13.50245071791209,
      "learning_rate": 6.401863437648481e-09,
      "logits/chosen": -1.9783008098602295,
      "logits/rejected": -1.8936630487442017,
      "logps/chosen": -262.051025390625,
      "logps/rejected": -244.21853637695312,
      "loss": 0.1265,
      "rewards/accuracies": 0.9437500238418579,
      "rewards/chosen": 1.8696222305297852,
      "rewards/margins": 4.679049491882324,
      "rewards/rejected": -2.809427261352539,
      "step": 345
    },
    {
      "epoch": 2.8806584362139915,
      "grad_norm": 19.77488068943749,
      "learning_rate": 3.3426626886769448e-09,
      "logits/chosen": -1.9724162817001343,
      "logits/rejected": -1.9013561010360718,
      "logps/chosen": -265.6155700683594,
      "logps/rejected": -258.1453552246094,
      "loss": 0.1582,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 2.233098030090332,
      "rewards/margins": 4.871306419372559,
      "rewards/rejected": -2.6382088661193848,
      "step": 350
    },
    {
      "epoch": 2.9218106995884776,
      "grad_norm": 16.657133866108477,
      "learning_rate": 1.2667381576779712e-09,
      "logits/chosen": -1.9556434154510498,
      "logits/rejected": -1.890546202659607,
      "logps/chosen": -237.84500122070312,
      "logps/rejected": -261.2818298339844,
      "loss": 0.1363,
      "rewards/accuracies": 0.9437500238418579,
      "rewards/chosen": 1.9853665828704834,
      "rewards/margins": 5.180100440979004,
      "rewards/rejected": -3.194733142852783,
      "step": 355
    },
    {
      "epoch": 2.962962962962963,
      "grad_norm": 12.623822906293494,
      "learning_rate": 1.7819973504940023e-10,
      "logits/chosen": -1.9709722995758057,
      "logits/rejected": -1.8710010051727295,
      "logps/chosen": -241.50997924804688,
      "logps/rejected": -266.9458923339844,
      "loss": 0.1258,
      "rewards/accuracies": 0.9437500238418579,
      "rewards/chosen": 1.947824478149414,
      "rewards/margins": 4.476650238037109,
      "rewards/rejected": -2.5288257598876953,
      "step": 360
    },
    {
      "epoch": 2.9876543209876543,
      "step": 363,
      "total_flos": 4280357159436288.0,
      "train_loss": 0.30565077164941584,
      "train_runtime": 13036.7565,
      "train_samples_per_second": 7.158,
      "train_steps_per_second": 0.028
    }
  ],
  "logging_steps": 5,
  "max_steps": 363,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 4280357159436288.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}