{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1872, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016025641025641025, "grad_norm": 81.23951893705919, "learning_rate": 2.6595744680851063e-09, "logits/chosen": 0.1357421875, "logits/rejected": -0.2060546875, "logps/chosen": -336.0, "logps/rejected": -346.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.016025641025641024, "grad_norm": 86.32419393322078, "learning_rate": 2.6595744680851062e-08, "logits/chosen": -0.16015625, "logits/rejected": -0.474609375, "logps/chosen": -364.0, "logps/rejected": -312.0, "loss": 0.702, "rewards/accuracies": 0.1527777761220932, "rewards/chosen": -0.009033203125, "rewards/margins": -0.01806640625, "rewards/rejected": 0.009033203125, "step": 10 }, { "epoch": 0.03205128205128205, "grad_norm": 100.77964496481692, "learning_rate": 5.3191489361702123e-08, "logits/chosen": -0.16796875, "logits/rejected": -0.466796875, "logps/chosen": -376.0, "logps/rejected": -310.0, "loss": 0.6939, "rewards/accuracies": 0.21250000596046448, "rewards/chosen": -0.006561279296875, "rewards/margins": -0.0021820068359375, "rewards/rejected": -0.00439453125, "step": 20 }, { "epoch": 0.04807692307692308, "grad_norm": 98.51227524666062, "learning_rate": 7.978723404255319e-08, "logits/chosen": -0.22265625, "logits/rejected": -0.5234375, "logps/chosen": -390.0, "logps/rejected": -332.0, "loss": 0.6834, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": 0.0419921875, "rewards/margins": 0.03515625, "rewards/rejected": 0.00689697265625, "step": 30 }, { "epoch": 0.0641025641025641, "grad_norm": 73.06116455998657, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -0.23828125, "logits/rejected": -0.4296875, "logps/chosen": -370.0, "logps/rejected": -304.0, "loss": 0.6555, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.142578125, "rewards/margins": 0.09521484375, "rewards/rejected": 0.046875, "step": 40 }, { "epoch": 0.08012820512820513, "grad_norm": 76.17217165247452, "learning_rate": 1.329787234042553e-07, "logits/chosen": -0.2421875, "logits/rejected": -0.46875, "logps/chosen": -382.0, "logps/rejected": -302.0, "loss": 0.6393, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2578125, "rewards/margins": 0.177734375, "rewards/rejected": 0.07958984375, "step": 50 }, { "epoch": 0.09615384615384616, "grad_norm": 61.759030001028826, "learning_rate": 1.5957446808510638e-07, "logits/chosen": -0.0164794921875, "logits/rejected": -0.291015625, "logps/chosen": -358.0, "logps/rejected": -306.0, "loss": 0.6168, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.376953125, "rewards/margins": 0.2255859375, "rewards/rejected": 0.15234375, "step": 60 }, { "epoch": 0.11217948717948718, "grad_norm": 64.71686861478489, "learning_rate": 1.8617021276595742e-07, "logits/chosen": -0.283203125, "logits/rejected": -0.50390625, "logps/chosen": -396.0, "logps/rejected": -292.0, "loss": 0.5847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.58203125, "rewards/margins": 0.37890625, "rewards/rejected": 0.201171875, "step": 70 }, { "epoch": 0.1282051282051282, "grad_norm": 52.52316366702727, "learning_rate": 2.127659574468085e-07, "logits/chosen": -0.35546875, "logits/rejected": -0.66796875, "logps/chosen": -368.0, "logps/rejected": -302.0, "loss": 0.5555, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6640625, "rewards/margins": 0.484375, "rewards/rejected": 0.1787109375, "step": 80 }, { "epoch": 0.14423076923076922, "grad_norm": 59.670100120011845, "learning_rate": 2.393617021276596e-07, "logits/chosen": -0.251953125, "logits/rejected": -0.55078125, "logps/chosen": -388.0, "logps/rejected": -312.0, "loss": 0.539, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.8046875, "rewards/margins": 0.5703125, "rewards/rejected": 0.2353515625, "step": 90 }, { "epoch": 0.16025641025641027, "grad_norm": 66.87233629909264, "learning_rate": 2.659574468085106e-07, "logits/chosen": -0.314453125, "logits/rejected": -0.478515625, "logps/chosen": -346.0, "logps/rejected": -324.0, "loss": 0.4916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.85546875, "rewards/margins": 0.48046875, "rewards/rejected": 0.373046875, "step": 100 }, { "epoch": 0.1762820512820513, "grad_norm": 53.961384431262026, "learning_rate": 2.925531914893617e-07, "logits/chosen": -0.294921875, "logits/rejected": -0.65625, "logps/chosen": -386.0, "logps/rejected": -306.0, "loss": 0.5006, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.9140625, "rewards/margins": 0.640625, "rewards/rejected": 0.275390625, "step": 110 }, { "epoch": 0.19230769230769232, "grad_norm": 49.6732649258676, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.298828125, "logits/rejected": -0.40234375, "logps/chosen": -388.0, "logps/rejected": -308.0, "loss": 0.4499, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.1171875, "rewards/margins": 0.82421875, "rewards/rejected": 0.29296875, "step": 120 }, { "epoch": 0.20833333333333334, "grad_norm": 52.819440458631206, "learning_rate": 3.457446808510638e-07, "logits/chosen": -0.1591796875, "logits/rejected": -0.28515625, "logps/chosen": -374.0, "logps/rejected": -280.0, "loss": 0.4087, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.3671875, "rewards/margins": 1.203125, "rewards/rejected": 0.16015625, "step": 130 }, { "epoch": 0.22435897435897437, "grad_norm": 45.055792147900604, "learning_rate": 3.7234042553191484e-07, "logits/chosen": -0.2177734375, "logits/rejected": -0.380859375, "logps/chosen": -364.0, "logps/rejected": -300.0, "loss": 0.4263, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 1.328125, "rewards/margins": 1.1640625, "rewards/rejected": 0.1650390625, "step": 140 }, { "epoch": 0.2403846153846154, "grad_norm": 37.20442727443192, "learning_rate": 3.989361702127659e-07, "logits/chosen": -0.208984375, "logits/rejected": -0.421875, "logps/chosen": -392.0, "logps/rejected": -310.0, "loss": 0.4273, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.3671875, "rewards/margins": 1.265625, "rewards/rejected": 0.10205078125, "step": 150 }, { "epoch": 0.2564102564102564, "grad_norm": 67.23258640755056, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.1416015625, "logits/rejected": -0.365234375, "logps/chosen": -380.0, "logps/rejected": -322.0, "loss": 0.3995, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.1484375, "rewards/margins": 1.1953125, "rewards/rejected": -0.047119140625, "step": 160 }, { "epoch": 0.2724358974358974, "grad_norm": 76.82700510564892, "learning_rate": 4.5212765957446806e-07, "logits/chosen": -0.1376953125, "logits/rejected": -0.42578125, "logps/chosen": -360.0, "logps/rejected": -312.0, "loss": 0.4255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1171875, "rewards/margins": 1.125, "rewards/rejected": -0.0093994140625, "step": 170 }, { "epoch": 0.28846153846153844, "grad_norm": 57.73247757369804, "learning_rate": 4.787234042553192e-07, "logits/chosen": -0.083984375, "logits/rejected": -0.1953125, "logps/chosen": -368.0, "logps/rejected": -316.0, "loss": 0.3897, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3125, "rewards/margins": 1.328125, "rewards/rejected": -0.01129150390625, "step": 180 }, { "epoch": 0.30448717948717946, "grad_norm": 93.3245464788157, "learning_rate": 4.994061757719714e-07, "logits/chosen": -0.158203125, "logits/rejected": -0.408203125, "logps/chosen": -344.0, "logps/rejected": -318.0, "loss": 0.4427, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.3828125, "rewards/margins": 1.515625, "rewards/rejected": -0.130859375, "step": 190 }, { "epoch": 0.32051282051282054, "grad_norm": 31.0057130908168, "learning_rate": 4.96437054631829e-07, "logits/chosen": -0.185546875, "logits/rejected": -0.333984375, "logps/chosen": -398.0, "logps/rejected": -276.0, "loss": 0.3853, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2578125, "rewards/margins": 1.5546875, "rewards/rejected": -0.298828125, "step": 200 }, { "epoch": 0.33653846153846156, "grad_norm": 31.866164526406457, "learning_rate": 4.934679334916864e-07, "logits/chosen": -0.134765625, "logits/rejected": -0.427734375, "logps/chosen": -372.0, "logps/rejected": -334.0, "loss": 0.2608, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.4375, "rewards/margins": 1.703125, "rewards/rejected": -0.267578125, "step": 210 }, { "epoch": 0.3525641025641026, "grad_norm": 26.376971522466825, "learning_rate": 4.904988123515439e-07, "logits/chosen": -0.0673828125, "logits/rejected": -0.150390625, "logps/chosen": -392.0, "logps/rejected": -294.0, "loss": 0.313, "rewards/accuracies": 0.875, "rewards/chosen": 1.6796875, "rewards/margins": 1.921875, "rewards/rejected": -0.2451171875, "step": 220 }, { "epoch": 0.3685897435897436, "grad_norm": 19.856366106787778, "learning_rate": 4.875296912114014e-07, "logits/chosen": -0.068359375, "logits/rejected": -0.373046875, "logps/chosen": -342.0, "logps/rejected": -298.0, "loss": 0.2838, "rewards/accuracies": 0.875, "rewards/chosen": 1.46875, "rewards/margins": 2.125, "rewards/rejected": -0.65625, "step": 230 }, { "epoch": 0.38461538461538464, "grad_norm": 55.84167138753407, "learning_rate": 4.845605700712589e-07, "logits/chosen": -0.12890625, "logits/rejected": -0.443359375, "logps/chosen": -368.0, "logps/rejected": -322.0, "loss": 0.2748, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 1.5546875, "rewards/margins": 2.5, "rewards/rejected": -0.953125, "step": 240 }, { "epoch": 0.40064102564102566, "grad_norm": 37.23188688423797, "learning_rate": 4.815914489311164e-07, "logits/chosen": -0.1357421875, "logits/rejected": -0.427734375, "logps/chosen": -364.0, "logps/rejected": -324.0, "loss": 0.2543, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6640625, "rewards/margins": 2.734375, "rewards/rejected": -1.0703125, "step": 250 }, { "epoch": 0.4166666666666667, "grad_norm": 35.660892697083206, "learning_rate": 4.786223277909738e-07, "logits/chosen": -0.123046875, "logits/rejected": -0.470703125, "logps/chosen": -366.0, "logps/rejected": -310.0, "loss": 0.2583, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 1.1484375, "rewards/margins": 2.375, "rewards/rejected": -1.234375, "step": 260 }, { "epoch": 0.4326923076923077, "grad_norm": 46.54757231137472, "learning_rate": 4.756532066508313e-07, "logits/chosen": -0.2021484375, "logits/rejected": -0.328125, "logps/chosen": -374.0, "logps/rejected": -340.0, "loss": 0.4125, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.1015625, "rewards/margins": 1.9765625, "rewards/rejected": -0.875, "step": 270 }, { "epoch": 0.44871794871794873, "grad_norm": 37.544511359682254, "learning_rate": 4.7268408551068883e-07, "logits/chosen": 0.03857421875, "logits/rejected": -0.54296875, "logps/chosen": -384.0, "logps/rejected": -342.0, "loss": 0.2471, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 1.1796875, "rewards/margins": 2.5, "rewards/rejected": -1.3203125, "step": 280 }, { "epoch": 0.46474358974358976, "grad_norm": 56.789270954370565, "learning_rate": 4.697149643705463e-07, "logits/chosen": -0.07958984375, "logits/rejected": -0.29296875, "logps/chosen": -348.0, "logps/rejected": -328.0, "loss": 0.297, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.4140625, "rewards/margins": 2.46875, "rewards/rejected": -1.0546875, "step": 290 }, { "epoch": 0.4807692307692308, "grad_norm": 14.22992572113077, "learning_rate": 4.667458432304038e-07, "logits/chosen": -0.2021484375, "logits/rejected": -0.388671875, "logps/chosen": -372.0, "logps/rejected": -322.0, "loss": 0.2564, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.6796875, "rewards/margins": 2.765625, "rewards/rejected": -1.0859375, "step": 300 }, { "epoch": 0.4967948717948718, "grad_norm": 36.87971897536812, "learning_rate": 4.6377672209026127e-07, "logits/chosen": -0.10888671875, "logits/rejected": -0.412109375, "logps/chosen": -390.0, "logps/rejected": -330.0, "loss": 0.1529, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.109375, "rewards/margins": 3.328125, "rewards/rejected": -1.2109375, "step": 310 }, { "epoch": 0.5128205128205128, "grad_norm": 31.81729836940978, "learning_rate": 4.6080760095011875e-07, "logits/chosen": -0.154296875, "logits/rejected": -0.35546875, "logps/chosen": -374.0, "logps/rejected": -304.0, "loss": 0.26, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.859375, "rewards/margins": 2.875, "rewards/rejected": -1.0234375, "step": 320 }, { "epoch": 0.5288461538461539, "grad_norm": 27.609497368983103, "learning_rate": 4.578384798099763e-07, "logits/chosen": -0.0849609375, "logits/rejected": -0.515625, "logps/chosen": -338.0, "logps/rejected": -328.0, "loss": 0.2903, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.6953125, "rewards/margins": 2.859375, "rewards/rejected": -1.171875, "step": 330 }, { "epoch": 0.5448717948717948, "grad_norm": 65.37858577012295, "learning_rate": 4.548693586698337e-07, "logits/chosen": -0.11083984375, "logits/rejected": -0.28125, "logps/chosen": -382.0, "logps/rejected": -338.0, "loss": 0.3046, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 1.7421875, "rewards/margins": 3.15625, "rewards/rejected": -1.4140625, "step": 340 }, { "epoch": 0.5608974358974359, "grad_norm": 53.122076726933024, "learning_rate": 4.519002375296912e-07, "logits/chosen": -0.1123046875, "logits/rejected": -0.46875, "logps/chosen": -370.0, "logps/rejected": -358.0, "loss": 0.3242, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.390625, "rewards/margins": 2.765625, "rewards/rejected": -1.375, "step": 350 }, { "epoch": 0.5769230769230769, "grad_norm": 29.215237884321418, "learning_rate": 4.4893111638954866e-07, "logits/chosen": -0.267578125, "logits/rejected": -0.3515625, "logps/chosen": -380.0, "logps/rejected": -324.0, "loss": 0.1967, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5390625, "rewards/margins": 3.296875, "rewards/rejected": -1.7578125, "step": 360 }, { "epoch": 0.592948717948718, "grad_norm": 23.99951739667445, "learning_rate": 4.4596199524940614e-07, "logits/chosen": -0.24609375, "logits/rejected": -0.376953125, "logps/chosen": -392.0, "logps/rejected": -328.0, "loss": 0.2886, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.0234375, "rewards/margins": 2.515625, "rewards/rejected": -1.5, "step": 370 }, { "epoch": 0.6089743589743589, "grad_norm": 73.48089183856715, "learning_rate": 4.429928741092636e-07, "logits/chosen": -0.1640625, "logits/rejected": -0.443359375, "logps/chosen": -378.0, "logps/rejected": -346.0, "loss": 0.2243, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.53125, "rewards/margins": 3.125, "rewards/rejected": -1.59375, "step": 380 }, { "epoch": 0.625, "grad_norm": 34.2482977900771, "learning_rate": 4.4002375296912115e-07, "logits/chosen": -0.109375, "logits/rejected": -0.5625, "logps/chosen": -382.0, "logps/rejected": -344.0, "loss": 0.1972, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.46875, "rewards/margins": 3.09375, "rewards/rejected": -1.6328125, "step": 390 }, { "epoch": 0.6410256410256411, "grad_norm": 46.4216688916151, "learning_rate": 4.3705463182897863e-07, "logits/chosen": -0.1435546875, "logits/rejected": -0.3359375, "logps/chosen": -372.0, "logps/rejected": -340.0, "loss": 0.2644, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.265625, "rewards/margins": 3.796875, "rewards/rejected": -1.53125, "step": 400 }, { "epoch": 0.657051282051282, "grad_norm": 44.3943999019337, "learning_rate": 4.340855106888361e-07, "logits/chosen": -0.1494140625, "logits/rejected": -0.53515625, "logps/chosen": -368.0, "logps/rejected": -334.0, "loss": 0.3043, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6640625, "rewards/margins": 3.265625, "rewards/rejected": -1.59375, "step": 410 }, { "epoch": 0.6730769230769231, "grad_norm": 49.38538485619709, "learning_rate": 4.311163895486936e-07, "logits/chosen": -0.2451171875, "logits/rejected": -0.353515625, "logps/chosen": -374.0, "logps/rejected": -332.0, "loss": 0.2839, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.5546875, "rewards/margins": 2.96875, "rewards/rejected": -1.4140625, "step": 420 }, { "epoch": 0.6891025641025641, "grad_norm": 52.14918805667079, "learning_rate": 4.28147268408551e-07, "logits/chosen": -0.142578125, "logits/rejected": -0.265625, "logps/chosen": -392.0, "logps/rejected": -308.0, "loss": 0.2573, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.71875, "rewards/margins": 3.546875, "rewards/rejected": -1.8359375, "step": 430 }, { "epoch": 0.7051282051282052, "grad_norm": 41.67427154859993, "learning_rate": 4.251781472684085e-07, "logits/chosen": -0.2041015625, "logits/rejected": -0.19140625, "logps/chosen": -382.0, "logps/rejected": -302.0, "loss": 0.2207, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9609375, "rewards/margins": 2.921875, "rewards/rejected": -0.96484375, "step": 440 }, { "epoch": 0.7211538461538461, "grad_norm": 49.967389161188926, "learning_rate": 4.22209026128266e-07, "logits/chosen": -0.216796875, "logits/rejected": -0.33984375, "logps/chosen": -368.0, "logps/rejected": -316.0, "loss": 0.3459, "rewards/accuracies": 0.875, "rewards/chosen": 1.46875, "rewards/margins": 2.75, "rewards/rejected": -1.28125, "step": 450 }, { "epoch": 0.7371794871794872, "grad_norm": 45.29394404835159, "learning_rate": 4.192399049881235e-07, "logits/chosen": -0.259765625, "logits/rejected": -0.462890625, "logps/chosen": -392.0, "logps/rejected": -334.0, "loss": 0.2416, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.765625, "rewards/margins": 3.296875, "rewards/rejected": -1.53125, "step": 460 }, { "epoch": 0.7532051282051282, "grad_norm": 49.65359950033634, "learning_rate": 4.16270783847981e-07, "logits/chosen": -0.09814453125, "logits/rejected": -0.384765625, "logps/chosen": -340.0, "logps/rejected": -312.0, "loss": 0.2871, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.25, "rewards/margins": 3.34375, "rewards/rejected": -1.09375, "step": 470 }, { "epoch": 0.7692307692307693, "grad_norm": 34.79083088842419, "learning_rate": 4.1330166270783846e-07, "logits/chosen": -0.1796875, "logits/rejected": -0.423828125, "logps/chosen": -358.0, "logps/rejected": -344.0, "loss": 0.2121, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.453125, "rewards/margins": 3.25, "rewards/rejected": -1.8046875, "step": 480 }, { "epoch": 0.7852564102564102, "grad_norm": 24.646692522257364, "learning_rate": 4.1033254156769594e-07, "logits/chosen": -0.1435546875, "logits/rejected": -0.390625, "logps/chosen": -360.0, "logps/rejected": -332.0, "loss": 0.2359, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.6640625, "rewards/margins": 3.421875, "rewards/rejected": -1.75, "step": 490 }, { "epoch": 0.8012820512820513, "grad_norm": 32.88069299074813, "learning_rate": 4.0736342042755347e-07, "logits/chosen": -0.216796875, "logits/rejected": -0.5390625, "logps/chosen": -380.0, "logps/rejected": -328.0, "loss": 0.2243, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9609375, "rewards/margins": 2.90625, "rewards/rejected": -1.9453125, "step": 500 }, { "epoch": 0.8173076923076923, "grad_norm": 35.40990218888171, "learning_rate": 4.0439429928741095e-07, "logits/chosen": -0.173828125, "logits/rejected": -0.302734375, "logps/chosen": -388.0, "logps/rejected": -320.0, "loss": 0.2403, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.421875, "rewards/margins": 3.203125, "rewards/rejected": -1.78125, "step": 510 }, { "epoch": 0.8333333333333334, "grad_norm": 45.40116394651564, "learning_rate": 4.0142517814726837e-07, "logits/chosen": -0.220703125, "logits/rejected": -0.37109375, "logps/chosen": -398.0, "logps/rejected": -340.0, "loss": 0.2562, "rewards/accuracies": 0.875, "rewards/chosen": 1.4375, "rewards/margins": 3.0, "rewards/rejected": -1.5625, "step": 520 }, { "epoch": 0.8493589743589743, "grad_norm": 38.39116264620735, "learning_rate": 3.9845605700712585e-07, "logits/chosen": -0.1953125, "logits/rejected": -0.244140625, "logps/chosen": -374.0, "logps/rejected": -288.0, "loss": 0.2727, "rewards/accuracies": 0.875, "rewards/chosen": 1.3671875, "rewards/margins": 3.046875, "rewards/rejected": -1.6875, "step": 530 }, { "epoch": 0.8653846153846154, "grad_norm": 47.38780960867294, "learning_rate": 3.9548693586698333e-07, "logits/chosen": -0.24609375, "logits/rejected": -0.4140625, "logps/chosen": -372.0, "logps/rejected": -318.0, "loss": 0.2017, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.9296875, "rewards/margins": 3.421875, "rewards/rejected": -1.4921875, "step": 540 }, { "epoch": 0.8814102564102564, "grad_norm": 53.98621268772635, "learning_rate": 3.925178147268408e-07, "logits/chosen": -0.322265625, "logits/rejected": -0.283203125, "logps/chosen": -358.0, "logps/rejected": -314.0, "loss": 0.2358, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.078125, "rewards/margins": 3.234375, "rewards/rejected": -1.1640625, "step": 550 }, { "epoch": 0.8974358974358975, "grad_norm": 57.737496610523515, "learning_rate": 3.8954869358669834e-07, "logits/chosen": -0.1767578125, "logits/rejected": -0.271484375, "logps/chosen": -384.0, "logps/rejected": -332.0, "loss": 0.2313, "rewards/accuracies": 0.875, "rewards/chosen": 1.8125, "rewards/margins": 3.203125, "rewards/rejected": -1.3828125, "step": 560 }, { "epoch": 0.9134615384615384, "grad_norm": 29.929989102081784, "learning_rate": 3.865795724465558e-07, "logits/chosen": -0.26171875, "logits/rejected": -0.466796875, "logps/chosen": -366.0, "logps/rejected": -302.0, "loss": 0.2186, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.0, "rewards/margins": 3.78125, "rewards/rejected": -1.78125, "step": 570 }, { "epoch": 0.9294871794871795, "grad_norm": 14.744590527736637, "learning_rate": 3.836104513064133e-07, "logits/chosen": -0.30078125, "logits/rejected": -0.46875, "logps/chosen": -364.0, "logps/rejected": -324.0, "loss": 0.1895, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.84375, "rewards/margins": 3.75, "rewards/rejected": -1.9140625, "step": 580 }, { "epoch": 0.9455128205128205, "grad_norm": 43.4367923218673, "learning_rate": 3.806413301662708e-07, "logits/chosen": -0.154296875, "logits/rejected": -0.5, "logps/chosen": -366.0, "logps/rejected": -348.0, "loss": 0.2121, "rewards/accuracies": 0.875, "rewards/chosen": 1.625, "rewards/margins": 3.546875, "rewards/rejected": -1.9140625, "step": 590 }, { "epoch": 0.9615384615384616, "grad_norm": 14.598190302159233, "learning_rate": 3.7767220902612825e-07, "logits/chosen": -0.21875, "logits/rejected": -0.35546875, "logps/chosen": -374.0, "logps/rejected": -328.0, "loss": 0.2134, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4609375, "rewards/margins": 3.59375, "rewards/rejected": -2.140625, "step": 600 }, { "epoch": 0.9775641025641025, "grad_norm": 65.50233470122603, "learning_rate": 3.747030878859858e-07, "logits/chosen": -0.09375, "logits/rejected": -0.36328125, "logps/chosen": -348.0, "logps/rejected": -332.0, "loss": 0.3465, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.484375, "rewards/margins": 3.65625, "rewards/rejected": -2.171875, "step": 610 }, { "epoch": 0.9935897435897436, "grad_norm": 45.476213003555294, "learning_rate": 3.717339667458432e-07, "logits/chosen": -0.158203125, "logits/rejected": -0.439453125, "logps/chosen": -364.0, "logps/rejected": -330.0, "loss": 0.2557, "rewards/accuracies": 0.875, "rewards/chosen": 1.1796875, "rewards/margins": 3.140625, "rewards/rejected": -1.953125, "step": 620 }, { "epoch": 1.0, "eval_logits/chosen": -0.2333984375, "eval_logits/rejected": -0.328125, "eval_logps/chosen": -366.0, "eval_logps/rejected": -340.0, "eval_loss": 0.20789062976837158, "eval_rewards/accuracies": 0.8942307829856873, "eval_rewards/chosen": 1.5859375, "eval_rewards/margins": 3.375, "eval_rewards/rejected": -1.7890625, "eval_runtime": 28.0659, "eval_samples_per_second": 7.126, "eval_steps_per_second": 0.463, "step": 624 }, { "epoch": 1.0096153846153846, "grad_norm": 6.092991588272169, "learning_rate": 3.687648456057007e-07, "logits/chosen": -0.2734375, "logits/rejected": -0.388671875, "logps/chosen": -350.0, "logps/rejected": -322.0, "loss": 0.1514, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7734375, "rewards/margins": 3.96875, "rewards/rejected": -2.203125, "step": 630 }, { "epoch": 1.0256410256410255, "grad_norm": 15.11610385237531, "learning_rate": 3.6579572446555817e-07, "logits/chosen": -0.21875, "logits/rejected": -0.46484375, "logps/chosen": -356.0, "logps/rejected": -374.0, "loss": 0.0783, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.921875, "rewards/margins": 4.8125, "rewards/rejected": -2.890625, "step": 640 }, { "epoch": 1.0416666666666667, "grad_norm": 8.924081971380067, "learning_rate": 3.6282660332541565e-07, "logits/chosen": -0.36328125, "logits/rejected": -0.5, "logps/chosen": -378.0, "logps/rejected": -348.0, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 1.7421875, "rewards/margins": 4.59375, "rewards/rejected": -2.84375, "step": 650 }, { "epoch": 1.0576923076923077, "grad_norm": 8.969543433658254, "learning_rate": 3.598574821852731e-07, "logits/chosen": -0.1572265625, "logits/rejected": -0.2890625, "logps/chosen": -364.0, "logps/rejected": -344.0, "loss": 0.1064, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.109375, "rewards/margins": 4.1875, "rewards/rejected": -2.078125, "step": 660 }, { "epoch": 1.0737179487179487, "grad_norm": 20.194350083453877, "learning_rate": 3.5688836104513066e-07, "logits/chosen": -0.1884765625, "logits/rejected": -0.427734375, "logps/chosen": -376.0, "logps/rejected": -326.0, "loss": 0.0802, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.109375, "rewards/margins": 4.3125, "rewards/rejected": -2.1875, "step": 670 }, { "epoch": 1.0897435897435896, "grad_norm": 9.472914566833554, "learning_rate": 3.5391923990498813e-07, "logits/chosen": -0.1494140625, "logits/rejected": -0.39453125, "logps/chosen": -354.0, "logps/rejected": -330.0, "loss": 0.0748, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.671875, "rewards/margins": 5.0625, "rewards/rejected": -2.390625, "step": 680 }, { "epoch": 1.1057692307692308, "grad_norm": 27.091485921309573, "learning_rate": 3.509501187648456e-07, "logits/chosen": -0.2578125, "logits/rejected": -0.359375, "logps/chosen": -368.0, "logps/rejected": -324.0, "loss": 0.0862, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.140625, "rewards/margins": 4.71875, "rewards/rejected": -2.5625, "step": 690 }, { "epoch": 1.1217948717948718, "grad_norm": 23.55129242956029, "learning_rate": 3.479809976247031e-07, "logits/chosen": -0.1875, "logits/rejected": -0.25, "logps/chosen": -352.0, "logps/rejected": -314.0, "loss": 0.0667, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.21875, "rewards/margins": 4.65625, "rewards/rejected": -2.4375, "step": 700 }, { "epoch": 1.1378205128205128, "grad_norm": 16.191916223045457, "learning_rate": 3.450118764845605e-07, "logits/chosen": -0.2431640625, "logits/rejected": -0.4453125, "logps/chosen": -368.0, "logps/rejected": -352.0, "loss": 0.1058, "rewards/accuracies": 0.9375, "rewards/chosen": 1.953125, "rewards/margins": 4.84375, "rewards/rejected": -2.890625, "step": 710 }, { "epoch": 1.1538461538461537, "grad_norm": 6.197082349513418, "learning_rate": 3.42042755344418e-07, "logits/chosen": -0.193359375, "logits/rejected": -0.298828125, "logps/chosen": -420.0, "logps/rejected": -352.0, "loss": 0.1152, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.140625, "rewards/margins": 4.8125, "rewards/rejected": -3.65625, "step": 720 }, { "epoch": 1.169871794871795, "grad_norm": 20.175263771423996, "learning_rate": 3.390736342042755e-07, "logits/chosen": -0.0849609375, "logits/rejected": -0.30859375, "logps/chosen": -368.0, "logps/rejected": -340.0, "loss": 0.0717, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.8203125, "rewards/margins": 5.125, "rewards/rejected": -3.296875, "step": 730 }, { "epoch": 1.185897435897436, "grad_norm": 14.053624440757009, "learning_rate": 3.36104513064133e-07, "logits/chosen": -0.1591796875, "logits/rejected": -0.373046875, "logps/chosen": -368.0, "logps/rejected": -342.0, "loss": 0.0763, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0, "rewards/margins": 5.53125, "rewards/rejected": -3.515625, "step": 740 }, { "epoch": 1.2019230769230769, "grad_norm": 19.07797922710506, "learning_rate": 3.331353919239905e-07, "logits/chosen": -0.134765625, "logits/rejected": -0.37109375, "logps/chosen": -374.0, "logps/rejected": -346.0, "loss": 0.0837, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.859375, "rewards/margins": 4.9375, "rewards/rejected": -3.078125, "step": 750 }, { "epoch": 1.217948717948718, "grad_norm": 3.621341208921815, "learning_rate": 3.3016627078384796e-07, "logits/chosen": -0.1337890625, "logits/rejected": -0.447265625, "logps/chosen": -382.0, "logps/rejected": -362.0, "loss": 0.0574, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7734375, "rewards/margins": 5.0625, "rewards/rejected": -3.28125, "step": 760 }, { "epoch": 1.233974358974359, "grad_norm": 32.55125289195706, "learning_rate": 3.2719714964370544e-07, "logits/chosen": -0.2099609375, "logits/rejected": -0.408203125, "logps/chosen": -354.0, "logps/rejected": -330.0, "loss": 0.0801, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.203125, "rewards/margins": 5.6875, "rewards/rejected": -3.46875, "step": 770 }, { "epoch": 1.25, "grad_norm": 22.82618457101383, "learning_rate": 3.2422802850356297e-07, "logits/chosen": -0.265625, "logits/rejected": -0.486328125, "logps/chosen": -362.0, "logps/rejected": -334.0, "loss": 0.0937, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.03125, "rewards/margins": 5.0625, "rewards/rejected": -3.046875, "step": 780 }, { "epoch": 1.266025641025641, "grad_norm": 49.835975518914886, "learning_rate": 3.2125890736342045e-07, "logits/chosen": -0.24609375, "logits/rejected": -0.447265625, "logps/chosen": -360.0, "logps/rejected": -352.0, "loss": 0.0787, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.140625, "rewards/margins": 5.5, "rewards/rejected": -3.375, "step": 790 }, { "epoch": 1.282051282051282, "grad_norm": 3.1313003868772107, "learning_rate": 3.182897862232779e-07, "logits/chosen": -0.283203125, "logits/rejected": -0.287109375, "logps/chosen": -372.0, "logps/rejected": -326.0, "loss": 0.1086, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6171875, "rewards/margins": 4.78125, "rewards/rejected": -3.1875, "step": 800 }, { "epoch": 1.2980769230769231, "grad_norm": 13.950684325273025, "learning_rate": 3.1532066508313535e-07, "logits/chosen": -0.234375, "logits/rejected": -0.640625, "logps/chosen": -344.0, "logps/rejected": -344.0, "loss": 0.0843, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.140625, "rewards/margins": 5.0625, "rewards/rejected": -2.90625, "step": 810 }, { "epoch": 1.314102564102564, "grad_norm": 6.930473305635045, "learning_rate": 3.1235154394299283e-07, "logits/chosen": -0.341796875, "logits/rejected": -0.326171875, "logps/chosen": -362.0, "logps/rejected": -344.0, "loss": 0.0667, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.984375, "rewards/margins": 5.21875, "rewards/rejected": -3.234375, "step": 820 }, { "epoch": 1.330128205128205, "grad_norm": 21.31500643130334, "learning_rate": 3.093824228028503e-07, "logits/chosen": -0.1748046875, "logits/rejected": -0.443359375, "logps/chosen": -366.0, "logps/rejected": -342.0, "loss": 0.0595, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.984375, "rewards/margins": 5.5625, "rewards/rejected": -3.5625, "step": 830 }, { "epoch": 1.3461538461538463, "grad_norm": 32.89803571469975, "learning_rate": 3.0641330166270784e-07, "logits/chosen": -0.234375, "logits/rejected": -0.34375, "logps/chosen": -330.0, "logps/rejected": -374.0, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": 2.28125, "rewards/margins": 6.0625, "rewards/rejected": -3.8125, "step": 840 }, { "epoch": 1.3621794871794872, "grad_norm": 32.22922106524605, "learning_rate": 3.034441805225653e-07, "logits/chosen": -0.279296875, "logits/rejected": -0.32421875, "logps/chosen": -358.0, "logps/rejected": -358.0, "loss": 0.0406, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.609375, "rewards/margins": 6.21875, "rewards/rejected": -3.59375, "step": 850 }, { "epoch": 1.3782051282051282, "grad_norm": 28.46125519366018, "learning_rate": 3.004750593824228e-07, "logits/chosen": -0.2294921875, "logits/rejected": -0.19140625, "logps/chosen": -348.0, "logps/rejected": -330.0, "loss": 0.0549, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.109375, "rewards/margins": 6.15625, "rewards/rejected": -4.0625, "step": 860 }, { "epoch": 1.3942307692307692, "grad_norm": 14.047872703813512, "learning_rate": 2.975059382422803e-07, "logits/chosen": -0.2490234375, "logits/rejected": -0.359375, "logps/chosen": -366.0, "logps/rejected": -316.0, "loss": 0.0797, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.296875, "rewards/margins": 6.375, "rewards/rejected": -4.09375, "step": 870 }, { "epoch": 1.4102564102564101, "grad_norm": 11.52369059409131, "learning_rate": 2.9453681710213776e-07, "logits/chosen": -0.26953125, "logits/rejected": -0.359375, "logps/chosen": -400.0, "logps/rejected": -348.0, "loss": 0.052, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.046875, "rewards/margins": 5.90625, "rewards/rejected": -3.84375, "step": 880 }, { "epoch": 1.4262820512820513, "grad_norm": 15.524036946329323, "learning_rate": 2.915676959619953e-07, "logits/chosen": -0.1923828125, "logits/rejected": -0.283203125, "logps/chosen": -364.0, "logps/rejected": -358.0, "loss": 0.0819, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.125, "rewards/margins": 6.1875, "rewards/rejected": -4.0625, "step": 890 }, { "epoch": 1.4423076923076923, "grad_norm": 56.41338438001929, "learning_rate": 2.885985748218527e-07, "logits/chosen": -0.2314453125, "logits/rejected": -0.427734375, "logps/chosen": -374.0, "logps/rejected": -352.0, "loss": 0.1086, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.546875, "rewards/margins": 5.65625, "rewards/rejected": -4.125, "step": 900 }, { "epoch": 1.4583333333333333, "grad_norm": 35.802404269657636, "learning_rate": 2.856294536817102e-07, "logits/chosen": -0.169921875, "logits/rejected": -0.296875, "logps/chosen": -366.0, "logps/rejected": -352.0, "loss": 0.0569, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.09375, "rewards/margins": 5.8125, "rewards/rejected": -3.71875, "step": 910 }, { "epoch": 1.4743589743589745, "grad_norm": 3.0990206807529574, "learning_rate": 2.8266033254156767e-07, "logits/chosen": -0.310546875, "logits/rejected": -0.16015625, "logps/chosen": -364.0, "logps/rejected": -358.0, "loss": 0.0532, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.359375, "rewards/margins": 5.75, "rewards/rejected": -3.390625, "step": 920 }, { "epoch": 1.4903846153846154, "grad_norm": 6.630507067800798, "learning_rate": 2.7969121140142515e-07, "logits/chosen": -0.2099609375, "logits/rejected": -0.384765625, "logps/chosen": -352.0, "logps/rejected": -344.0, "loss": 0.1117, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.78125, "rewards/margins": 5.9375, "rewards/rejected": -3.140625, "step": 930 }, { "epoch": 1.5064102564102564, "grad_norm": 3.9527473386787992, "learning_rate": 2.7672209026128263e-07, "logits/chosen": -0.150390625, "logits/rejected": -0.353515625, "logps/chosen": -364.0, "logps/rejected": -326.0, "loss": 0.1115, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.71875, "rewards/margins": 5.375, "rewards/rejected": -2.65625, "step": 940 }, { "epoch": 1.5224358974358974, "grad_norm": 21.74505432942635, "learning_rate": 2.7375296912114016e-07, "logits/chosen": -0.224609375, "logits/rejected": -0.3203125, "logps/chosen": -388.0, "logps/rejected": -364.0, "loss": 0.0695, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.765625, "rewards/margins": 5.5, "rewards/rejected": -2.75, "step": 950 }, { "epoch": 1.5384615384615383, "grad_norm": 34.5728416770996, "learning_rate": 2.7078384798099764e-07, "logits/chosen": -0.2197265625, "logits/rejected": -0.412109375, "logps/chosen": -356.0, "logps/rejected": -328.0, "loss": 0.0797, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.9375, "rewards/margins": 6.40625, "rewards/rejected": -3.46875, "step": 960 }, { "epoch": 1.5544871794871795, "grad_norm": 10.226160428385189, "learning_rate": 2.678147268408551e-07, "logits/chosen": -0.1787109375, "logits/rejected": -0.4140625, "logps/chosen": -372.0, "logps/rejected": -330.0, "loss": 0.0415, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.40625, "rewards/margins": 6.40625, "rewards/rejected": -3.984375, "step": 970 }, { "epoch": 1.5705128205128205, "grad_norm": 2.887418080486524, "learning_rate": 2.648456057007126e-07, "logits/chosen": -0.1357421875, "logits/rejected": -0.318359375, "logps/chosen": -388.0, "logps/rejected": -320.0, "loss": 0.0622, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.65625, "rewards/margins": 6.40625, "rewards/rejected": -3.734375, "step": 980 }, { "epoch": 1.5865384615384617, "grad_norm": 25.20155218665509, "learning_rate": 2.6187648456057e-07, "logits/chosen": -0.1044921875, "logits/rejected": -0.37890625, "logps/chosen": -330.0, "logps/rejected": -326.0, "loss": 0.0672, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.921875, "rewards/margins": 6.75, "rewards/rejected": -3.828125, "step": 990 }, { "epoch": 1.6025641025641026, "grad_norm": 2.9326020885487107, "learning_rate": 2.589073634204275e-07, "logits/chosen": -0.244140625, "logits/rejected": -0.4609375, "logps/chosen": -370.0, "logps/rejected": -336.0, "loss": 0.0686, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.4375, "rewards/margins": 6.375, "rewards/rejected": -3.9375, "step": 1000 }, { "epoch": 1.6185897435897436, "grad_norm": 2.6087260544675948, "learning_rate": 2.5593824228028503e-07, "logits/chosen": -0.330078125, "logits/rejected": -0.59765625, "logps/chosen": -366.0, "logps/rejected": -384.0, "loss": 0.0506, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.390625, "rewards/margins": 6.4375, "rewards/rejected": -4.0625, "step": 1010 }, { "epoch": 1.6346153846153846, "grad_norm": 46.202092572333505, "learning_rate": 2.529691211401425e-07, "logits/chosen": -0.10791015625, "logits/rejected": -0.40234375, "logps/chosen": -370.0, "logps/rejected": -368.0, "loss": 0.1479, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9609375, "rewards/margins": 5.6875, "rewards/rejected": -3.71875, "step": 1020 }, { "epoch": 1.6506410256410255, "grad_norm": 11.039931814429327, "learning_rate": 2.5e-07, "logits/chosen": -0.1875, "logits/rejected": -0.357421875, "logps/chosen": -398.0, "logps/rejected": -348.0, "loss": 0.0695, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6875, "rewards/margins": 5.59375, "rewards/rejected": -3.90625, "step": 1030 }, { "epoch": 1.6666666666666665, "grad_norm": 19.29266338720831, "learning_rate": 2.4703087885985747e-07, "logits/chosen": -0.2353515625, "logits/rejected": -0.4609375, "logps/chosen": -364.0, "logps/rejected": -350.0, "loss": 0.0767, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5078125, "rewards/margins": 6.15625, "rewards/rejected": -4.65625, "step": 1040 }, { "epoch": 1.6826923076923077, "grad_norm": 42.04473353572841, "learning_rate": 2.4406175771971495e-07, "logits/chosen": -0.375, "logits/rejected": -0.515625, "logps/chosen": -374.0, "logps/rejected": -356.0, "loss": 0.0784, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8046875, "rewards/margins": 5.84375, "rewards/rejected": -4.0625, "step": 1050 }, { "epoch": 1.6987179487179487, "grad_norm": 48.05824905529893, "learning_rate": 2.410926365795724e-07, "logits/chosen": -0.2109375, "logits/rejected": -0.353515625, "logps/chosen": -380.0, "logps/rejected": -342.0, "loss": 0.0879, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7109375, "rewards/margins": 5.40625, "rewards/rejected": -3.703125, "step": 1060 }, { "epoch": 1.7147435897435899, "grad_norm": 9.491221178753944, "learning_rate": 2.381235154394299e-07, "logits/chosen": -0.2138671875, "logits/rejected": -0.4375, "logps/chosen": -372.0, "logps/rejected": -358.0, "loss": 0.0497, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.296875, "rewards/margins": 6.4375, "rewards/rejected": -4.125, "step": 1070 }, { "epoch": 1.7307692307692308, "grad_norm": 40.31044273857979, "learning_rate": 2.351543942992874e-07, "logits/chosen": -0.248046875, "logits/rejected": -0.51953125, "logps/chosen": -360.0, "logps/rejected": -350.0, "loss": 0.1188, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.515625, "rewards/margins": 6.34375, "rewards/rejected": -3.8125, "step": 1080 }, { "epoch": 1.7467948717948718, "grad_norm": 1.4266357038353779, "learning_rate": 2.3218527315914489e-07, "logits/chosen": -0.287109375, "logits/rejected": -0.283203125, "logps/chosen": -356.0, "logps/rejected": -328.0, "loss": 0.1075, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.265625, "rewards/margins": 5.59375, "rewards/rejected": -3.328125, "step": 1090 }, { "epoch": 1.7628205128205128, "grad_norm": 21.16673032403483, "learning_rate": 2.2921615201900234e-07, "logits/chosen": -0.1572265625, "logits/rejected": -0.408203125, "logps/chosen": -392.0, "logps/rejected": -322.0, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 2.046875, "rewards/margins": 6.4375, "rewards/rejected": -4.40625, "step": 1100 }, { "epoch": 1.7788461538461537, "grad_norm": 32.08142550351567, "learning_rate": 2.2624703087885984e-07, "logits/chosen": -0.17578125, "logits/rejected": -0.373046875, "logps/chosen": -356.0, "logps/rejected": -348.0, "loss": 0.0622, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.0625, "rewards/margins": 6.3125, "rewards/rejected": -4.25, "step": 1110 }, { "epoch": 1.7948717948717947, "grad_norm": 10.94873802284618, "learning_rate": 2.2327790973871732e-07, "logits/chosen": -0.296875, "logits/rejected": -0.474609375, "logps/chosen": -372.0, "logps/rejected": -368.0, "loss": 0.0631, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.015625, "rewards/margins": 6.15625, "rewards/rejected": -4.125, "step": 1120 }, { "epoch": 1.810897435897436, "grad_norm": 4.164714441310206, "learning_rate": 2.2030878859857483e-07, "logits/chosen": -0.251953125, "logits/rejected": -0.330078125, "logps/chosen": -362.0, "logps/rejected": -346.0, "loss": 0.0571, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.015625, "rewards/margins": 6.21875, "rewards/rejected": -4.1875, "step": 1130 }, { "epoch": 1.8269230769230769, "grad_norm": 6.828795713228818, "learning_rate": 2.173396674584323e-07, "logits/chosen": -0.234375, "logits/rejected": -0.33984375, "logps/chosen": -376.0, "logps/rejected": -352.0, "loss": 0.1227, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.234375, "rewards/margins": 5.5, "rewards/rejected": -3.234375, "step": 1140 }, { "epoch": 1.842948717948718, "grad_norm": 34.413780149566165, "learning_rate": 2.1437054631828976e-07, "logits/chosen": -0.224609375, "logits/rejected": -0.296875, "logps/chosen": -390.0, "logps/rejected": -346.0, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 1.640625, "rewards/margins": 6.1875, "rewards/rejected": -4.5625, "step": 1150 }, { "epoch": 1.858974358974359, "grad_norm": 3.332529440650818, "learning_rate": 2.1140142517814726e-07, "logits/chosen": -0.1669921875, "logits/rejected": -0.376953125, "logps/chosen": -352.0, "logps/rejected": -370.0, "loss": 0.0768, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.875, "rewards/margins": 6.34375, "rewards/rejected": -4.46875, "step": 1160 }, { "epoch": 1.875, "grad_norm": 8.41589370404434, "learning_rate": 2.0843230403800474e-07, "logits/chosen": -0.349609375, "logits/rejected": -0.31640625, "logps/chosen": -374.0, "logps/rejected": -350.0, "loss": 0.0811, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.7578125, "rewards/margins": 5.5, "rewards/rejected": -3.75, "step": 1170 }, { "epoch": 1.891025641025641, "grad_norm": 40.47797504678685, "learning_rate": 2.0546318289786222e-07, "logits/chosen": -0.2041015625, "logits/rejected": -0.40234375, "logps/chosen": -354.0, "logps/rejected": -366.0, "loss": 0.1088, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5703125, "rewards/margins": 5.53125, "rewards/rejected": -3.953125, "step": 1180 }, { "epoch": 1.907051282051282, "grad_norm": 8.055421977487784, "learning_rate": 2.0249406175771972e-07, "logits/chosen": -0.298828125, "logits/rejected": -0.482421875, "logps/chosen": -380.0, "logps/rejected": -354.0, "loss": 0.0963, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.046875, "rewards/margins": 6.0625, "rewards/rejected": -4.0, "step": 1190 }, { "epoch": 1.9230769230769231, "grad_norm": 1.0123224706730725, "learning_rate": 1.9952494061757718e-07, "logits/chosen": -0.259765625, "logits/rejected": -0.10888671875, "logps/chosen": -348.0, "logps/rejected": -356.0, "loss": 0.0319, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.71875, "rewards/margins": 7.03125, "rewards/rejected": -4.3125, "step": 1200 }, { "epoch": 1.939102564102564, "grad_norm": 44.45376114208627, "learning_rate": 1.9655581947743466e-07, "logits/chosen": -0.21875, "logits/rejected": -0.3671875, "logps/chosen": -372.0, "logps/rejected": -376.0, "loss": 0.0746, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7578125, "rewards/margins": 6.59375, "rewards/rejected": -4.8125, "step": 1210 }, { "epoch": 1.9551282051282053, "grad_norm": 15.603585643576269, "learning_rate": 1.9358669833729216e-07, "logits/chosen": -0.15234375, "logits/rejected": -0.373046875, "logps/chosen": -380.0, "logps/rejected": -352.0, "loss": 0.0608, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.734375, "rewards/margins": 6.0625, "rewards/rejected": -4.3125, "step": 1220 }, { "epoch": 1.9711538461538463, "grad_norm": 13.268798051575008, "learning_rate": 1.9061757719714964e-07, "logits/chosen": -0.380859375, "logits/rejected": -0.45703125, "logps/chosen": -396.0, "logps/rejected": -362.0, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": 2.328125, "rewards/margins": 6.65625, "rewards/rejected": -4.3125, "step": 1230 }, { "epoch": 1.9871794871794872, "grad_norm": 18.53274384798147, "learning_rate": 1.876484560570071e-07, "logits/chosen": -0.1357421875, "logits/rejected": -0.251953125, "logps/chosen": -390.0, "logps/rejected": -342.0, "loss": 0.0639, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.53125, "rewards/margins": 6.78125, "rewards/rejected": -4.25, "step": 1240 }, { "epoch": 2.0, "eval_logits/chosen": -0.25, "eval_logits/rejected": -0.30078125, "eval_logps/chosen": -362.0, "eval_logps/rejected": -358.0, "eval_loss": 0.2055511474609375, "eval_rewards/accuracies": 0.9038461446762085, "eval_rewards/chosen": 1.859375, "eval_rewards/margins": 5.4375, "eval_rewards/rejected": -3.5625, "eval_runtime": 29.021, "eval_samples_per_second": 6.892, "eval_steps_per_second": 0.448, "step": 1248 }, { "epoch": 2.003205128205128, "grad_norm": 2.3679308142140885, "learning_rate": 1.846793349168646e-07, "logits/chosen": -0.341796875, "logits/rejected": -0.46484375, "logps/chosen": -366.0, "logps/rejected": -352.0, "loss": 0.0315, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.609375, "rewards/margins": 6.9375, "rewards/rejected": -4.3125, "step": 1250 }, { "epoch": 2.019230769230769, "grad_norm": 2.590719502204235, "learning_rate": 1.8171021377672207e-07, "logits/chosen": -0.25, "logits/rejected": -0.404296875, "logps/chosen": -378.0, "logps/rejected": -364.0, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 2.015625, "rewards/margins": 6.90625, "rewards/rejected": -4.875, "step": 1260 }, { "epoch": 2.03525641025641, "grad_norm": 22.175507349451703, "learning_rate": 1.7874109263657958e-07, "logits/chosen": -0.2275390625, "logits/rejected": -0.37890625, "logps/chosen": -364.0, "logps/rejected": -384.0, "loss": 0.0156, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0625, "rewards/margins": 7.375, "rewards/rejected": -5.3125, "step": 1270 }, { "epoch": 2.051282051282051, "grad_norm": 4.568251724068892, "learning_rate": 1.7577197149643706e-07, "logits/chosen": -0.25, "logits/rejected": -0.482421875, "logps/chosen": -380.0, "logps/rejected": -370.0, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 1.953125, "rewards/margins": 7.46875, "rewards/rejected": -5.53125, "step": 1280 }, { "epoch": 2.0673076923076925, "grad_norm": 4.904685892165479, "learning_rate": 1.728028503562945e-07, "logits/chosen": -0.22265625, "logits/rejected": -0.466796875, "logps/chosen": -376.0, "logps/rejected": -366.0, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 2.515625, "rewards/margins": 7.3125, "rewards/rejected": -4.78125, "step": 1290 }, { "epoch": 2.0833333333333335, "grad_norm": 0.6187256370987634, "learning_rate": 1.6983372921615202e-07, "logits/chosen": -0.2138671875, "logits/rejected": -0.458984375, "logps/chosen": -362.0, "logps/rejected": -370.0, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 2.265625, "rewards/margins": 7.625, "rewards/rejected": -5.375, "step": 1300 }, { "epoch": 2.0993589743589745, "grad_norm": 0.7331904803685036, "learning_rate": 1.668646080760095e-07, "logits/chosen": -0.15234375, "logits/rejected": -0.40625, "logps/chosen": -354.0, "logps/rejected": -356.0, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 2.59375, "rewards/margins": 7.65625, "rewards/rejected": -5.0625, "step": 1310 }, { "epoch": 2.1153846153846154, "grad_norm": 2.0296944258203036, "learning_rate": 1.6389548693586697e-07, "logits/chosen": -0.2412109375, "logits/rejected": -0.1748046875, "logps/chosen": -348.0, "logps/rejected": -348.0, "loss": 0.0231, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.484375, "rewards/margins": 7.125, "rewards/rejected": -4.625, "step": 1320 }, { "epoch": 2.1314102564102564, "grad_norm": 23.71632295557207, "learning_rate": 1.6092636579572448e-07, "logits/chosen": -0.2275390625, "logits/rejected": -0.3125, "logps/chosen": -358.0, "logps/rejected": -342.0, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 2.78125, "rewards/margins": 7.90625, "rewards/rejected": -5.125, "step": 1330 }, { "epoch": 2.1474358974358974, "grad_norm": 4.284093803737362, "learning_rate": 1.5795724465558193e-07, "logits/chosen": -0.2255859375, "logits/rejected": -0.5546875, "logps/chosen": -364.0, "logps/rejected": -384.0, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 2.484375, "rewards/margins": 7.4375, "rewards/rejected": -4.9375, "step": 1340 }, { "epoch": 2.1634615384615383, "grad_norm": 0.7862983762455222, "learning_rate": 1.549881235154394e-07, "logits/chosen": -0.1435546875, "logits/rejected": -0.337890625, "logps/chosen": -376.0, "logps/rejected": -354.0, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 2.828125, "rewards/margins": 7.40625, "rewards/rejected": -4.59375, "step": 1350 }, { "epoch": 2.1794871794871793, "grad_norm": 1.8750426567792726, "learning_rate": 1.520190023752969e-07, "logits/chosen": -0.259765625, "logits/rejected": -0.59765625, "logps/chosen": -364.0, "logps/rejected": -330.0, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 3.015625, "rewards/margins": 8.1875, "rewards/rejected": -5.15625, "step": 1360 }, { "epoch": 2.1955128205128207, "grad_norm": 7.170950559094129, "learning_rate": 1.490498812351544e-07, "logits/chosen": -0.251953125, "logits/rejected": -0.36328125, "logps/chosen": -362.0, "logps/rejected": -356.0, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 3.0, "rewards/margins": 8.1875, "rewards/rejected": -5.1875, "step": 1370 }, { "epoch": 2.2115384615384617, "grad_norm": 21.23713813309802, "learning_rate": 1.4608076009501184e-07, "logits/chosen": -0.2431640625, "logits/rejected": -0.2353515625, "logps/chosen": -366.0, "logps/rejected": -356.0, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 2.6875, "rewards/margins": 7.8125, "rewards/rejected": -5.125, "step": 1380 }, { "epoch": 2.2275641025641026, "grad_norm": 7.169471258104354, "learning_rate": 1.4311163895486935e-07, "logits/chosen": -0.18359375, "logits/rejected": -0.29296875, "logps/chosen": -348.0, "logps/rejected": -364.0, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 2.90625, "rewards/margins": 8.1875, "rewards/rejected": -5.25, "step": 1390 }, { "epoch": 2.2435897435897436, "grad_norm": 4.891546899421327, "learning_rate": 1.4014251781472683e-07, "logits/chosen": -0.22265625, "logits/rejected": -0.55078125, "logps/chosen": -344.0, "logps/rejected": -374.0, "loss": 0.0165, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.71875, "rewards/margins": 7.4375, "rewards/rejected": -4.71875, "step": 1400 }, { "epoch": 2.2596153846153846, "grad_norm": 14.0088561337579, "learning_rate": 1.3717339667458433e-07, "logits/chosen": -0.171875, "logits/rejected": -0.375, "logps/chosen": -336.0, "logps/rejected": -380.0, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": 2.328125, "rewards/margins": 8.125, "rewards/rejected": -5.8125, "step": 1410 }, { "epoch": 2.2756410256410255, "grad_norm": 5.478498841262275, "learning_rate": 1.342042755344418e-07, "logits/chosen": -0.244140625, "logits/rejected": -0.60546875, "logps/chosen": -340.0, "logps/rejected": -368.0, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 2.078125, "rewards/margins": 7.125, "rewards/rejected": -5.0625, "step": 1420 }, { "epoch": 2.2916666666666665, "grad_norm": 4.08870815918569, "learning_rate": 1.3123515439429926e-07, "logits/chosen": -0.216796875, "logits/rejected": -0.5, "logps/chosen": -348.0, "logps/rejected": -380.0, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": 2.40625, "rewards/margins": 7.875, "rewards/rejected": -5.46875, "step": 1430 }, { "epoch": 2.3076923076923075, "grad_norm": 4.0828939935609885, "learning_rate": 1.2826603325415677e-07, "logits/chosen": -0.310546875, "logits/rejected": -0.3515625, "logps/chosen": -362.0, "logps/rejected": -372.0, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 1.9296875, "rewards/margins": 7.90625, "rewards/rejected": -5.96875, "step": 1440 }, { "epoch": 2.323717948717949, "grad_norm": 2.2611774310220714, "learning_rate": 1.2529691211401425e-07, "logits/chosen": -0.2890625, "logits/rejected": -0.3671875, "logps/chosen": -350.0, "logps/rejected": -380.0, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 2.359375, "rewards/margins": 8.1875, "rewards/rejected": -5.84375, "step": 1450 }, { "epoch": 2.33974358974359, "grad_norm": 35.51723177781465, "learning_rate": 1.2232779097387173e-07, "logits/chosen": -0.244140625, "logits/rejected": -0.36328125, "logps/chosen": -378.0, "logps/rejected": -368.0, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": 2.078125, "rewards/margins": 8.125, "rewards/rejected": -6.0, "step": 1460 }, { "epoch": 2.355769230769231, "grad_norm": 5.534009913713299, "learning_rate": 1.193586698337292e-07, "logits/chosen": -0.1767578125, "logits/rejected": -0.5390625, "logps/chosen": -368.0, "logps/rejected": -358.0, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 2.484375, "rewards/margins": 8.0625, "rewards/rejected": -5.59375, "step": 1470 }, { "epoch": 2.371794871794872, "grad_norm": 18.31189968205467, "learning_rate": 1.163895486935867e-07, "logits/chosen": -0.25390625, "logits/rejected": -0.353515625, "logps/chosen": -368.0, "logps/rejected": -356.0, "loss": 0.0359, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.3125, "rewards/margins": 7.78125, "rewards/rejected": -5.46875, "step": 1480 }, { "epoch": 2.3878205128205128, "grad_norm": 14.190561755907082, "learning_rate": 1.1342042755344417e-07, "logits/chosen": -0.1484375, "logits/rejected": -0.35546875, "logps/chosen": -376.0, "logps/rejected": -370.0, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 2.734375, "rewards/margins": 8.4375, "rewards/rejected": -5.6875, "step": 1490 }, { "epoch": 2.4038461538461537, "grad_norm": 57.38633989383038, "learning_rate": 1.1045130641330165e-07, "logits/chosen": -0.2177734375, "logits/rejected": -0.24609375, "logps/chosen": -356.0, "logps/rejected": -376.0, "loss": 0.0762, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.25, "rewards/margins": 7.25, "rewards/rejected": -6.0, "step": 1500 }, { "epoch": 2.4198717948717947, "grad_norm": 13.945337786807688, "learning_rate": 1.0748218527315913e-07, "logits/chosen": -0.2353515625, "logits/rejected": -0.486328125, "logps/chosen": -374.0, "logps/rejected": -386.0, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 2.28125, "rewards/margins": 8.625, "rewards/rejected": -6.34375, "step": 1510 }, { "epoch": 2.435897435897436, "grad_norm": 2.025562206086781, "learning_rate": 1.0451306413301662e-07, "logits/chosen": -0.2890625, "logits/rejected": -0.404296875, "logps/chosen": -362.0, "logps/rejected": -360.0, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 2.96875, "rewards/margins": 8.875, "rewards/rejected": -5.90625, "step": 1520 }, { "epoch": 2.451923076923077, "grad_norm": 2.5818458781278997, "learning_rate": 1.0154394299287411e-07, "logits/chosen": -0.20703125, "logits/rejected": -0.380859375, "logps/chosen": -356.0, "logps/rejected": -378.0, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 2.75, "rewards/margins": 8.375, "rewards/rejected": -5.59375, "step": 1530 }, { "epoch": 2.467948717948718, "grad_norm": 0.5147935469188503, "learning_rate": 9.857482185273158e-08, "logits/chosen": -0.240234375, "logits/rejected": -0.47265625, "logps/chosen": -348.0, "logps/rejected": -354.0, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 2.609375, "rewards/margins": 8.125, "rewards/rejected": -5.46875, "step": 1540 }, { "epoch": 2.483974358974359, "grad_norm": 2.5527476910216764, "learning_rate": 9.560570071258907e-08, "logits/chosen": -0.1748046875, "logits/rejected": -0.392578125, "logps/chosen": -378.0, "logps/rejected": -336.0, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 2.34375, "rewards/margins": 8.125, "rewards/rejected": -5.8125, "step": 1550 }, { "epoch": 2.5, "grad_norm": 2.6084669807533807, "learning_rate": 9.263657957244655e-08, "logits/chosen": -0.28125, "logits/rejected": -0.40234375, "logps/chosen": -340.0, "logps/rejected": -386.0, "loss": 0.0112, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.78125, "rewards/margins": 8.4375, "rewards/rejected": -5.6875, "step": 1560 }, { "epoch": 2.516025641025641, "grad_norm": 0.5142214115866551, "learning_rate": 8.966745843230403e-08, "logits/chosen": -0.26953125, "logits/rejected": -0.34765625, "logps/chosen": -332.0, "logps/rejected": -338.0, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 2.625, "rewards/margins": 8.25, "rewards/rejected": -5.625, "step": 1570 }, { "epoch": 2.532051282051282, "grad_norm": 0.4686133876015494, "learning_rate": 8.669833729216151e-08, "logits/chosen": -0.2080078125, "logits/rejected": -0.36328125, "logps/chosen": -374.0, "logps/rejected": -382.0, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 2.0625, "rewards/margins": 7.84375, "rewards/rejected": -5.78125, "step": 1580 }, { "epoch": 2.5480769230769234, "grad_norm": 4.377458752115824, "learning_rate": 8.3729216152019e-08, "logits/chosen": -0.240234375, "logits/rejected": -0.50390625, "logps/chosen": -358.0, "logps/rejected": -370.0, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 2.375, "rewards/margins": 8.4375, "rewards/rejected": -6.03125, "step": 1590 }, { "epoch": 2.564102564102564, "grad_norm": 8.575422939319253, "learning_rate": 8.076009501187649e-08, "logits/chosen": -0.2490234375, "logits/rejected": -0.337890625, "logps/chosen": -348.0, "logps/rejected": -370.0, "loss": 0.0241, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.484375, "rewards/margins": 7.5, "rewards/rejected": -5.0, "step": 1600 }, { "epoch": 2.5801282051282053, "grad_norm": 2.551052943978534, "learning_rate": 7.779097387173396e-08, "logits/chosen": -0.2734375, "logits/rejected": -0.375, "logps/chosen": -342.0, "logps/rejected": -360.0, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 2.8125, "rewards/margins": 7.8125, "rewards/rejected": -4.96875, "step": 1610 }, { "epoch": 2.5961538461538463, "grad_norm": 6.000172766868698, "learning_rate": 7.482185273159145e-08, "logits/chosen": -0.263671875, "logits/rejected": -0.451171875, "logps/chosen": -368.0, "logps/rejected": -378.0, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": 2.46875, "rewards/margins": 7.96875, "rewards/rejected": -5.5, "step": 1620 }, { "epoch": 2.6121794871794872, "grad_norm": 19.722033841376266, "learning_rate": 7.185273159144893e-08, "logits/chosen": -0.294921875, "logits/rejected": -0.392578125, "logps/chosen": -366.0, "logps/rejected": -398.0, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 2.71875, "rewards/margins": 8.25, "rewards/rejected": -5.53125, "step": 1630 }, { "epoch": 2.628205128205128, "grad_norm": 11.30540085696599, "learning_rate": 6.88836104513064e-08, "logits/chosen": -0.18359375, "logits/rejected": -0.314453125, "logps/chosen": -380.0, "logps/rejected": -354.0, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 2.734375, "rewards/margins": 8.6875, "rewards/rejected": -5.96875, "step": 1640 }, { "epoch": 2.644230769230769, "grad_norm": 1.1773482574998353, "learning_rate": 6.591448931116388e-08, "logits/chosen": -0.162109375, "logits/rejected": -0.4609375, "logps/chosen": -378.0, "logps/rejected": -374.0, "loss": 0.0175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.125, "rewards/margins": 8.0625, "rewards/rejected": -5.9375, "step": 1650 }, { "epoch": 2.66025641025641, "grad_norm": 52.67450880620409, "learning_rate": 6.294536817102138e-08, "logits/chosen": -0.294921875, "logits/rejected": -0.58203125, "logps/chosen": -386.0, "logps/rejected": -376.0, "loss": 0.0316, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.828125, "rewards/margins": 7.53125, "rewards/rejected": -5.6875, "step": 1660 }, { "epoch": 2.676282051282051, "grad_norm": 2.444649208747581, "learning_rate": 5.997624703087885e-08, "logits/chosen": -0.2734375, "logits/rejected": -0.30859375, "logps/chosen": -348.0, "logps/rejected": -330.0, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 2.34375, "rewards/margins": 7.96875, "rewards/rejected": -5.625, "step": 1670 }, { "epoch": 2.6923076923076925, "grad_norm": 1.8309710467577502, "learning_rate": 5.700712589073634e-08, "logits/chosen": -0.33203125, "logits/rejected": -0.5546875, "logps/chosen": -368.0, "logps/rejected": -372.0, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 2.578125, "rewards/margins": 8.3125, "rewards/rejected": -5.75, "step": 1680 }, { "epoch": 2.7083333333333335, "grad_norm": 0.974183927512936, "learning_rate": 5.4038004750593824e-08, "logits/chosen": -0.212890625, "logits/rejected": -0.384765625, "logps/chosen": -370.0, "logps/rejected": -366.0, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 2.4375, "rewards/margins": 8.625, "rewards/rejected": -6.21875, "step": 1690 }, { "epoch": 2.7243589743589745, "grad_norm": 8.112271810670284, "learning_rate": 5.10688836104513e-08, "logits/chosen": -0.287109375, "logits/rejected": -0.34375, "logps/chosen": -364.0, "logps/rejected": -384.0, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 2.53125, "rewards/margins": 8.625, "rewards/rejected": -6.09375, "step": 1700 }, { "epoch": 2.7403846153846154, "grad_norm": 48.323303487774965, "learning_rate": 4.809976247030879e-08, "logits/chosen": -0.2734375, "logits/rejected": -0.39453125, "logps/chosen": -378.0, "logps/rejected": -368.0, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": 2.28125, "rewards/margins": 7.9375, "rewards/rejected": -5.65625, "step": 1710 }, { "epoch": 2.7564102564102564, "grad_norm": 0.6216279700601488, "learning_rate": 4.5130641330166267e-08, "logits/chosen": -0.2470703125, "logits/rejected": -0.3828125, "logps/chosen": -340.0, "logps/rejected": -372.0, "loss": 0.0305, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.25, "rewards/margins": 7.6875, "rewards/rejected": -5.4375, "step": 1720 }, { "epoch": 2.7724358974358974, "grad_norm": 8.386988307728698, "learning_rate": 4.216152019002375e-08, "logits/chosen": -0.2060546875, "logits/rejected": -0.349609375, "logps/chosen": -360.0, "logps/rejected": -364.0, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 2.734375, "rewards/margins": 8.4375, "rewards/rejected": -5.6875, "step": 1730 }, { "epoch": 2.7884615384615383, "grad_norm": 2.6310329285858587, "learning_rate": 3.919239904988123e-08, "logits/chosen": -0.1826171875, "logits/rejected": -0.333984375, "logps/chosen": -358.0, "logps/rejected": -390.0, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 2.28125, "rewards/margins": 7.875, "rewards/rejected": -5.59375, "step": 1740 }, { "epoch": 2.8044871794871797, "grad_norm": 3.066349322026837, "learning_rate": 3.6223277909738715e-08, "logits/chosen": -0.2236328125, "logits/rejected": -0.357421875, "logps/chosen": -350.0, "logps/rejected": -368.0, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 2.40625, "rewards/margins": 7.96875, "rewards/rejected": -5.5625, "step": 1750 }, { "epoch": 2.8205128205128203, "grad_norm": 8.079242737705172, "learning_rate": 3.32541567695962e-08, "logits/chosen": -0.0966796875, "logits/rejected": -0.2431640625, "logps/chosen": -342.0, "logps/rejected": -360.0, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 2.421875, "rewards/margins": 8.4375, "rewards/rejected": -6.03125, "step": 1760 }, { "epoch": 2.8365384615384617, "grad_norm": 36.85905904660706, "learning_rate": 3.028503562945368e-08, "logits/chosen": -0.1962890625, "logits/rejected": -0.1435546875, "logps/chosen": -376.0, "logps/rejected": -366.0, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 3.015625, "rewards/margins": 8.75, "rewards/rejected": -5.71875, "step": 1770 }, { "epoch": 2.8525641025641026, "grad_norm": 1.3700769518291278, "learning_rate": 2.7315914489311164e-08, "logits/chosen": -0.177734375, "logits/rejected": -0.29296875, "logps/chosen": -354.0, "logps/rejected": -366.0, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 2.1875, "rewards/margins": 8.375, "rewards/rejected": -6.15625, "step": 1780 }, { "epoch": 2.8685897435897436, "grad_norm": 2.5772413830882437, "learning_rate": 2.4346793349168646e-08, "logits/chosen": -0.3515625, "logits/rejected": -0.40625, "logps/chosen": -356.0, "logps/rejected": -380.0, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 2.265625, "rewards/margins": 8.25, "rewards/rejected": -5.96875, "step": 1790 }, { "epoch": 2.8846153846153846, "grad_norm": 0.2522421688634616, "learning_rate": 2.1377672209026125e-08, "logits/chosen": -0.1884765625, "logits/rejected": -0.296875, "logps/chosen": -354.0, "logps/rejected": -362.0, "loss": 0.0155, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.609375, "rewards/margins": 8.625, "rewards/rejected": -5.96875, "step": 1800 }, { "epoch": 2.9006410256410255, "grad_norm": 2.0378946111088347, "learning_rate": 1.840855106888361e-08, "logits/chosen": -0.298828125, "logits/rejected": -0.4765625, "logps/chosen": -346.0, "logps/rejected": -370.0, "loss": 0.0427, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.828125, "rewards/margins": 7.90625, "rewards/rejected": -6.0625, "step": 1810 }, { "epoch": 2.9166666666666665, "grad_norm": 2.2386092025677784, "learning_rate": 1.5439429928741092e-08, "logits/chosen": -0.21875, "logits/rejected": -0.2138671875, "logps/chosen": -380.0, "logps/rejected": -372.0, "loss": 0.0179, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.84375, "rewards/margins": 8.5, "rewards/rejected": -6.65625, "step": 1820 }, { "epoch": 2.9326923076923075, "grad_norm": 5.606669828245322, "learning_rate": 1.2470308788598574e-08, "logits/chosen": -0.26953125, "logits/rejected": -0.224609375, "logps/chosen": -364.0, "logps/rejected": -346.0, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": 2.5625, "rewards/margins": 8.5625, "rewards/rejected": -5.96875, "step": 1830 }, { "epoch": 2.948717948717949, "grad_norm": 7.459897331240726, "learning_rate": 9.501187648456057e-09, "logits/chosen": -0.166015625, "logits/rejected": -0.27734375, "logps/chosen": -338.0, "logps/rejected": -372.0, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 1.9453125, "rewards/margins": 8.375, "rewards/rejected": -6.40625, "step": 1840 }, { "epoch": 2.96474358974359, "grad_norm": 46.309709716339626, "learning_rate": 6.532066508313539e-09, "logits/chosen": -0.265625, "logits/rejected": -0.3046875, "logps/chosen": -368.0, "logps/rejected": -392.0, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 2.125, "rewards/margins": 8.5, "rewards/rejected": -6.375, "step": 1850 }, { "epoch": 2.980769230769231, "grad_norm": 2.757504775523463, "learning_rate": 3.562945368171021e-09, "logits/chosen": -0.2314453125, "logits/rejected": -0.3515625, "logps/chosen": -366.0, "logps/rejected": -382.0, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 2.3125, "rewards/margins": 8.125, "rewards/rejected": -5.84375, "step": 1860 }, { "epoch": 2.996794871794872, "grad_norm": 0.7690916064350508, "learning_rate": 5.938242280285036e-10, "logits/chosen": -0.146484375, "logits/rejected": -0.361328125, "logps/chosen": -352.0, "logps/rejected": -378.0, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 2.9375, "rewards/margins": 8.75, "rewards/rejected": -5.84375, "step": 1870 }, { "epoch": 3.0, "eval_logits/chosen": -0.2392578125, "eval_logits/rejected": -0.2734375, "eval_logps/chosen": -364.0, "eval_logps/rejected": -370.0, "eval_loss": 0.21664032340049744, "eval_rewards/accuracies": 0.9134615659713745, "eval_rewards/chosen": 1.7734375, "eval_rewards/margins": 6.65625, "eval_rewards/rejected": -4.875, "eval_runtime": 26.3085, "eval_samples_per_second": 7.602, "eval_steps_per_second": 0.494, "step": 1872 }, { "epoch": 3.0, "step": 1872, "total_flos": 0.0, "train_loss": 0.14497800362415802, "train_runtime": 10901.8033, "train_samples_per_second": 2.746, "train_steps_per_second": 0.172 } ], "logging_steps": 10, "max_steps": 1872, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }