{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9667896678966788, "eval_steps": 40, "global_step": 201, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07380073800738007, "grad_norm": 80.02477445251274, "learning_rate": 5e-07, "logits/chosen": -2.7569785118103027, "logits/rejected": -2.715679883956909, "logps/chosen": -343.655517578125, "logps/rejected": -244.0912628173828, "loss": 0.687, "rewards/accuracies": 0.34375, "rewards/chosen": 0.02633141539990902, "rewards/margins": 0.006850541569292545, "rewards/rejected": 0.0194808728992939, "step": 5 }, { "epoch": 0.14760147601476015, "grad_norm": 65.98268514011825, "learning_rate": 1e-06, "logits/chosen": -2.676809787750244, "logits/rejected": -2.666592836380005, "logps/chosen": -296.428955078125, "logps/rejected": -247.4902801513672, "loss": 0.6147, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8011910319328308, "rewards/margins": 0.2567104995250702, "rewards/rejected": 0.5444804430007935, "step": 10 }, { "epoch": 0.22140221402214022, "grad_norm": 48.66427015180346, "learning_rate": 9.983100718730718e-07, "logits/chosen": -2.416226863861084, "logits/rejected": -2.3806653022766113, "logps/chosen": -316.8359069824219, "logps/rejected": -258.2687683105469, "loss": 0.6095, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.1471664905548096, "rewards/margins": 0.7412694692611694, "rewards/rejected": 1.4058969020843506, "step": 15 }, { "epoch": 0.2952029520295203, "grad_norm": 50.05057195236849, "learning_rate": 9.932517109205849e-07, "logits/chosen": -2.1923749446868896, "logits/rejected": -2.1478309631347656, "logps/chosen": -294.5142517089844, "logps/rejected": -243.7734375, "loss": 0.556, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.3299460411071777, "rewards/margins": 1.3834998607635498, "rewards/rejected": 0.9464457631111145, "step": 20 }, { "epoch": 0.36900369003690037, "grad_norm": 41.37624373189553, "learning_rate": 9.848591102083375e-07, "logits/chosen": -2.0363731384277344, "logits/rejected": -2.030383348464966, "logps/chosen": -282.7300720214844, "logps/rejected": -221.184326171875, "loss": 0.4963, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 2.7908506393432617, "rewards/margins": 1.624943494796753, "rewards/rejected": 1.1659072637557983, "step": 25 }, { "epoch": 0.44280442804428044, "grad_norm": 43.83501071765918, "learning_rate": 9.731890013043367e-07, "logits/chosen": -2.0403037071228027, "logits/rejected": -1.9934555292129517, "logps/chosen": -325.14227294921875, "logps/rejected": -214.34542846679688, "loss": 0.4972, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 2.984839916229248, "rewards/margins": 1.5722445249557495, "rewards/rejected": 1.412595510482788, "step": 30 }, { "epoch": 0.5166051660516605, "grad_norm": 47.665657648113644, "learning_rate": 9.583202707897073e-07, "logits/chosen": -2.0699315071105957, "logits/rejected": -2.042548418045044, "logps/chosen": -318.35357666015625, "logps/rejected": -221.4462432861328, "loss": 0.5431, "rewards/accuracies": 0.8125, "rewards/chosen": 3.141150951385498, "rewards/margins": 1.8329731225967407, "rewards/rejected": 1.3081778287887573, "step": 35 }, { "epoch": 0.5904059040590406, "grad_norm": 42.1852532770112, "learning_rate": 9.403534270080829e-07, "logits/chosen": -2.1574552059173584, "logits/rejected": -2.105395555496216, "logps/chosen": -282.8706359863281, "logps/rejected": -239.42562866210938, "loss": 0.563, "rewards/accuracies": 0.78125, "rewards/chosen": 2.571629524230957, "rewards/margins": 1.7009865045547485, "rewards/rejected": 0.8706433176994324, "step": 40 }, { "epoch": 0.5904059040590406, "eval_logits/chosen": -2.179224967956543, "eval_logits/rejected": -2.15881085395813, "eval_logps/chosen": -304.3701171875, "eval_logps/rejected": -235.69309997558594, "eval_loss": 0.4594477713108063, "eval_rewards/accuracies": 0.8185483813285828, "eval_rewards/chosen": 2.485563278198242, "eval_rewards/margins": 1.8135225772857666, "eval_rewards/rejected": 0.6720407009124756, "eval_runtime": 131.0305, "eval_samples_per_second": 14.661, "eval_steps_per_second": 0.237, "step": 40 }, { "epoch": 0.6642066420664207, "grad_norm": 38.916777514219696, "learning_rate": 9.19409920658098e-07, "logits/chosen": -2.225562572479248, "logits/rejected": -2.181002378463745, "logps/chosen": -276.44537353515625, "logps/rejected": -232.626220703125, "loss": 0.5076, "rewards/accuracies": 0.78125, "rewards/chosen": 2.278400182723999, "rewards/margins": 1.4811707735061646, "rewards/rejected": 0.7972294092178345, "step": 45 }, { "epoch": 0.7380073800738007, "grad_norm": 50.710250280321, "learning_rate": 8.956313238215823e-07, "logits/chosen": -2.2307848930358887, "logits/rejected": -2.1967437267303467, "logps/chosen": -313.6961364746094, "logps/rejected": -241.0548858642578, "loss": 0.5239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.592728853225708, "rewards/margins": 1.948052167892456, "rewards/rejected": 0.6446765661239624, "step": 50 }, { "epoch": 0.8118081180811808, "grad_norm": 39.063704669645155, "learning_rate": 8.691783729769873e-07, "logits/chosen": -2.139880895614624, "logits/rejected": -2.139148712158203, "logps/chosen": -299.7575988769531, "logps/rejected": -245.935546875, "loss": 0.5018, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 2.354003429412842, "rewards/margins": 1.81247878074646, "rewards/rejected": 0.5415242910385132, "step": 55 }, { "epoch": 0.8856088560885609, "grad_norm": 33.2796085112328, "learning_rate": 8.402298824670029e-07, "logits/chosen": -2.0772578716278076, "logits/rejected": -2.054955005645752, "logps/chosen": -295.5028991699219, "logps/rejected": -244.0660858154297, "loss": 0.4817, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 2.2515780925750732, "rewards/margins": 1.5721994638442993, "rewards/rejected": 0.6793786883354187, "step": 60 }, { "epoch": 0.959409594095941, "grad_norm": 36.14638757212613, "learning_rate": 8.089815357650089e-07, "logits/chosen": -2.0140891075134277, "logits/rejected": -1.9471585750579834, "logps/chosen": -302.58148193359375, "logps/rejected": -237.9540252685547, "loss": 0.4943, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.2041029930114746, "rewards/margins": 2.0399723052978516, "rewards/rejected": 0.16413061320781708, "step": 65 }, { "epoch": 1.033210332103321, "grad_norm": 21.302121663013374, "learning_rate": 7.756445627110522e-07, "logits/chosen": -2.040945053100586, "logits/rejected": -2.0241832733154297, "logps/chosen": -312.1359558105469, "logps/rejected": -239.3393096923828, "loss": 0.3303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.603544235229492, "rewards/margins": 2.4756617546081543, "rewards/rejected": 0.12788262963294983, "step": 70 }, { "epoch": 1.1070110701107012, "grad_norm": 20.556094388092646, "learning_rate": 7.404443116588547e-07, "logits/chosen": -2.104165554046631, "logits/rejected": -2.059689521789551, "logps/chosen": -294.634765625, "logps/rejected": -238.32437133789062, "loss": 0.129, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.189236879348755, "rewards/margins": 3.7732062339782715, "rewards/rejected": -0.5839694142341614, "step": 75 }, { "epoch": 1.1808118081180812, "grad_norm": 20.50931148538785, "learning_rate": 7.036187261857288e-07, "logits/chosen": -2.146726608276367, "logits/rejected": -2.1075119972229004, "logps/chosen": -297.4272155761719, "logps/rejected": -262.4473876953125, "loss": 0.154, "rewards/accuracies": 0.96875, "rewards/chosen": 3.2519805431365967, "rewards/margins": 3.6943678855895996, "rewards/rejected": -0.44238725304603577, "step": 80 }, { "epoch": 1.1808118081180812, "eval_logits/chosen": -2.13566517829895, "eval_logits/rejected": -2.110398054122925, "eval_logps/chosen": -301.3644104003906, "eval_logps/rejected": -238.48484802246094, "eval_loss": 0.46015238761901855, "eval_rewards/accuracies": 0.8427419066429138, "eval_rewards/chosen": 2.7861340045928955, "eval_rewards/margins": 2.3932666778564453, "eval_rewards/rejected": 0.3928670585155487, "eval_runtime": 129.5743, "eval_samples_per_second": 14.825, "eval_steps_per_second": 0.239, "step": 80 }, { "epoch": 1.2546125461254611, "grad_norm": 24.33309810818949, "learning_rate": 6.654167366624008e-07, "logits/chosen": -2.142047882080078, "logits/rejected": -2.1115987300872803, "logps/chosen": -289.6197204589844, "logps/rejected": -245.8259735107422, "loss": 0.1699, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 3.555595874786377, "rewards/margins": 4.105128288269043, "rewards/rejected": -0.5495321750640869, "step": 85 }, { "epoch": 1.3284132841328413, "grad_norm": 23.507286919588484, "learning_rate": 6.260965775552713e-07, "logits/chosen": -2.1702046394348145, "logits/rejected": -2.1256089210510254, "logps/chosen": -299.5054626464844, "logps/rejected": -242.0937042236328, "loss": 0.159, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 3.9076132774353027, "rewards/margins": 4.560946464538574, "rewards/rejected": -0.6533328890800476, "step": 90 }, { "epoch": 1.4022140221402215, "grad_norm": 15.516195820704533, "learning_rate": 5.859240418356614e-07, "logits/chosen": -2.1203560829162598, "logits/rejected": -2.07737398147583, "logps/chosen": -270.5323791503906, "logps/rejected": -282.30242919921875, "loss": 0.1745, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.4920401573181152, "rewards/margins": 4.871523380279541, "rewards/rejected": -1.3794825077056885, "step": 95 }, { "epoch": 1.4760147601476015, "grad_norm": 15.962268006534465, "learning_rate": 5.451706842957421e-07, "logits/chosen": -2.0756678581237793, "logits/rejected": -2.0366768836975098, "logps/chosen": -285.35400390625, "logps/rejected": -261.02069091796875, "loss": 0.1518, "rewards/accuracies": 0.96875, "rewards/chosen": 3.5108916759490967, "rewards/margins": 4.940871715545654, "rewards/rejected": -1.4299800395965576, "step": 100 }, { "epoch": 1.5498154981549814, "grad_norm": 25.320702801914457, "learning_rate": 5.041119859162068e-07, "logits/chosen": -2.1494388580322266, "logits/rejected": -2.1103031635284424, "logps/chosen": -291.79193115234375, "logps/rejected": -242.1620635986328, "loss": 0.1927, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.214129686355591, "rewards/margins": 4.194614410400391, "rewards/rejected": -0.980484664440155, "step": 105 }, { "epoch": 1.6236162361623616, "grad_norm": 24.127332932431226, "learning_rate": 4.630254916940423e-07, "logits/chosen": -2.174290180206299, "logits/rejected": -2.179755926132202, "logps/chosen": -279.0810546875, "logps/rejected": -252.66488647460938, "loss": 0.1829, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 3.157012701034546, "rewards/margins": 4.443808078765869, "rewards/rejected": -1.2867956161499023, "step": 110 }, { "epoch": 1.6974169741697418, "grad_norm": 20.11391135642748, "learning_rate": 4.2218893451814e-07, "logits/chosen": -2.2010812759399414, "logits/rejected": -2.164829730987549, "logps/chosen": -289.4188232421875, "logps/rejected": -246.65945434570312, "loss": 0.1934, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.433326244354248, "rewards/margins": 4.391345977783203, "rewards/rejected": -0.9580191373825073, "step": 115 }, { "epoch": 1.7712177121771218, "grad_norm": 20.706343509306766, "learning_rate": 3.8187835777481375e-07, "logits/chosen": -2.176086187362671, "logits/rejected": -2.1578235626220703, "logps/chosen": -281.7149353027344, "logps/rejected": -265.0261535644531, "loss": 0.2027, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.2026546001434326, "rewards/margins": 4.289515495300293, "rewards/rejected": -1.0868606567382812, "step": 120 }, { "epoch": 1.7712177121771218, "eval_logits/chosen": -2.1726152896881104, "eval_logits/rejected": -2.146054983139038, "eval_logps/chosen": -304.15960693359375, "eval_logps/rejected": -246.97988891601562, "eval_loss": 0.48685166239738464, "eval_rewards/accuracies": 0.8548387289047241, "eval_rewards/chosen": 2.5066120624542236, "eval_rewards/margins": 2.9632484912872314, "eval_rewards/rejected": -0.4566364884376526, "eval_runtime": 129.7757, "eval_samples_per_second": 14.802, "eval_steps_per_second": 0.239, "step": 120 }, { "epoch": 1.8450184501845017, "grad_norm": 20.871306894670933, "learning_rate": 3.423662493738687e-07, "logits/chosen": -2.180792808532715, "logits/rejected": -2.159304141998291, "logps/chosen": -301.1511535644531, "logps/rejected": -255.13919067382812, "loss": 0.1609, "rewards/accuracies": 0.9375, "rewards/chosen": 3.2595107555389404, "rewards/margins": 4.297440528869629, "rewards/rejected": -1.037929654121399, "step": 125 }, { "epoch": 1.918819188191882, "grad_norm": 27.947861559843737, "learning_rate": 3.039196998086687e-07, "logits/chosen": -2.136273145675659, "logits/rejected": -2.1014552116394043, "logps/chosen": -286.9736022949219, "logps/rejected": -244.7154083251953, "loss": 0.1847, "rewards/accuracies": 0.9375, "rewards/chosen": 3.395556926727295, "rewards/margins": 4.3099188804626465, "rewards/rejected": -0.9143617749214172, "step": 130 }, { "epoch": 1.992619926199262, "grad_norm": 20.821197239752305, "learning_rate": 2.667985967011878e-07, "logits/chosen": -2.1088356971740723, "logits/rejected": -2.0703465938568115, "logps/chosen": -286.96917724609375, "logps/rejected": -256.48016357421875, "loss": 0.1724, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 3.350585460662842, "rewards/margins": 4.244786262512207, "rewards/rejected": -0.8942006826400757, "step": 135 }, { "epoch": 2.066420664206642, "grad_norm": 8.245558323252546, "learning_rate": 2.3125386803640183e-07, "logits/chosen": -2.1218690872192383, "logits/rejected": -2.0660667419433594, "logps/chosen": -284.4044494628906, "logps/rejected": -270.7417907714844, "loss": 0.0938, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.351060152053833, "rewards/margins": 4.853818416595459, "rewards/rejected": -1.5027587413787842, "step": 140 }, { "epoch": 2.140221402214022, "grad_norm": 14.140599014287302, "learning_rate": 1.9752578596124952e-07, "logits/chosen": -2.093632936477661, "logits/rejected": -2.0502517223358154, "logps/chosen": -288.5584716796875, "logps/rejected": -256.74652099609375, "loss": 0.0775, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 3.4150993824005127, "rewards/margins": 4.966043949127197, "rewards/rejected": -1.5509445667266846, "step": 145 }, { "epoch": 2.2140221402214024, "grad_norm": 7.605905759499919, "learning_rate": 1.6584234261399532e-07, "logits/chosen": -2.0875797271728516, "logits/rejected": -2.0646932125091553, "logps/chosen": -295.5018310546875, "logps/rejected": -290.001708984375, "loss": 0.0579, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 3.694823741912842, "rewards/margins": 5.430555820465088, "rewards/rejected": -1.7357313632965088, "step": 150 }, { "epoch": 2.2878228782287824, "grad_norm": 20.873090027101682, "learning_rate": 1.3641770896292082e-07, "logits/chosen": -2.0764248371124268, "logits/rejected": -2.060342311859131, "logps/chosen": -278.5547790527344, "logps/rejected": -249.08203125, "loss": 0.0718, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.3755805492401123, "rewards/margins": 5.271130084991455, "rewards/rejected": -1.8955495357513428, "step": 155 }, { "epoch": 2.3616236162361623, "grad_norm": 12.7807011486128, "learning_rate": 1.0945078707215221e-07, "logits/chosen": -2.073279857635498, "logits/rejected": -2.0515029430389404, "logps/chosen": -279.70892333984375, "logps/rejected": -263.677734375, "loss": 0.0725, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.862575054168701, "rewards/margins": 5.486065864562988, "rewards/rejected": -1.623490571975708, "step": 160 }, { "epoch": 2.3616236162361623, "eval_logits/chosen": -2.0765814781188965, "eval_logits/rejected": -2.042445182800293, "eval_logps/chosen": -301.5458984375, "eval_logps/rejected": -246.53857421875, "eval_loss": 0.48189839720726013, "eval_rewards/accuracies": 0.8629032373428345, "eval_rewards/chosen": 2.7679829597473145, "eval_rewards/margins": 3.1804890632629395, "eval_rewards/rejected": -0.412506103515625, "eval_runtime": 129.9118, "eval_samples_per_second": 14.787, "eval_steps_per_second": 0.239, "step": 160 }, { "epoch": 2.4354243542435423, "grad_norm": 16.62994387557585, "learning_rate": 8.512386558088919e-08, "logits/chosen": -2.0903940200805664, "logits/rejected": -2.0252914428710938, "logps/chosen": -286.7425842285156, "logps/rejected": -249.64614868164062, "loss": 0.0707, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.8004047870635986, "rewards/margins": 5.124575614929199, "rewards/rejected": -1.3241703510284424, "step": 165 }, { "epoch": 2.5092250922509223, "grad_norm": 13.149398258549308, "learning_rate": 6.360138748461013e-08, "logits/chosen": -2.078819751739502, "logits/rejected": -2.0325751304626465, "logps/chosen": -279.3172912597656, "logps/rejected": -262.2966003417969, "loss": 0.0712, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.741664409637451, "rewards/margins": 5.292626857757568, "rewards/rejected": -1.5509625673294067, "step": 170 }, { "epoch": 2.5830258302583027, "grad_norm": 15.477600906013183, "learning_rate": 4.5028838547699346e-08, "logits/chosen": -2.058854818344116, "logits/rejected": -2.045734167098999, "logps/chosen": -293.87738037109375, "logps/rejected": -277.49139404296875, "loss": 0.0756, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.888404369354248, "rewards/margins": 5.4004316329956055, "rewards/rejected": -1.512027382850647, "step": 175 }, { "epoch": 2.6568265682656826, "grad_norm": 10.486814550692278, "learning_rate": 2.9531763861505964e-08, "logits/chosen": -2.057389497756958, "logits/rejected": -2.0072054862976074, "logps/chosen": -284.025634765625, "logps/rejected": -249.7481231689453, "loss": 0.0701, "rewards/accuracies": 0.96875, "rewards/chosen": 3.7061257362365723, "rewards/margins": 5.206698417663574, "rewards/rejected": -1.500572919845581, "step": 180 }, { "epoch": 2.7306273062730626, "grad_norm": 16.604175060639175, "learning_rate": 1.7214919195619125e-08, "logits/chosen": -2.0375514030456543, "logits/rejected": -2.0372228622436523, "logps/chosen": -293.4367980957031, "logps/rejected": -243.2362823486328, "loss": 0.0833, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.817143201828003, "rewards/margins": 5.346969127655029, "rewards/rejected": -1.5298258066177368, "step": 185 }, { "epoch": 2.804428044280443, "grad_norm": 11.656202399163227, "learning_rate": 8.161562878982398e-09, "logits/chosen": -2.064812183380127, "logits/rejected": -2.0154833793640137, "logps/chosen": -295.53033447265625, "logps/rejected": -259.0420837402344, "loss": 0.0933, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.9195189476013184, "rewards/margins": 5.322437286376953, "rewards/rejected": -1.4029181003570557, "step": 190 }, { "epoch": 2.878228782287823, "grad_norm": 12.21325697905649, "learning_rate": 2.432892997526026e-09, "logits/chosen": -2.0528626441955566, "logits/rejected": -2.0427441596984863, "logps/chosen": -290.7054443359375, "logps/rejected": -244.73696899414062, "loss": 0.0959, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 3.4299838542938232, "rewards/margins": 5.134265899658203, "rewards/rejected": -1.7042820453643799, "step": 195 }, { "epoch": 2.952029520295203, "grad_norm": 13.794636154783172, "learning_rate": 6.763371270035457e-11, "logits/chosen": -2.0266225337982178, "logits/rejected": -2.011596441268921, "logps/chosen": -275.36798095703125, "logps/rejected": -242.58694458007812, "loss": 0.0505, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.6280651092529297, "rewards/margins": 5.206905364990234, "rewards/rejected": -1.5788400173187256, "step": 200 }, { "epoch": 2.952029520295203, "eval_logits/chosen": -2.054385185241699, "eval_logits/rejected": -2.0193707942962646, "eval_logps/chosen": -301.7057800292969, "eval_logps/rejected": -247.90260314941406, "eval_loss": 0.48475462198257446, "eval_rewards/accuracies": 0.8548387289047241, "eval_rewards/chosen": 2.75199556350708, "eval_rewards/margins": 3.300902843475342, "eval_rewards/rejected": -0.5489078760147095, "eval_runtime": 129.5144, "eval_samples_per_second": 14.832, "eval_steps_per_second": 0.239, "step": 200 }, { "epoch": 2.9667896678966788, "step": 201, "total_flos": 2369906314051584.0, "train_loss": 0.26609369445202957, "train_runtime": 7643.0309, "train_samples_per_second": 6.784, "train_steps_per_second": 0.026 } ], "logging_steps": 5, "max_steps": 201, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2369906314051584.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }