{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 324, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.046296296296296294, "grad_norm": 60.684144460841296, "learning_rate": 5e-07, "logits/chosen": -2.7229340076446533, "logits/rejected": -2.708962917327881, "logps/chosen": -284.58026123046875, "logps/rejected": -236.6112823486328, "loss": 0.6902, "rewards/accuracies": 0.34375, "rewards/chosen": 0.019139662384986877, "rewards/margins": 0.0077867708168923855, "rewards/rejected": 0.011352891102433205, "step": 5 }, { "epoch": 0.09259259259259259, "grad_norm": 58.154423145201655, "learning_rate": 1e-06, "logits/chosen": -2.6737208366394043, "logits/rejected": -2.683973789215088, "logps/chosen": -277.359619140625, "logps/rejected": -221.55029296875, "loss": 0.6271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.614923357963562, "rewards/margins": 0.23358741402626038, "rewards/rejected": 0.3813360035419464, "step": 10 }, { "epoch": 0.1388888888888889, "grad_norm": 42.31468213125409, "learning_rate": 9.99374496282885e-07, "logits/chosen": -2.4994168281555176, "logits/rejected": -2.450195789337158, "logps/chosen": -262.51397705078125, "logps/rejected": -197.8154296875, "loss": 0.5716, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.9596316814422607, "rewards/margins": 0.8402416110038757, "rewards/rejected": 1.1193901300430298, "step": 15 }, { "epoch": 0.18518518518518517, "grad_norm": 48.223216933084814, "learning_rate": 9.974995501511404e-07, "logits/chosen": -2.2974586486816406, "logits/rejected": -2.2951722145080566, "logps/chosen": -245.7628936767578, "logps/rejected": -194.25070190429688, "loss": 0.5987, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.8363304138183594, "rewards/margins": 0.6894850730895996, "rewards/rejected": 1.1468452215194702, "step": 20 }, { "epoch": 0.23148148148148148, "grad_norm": 37.278982280588096, "learning_rate": 9.94379852747865e-07, "logits/chosen": -2.147829055786133, "logits/rejected": -2.1413960456848145, "logps/chosen": -262.19586181640625, "logps/rejected": -201.21205139160156, "loss": 0.521, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 2.0674498081207275, "rewards/margins": 1.227130651473999, "rewards/rejected": 0.840319037437439, "step": 25 }, { "epoch": 0.2777777777777778, "grad_norm": 40.42547622604402, "learning_rate": 9.900232096023476e-07, "logits/chosen": -2.008653163909912, "logits/rejected": -1.9756759405136108, "logps/chosen": -254.8358612060547, "logps/rejected": -205.0883026123047, "loss": 0.531, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 2.0206902027130127, "rewards/margins": 1.3330821990966797, "rewards/rejected": 0.6876081228256226, "step": 30 }, { "epoch": 0.32407407407407407, "grad_norm": 36.172789672007866, "learning_rate": 9.844405211005144e-07, "logits/chosen": -1.9186718463897705, "logits/rejected": -1.8931595087051392, "logps/chosen": -254.8049774169922, "logps/rejected": -235.9902801513672, "loss": 0.5109, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 1.925907850265503, "rewards/margins": 1.3726422786712646, "rewards/rejected": 0.5532655119895935, "step": 35 }, { "epoch": 0.37037037037037035, "grad_norm": 42.11552973137223, "learning_rate": 9.776457552120033e-07, "logits/chosen": -1.9418878555297852, "logits/rejected": -1.9285017251968384, "logps/chosen": -271.52093505859375, "logps/rejected": -207.9265899658203, "loss": 0.5478, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 1.6729097366333008, "rewards/margins": 1.3872096538543701, "rewards/rejected": 0.28570008277893066, "step": 40 }, { "epoch": 0.4166666666666667, "grad_norm": 37.7632765834023, "learning_rate": 9.696559125420947e-07, "logits/chosen": -2.0984578132629395, "logits/rejected": -2.0816712379455566, "logps/chosen": -265.9540710449219, "logps/rejected": -209.5546112060547, "loss": 0.5079, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.1132609844207764, "rewards/margins": 1.7824798822402954, "rewards/rejected": 0.33078116178512573, "step": 45 }, { "epoch": 0.46296296296296297, "grad_norm": 39.63733367911293, "learning_rate": 9.604909837959454e-07, "logits/chosen": -2.116713762283325, "logits/rejected": -2.0851855278015137, "logps/chosen": -259.4333190917969, "logps/rejected": -226.8323516845703, "loss": 0.5413, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 2.012805938720703, "rewards/margins": 1.6606166362762451, "rewards/rejected": 0.35218924283981323, "step": 50 }, { "epoch": 0.5092592592592593, "grad_norm": 37.60669919034035, "learning_rate": 9.501738997615469e-07, "logits/chosen": -2.129849672317505, "logits/rejected": -2.118218183517456, "logps/chosen": -269.1800537109375, "logps/rejected": -219.0711669921875, "loss": 0.5543, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 1.9002279043197632, "rewards/margins": 1.5283737182617188, "rewards/rejected": 0.37185433506965637, "step": 55 }, { "epoch": 0.5555555555555556, "grad_norm": 33.29835445177269, "learning_rate": 9.387304739365523e-07, "logits/chosen": -2.159351110458374, "logits/rejected": -2.130619525909424, "logps/chosen": -252.5172882080078, "logps/rejected": -228.6378936767578, "loss": 0.5365, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 1.741283655166626, "rewards/margins": 1.711003065109253, "rewards/rejected": 0.030280273407697678, "step": 60 }, { "epoch": 0.6018518518518519, "grad_norm": 30.257759350833947, "learning_rate": 9.261893379425217e-07, "logits/chosen": -2.1917521953582764, "logits/rejected": -2.1474714279174805, "logps/chosen": -268.36163330078125, "logps/rejected": -205.3818817138672, "loss": 0.4985, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 1.57059645652771, "rewards/margins": 1.7522954940795898, "rewards/rejected": -0.18169905245304108, "step": 65 }, { "epoch": 0.6481481481481481, "grad_norm": 40.25398245444873, "learning_rate": 9.125818698881797e-07, "logits/chosen": -2.2085392475128174, "logits/rejected": -2.21848201751709, "logps/chosen": -271.4682312011719, "logps/rejected": -214.6433563232422, "loss": 0.5248, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.0917803049087524, "rewards/margins": 1.8838036060333252, "rewards/rejected": -0.7920231819152832, "step": 70 }, { "epoch": 0.6944444444444444, "grad_norm": 37.0974929070999, "learning_rate": 8.979421158609205e-07, "logits/chosen": -2.2309927940368652, "logits/rejected": -2.200138807296753, "logps/chosen": -266.31951904296875, "logps/rejected": -201.2650909423828, "loss": 0.5172, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 1.1627981662750244, "rewards/margins": 1.913628339767456, "rewards/rejected": -0.7508302330970764, "step": 75 }, { "epoch": 0.7407407407407407, "grad_norm": 36.09183980430684, "learning_rate": 8.823067047429906e-07, "logits/chosen": -2.2019572257995605, "logits/rejected": -2.1673662662506104, "logps/chosen": -278.7079162597656, "logps/rejected": -234.3495330810547, "loss": 0.5542, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.2150039672851562, "rewards/margins": 1.7666345834732056, "rewards/rejected": -0.551630437374115, "step": 80 }, { "epoch": 0.7870370370370371, "grad_norm": 38.47115055753171, "learning_rate": 8.657147565654818e-07, "logits/chosen": -2.1636500358581543, "logits/rejected": -2.146523952484131, "logps/chosen": -275.8933410644531, "logps/rejected": -237.3660888671875, "loss": 0.4855, "rewards/accuracies": 0.78125, "rewards/chosen": 1.741151213645935, "rewards/margins": 1.9285080432891846, "rewards/rejected": -0.1873568296432495, "step": 85 }, { "epoch": 0.8333333333333334, "grad_norm": 31.938894285721222, "learning_rate": 8.482077846294308e-07, "logits/chosen": -2.167405605316162, "logits/rejected": -2.1467373371124268, "logps/chosen": -275.11895751953125, "logps/rejected": -217.98818969726562, "loss": 0.4561, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.2301520109176636, "rewards/margins": 2.0726938247680664, "rewards/rejected": -0.8425418138504028, "step": 90 }, { "epoch": 0.8796296296296297, "grad_norm": 35.431684887112006, "learning_rate": 8.298295916389233e-07, "logits/chosen": -2.1900432109832764, "logits/rejected": -2.1787796020507812, "logps/chosen": -264.6141662597656, "logps/rejected": -242.1894989013672, "loss": 0.442, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3056919574737549, "rewards/margins": 2.1279501914978027, "rewards/rejected": -0.822258472442627, "step": 95 }, { "epoch": 0.9259259259259259, "grad_norm": 33.888318187677584, "learning_rate": 8.106261601060772e-07, "logits/chosen": -2.1956303119659424, "logits/rejected": -2.1856637001037598, "logps/chosen": -297.5550231933594, "logps/rejected": -255.35263061523438, "loss": 0.5761, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 1.0619537830352783, "rewards/margins": 1.9099775552749634, "rewards/rejected": -0.8480235934257507, "step": 100 }, { "epoch": 0.9259259259259259, "eval_logits/chosen": -2.1676995754241943, "eval_logits/rejected": -2.146118402481079, "eval_logps/chosen": -279.2745056152344, "eval_logps/rejected": -233.75474548339844, "eval_loss": 0.46477359533309937, "eval_rewards/accuracies": 0.8061224222183228, "eval_rewards/chosen": 1.101247787475586, "eval_rewards/margins": 2.194356679916382, "eval_rewards/rejected": -1.0931090116500854, "eval_runtime": 208.6661, "eval_samples_per_second": 14.727, "eval_steps_per_second": 0.235, "step": 100 }, { "epoch": 0.9722222222222222, "grad_norm": 33.67711763185909, "learning_rate": 7.906455373021128e-07, "logits/chosen": -2.124380588531494, "logits/rejected": -2.1406655311584473, "logps/chosen": -263.89453125, "logps/rejected": -228.3328094482422, "loss": 0.4873, "rewards/accuracies": 0.84375, "rewards/chosen": 0.895250141620636, "rewards/margins": 2.0895395278930664, "rewards/rejected": -1.1942893266677856, "step": 105 }, { "epoch": 1.0185185185185186, "grad_norm": 21.244154790846327, "learning_rate": 7.699377150423672e-07, "logits/chosen": -2.1806797981262207, "logits/rejected": -2.1463980674743652, "logps/chosen": -263.21453857421875, "logps/rejected": -247.73080444335938, "loss": 0.3339, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.9122228622436523, "rewards/margins": 2.487616777420044, "rewards/rejected": -1.5753939151763916, "step": 110 }, { "epoch": 1.0648148148148149, "grad_norm": 16.299205121924633, "learning_rate": 7.485545046060271e-07, "logits/chosen": -2.2146365642547607, "logits/rejected": -2.222085475921631, "logps/chosen": -280.36639404296875, "logps/rejected": -227.9482421875, "loss": 0.1998, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.063042163848877, "rewards/margins": 3.6736202239990234, "rewards/rejected": -1.610577940940857, "step": 115 }, { "epoch": 1.1111111111111112, "grad_norm": 16.73718376613808, "learning_rate": 7.265494071035401e-07, "logits/chosen": -2.2634971141815186, "logits/rejected": -2.216996192932129, "logps/chosen": -259.3006896972656, "logps/rejected": -246.74990844726562, "loss": 0.203, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.0323493480682373, "rewards/margins": 3.455160140991211, "rewards/rejected": -1.4228107929229736, "step": 120 }, { "epoch": 1.1574074074074074, "grad_norm": 18.487444498570227, "learning_rate": 7.03977479616039e-07, "logits/chosen": -2.20058012008667, "logits/rejected": -2.2042083740234375, "logps/chosen": -257.96331787109375, "logps/rejected": -254.6935577392578, "loss": 0.1666, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.385958433151245, "rewards/margins": 3.8731255531311035, "rewards/rejected": -1.4871671199798584, "step": 125 }, { "epoch": 1.2037037037037037, "grad_norm": 22.810846773089985, "learning_rate": 6.808951974417076e-07, "logits/chosen": -2.1897432804107666, "logits/rejected": -2.141242027282715, "logps/chosen": -255.61087036132812, "logps/rejected": -223.12802124023438, "loss": 0.2098, "rewards/accuracies": 0.9375, "rewards/chosen": 2.2871758937835693, "rewards/margins": 3.679692506790161, "rewards/rejected": -1.3925166130065918, "step": 130 }, { "epoch": 1.25, "grad_norm": 20.35798660753584, "learning_rate": 6.573603127937442e-07, "logits/chosen": -2.2258079051971436, "logits/rejected": -2.189937114715576, "logps/chosen": -264.5538024902344, "logps/rejected": -231.6312255859375, "loss": 0.2095, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.0883944034576416, "rewards/margins": 3.591538906097412, "rewards/rejected": -1.5031449794769287, "step": 135 }, { "epoch": 1.2962962962962963, "grad_norm": 23.04895829278538, "learning_rate": 6.334317103034652e-07, "logits/chosen": -2.2638206481933594, "logits/rejected": -2.228890895843506, "logps/chosen": -257.40338134765625, "logps/rejected": -240.62997436523438, "loss": 0.2394, "rewards/accuracies": 0.9375, "rewards/chosen": 2.0762557983398438, "rewards/margins": 3.8573691844940186, "rewards/rejected": -1.7811139822006226, "step": 140 }, { "epoch": 1.3425925925925926, "grad_norm": 17.62661068504451, "learning_rate": 6.091692596900827e-07, "logits/chosen": -2.235206127166748, "logits/rejected": -2.2013344764709473, "logps/chosen": -262.8838195800781, "logps/rejected": -244.3470916748047, "loss": 0.2448, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 2.0590713024139404, "rewards/margins": 3.7045459747314453, "rewards/rejected": -1.6454746723175049, "step": 145 }, { "epoch": 1.3888888888888888, "grad_norm": 17.569013714484907, "learning_rate": 5.84633665965777e-07, "logits/chosen": -2.194655418395996, "logits/rejected": -2.168844699859619, "logps/chosen": -251.3390350341797, "logps/rejected": -280.6144104003906, "loss": 0.2189, "rewards/accuracies": 0.9375, "rewards/chosen": 1.981199026107788, "rewards/margins": 4.223908424377441, "rewards/rejected": -2.2427096366882324, "step": 150 }, { "epoch": 1.4351851851851851, "grad_norm": 22.596638318143317, "learning_rate": 5.598863175508526e-07, "logits/chosen": -2.153757333755493, "logits/rejected": -2.102107524871826, "logps/chosen": -247.99124145507812, "logps/rejected": -237.85702514648438, "loss": 0.2229, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9839226007461548, "rewards/margins": 3.839322328567505, "rewards/rejected": -1.85539972782135, "step": 155 }, { "epoch": 1.4814814814814814, "grad_norm": 20.676833262098377, "learning_rate": 5.349891326789986e-07, "logits/chosen": -2.069032669067383, "logits/rejected": -2.0706586837768555, "logps/chosen": -255.1598358154297, "logps/rejected": -253.1689910888672, "loss": 0.252, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 2.097238063812256, "rewards/margins": 4.1082868576049805, "rewards/rejected": -2.011049270629883, "step": 160 }, { "epoch": 1.5277777777777777, "grad_norm": 20.783062875793597, "learning_rate": 5.100044044769472e-07, "logits/chosen": -2.0675559043884277, "logits/rejected": -2.051846981048584, "logps/chosen": -283.98687744140625, "logps/rejected": -251.8580780029297, "loss": 0.2301, "rewards/accuracies": 0.9375, "rewards/chosen": 2.353699207305908, "rewards/margins": 3.9306633472442627, "rewards/rejected": -1.5769641399383545, "step": 165 }, { "epoch": 1.574074074074074, "grad_norm": 23.57908475198222, "learning_rate": 4.849946451061443e-07, "logits/chosen": -1.9997777938842773, "logits/rejected": -1.967475175857544, "logps/chosen": -279.0697937011719, "logps/rejected": -233.81790161132812, "loss": 0.2143, "rewards/accuracies": 0.90625, "rewards/chosen": 2.201470375061035, "rewards/margins": 3.6621429920196533, "rewards/rejected": -1.4606724977493286, "step": 170 }, { "epoch": 1.6203703703703702, "grad_norm": 24.506892070167254, "learning_rate": 4.6002242935639254e-07, "logits/chosen": -1.9627641439437866, "logits/rejected": -1.9285609722137451, "logps/chosen": -253.19888305664062, "logps/rejected": -248.28207397460938, "loss": 0.2712, "rewards/accuracies": 0.90625, "rewards/chosen": 2.0500733852386475, "rewards/margins": 3.925525188446045, "rewards/rejected": -1.875451683998108, "step": 175 }, { "epoch": 1.6666666666666665, "grad_norm": 21.133393742613016, "learning_rate": 4.351502380827958e-07, "logits/chosen": -1.9648786783218384, "logits/rejected": -1.9206435680389404, "logps/chosen": -272.1482238769531, "logps/rejected": -215.24819946289062, "loss": 0.2378, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.675158143043518, "rewards/margins": 3.6289196014404297, "rewards/rejected": -1.9537616968154907, "step": 180 }, { "epoch": 1.7129629629629628, "grad_norm": 21.130284789458205, "learning_rate": 4.104403018777323e-07, "logits/chosen": -2.0080366134643555, "logits/rejected": -1.9320275783538818, "logps/chosen": -249.93130493164062, "logps/rejected": -250.1505584716797, "loss": 0.2425, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.9229265451431274, "rewards/margins": 4.077498912811279, "rewards/rejected": -2.1545729637145996, "step": 185 }, { "epoch": 1.7592592592592593, "grad_norm": 18.108471689412447, "learning_rate": 3.8595444536898525e-07, "logits/chosen": -2.001293659210205, "logits/rejected": -1.9521024227142334, "logps/chosen": -265.4033203125, "logps/rejected": -215.8245086669922, "loss": 0.2355, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.1630921363830566, "rewards/margins": 3.895209550857544, "rewards/rejected": -1.7321174144744873, "step": 190 }, { "epoch": 1.8055555555555556, "grad_norm": 20.623886123317074, "learning_rate": 3.61753932533607e-07, "logits/chosen": -2.024538040161133, "logits/rejected": -1.9703800678253174, "logps/chosen": -271.60858154296875, "logps/rejected": -246.217529296875, "loss": 0.2361, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.9751393795013428, "rewards/margins": 3.928147554397583, "rewards/rejected": -1.9530079364776611, "step": 195 }, { "epoch": 1.8518518518518519, "grad_norm": 22.55367614082194, "learning_rate": 3.3789931341453557e-07, "logits/chosen": -2.030125856399536, "logits/rejected": -1.9982750415802002, "logps/chosen": -266.38250732421875, "logps/rejected": -241.37753295898438, "loss": 0.2547, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.0332014560699463, "rewards/margins": 3.87813138961792, "rewards/rejected": -1.8449299335479736, "step": 200 }, { "epoch": 1.8518518518518519, "eval_logits/chosen": -2.0550973415374756, "eval_logits/rejected": -2.0096685886383057, "eval_logps/chosen": -274.60986328125, "eval_logps/rejected": -237.3907470703125, "eval_loss": 0.46807482838630676, "eval_rewards/accuracies": 0.8367347121238708, "eval_rewards/chosen": 1.5677103996276855, "eval_rewards/margins": 3.0244193077087402, "eval_rewards/rejected": -1.4567087888717651, "eval_runtime": 206.9715, "eval_samples_per_second": 14.847, "eval_steps_per_second": 0.237, "step": 200 }, { "epoch": 1.8981481481481481, "grad_norm": 17.819204918874263, "learning_rate": 3.144502726234889e-07, "logits/chosen": -2.0324487686157227, "logits/rejected": -2.0080246925354004, "logps/chosen": -288.62127685546875, "logps/rejected": -250.75411987304688, "loss": 0.2296, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.1828694343566895, "rewards/margins": 3.992137908935547, "rewards/rejected": -1.8092679977416992, "step": 205 }, { "epoch": 1.9444444444444444, "grad_norm": 19.148987139894267, "learning_rate": 2.9146548000917677e-07, "logits/chosen": -2.0495505332946777, "logits/rejected": -2.0217370986938477, "logps/chosen": -242.3642120361328, "logps/rejected": -242.4574737548828, "loss": 0.2326, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.7121883630752563, "rewards/margins": 3.8212177753448486, "rewards/rejected": -2.1090292930603027, "step": 210 }, { "epoch": 1.9907407407407407, "grad_norm": 22.69196711859144, "learning_rate": 2.69002443864469e-07, "logits/chosen": -2.080535650253296, "logits/rejected": -2.0337400436401367, "logps/chosen": -270.77313232421875, "logps/rejected": -239.3463592529297, "loss": 0.2649, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7927030324935913, "rewards/margins": 3.772850513458252, "rewards/rejected": -1.9801477193832397, "step": 215 }, { "epoch": 2.037037037037037, "grad_norm": 13.943990718217357, "learning_rate": 2.4711736703979015e-07, "logits/chosen": -2.103257656097412, "logits/rejected": -2.0635571479797363, "logps/chosen": -284.7521057128906, "logps/rejected": -256.8929138183594, "loss": 0.1477, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.748323678970337, "rewards/margins": 4.446930885314941, "rewards/rejected": -2.6986069679260254, "step": 220 }, { "epoch": 2.0833333333333335, "grad_norm": 15.45294738008002, "learning_rate": 2.258650063227533e-07, "logits/chosen": -2.1084115505218506, "logits/rejected": -2.056737184524536, "logps/chosen": -259.67041015625, "logps/rejected": -259.40728759765625, "loss": 0.1268, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.458908200263977, "rewards/margins": 4.31704044342041, "rewards/rejected": -2.858132839202881, "step": 225 }, { "epoch": 2.1296296296296298, "grad_norm": 12.88270101235706, "learning_rate": 2.0529853543586216e-07, "logits/chosen": -2.1149027347564697, "logits/rejected": -2.0866215229034424, "logps/chosen": -272.2378234863281, "logps/rejected": -256.72943115234375, "loss": 0.1228, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.90532648563385, "rewards/margins": 4.682445526123047, "rewards/rejected": -2.777118682861328, "step": 230 }, { "epoch": 2.175925925925926, "grad_norm": 12.32826597177878, "learning_rate": 1.854694119950675e-07, "logits/chosen": -2.113334894180298, "logits/rejected": -2.05454683303833, "logps/chosen": -263.45074462890625, "logps/rejected": -255.12893676757812, "loss": 0.0999, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.9264167547225952, "rewards/margins": 4.30756139755249, "rewards/rejected": -2.3811442852020264, "step": 235 }, { "epoch": 2.2222222222222223, "grad_norm": 15.15405460227757, "learning_rate": 1.6642724876204657e-07, "logits/chosen": -2.133556842803955, "logits/rejected": -2.087902784347534, "logps/chosen": -270.2631530761719, "logps/rejected": -253.8778839111328, "loss": 0.1237, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 2.2957730293273926, "rewards/margins": 4.74481725692749, "rewards/rejected": -2.4490439891815186, "step": 240 }, { "epoch": 2.2685185185185186, "grad_norm": 16.649173808427264, "learning_rate": 1.4821968951233637e-07, "logits/chosen": -2.107185125350952, "logits/rejected": -2.0924763679504395, "logps/chosen": -258.2135314941406, "logps/rejected": -233.6188507080078, "loss": 0.1055, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.095768928527832, "rewards/margins": 4.622010231018066, "rewards/rejected": -2.5262415409088135, "step": 245 }, { "epoch": 2.314814814814815, "grad_norm": 14.609854928382536, "learning_rate": 1.308922898298977e-07, "logits/chosen": -2.115427255630493, "logits/rejected": -2.0896975994110107, "logps/chosen": -263.09588623046875, "logps/rejected": -243.6695556640625, "loss": 0.1124, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 2.3170924186706543, "rewards/margins": 4.6176228523254395, "rewards/rejected": -2.300530195236206, "step": 250 }, { "epoch": 2.361111111111111, "grad_norm": 14.676533311284945, "learning_rate": 1.144884031263681e-07, "logits/chosen": -2.1398823261260986, "logits/rejected": -2.0688061714172363, "logps/chosen": -266.96026611328125, "logps/rejected": -252.38302612304688, "loss": 0.1168, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.9593225717544556, "rewards/margins": 4.533578872680664, "rewards/rejected": -2.574256420135498, "step": 255 }, { "epoch": 2.4074074074074074, "grad_norm": 11.628764039431521, "learning_rate": 9.904907217018e-08, "logits/chosen": -2.1351375579833984, "logits/rejected": -2.067322254180908, "logps/chosen": -256.94305419921875, "logps/rejected": -237.3994140625, "loss": 0.0995, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1635353565216064, "rewards/margins": 4.803465843200684, "rewards/rejected": -2.6399312019348145, "step": 260 }, { "epoch": 2.4537037037037037, "grad_norm": 18.178584418637232, "learning_rate": 8.461292639694517e-08, "logits/chosen": -2.0998477935791016, "logits/rejected": -2.060464382171631, "logps/chosen": -250.79953002929688, "logps/rejected": -249.4027557373047, "loss": 0.1143, "rewards/accuracies": 0.96875, "rewards/chosen": 2.085954189300537, "rewards/margins": 4.921433448791504, "rewards/rejected": -2.8354785442352295, "step": 265 }, { "epoch": 2.5, "grad_norm": 16.4110633269618, "learning_rate": 7.12160852580314e-08, "logits/chosen": -2.116481065750122, "logits/rejected": -2.049785852432251, "logps/chosen": -253.52597045898438, "logps/rejected": -239.3027801513672, "loss": 0.1219, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.850622534751892, "rewards/margins": 4.572412014007568, "rewards/rejected": -2.7217891216278076, "step": 270 }, { "epoch": 2.5462962962962963, "grad_norm": 15.514051763273304, "learning_rate": 5.889206784915862e-08, "logits/chosen": -2.1012206077575684, "logits/rejected": -2.0580103397369385, "logps/chosen": -258.37774658203125, "logps/rejected": -258.62432861328125, "loss": 0.1092, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.9426629543304443, "rewards/margins": 5.028421878814697, "rewards/rejected": -3.085759401321411, "step": 275 }, { "epoch": 2.5925925925925926, "grad_norm": 11.456484271697569, "learning_rate": 4.767170904512291e-08, "logits/chosen": -2.0715463161468506, "logits/rejected": -2.0747976303100586, "logps/chosen": -238.94155883789062, "logps/rejected": -255.44912719726562, "loss": 0.1131, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.1153740882873535, "rewards/margins": 4.905180931091309, "rewards/rejected": -2.789807081222534, "step": 280 }, { "epoch": 2.638888888888889, "grad_norm": 13.870849160723635, "learning_rate": 3.7583082350481573e-08, "logits/chosen": -2.0784220695495605, "logits/rejected": -2.052750587463379, "logps/chosen": -263.8325500488281, "logps/rejected": -234.6880340576172, "loss": 0.1039, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.34096097946167, "rewards/margins": 4.445528507232666, "rewards/rejected": -2.104567289352417, "step": 285 }, { "epoch": 2.685185185185185, "grad_norm": 17.181608986758075, "learning_rate": 2.86514296592269e-08, "logits/chosen": -2.086383819580078, "logits/rejected": -2.040381669998169, "logps/chosen": -280.8565673828125, "logps/rejected": -260.22607421875, "loss": 0.1165, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.262991428375244, "rewards/margins": 4.693873405456543, "rewards/rejected": -2.430881977081299, "step": 290 }, { "epoch": 2.7314814814814814, "grad_norm": 13.52180668792329, "learning_rate": 2.089909809919227e-08, "logits/chosen": -2.069549083709717, "logits/rejected": -2.030317544937134, "logps/chosen": -254.6141357421875, "logps/rejected": -262.8475341796875, "loss": 0.1297, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.047020435333252, "rewards/margins": 5.004242420196533, "rewards/rejected": -2.9572222232818604, "step": 295 }, { "epoch": 2.7777777777777777, "grad_norm": 16.042004959202647, "learning_rate": 1.434548411920622e-08, "logits/chosen": -2.058128833770752, "logits/rejected": -2.031694173812866, "logps/chosen": -252.9358673095703, "logps/rejected": -270.03570556640625, "loss": 0.1018, "rewards/accuracies": 0.96875, "rewards/chosen": 2.2302279472351074, "rewards/margins": 4.993742942810059, "rewards/rejected": -2.7635152339935303, "step": 300 }, { "epoch": 2.7777777777777777, "eval_logits/chosen": -2.0879852771759033, "eval_logits/rejected": -2.0407848358154297, "eval_logps/chosen": -276.32293701171875, "eval_logps/rejected": -243.11355590820312, "eval_loss": 0.46450331807136536, "eval_rewards/accuracies": 0.8494898080825806, "eval_rewards/chosen": 1.3964064121246338, "eval_rewards/margins": 3.4253976345062256, "eval_rewards/rejected": -2.028991460800171, "eval_runtime": 206.9293, "eval_samples_per_second": 14.85, "eval_steps_per_second": 0.237, "step": 300 }, { "epoch": 2.824074074074074, "grad_norm": 19.205673267409885, "learning_rate": 9.00698495888874e-09, "logits/chosen": -2.068077564239502, "logits/rejected": -2.027005672454834, "logps/chosen": -248.1720428466797, "logps/rejected": -249.2681121826172, "loss": 0.1352, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.7687886953353882, "rewards/margins": 4.742621898651123, "rewards/rejected": -2.973832845687866, "step": 305 }, { "epoch": 2.8703703703703702, "grad_norm": 18.56397553393815, "learning_rate": 4.8969576225142975e-09, "logits/chosen": -2.074991464614868, "logits/rejected": -2.0308446884155273, "logps/chosen": -258.87982177734375, "logps/rejected": -249.52542114257812, "loss": 0.1173, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 2.297821521759033, "rewards/margins": 4.811751365661621, "rewards/rejected": -2.513930082321167, "step": 310 }, { "epoch": 2.9166666666666665, "grad_norm": 19.703259917873964, "learning_rate": 2.0256854595881446e-09, "logits/chosen": -2.0802104473114014, "logits/rejected": -2.04276704788208, "logps/chosen": -257.5400390625, "logps/rejected": -241.93057250976562, "loss": 0.1309, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.9402424097061157, "rewards/margins": 4.416214942932129, "rewards/rejected": -2.4759726524353027, "step": 315 }, { "epoch": 2.962962962962963, "grad_norm": 15.35409361746528, "learning_rate": 4.0035243575342604e-10, "logits/chosen": -2.100257396697998, "logits/rejected": -2.023787260055542, "logps/chosen": -279.096923828125, "logps/rejected": -257.52032470703125, "loss": 0.1095, "rewards/accuracies": 0.96875, "rewards/chosen": 2.575255870819092, "rewards/margins": 5.259024620056152, "rewards/rejected": -2.6837692260742188, "step": 320 }, { "epoch": 3.0, "step": 324, "total_flos": 3820433807769600.0, "train_loss": 0.29292676165515996, "train_runtime": 11688.2747, "train_samples_per_second": 7.096, "train_steps_per_second": 0.028 } ], "logging_steps": 5, "max_steps": 324, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3820433807769600.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }