{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.983177570093458, "eval_steps": 50, "global_step": 399, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.037383177570093455, "grad_norm": 53.10315006693553, "learning_rate": 5e-07, "logits/chosen": -2.7264351844787598, "logits/rejected": -2.7314915657043457, "logps/chosen": -233.46450805664062, "logps/rejected": -215.2651824951172, "loss": 0.6911, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": 0.011523213237524033, "rewards/margins": 0.00106804131064564, "rewards/rejected": 0.010455173440277576, "step": 5 }, { "epoch": 0.07476635514018691, "grad_norm": 47.36575434115236, "learning_rate": 1e-06, "logits/chosen": -2.7007861137390137, "logits/rejected": -2.6771092414855957, "logps/chosen": -243.54736328125, "logps/rejected": -216.7264404296875, "loss": 0.6571, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.35093382000923157, "rewards/margins": 0.08091190457344055, "rewards/rejected": 0.270021915435791, "step": 10 }, { "epoch": 0.11214953271028037, "grad_norm": 47.16978215311432, "learning_rate": 9.995924118521016e-07, "logits/chosen": -2.430677652359009, "logits/rejected": -2.3965601921081543, "logps/chosen": -245.1031951904297, "logps/rejected": -206.2293701171875, "loss": 0.6246, "rewards/accuracies": 0.6875, "rewards/chosen": 1.259174108505249, "rewards/margins": 0.5380627512931824, "rewards/rejected": 0.7211112380027771, "step": 15 }, { "epoch": 0.14953271028037382, "grad_norm": 43.635723418114395, "learning_rate": 9.983703119207998e-07, "logits/chosen": -2.1696972846984863, "logits/rejected": -2.1355605125427246, "logps/chosen": -241.0774383544922, "logps/rejected": -203.36190795898438, "loss": 0.6793, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.169782280921936, "rewards/margins": 0.6064848899841309, "rewards/rejected": 0.5632972717285156, "step": 20 }, { "epoch": 0.18691588785046728, "grad_norm": 48.55163726023584, "learning_rate": 9.963356926598848e-07, "logits/chosen": -2.0636093616485596, "logits/rejected": -2.068882942199707, "logps/chosen": -245.859130859375, "logps/rejected": -225.98214721679688, "loss": 0.6303, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.4211041927337646, "rewards/margins": 0.8779617547988892, "rewards/rejected": 0.5431426167488098, "step": 25 }, { "epoch": 0.22429906542056074, "grad_norm": 42.28573315120618, "learning_rate": 9.934918712161414e-07, "logits/chosen": -2.1142563819885254, "logits/rejected": -2.0855462551116943, "logps/chosen": -239.36471557617188, "logps/rejected": -208.6727294921875, "loss": 0.5921, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.9205316305160522, "rewards/margins": 0.793667733669281, "rewards/rejected": 0.12686386704444885, "step": 30 }, { "epoch": 0.2616822429906542, "grad_norm": 43.204924277825036, "learning_rate": 9.898434840212305e-07, "logits/chosen": -2.1356260776519775, "logits/rejected": -2.0982658863067627, "logps/chosen": -249.84848022460938, "logps/rejected": -232.4295196533203, "loss": 0.5949, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.6800674200057983, "rewards/margins": 0.8049441576004028, "rewards/rejected": -0.12487666308879852, "step": 35 }, { "epoch": 0.29906542056074764, "grad_norm": 38.34647398013497, "learning_rate": 9.853964792326704e-07, "logits/chosen": -2.1388490200042725, "logits/rejected": -2.106875419616699, "logps/chosen": -231.1354217529297, "logps/rejected": -210.77197265625, "loss": 0.6446, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6789754033088684, "rewards/margins": 1.0355161428451538, "rewards/rejected": -0.35654082894325256, "step": 40 }, { "epoch": 0.3364485981308411, "grad_norm": 44.46300001215897, "learning_rate": 9.80158107036243e-07, "logits/chosen": -2.182988405227661, "logits/rejected": -2.139224052429199, "logps/chosen": -253.672119140625, "logps/rejected": -198.7143096923828, "loss": 0.5967, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.711867094039917, "rewards/margins": 0.8050382733345032, "rewards/rejected": -0.093171127140522, "step": 45 }, { "epoch": 0.37383177570093457, "grad_norm": 36.368588934194, "learning_rate": 9.741369078256344e-07, "logits/chosen": -2.1803622245788574, "logits/rejected": -2.1714465618133545, "logps/chosen": -229.3830108642578, "logps/rejected": -214.208251953125, "loss": 0.577, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7632301449775696, "rewards/margins": 0.9728155136108398, "rewards/rejected": -0.20958539843559265, "step": 50 }, { "epoch": 0.37383177570093457, "eval_logits/chosen": -2.168900966644287, "eval_logits/rejected": -2.154597759246826, "eval_logps/chosen": -240.57708740234375, "eval_logps/rejected": -220.16871643066406, "eval_loss": 0.578223705291748, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": 0.7959616780281067, "eval_rewards/margins": 0.9759488701820374, "eval_rewards/rejected": -0.17998719215393066, "eval_runtime": 252.8699, "eval_samples_per_second": 15.024, "eval_steps_per_second": 0.237, "step": 50 }, { "epoch": 0.411214953271028, "grad_norm": 36.85921784944549, "learning_rate": 9.673426982785825e-07, "logits/chosen": -2.1165783405303955, "logits/rejected": -2.133802890777588, "logps/chosen": -227.85147094726562, "logps/rejected": -229.4573516845703, "loss": 0.5854, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.668209969997406, "rewards/margins": 0.9655311703681946, "rewards/rejected": -0.29732123017311096, "step": 55 }, { "epoch": 0.4485981308411215, "grad_norm": 37.099348027626476, "learning_rate": 9.597865553522297e-07, "logits/chosen": -2.1299071311950684, "logits/rejected": -2.1265180110931396, "logps/chosen": -246.6769256591797, "logps/rejected": -218.6841583251953, "loss": 0.5468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3689742088317871, "rewards/margins": 1.112032175064087, "rewards/rejected": -0.7430580258369446, "step": 60 }, { "epoch": 0.48598130841121495, "grad_norm": 35.927665101781095, "learning_rate": 9.514807982237785e-07, "logits/chosen": -2.298119068145752, "logits/rejected": -2.2940573692321777, "logps/chosen": -265.78155517578125, "logps/rejected": -213.521240234375, "loss": 0.5918, "rewards/accuracies": 0.75, "rewards/chosen": 0.9371916055679321, "rewards/margins": 1.3884761333465576, "rewards/rejected": -0.45128464698791504, "step": 65 }, { "epoch": 0.5233644859813084, "grad_norm": 39.11873955408514, "learning_rate": 9.424389682058886e-07, "logits/chosen": -2.3393406867980957, "logits/rejected": -2.309872627258301, "logps/chosen": -218.0382080078125, "logps/rejected": -194.01332092285156, "loss": 0.5295, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.9512729644775391, "rewards/margins": 1.144668698310852, "rewards/rejected": -0.19339559972286224, "step": 70 }, { "epoch": 0.5607476635514018, "grad_norm": 37.634486315960864, "learning_rate": 9.326758066695624e-07, "logits/chosen": -2.34443736076355, "logits/rejected": -2.325118064880371, "logps/chosen": -259.53143310546875, "logps/rejected": -198.3692626953125, "loss": 0.5487, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.9949586987495422, "rewards/margins": 1.7472797632217407, "rewards/rejected": -0.7523208856582642, "step": 75 }, { "epoch": 0.5981308411214953, "grad_norm": 35.50580413675745, "learning_rate": 9.222072310105126e-07, "logits/chosen": -2.3364174365997314, "logits/rejected": -2.312894105911255, "logps/chosen": -236.32666015625, "logps/rejected": -261.32843017578125, "loss": 0.6152, "rewards/accuracies": 0.71875, "rewards/chosen": 0.37070125341415405, "rewards/margins": 1.0360088348388672, "rewards/rejected": -0.6653076410293579, "step": 80 }, { "epoch": 0.6355140186915887, "grad_norm": 33.7374435363093, "learning_rate": 9.110503086981955e-07, "logits/chosen": -2.2803781032562256, "logits/rejected": -2.267977237701416, "logps/chosen": -255.5703887939453, "logps/rejected": -206.46066284179688, "loss": 0.553, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.18914642930030823, "rewards/margins": 1.2585302591323853, "rewards/rejected": -1.0693838596343994, "step": 85 }, { "epoch": 0.6728971962616822, "grad_norm": 39.897529811340966, "learning_rate": 8.992232294498169e-07, "logits/chosen": -2.1736109256744385, "logits/rejected": -2.1623623371124268, "logps/chosen": -255.2686309814453, "logps/rejected": -225.6684112548828, "loss": 0.5359, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.11371274292469025, "rewards/margins": 1.4066712856292725, "rewards/rejected": -1.2929584980010986, "step": 90 }, { "epoch": 0.7102803738317757, "grad_norm": 36.88770484599986, "learning_rate": 8.867452755746805e-07, "logits/chosen": -2.1795907020568848, "logits/rejected": -2.1622931957244873, "logps/chosen": -267.36358642578125, "logps/rejected": -237.27359008789062, "loss": 0.4889, "rewards/accuracies": 0.75, "rewards/chosen": -0.24791303277015686, "rewards/margins": 1.365724802017212, "rewards/rejected": -1.6136376857757568, "step": 95 }, { "epoch": 0.7476635514018691, "grad_norm": 29.575103655541206, "learning_rate": 8.736367905372246e-07, "logits/chosen": -2.1824848651885986, "logits/rejected": -2.164578914642334, "logps/chosen": -262.33575439453125, "logps/rejected": -242.2086639404297, "loss": 0.5388, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11618832498788834, "rewards/margins": 1.8072330951690674, "rewards/rejected": -1.9234212636947632, "step": 100 }, { "epoch": 0.7476635514018691, "eval_logits/chosen": -2.199050188064575, "eval_logits/rejected": -2.174029588699341, "eval_logps/chosen": -252.93431091308594, "eval_logps/rejected": -238.5013885498047, "eval_loss": 0.5390673875808716, "eval_rewards/accuracies": 0.7479166388511658, "eval_rewards/chosen": -0.4397614300251007, "eval_rewards/margins": 1.5734889507293701, "eval_rewards/rejected": -2.0132501125335693, "eval_runtime": 252.4297, "eval_samples_per_second": 15.05, "eval_steps_per_second": 0.238, "step": 100 }, { "epoch": 0.7850467289719626, "grad_norm": 33.93716964639979, "learning_rate": 8.599191457900016e-07, "logits/chosen": -2.2423758506774902, "logits/rejected": -2.215951919555664, "logps/chosen": -248.74136352539062, "logps/rejected": -238.6445770263672, "loss": 0.5277, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.2006298005580902, "rewards/margins": 1.6982009410858154, "rewards/rejected": -1.898830771446228, "step": 105 }, { "epoch": 0.822429906542056, "grad_norm": 34.93462881131966, "learning_rate": 8.456147059306757e-07, "logits/chosen": -2.3176093101501465, "logits/rejected": -2.312403440475464, "logps/chosen": -260.4902648925781, "logps/rejected": -202.03927612304688, "loss": 0.5658, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.13997402787208557, "rewards/margins": 1.6242597103118896, "rewards/rejected": -1.7642338275909424, "step": 110 }, { "epoch": 0.8598130841121495, "grad_norm": 33.1410955281443, "learning_rate": 8.307467922398432e-07, "logits/chosen": -2.4065065383911133, "logits/rejected": -2.4101977348327637, "logps/chosen": -249.76541137695312, "logps/rejected": -247.932861328125, "loss": 0.5152, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.18386869132518768, "rewards/margins": 1.8899492025375366, "rewards/rejected": -2.073817729949951, "step": 115 }, { "epoch": 0.897196261682243, "grad_norm": 31.607304609592937, "learning_rate": 8.15339644659121e-07, "logits/chosen": -2.4259490966796875, "logits/rejected": -2.41045880317688, "logps/chosen": -258.97723388671875, "logps/rejected": -221.7887420654297, "loss": 0.5595, "rewards/accuracies": 0.78125, "rewards/chosen": -0.14788323640823364, "rewards/margins": 1.9169807434082031, "rewards/rejected": -2.064863681793213, "step": 120 }, { "epoch": 0.9345794392523364, "grad_norm": 32.67833778299361, "learning_rate": 7.994183822714968e-07, "logits/chosen": -2.4193129539489746, "logits/rejected": -2.4144270420074463, "logps/chosen": -262.0440368652344, "logps/rejected": -223.1062469482422, "loss": 0.4636, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.24569320678710938, "rewards/margins": 2.0063912868499756, "rewards/rejected": -2.252084970474243, "step": 125 }, { "epoch": 0.9719626168224299, "grad_norm": 28.586935737564, "learning_rate": 7.830089623483656e-07, "logits/chosen": -2.431533098220825, "logits/rejected": -2.414226531982422, "logps/chosen": -250.75613403320312, "logps/rejected": -233.8451385498047, "loss": 0.4432, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.2264728844165802, "rewards/margins": 2.210639476776123, "rewards/rejected": -2.4371120929718018, "step": 130 }, { "epoch": 1.0093457943925233, "grad_norm": 21.434099176546553, "learning_rate": 7.661381380300253e-07, "logits/chosen": -2.442919969558716, "logits/rejected": -2.422956705093384, "logps/chosen": -274.38555908203125, "logps/rejected": -250.0774688720703, "loss": 0.4499, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.05850023776292801, "rewards/margins": 2.6394431591033936, "rewards/rejected": -2.5809426307678223, "step": 135 }, { "epoch": 1.0467289719626167, "grad_norm": 18.591470106307337, "learning_rate": 7.488334147086263e-07, "logits/chosen": -2.431138515472412, "logits/rejected": -2.4030303955078125, "logps/chosen": -250.7612762451172, "logps/rejected": -224.02401733398438, "loss": 0.2627, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.4705049395561218, "rewards/margins": 2.965265989303589, "rewards/rejected": -2.494760751724243, "step": 140 }, { "epoch": 1.0841121495327102, "grad_norm": 26.220775795974554, "learning_rate": 7.311230051846819e-07, "logits/chosen": -2.4043233394622803, "logits/rejected": -2.3778297901153564, "logps/chosen": -225.55380249023438, "logps/rejected": -237.52963256835938, "loss": 0.2753, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.8029943704605103, "rewards/margins": 3.111466646194458, "rewards/rejected": -2.3084726333618164, "step": 145 }, { "epoch": 1.1214953271028036, "grad_norm": 20.647616721664097, "learning_rate": 7.130357836702577e-07, "logits/chosen": -2.3966336250305176, "logits/rejected": -2.3554282188415527, "logps/chosen": -260.1661376953125, "logps/rejected": -240.3435821533203, "loss": 0.2653, "rewards/accuracies": 0.90625, "rewards/chosen": 1.162467360496521, "rewards/margins": 3.1894516944885254, "rewards/rejected": -2.026984214782715, "step": 150 }, { "epoch": 1.1214953271028036, "eval_logits/chosen": -2.3485267162323, "eval_logits/rejected": -2.32659649848938, "eval_logps/chosen": -245.67454528808594, "eval_logps/rejected": -235.21473693847656, "eval_loss": 0.5246723890304565, "eval_rewards/accuracies": 0.7645833492279053, "eval_rewards/chosen": 0.2862185537815094, "eval_rewards/margins": 1.9708030223846436, "eval_rewards/rejected": -1.6845842599868774, "eval_runtime": 253.0283, "eval_samples_per_second": 15.014, "eval_steps_per_second": 0.237, "step": 150 }, { "epoch": 1.158878504672897, "grad_norm": 23.50919746826828, "learning_rate": 6.946012387138247e-07, "logits/chosen": -2.3319590091705322, "logits/rejected": -2.305656909942627, "logps/chosen": -240.41323852539062, "logps/rejected": -236.7122802734375, "loss": 0.2499, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9150522947311401, "rewards/margins": 3.2040953636169434, "rewards/rejected": -2.2890431880950928, "step": 155 }, { "epoch": 1.1962616822429906, "grad_norm": 16.10335481559186, "learning_rate": 6.758494251235274e-07, "logits/chosen": -2.288930654525757, "logits/rejected": -2.2800326347351074, "logps/chosen": -252.2129364013672, "logps/rejected": -242.2833251953125, "loss": 0.2442, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.06269371509552, "rewards/margins": 3.869558334350586, "rewards/rejected": -2.8068645000457764, "step": 160 }, { "epoch": 1.233644859813084, "grad_norm": 18.48336670058505, "learning_rate": 6.568109149672496e-07, "logits/chosen": -2.2865896224975586, "logits/rejected": -2.2526893615722656, "logps/chosen": -253.0467529296875, "logps/rejected": -230.2500762939453, "loss": 0.2703, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.3167855739593506, "rewards/margins": 3.570039749145508, "rewards/rejected": -2.2532541751861572, "step": 165 }, { "epoch": 1.2710280373831775, "grad_norm": 25.61953738965752, "learning_rate": 6.375167477293648e-07, "logits/chosen": -2.304429292678833, "logits/rejected": -2.293348789215088, "logps/chosen": -239.5231170654297, "logps/rejected": -229.12246704101562, "loss": 0.2725, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3146635293960571, "rewards/margins": 3.4736862182617188, "rewards/rejected": -2.159022808074951, "step": 170 }, { "epoch": 1.308411214953271, "grad_norm": 25.42815054266923, "learning_rate": 6.179983797054321e-07, "logits/chosen": -2.3991851806640625, "logits/rejected": -2.3383266925811768, "logps/chosen": -236.6082000732422, "logps/rejected": -269.2547607421875, "loss": 0.2646, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.4701949954032898, "rewards/margins": 4.1486945152282715, "rewards/rejected": -3.6785004138946533, "step": 175 }, { "epoch": 1.3457943925233644, "grad_norm": 25.323140928512764, "learning_rate": 5.982876327173427e-07, "logits/chosen": -2.4732565879821777, "logits/rejected": -2.43239164352417, "logps/chosen": -274.3072814941406, "logps/rejected": -239.09585571289062, "loss": 0.3069, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.0698857456445694, "rewards/margins": 3.551178455352783, "rewards/rejected": -3.481292247772217, "step": 180 }, { "epoch": 1.3831775700934579, "grad_norm": 20.834596330130136, "learning_rate": 5.78416642232531e-07, "logits/chosen": -2.4427692890167236, "logits/rejected": -2.4098360538482666, "logps/chosen": -258.0860900878906, "logps/rejected": -243.33206176757812, "loss": 0.2444, "rewards/accuracies": 0.875, "rewards/chosen": 0.3224690556526184, "rewards/margins": 3.5159637928009033, "rewards/rejected": -3.1934947967529297, "step": 185 }, { "epoch": 1.4205607476635513, "grad_norm": 18.851032795980213, "learning_rate": 5.584178049718314e-07, "logits/chosen": -2.4080841541290283, "logits/rejected": -2.3888661861419678, "logps/chosen": -242.3834686279297, "logps/rejected": -247.5576629638672, "loss": 0.2604, "rewards/accuracies": 0.875, "rewards/chosen": 0.38662514090538025, "rewards/margins": 3.3747589588165283, "rewards/rejected": -2.988133668899536, "step": 190 }, { "epoch": 1.4579439252336448, "grad_norm": 24.324007820476073, "learning_rate": 5.38323726091401e-07, "logits/chosen": -2.436607837677002, "logits/rejected": -2.391335964202881, "logps/chosen": -250.54647827148438, "logps/rejected": -239.52590942382812, "loss": 0.3042, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.005544149782508612, "rewards/margins": 3.300248384475708, "rewards/rejected": -3.3057925701141357, "step": 195 }, { "epoch": 1.4953271028037383, "grad_norm": 21.752837131187086, "learning_rate": 5.181671660248178e-07, "logits/chosen": -2.4886248111724854, "logits/rejected": -2.4703927040100098, "logps/chosen": -239.51376342773438, "logps/rejected": -247.77114868164062, "loss": 0.2571, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.03603418171405792, "rewards/margins": 3.4513862133026123, "rewards/rejected": -3.4874203205108643, "step": 200 }, { "epoch": 1.4953271028037383, "eval_logits/chosen": -2.5015809535980225, "eval_logits/rejected": -2.475208282470703, "eval_logps/chosen": -254.51597595214844, "eval_logps/rejected": -249.1765594482422, "eval_loss": 0.5108085870742798, "eval_rewards/accuracies": 0.7791666388511658, "eval_rewards/chosen": -0.5979260802268982, "eval_rewards/margins": 2.482844591140747, "eval_rewards/rejected": -3.080770254135132, "eval_runtime": 252.5989, "eval_samples_per_second": 15.04, "eval_steps_per_second": 0.238, "step": 200 }, { "epoch": 1.5327102803738317, "grad_norm": 22.788597725922575, "learning_rate": 4.979809870720242e-07, "logits/chosen": -2.5105209350585938, "logits/rejected": -2.48913836479187, "logps/chosen": -252.13818359375, "logps/rejected": -243.4073944091797, "loss": 0.2725, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.011826371774077415, "rewards/margins": 3.6809163093566895, "rewards/rejected": -3.6927428245544434, "step": 205 }, { "epoch": 1.5700934579439252, "grad_norm": 22.065080050736203, "learning_rate": 4.777980998221901e-07, "logits/chosen": -2.4797415733337402, "logits/rejected": -2.4482269287109375, "logps/chosen": -224.90585327148438, "logps/rejected": -238.3304443359375, "loss": 0.283, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.0005477949744090438, "rewards/margins": 3.547239303588867, "rewards/rejected": -3.5477871894836426, "step": 210 }, { "epoch": 1.6074766355140186, "grad_norm": 24.51095857605576, "learning_rate": 4.5765140949784923e-07, "logits/chosen": -2.4635329246520996, "logits/rejected": -2.4240822792053223, "logps/chosen": -256.9695739746094, "logps/rejected": -243.4046630859375, "loss": 0.3058, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.43238821625709534, "rewards/margins": 3.786862850189209, "rewards/rejected": -3.3544750213623047, "step": 215 }, { "epoch": 1.644859813084112, "grad_norm": 20.419017482003486, "learning_rate": 4.3757376230778383e-07, "logits/chosen": -2.4464869499206543, "logits/rejected": -2.394662618637085, "logps/chosen": -249.76754760742188, "logps/rejected": -256.5289306640625, "loss": 0.2327, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8897647857666016, "rewards/margins": 4.213340759277344, "rewards/rejected": -3.3235764503479004, "step": 220 }, { "epoch": 1.6822429906542056, "grad_norm": 53.385379065169666, "learning_rate": 4.1759789189612333e-07, "logits/chosen": -2.394674777984619, "logits/rejected": -2.3603250980377197, "logps/chosen": -253.1683349609375, "logps/rejected": -259.30975341796875, "loss": 0.2888, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.5363051891326904, "rewards/margins": 3.8216965198516846, "rewards/rejected": -3.285390853881836, "step": 225 }, { "epoch": 1.719626168224299, "grad_norm": 19.461317653850365, "learning_rate": 3.9775636597496285e-07, "logits/chosen": -2.3545804023742676, "logits/rejected": -2.356778621673584, "logps/chosen": -246.68734741210938, "logps/rejected": -241.13998413085938, "loss": 0.2588, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.10117466747760773, "rewards/margins": 3.4182426929473877, "rewards/rejected": -3.317068099975586, "step": 230 }, { "epoch": 1.7570093457943925, "grad_norm": 22.752036394982305, "learning_rate": 3.7808153322750893e-07, "logits/chosen": -2.379742383956909, "logits/rejected": -2.3354058265686035, "logps/chosen": -243.09249877929688, "logps/rejected": -249.0592498779297, "loss": 0.2849, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.22115659713745117, "rewards/margins": 3.8574492931365967, "rewards/rejected": -3.6362926959991455, "step": 235 }, { "epoch": 1.794392523364486, "grad_norm": 25.661272927251265, "learning_rate": 3.586054705683208e-07, "logits/chosen": -2.387585163116455, "logits/rejected": -2.333857774734497, "logps/chosen": -273.67315673828125, "logps/rejected": -271.0914306640625, "loss": 0.2909, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.6078623533248901, "rewards/margins": 3.6751747131347656, "rewards/rejected": -3.0673117637634277, "step": 240 }, { "epoch": 1.8317757009345794, "grad_norm": 25.180396150597772, "learning_rate": 3.393599308466285e-07, "logits/chosen": -2.401310920715332, "logits/rejected": -2.336974620819092, "logps/chosen": -256.4820251464844, "logps/rejected": -246.9813232421875, "loss": 0.282, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.10372234880924225, "rewards/margins": 3.3294150829315186, "rewards/rejected": -3.2256927490234375, "step": 245 }, { "epoch": 1.8691588785046729, "grad_norm": 18.724955408080817, "learning_rate": 3.203762910779944e-07, "logits/chosen": -2.404759645462036, "logits/rejected": -2.3678829669952393, "logps/chosen": -253.46896362304688, "logps/rejected": -234.8633575439453, "loss": 0.2803, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.2244938164949417, "rewards/margins": 3.192117214202881, "rewards/rejected": -2.967623233795166, "step": 250 }, { "epoch": 1.8691588785046729, "eval_logits/chosen": -2.4107203483581543, "eval_logits/rejected": -2.3852570056915283, "eval_logps/chosen": -251.44602966308594, "eval_logps/rejected": -245.23483276367188, "eval_loss": 0.48170700669288635, "eval_rewards/accuracies": 0.7854166626930237, "eval_rewards/chosen": -0.2909303605556488, "eval_rewards/margins": 2.3956663608551025, "eval_rewards/rejected": -2.686596632003784, "eval_runtime": 252.2818, "eval_samples_per_second": 15.059, "eval_steps_per_second": 0.238, "step": 250 }, { "epoch": 1.9065420560747663, "grad_norm": 29.924010084948176, "learning_rate": 3.0168550128871264e-07, "logits/chosen": -2.4268598556518555, "logits/rejected": -2.3924014568328857, "logps/chosen": -237.61965942382812, "logps/rejected": -245.6161346435547, "loss": 0.2684, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.2175990641117096, "rewards/margins": 3.640658140182495, "rewards/rejected": -3.4230587482452393, "step": 255 }, { "epoch": 1.9439252336448598, "grad_norm": 26.71214593633519, "learning_rate": 2.833180340563554e-07, "logits/chosen": -2.425642728805542, "logits/rejected": -2.399965763092041, "logps/chosen": -231.1629180908203, "logps/rejected": -250.3664093017578, "loss": 0.2788, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.05841640383005142, "rewards/margins": 3.254362106323242, "rewards/rejected": -3.1959457397460938, "step": 260 }, { "epoch": 1.9813084112149533, "grad_norm": 23.089942965590083, "learning_rate": 2.653038348287261e-07, "logits/chosen": -2.4249393939971924, "logits/rejected": -2.3924624919891357, "logps/chosen": -267.17547607421875, "logps/rejected": -267.37066650390625, "loss": 0.2624, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.4873844087123871, "rewards/margins": 3.554990291595459, "rewards/rejected": -3.0676064491271973, "step": 265 }, { "epoch": 2.0186915887850465, "grad_norm": 24.317059270551702, "learning_rate": 2.476722731022207e-07, "logits/chosen": -2.397399663925171, "logits/rejected": -2.3522555828094482, "logps/chosen": -270.63812255859375, "logps/rejected": -237.6252899169922, "loss": 0.2462, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7779295444488525, "rewards/margins": 4.320176601409912, "rewards/rejected": -3.5422470569610596, "step": 270 }, { "epoch": 2.05607476635514, "grad_norm": 24.99157167857996, "learning_rate": 2.3045209453919407e-07, "logits/chosen": -2.3702712059020996, "logits/rejected": -2.362605571746826, "logps/chosen": -238.41159057617188, "logps/rejected": -242.57275390625, "loss": 0.1566, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.2242518663406372, "rewards/margins": 3.7089030742645264, "rewards/rejected": -3.4846510887145996, "step": 275 }, { "epoch": 2.0934579439252334, "grad_norm": 19.242571139212618, "learning_rate": 2.13671374102394e-07, "logits/chosen": -2.362250804901123, "logits/rejected": -2.343247175216675, "logps/chosen": -276.24969482421875, "logps/rejected": -272.6140441894531, "loss": 0.1735, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.516792893409729, "rewards/margins": 4.325055122375488, "rewards/rejected": -3.808262348175049, "step": 280 }, { "epoch": 2.130841121495327, "grad_norm": 15.533308886613643, "learning_rate": 1.9735747028287342e-07, "logits/chosen": -2.3508124351501465, "logits/rejected": -2.330045700073242, "logps/chosen": -239.631103515625, "logps/rejected": -263.2986145019531, "loss": 0.1593, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.07615267485380173, "rewards/margins": 4.113415718078613, "rewards/rejected": -4.037262916564941, "step": 285 }, { "epoch": 2.1682242990654204, "grad_norm": 15.699144927979777, "learning_rate": 1.815369804960034e-07, "logits/chosen": -2.35709547996521, "logits/rejected": -2.3328442573547363, "logps/chosen": -244.9105987548828, "logps/rejected": -255.9078826904297, "loss": 0.1619, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3098369538784027, "rewards/margins": 4.1518449783325195, "rewards/rejected": -3.842008113861084, "step": 290 }, { "epoch": 2.205607476635514, "grad_norm": 20.58346646707518, "learning_rate": 1.6623569771830852e-07, "logits/chosen": -2.3732151985168457, "logits/rejected": -2.3346047401428223, "logps/chosen": -247.153564453125, "logps/rejected": -250.949951171875, "loss": 0.1841, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.4144899249076843, "rewards/margins": 3.991154909133911, "rewards/rejected": -3.5766654014587402, "step": 295 }, { "epoch": 2.2429906542056073, "grad_norm": 12.509918099598698, "learning_rate": 1.5147856843582002e-07, "logits/chosen": -2.349177360534668, "logits/rejected": -2.3184986114501953, "logps/chosen": -242.14108276367188, "logps/rejected": -242.0617218017578, "loss": 0.1739, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6371382474899292, "rewards/margins": 4.000860214233398, "rewards/rejected": -3.3637218475341797, "step": 300 }, { "epoch": 2.2429906542056073, "eval_logits/chosen": -2.355950355529785, "eval_logits/rejected": -2.3281238079071045, "eval_logps/chosen": -252.35202026367188, "eval_logps/rejected": -246.845947265625, "eval_loss": 0.49124717712402344, "eval_rewards/accuracies": 0.7916666865348816, "eval_rewards/chosen": -0.381531685590744, "eval_rewards/margins": 2.466177225112915, "eval_rewards/rejected": -2.8477089405059814, "eval_runtime": 252.5764, "eval_samples_per_second": 15.041, "eval_steps_per_second": 0.238, "step": 300 }, { "epoch": 2.2803738317757007, "grad_norm": 18.288756117670946, "learning_rate": 1.3728965197250781e-07, "logits/chosen": -2.35316801071167, "logits/rejected": -2.318678379058838, "logps/chosen": -256.4324645996094, "logps/rejected": -242.2520751953125, "loss": 0.1359, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5630406141281128, "rewards/margins": 4.296410083770752, "rewards/rejected": -3.7333693504333496, "step": 305 }, { "epoch": 2.317757009345794, "grad_norm": 16.35995569615145, "learning_rate": 1.236920812651003e-07, "logits/chosen": -2.347511053085327, "logits/rejected": -2.3156919479370117, "logps/chosen": -234.4215545654297, "logps/rejected": -240.92129516601562, "loss": 0.1667, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.38125258684158325, "rewards/margins": 4.210916042327881, "rewards/rejected": -3.8296637535095215, "step": 310 }, { "epoch": 2.3551401869158877, "grad_norm": 15.341078879044524, "learning_rate": 1.1070802514823913e-07, "logits/chosen": -2.3151564598083496, "logits/rejected": -2.291027307510376, "logps/chosen": -245.5782928466797, "logps/rejected": -246.9814453125, "loss": 0.1652, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.24629132449626923, "rewards/margins": 4.182533264160156, "rewards/rejected": -3.9362423419952393, "step": 315 }, { "epoch": 2.392523364485981, "grad_norm": 23.7922647773062, "learning_rate": 9.835865221146389e-08, "logits/chosen": -2.301703691482544, "logits/rejected": -2.290492296218872, "logps/chosen": -223.250732421875, "logps/rejected": -263.16534423828125, "loss": 0.1591, "rewards/accuracies": 0.96875, "rewards/chosen": 0.41100868582725525, "rewards/margins": 4.717347145080566, "rewards/rejected": -4.306338310241699, "step": 320 }, { "epoch": 2.4299065420560746, "grad_norm": 16.439486015212957, "learning_rate": 8.666409628694693e-08, "logits/chosen": -2.319634199142456, "logits/rejected": -2.2700839042663574, "logps/chosen": -260.8030700683594, "logps/rejected": -255.59646606445312, "loss": 0.1595, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.4477875232696533, "rewards/margins": 4.375200271606445, "rewards/rejected": -3.927412748336792, "step": 325 }, { "epoch": 2.467289719626168, "grad_norm": 14.726547277918746, "learning_rate": 7.564342362424713e-08, "logits/chosen": -2.313271999359131, "logits/rejected": -2.2773654460906982, "logps/chosen": -262.1775817871094, "logps/rejected": -245.81216430664062, "loss": 0.1524, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.36651816964149475, "rewards/margins": 4.084885597229004, "rewards/rejected": -3.7183678150177, "step": 330 }, { "epoch": 2.5046728971962615, "grad_norm": 11.804281276567055, "learning_rate": 6.53146018056011e-08, "logits/chosen": -2.328040599822998, "logits/rejected": -2.275287389755249, "logps/chosen": -258.7254638671875, "logps/rejected": -244.4686737060547, "loss": 0.1497, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21398362517356873, "rewards/margins": 4.004579067230225, "rewards/rejected": -3.790595531463623, "step": 335 }, { "epoch": 2.542056074766355, "grad_norm": 17.23503560362695, "learning_rate": 5.569447045242931e-08, "logits/chosen": -2.288806676864624, "logits/rejected": -2.2881455421447754, "logps/chosen": -238.40414428710938, "logps/rejected": -256.07183837890625, "loss": 0.1615, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1942141056060791, "rewards/margins": 3.9404773712158203, "rewards/rejected": -3.7462635040283203, "step": 340 }, { "epoch": 2.5794392523364484, "grad_norm": 24.74669297879705, "learning_rate": 4.6798713770814625e-08, "logits/chosen": -2.3001134395599365, "logits/rejected": -2.2864573001861572, "logps/chosen": -238.6791534423828, "logps/rejected": -238.4395294189453, "loss": 0.1534, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.4987949728965759, "rewards/margins": 4.1639580726623535, "rewards/rejected": -3.665163040161133, "step": 345 }, { "epoch": 2.616822429906542, "grad_norm": 18.596924543370314, "learning_rate": 3.864183498071699e-08, "logits/chosen": -2.3193628787994385, "logits/rejected": -2.2843828201293945, "logps/chosen": -253.653076171875, "logps/rejected": -278.3705139160156, "loss": 0.1631, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.44818204641342163, "rewards/margins": 4.706423759460449, "rewards/rejected": -4.258241176605225, "step": 350 }, { "epoch": 2.616822429906542, "eval_logits/chosen": -2.3091881275177, "eval_logits/rejected": -2.2783515453338623, "eval_logps/chosen": -252.63780212402344, "eval_logps/rejected": -248.45175170898438, "eval_loss": 0.49646082520484924, "eval_rewards/accuracies": 0.7895833253860474, "eval_rewards/chosen": -0.4101119339466095, "eval_rewards/margins": 2.5981762409210205, "eval_rewards/rejected": -3.0082881450653076, "eval_runtime": 251.9742, "eval_samples_per_second": 15.077, "eval_steps_per_second": 0.238, "step": 350 }, { "epoch": 2.6542056074766354, "grad_norm": 24.678306348653066, "learning_rate": 3.1237132670611455e-08, "logits/chosen": -2.3217742443084717, "logits/rejected": -2.285179853439331, "logps/chosen": -245.7552032470703, "logps/rejected": -277.97393798828125, "loss": 0.1623, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.5618477463722229, "rewards/margins": 4.592735290527344, "rewards/rejected": -4.030888080596924, "step": 355 }, { "epoch": 2.691588785046729, "grad_norm": 18.396082804478937, "learning_rate": 2.4596679116099083e-08, "logits/chosen": -2.304999828338623, "logits/rejected": -2.295154094696045, "logps/chosen": -240.2224884033203, "logps/rejected": -248.5629425048828, "loss": 0.1735, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.357688844203949, "rewards/margins": 3.9913132190704346, "rewards/rejected": -3.63362455368042, "step": 360 }, { "epoch": 2.7289719626168223, "grad_norm": 18.24918146544934, "learning_rate": 1.8731300597841837e-08, "logits/chosen": -2.304487943649292, "logits/rejected": -2.260864496231079, "logps/chosen": -234.8140106201172, "logps/rejected": -260.767578125, "loss": 0.161, "rewards/accuracies": 0.9375, "rewards/chosen": 0.49128788709640503, "rewards/margins": 4.2879767417907715, "rewards/rejected": -3.7966887950897217, "step": 365 }, { "epoch": 2.7663551401869158, "grad_norm": 26.438822654562358, "learning_rate": 1.365055975090773e-08, "logits/chosen": -2.3118393421173096, "logits/rejected": -2.2817904949188232, "logps/chosen": -252.44638061523438, "logps/rejected": -268.03570556640625, "loss": 0.1686, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5989893078804016, "rewards/margins": 4.2095770835876465, "rewards/rejected": -3.6105875968933105, "step": 370 }, { "epoch": 2.803738317757009, "grad_norm": 16.398293030996605, "learning_rate": 9.362739974303757e-09, "logits/chosen": -2.335395097732544, "logits/rejected": -2.2768070697784424, "logps/chosen": -243.970703125, "logps/rejected": -267.5694580078125, "loss": 0.1565, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.38310879468917847, "rewards/margins": 4.402381896972656, "rewards/rejected": -4.019272804260254, "step": 375 }, { "epoch": 2.8411214953271027, "grad_norm": 18.034442959036298, "learning_rate": 5.874831926114931e-09, "logits/chosen": -2.3112030029296875, "logits/rejected": -2.281367778778076, "logps/chosen": -240.7181854248047, "logps/rejected": -251.8282012939453, "loss": 0.1504, "rewards/accuracies": 0.9375, "rewards/chosen": 0.41617345809936523, "rewards/margins": 4.074304580688477, "rewards/rejected": -3.6581311225891113, "step": 380 }, { "epoch": 2.878504672897196, "grad_norm": 17.543650487814592, "learning_rate": 3.192522126266861e-09, "logits/chosen": -2.3081631660461426, "logits/rejected": -2.264885902404785, "logps/chosen": -244.57107543945312, "logps/rejected": -248.1013946533203, "loss": 0.1641, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.41191452741622925, "rewards/margins": 4.37111759185791, "rewards/rejected": -3.9592037200927734, "step": 385 }, { "epoch": 2.9158878504672896, "grad_norm": 15.080570821768335, "learning_rate": 1.3201836854931924e-09, "logits/chosen": -2.3092455863952637, "logits/rejected": -2.2830567359924316, "logps/chosen": -241.13278198242188, "logps/rejected": -266.1209716796875, "loss": 0.1715, "rewards/accuracies": 0.96875, "rewards/chosen": 0.32842570543289185, "rewards/margins": 4.37562370300293, "rewards/rejected": -4.047197341918945, "step": 390 }, { "epoch": 2.953271028037383, "grad_norm": 20.03660661880128, "learning_rate": 2.6086917562317957e-10, "logits/chosen": -2.3180956840515137, "logits/rejected": -2.277705669403076, "logps/chosen": -252.68295288085938, "logps/rejected": -274.5094909667969, "loss": 0.1565, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.44486260414123535, "rewards/margins": 4.4007062911987305, "rewards/rejected": -3.955843687057495, "step": 395 }, { "epoch": 2.983177570093458, "step": 399, "total_flos": 4704901791744000.0, "train_loss": 0.3356622438084213, "train_runtime": 15341.8197, "train_samples_per_second": 6.686, "train_steps_per_second": 0.026 } ], "logging_steps": 5, "max_steps": 399, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4704901791744000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }