{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6837606837606838, "eval_steps": 40, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017094017094017096, "grad_norm": 35.038580788061665, "learning_rate": 5e-07, "logits/chosen": -2.7457876205444336, "logits/rejected": -2.7444841861724854, "logps/chosen": -164.26461791992188, "logps/rejected": -170.55870056152344, "loss": 0.6935, "rewards/accuracies": 0.26875001192092896, "rewards/chosen": 0.003455913159996271, "rewards/margins": -0.0019886991940438747, "rewards/rejected": 0.0054446132853627205, "step": 5 }, { "epoch": 0.03418803418803419, "grad_norm": 36.203903910498276, "learning_rate": 1e-06, "logits/chosen": -2.7106502056121826, "logits/rejected": -2.716397523880005, "logps/chosen": -171.80043029785156, "logps/rejected": -165.20602416992188, "loss": 0.6875, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.012000308372080326, "rewards/margins": 0.0025437879376113415, "rewards/rejected": 0.009456520900130272, "step": 10 }, { "epoch": 0.05128205128205128, "grad_norm": 33.9576577784673, "learning_rate": 9.999177507263144e-07, "logits/chosen": -2.651571750640869, "logits/rejected": -2.629457473754883, "logps/chosen": -174.04080200195312, "logps/rejected": -174.0542755126953, "loss": 0.6698, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.23909731209278107, "rewards/margins": 0.10868903249502182, "rewards/rejected": 0.13040827214717865, "step": 15 }, { "epoch": 0.06837606837606838, "grad_norm": 34.33646066636181, "learning_rate": 9.996710299650301e-07, "logits/chosen": -2.476440668106079, "logits/rejected": -2.450225353240967, "logps/chosen": -158.1311798095703, "logps/rejected": -158.0066680908203, "loss": 0.6613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4318675100803375, "rewards/margins": 0.14549395442008972, "rewards/rejected": 0.2863735556602478, "step": 20 }, { "epoch": 0.08547008547008547, "grad_norm": 33.16430522723429, "learning_rate": 9.992599188865604e-07, "logits/chosen": -2.3086318969726562, "logits/rejected": -2.3104796409606934, "logps/chosen": -150.59771728515625, "logps/rejected": -156.85037231445312, "loss": 0.6494, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.5047669410705566, "rewards/margins": 0.16554531455039978, "rewards/rejected": 0.33922165632247925, "step": 25 }, { "epoch": 0.10256410256410256, "grad_norm": 34.52861424862365, "learning_rate": 9.98684552745256e-07, "logits/chosen": -2.217874050140381, "logits/rejected": -2.2254481315612793, "logps/chosen": -161.29412841796875, "logps/rejected": -161.40841674804688, "loss": 0.6295, "rewards/accuracies": 0.625, "rewards/chosen": 0.4176379144191742, "rewards/margins": 0.26531916856765747, "rewards/rejected": 0.15231874585151672, "step": 30 }, { "epoch": 0.11965811965811966, "grad_norm": 31.455117829218544, "learning_rate": 9.979451208349055e-07, "logits/chosen": -2.2608728408813477, "logits/rejected": -2.246007204055786, "logps/chosen": -171.71456909179688, "logps/rejected": -174.46578979492188, "loss": 0.6305, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01912705972790718, "rewards/margins": 0.31441593170166016, "rewards/rejected": -0.33354294300079346, "step": 35 }, { "epoch": 0.13675213675213677, "grad_norm": 31.67318837058587, "learning_rate": 9.970418664264595e-07, "logits/chosen": -2.345672130584717, "logits/rejected": -2.331491470336914, "logps/chosen": -171.24766540527344, "logps/rejected": -176.8189697265625, "loss": 0.5989, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.27867692708969116, "rewards/margins": 0.5290472507476807, "rewards/rejected": -0.8077241778373718, "step": 40 }, { "epoch": 0.13675213675213677, "eval_logits/chosen": -2.4102065563201904, "eval_logits/rejected": -2.401230573654175, "eval_logps/chosen": -162.36439514160156, "eval_logps/rejected": -167.4954071044922, "eval_loss": 0.6069236993789673, "eval_rewards/accuracies": 0.6365384459495544, "eval_rewards/chosen": -0.388705849647522, "eval_rewards/margins": 0.47280558943748474, "eval_rewards/rejected": -0.8615114688873291, "eval_runtime": 509.918, "eval_samples_per_second": 16.305, "eval_steps_per_second": 0.255, "step": 40 }, { "epoch": 0.15384615384615385, "grad_norm": 36.18313806223269, "learning_rate": 9.95975086687994e-07, "logits/chosen": -2.44050669670105, "logits/rejected": -2.4460220336914062, "logps/chosen": -163.82875061035156, "logps/rejected": -167.35989379882812, "loss": 0.6146, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.31098368763923645, "rewards/margins": 0.46269193291664124, "rewards/rejected": -0.7736755609512329, "step": 45 }, { "epoch": 0.17094017094017094, "grad_norm": 31.13412274683678, "learning_rate": 9.947451325869439e-07, "logits/chosen": -2.501091718673706, "logits/rejected": -2.4991250038146973, "logps/chosen": -172.09686279296875, "logps/rejected": -177.7747802734375, "loss": 0.577, "rewards/accuracies": 0.75, "rewards/chosen": -0.212348073720932, "rewards/margins": 0.6062799692153931, "rewards/rejected": -0.8186280131340027, "step": 50 }, { "epoch": 0.18803418803418803, "grad_norm": 31.508672436862835, "learning_rate": 9.933524087746347e-07, "logits/chosen": -2.437525510787964, "logits/rejected": -2.4285693168640137, "logps/chosen": -168.1316375732422, "logps/rejected": -175.23193359375, "loss": 0.571, "rewards/accuracies": 0.75, "rewards/chosen": -0.513076901435852, "rewards/margins": 0.7702310681343079, "rewards/rejected": -1.2833080291748047, "step": 55 }, { "epoch": 0.20512820512820512, "grad_norm": 30.148068867306787, "learning_rate": 9.917973734531549e-07, "logits/chosen": -2.431530475616455, "logits/rejected": -2.431729793548584, "logps/chosen": -159.38168334960938, "logps/rejected": -170.52500915527344, "loss": 0.5762, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.34855490922927856, "rewards/margins": 0.5969334244728088, "rewards/rejected": -0.9454883337020874, "step": 60 }, { "epoch": 0.2222222222222222, "grad_norm": 32.03814968183332, "learning_rate": 9.90080538224607e-07, "logits/chosen": -2.533193588256836, "logits/rejected": -2.5252978801727295, "logps/chosen": -157.30966186523438, "logps/rejected": -166.26011657714844, "loss": 0.5643, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.007600936107337475, "rewards/margins": 0.5010749697685242, "rewards/rejected": -0.5086758732795715, "step": 65 }, { "epoch": 0.23931623931623933, "grad_norm": 29.16308768569833, "learning_rate": 9.882024679227938e-07, "logits/chosen": -2.5899624824523926, "logits/rejected": -2.5779967308044434, "logps/chosen": -178.4553985595703, "logps/rejected": -179.71542358398438, "loss": 0.5464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.47189587354660034, "rewards/margins": 0.8304598927497864, "rewards/rejected": -1.3023556470870972, "step": 70 }, { "epoch": 0.2564102564102564, "grad_norm": 28.918531347661485, "learning_rate": 9.861637804273881e-07, "logits/chosen": -2.578892469406128, "logits/rejected": -2.5758416652679443, "logps/chosen": -162.60537719726562, "logps/rejected": -170.6789093017578, "loss": 0.5553, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45147842168807983, "rewards/margins": 0.6994724273681641, "rewards/rejected": -1.1509509086608887, "step": 75 }, { "epoch": 0.27350427350427353, "grad_norm": 26.98866754941649, "learning_rate": 9.83965146460653e-07, "logits/chosen": -2.54936146736145, "logits/rejected": -2.5406956672668457, "logps/chosen": -168.81484985351562, "logps/rejected": -179.770751953125, "loss": 0.5452, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6899678111076355, "rewards/margins": 0.8549306988716125, "rewards/rejected": -1.544898509979248, "step": 80 }, { "epoch": 0.27350427350427353, "eval_logits/chosen": -2.53336238861084, "eval_logits/rejected": -2.517695665359497, "eval_logps/chosen": -167.28964233398438, "eval_logps/rejected": -177.21824645996094, "eval_loss": 0.5331124663352966, "eval_rewards/accuracies": 0.7134615182876587, "eval_rewards/chosen": -0.8812309503555298, "eval_rewards/margins": 0.9525622725486755, "eval_rewards/rejected": -1.8337931632995605, "eval_runtime": 510.0922, "eval_samples_per_second": 16.299, "eval_steps_per_second": 0.255, "step": 80 }, { "epoch": 0.2905982905982906, "grad_norm": 34.783908892421536, "learning_rate": 9.816072893667758e-07, "logits/chosen": -2.5432825088500977, "logits/rejected": -2.5159504413604736, "logps/chosen": -174.62197875976562, "logps/rejected": -185.89413452148438, "loss": 0.5581, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0434839725494385, "rewards/margins": 1.0283188819885254, "rewards/rejected": -2.0718026161193848, "step": 85 }, { "epoch": 0.3076923076923077, "grad_norm": 26.697686805838906, "learning_rate": 9.790909848738904e-07, "logits/chosen": -2.5102508068084717, "logits/rejected": -2.5222485065460205, "logps/chosen": -175.47544860839844, "logps/rejected": -183.92678833007812, "loss": 0.5208, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9199908971786499, "rewards/margins": 0.8521744608879089, "rewards/rejected": -1.7721655368804932, "step": 90 }, { "epoch": 0.3247863247863248, "grad_norm": 30.125094604814798, "learning_rate": 9.764170608388647e-07, "logits/chosen": -2.514260768890381, "logits/rejected": -2.4829812049865723, "logps/chosen": -167.62655639648438, "logps/rejected": -174.2395477294922, "loss": 0.5242, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6241778135299683, "rewards/margins": 1.0742968320846558, "rewards/rejected": -1.6984745264053345, "step": 95 }, { "epoch": 0.3418803418803419, "grad_norm": 27.550843374580296, "learning_rate": 9.735863969749371e-07, "logits/chosen": -2.4171032905578613, "logits/rejected": -2.381608486175537, "logps/chosen": -177.05935668945312, "logps/rejected": -188.4621124267578, "loss": 0.5002, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7831762433052063, "rewards/margins": 1.0672458410263062, "rewards/rejected": -1.8504221439361572, "step": 100 }, { "epoch": 0.358974358974359, "grad_norm": 30.39392617500016, "learning_rate": 9.705999245622956e-07, "logits/chosen": -2.3619236946105957, "logits/rejected": -2.3391060829162598, "logps/chosen": -170.48300170898438, "logps/rejected": -183.28384399414062, "loss": 0.5026, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8889726400375366, "rewards/margins": 0.9097515940666199, "rewards/rejected": -1.7987244129180908, "step": 105 }, { "epoch": 0.37606837606837606, "grad_norm": 26.741945030347612, "learning_rate": 9.674586261416873e-07, "logits/chosen": -2.2946972846984863, "logits/rejected": -2.2440435886383057, "logps/chosen": -179.06390380859375, "logps/rejected": -188.00010681152344, "loss": 0.5206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6539386510848999, "rewards/margins": 1.0372655391693115, "rewards/rejected": -1.691204309463501, "step": 110 }, { "epoch": 0.39316239316239315, "grad_norm": 33.116742735027486, "learning_rate": 9.641635351911664e-07, "logits/chosen": -2.218276262283325, "logits/rejected": -2.18500018119812, "logps/chosen": -171.17381286621094, "logps/rejected": -183.25845336914062, "loss": 0.4801, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9279203414916992, "rewards/margins": 1.2727015018463135, "rewards/rejected": -2.200622081756592, "step": 115 }, { "epoch": 0.41025641025641024, "grad_norm": 27.185641229760538, "learning_rate": 9.607157357860821e-07, "logits/chosen": -2.124584436416626, "logits/rejected": -2.0961549282073975, "logps/chosen": -189.48277282714844, "logps/rejected": -203.43951416015625, "loss": 0.5026, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2869656085968018, "rewards/margins": 1.3039339780807495, "rewards/rejected": -2.5908992290496826, "step": 120 }, { "epoch": 0.41025641025641024, "eval_logits/chosen": -2.0268211364746094, "eval_logits/rejected": -1.9764775037765503, "eval_logps/chosen": -172.888671875, "eval_logps/rejected": -185.58355712890625, "eval_loss": 0.49246644973754883, "eval_rewards/accuracies": 0.7442307472229004, "eval_rewards/chosen": -1.441135048866272, "eval_rewards/margins": 1.2291908264160156, "eval_rewards/rejected": -2.670325756072998, "eval_runtime": 510.1247, "eval_samples_per_second": 16.298, "eval_steps_per_second": 0.255, "step": 120 }, { "epoch": 0.42735042735042733, "grad_norm": 31.03461706328688, "learning_rate": 9.571163622424225e-07, "logits/chosen": -1.944964051246643, "logits/rejected": -1.9178746938705444, "logps/chosen": -175.3327178955078, "logps/rejected": -188.2616729736328, "loss": 0.5017, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.579502820968628, "rewards/margins": 1.2485122680664062, "rewards/rejected": -2.828014850616455, "step": 125 }, { "epoch": 0.4444444444444444, "grad_norm": 29.080520770184428, "learning_rate": 9.533665987436261e-07, "logits/chosen": -1.8825464248657227, "logits/rejected": -1.8078832626342773, "logps/chosen": -178.3484649658203, "logps/rejected": -197.55380249023438, "loss": 0.4983, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5868518352508545, "rewards/margins": 1.2471343278884888, "rewards/rejected": -2.8339860439300537, "step": 130 }, { "epoch": 0.46153846153846156, "grad_norm": 28.903021536294002, "learning_rate": 9.494676789509899e-07, "logits/chosen": -1.8585374355316162, "logits/rejected": -1.8128669261932373, "logps/chosen": -178.5911407470703, "logps/rejected": -195.90933227539062, "loss": 0.492, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2965319156646729, "rewards/margins": 1.4173026084899902, "rewards/rejected": -2.713834285736084, "step": 135 }, { "epoch": 0.47863247863247865, "grad_norm": 27.5476391641307, "learning_rate": 9.454208855977985e-07, "logits/chosen": -1.920654296875, "logits/rejected": -1.8412939310073853, "logps/chosen": -179.1053924560547, "logps/rejected": -196.11526489257812, "loss": 0.4753, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5140180587768555, "rewards/margins": 1.5388453006744385, "rewards/rejected": -3.052863121032715, "step": 140 }, { "epoch": 0.49572649572649574, "grad_norm": 30.03317842923354, "learning_rate": 9.41227550067308e-07, "logits/chosen": -1.9514515399932861, "logits/rejected": -1.949883222579956, "logps/chosen": -178.63250732421875, "logps/rejected": -191.42721557617188, "loss": 0.4803, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.513338327407837, "rewards/margins": 1.4887291193008423, "rewards/rejected": -3.0020670890808105, "step": 145 }, { "epoch": 0.5128205128205128, "grad_norm": 30.28469957381902, "learning_rate": 9.36889051954725e-07, "logits/chosen": -2.0093894004821777, "logits/rejected": -1.9657704830169678, "logps/chosen": -180.35043334960938, "logps/rejected": -197.2502899169922, "loss": 0.4895, "rewards/accuracies": 0.78125, "rewards/chosen": -1.602224588394165, "rewards/margins": 1.6883083581924438, "rewards/rejected": -3.2905325889587402, "step": 150 }, { "epoch": 0.5299145299145299, "grad_norm": 28.420242591686232, "learning_rate": 9.324068186133245e-07, "logits/chosen": -1.9976894855499268, "logits/rejected": -1.9886022806167603, "logps/chosen": -171.70602416992188, "logps/rejected": -185.99795532226562, "loss": 0.4608, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.2952425479888916, "rewards/margins": 1.7483227252960205, "rewards/rejected": -3.043565034866333, "step": 155 }, { "epoch": 0.5470085470085471, "grad_norm": 26.601543429998234, "learning_rate": 9.277823246848536e-07, "logits/chosen": -2.056879758834839, "logits/rejected": -1.9998328685760498, "logps/chosen": -186.3706817626953, "logps/rejected": -196.63290405273438, "loss": 0.4511, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2312135696411133, "rewards/margins": 1.352858304977417, "rewards/rejected": -2.5840718746185303, "step": 160 }, { "epoch": 0.5470085470085471, "eval_logits/chosen": -2.070892095565796, "eval_logits/rejected": -2.0279953479766846, "eval_logps/chosen": -171.76034545898438, "eval_logps/rejected": -189.1643829345703, "eval_loss": 0.4683005213737488, "eval_rewards/accuracies": 0.762499988079071, "eval_rewards/chosen": -1.328302264213562, "eval_rewards/margins": 1.70010507106781, "eval_rewards/rejected": -3.028407096862793, "eval_runtime": 509.9565, "eval_samples_per_second": 16.303, "eval_steps_per_second": 0.255, "step": 160 }, { "epoch": 0.5641025641025641, "grad_norm": 41.76296476638838, "learning_rate": 9.230170916143793e-07, "logits/chosen": -2.1190731525421143, "logits/rejected": -2.083359956741333, "logps/chosen": -176.87539672851562, "logps/rejected": -198.44384765625, "loss": 0.4944, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2975060939788818, "rewards/margins": 1.6890850067138672, "rewards/rejected": -2.98659086227417, "step": 165 }, { "epoch": 0.5811965811965812, "grad_norm": 28.83194976337172, "learning_rate": 9.181126871497378e-07, "logits/chosen": -2.175851583480835, "logits/rejected": -2.1391243934631348, "logps/chosen": -178.2881317138672, "logps/rejected": -197.88473510742188, "loss": 0.4813, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2544641494750977, "rewards/margins": 1.7747846841812134, "rewards/rejected": -3.0292489528656006, "step": 170 }, { "epoch": 0.5982905982905983, "grad_norm": 30.93659066586097, "learning_rate": 9.130707248257491e-07, "logits/chosen": -2.313814640045166, "logits/rejected": -2.2677135467529297, "logps/chosen": -170.06781005859375, "logps/rejected": -177.8175811767578, "loss": 0.4863, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0524061918258667, "rewards/margins": 1.3644572496414185, "rewards/rejected": -2.416863441467285, "step": 175 }, { "epoch": 0.6153846153846154, "grad_norm": 25.018999438635433, "learning_rate": 9.078928634333698e-07, "logits/chosen": -2.302171230316162, "logits/rejected": -2.2788572311401367, "logps/chosen": -179.72390747070312, "logps/rejected": -197.12283325195312, "loss": 0.4553, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.6731274724006653, "rewards/margins": 1.6728944778442383, "rewards/rejected": -2.346021890640259, "step": 180 }, { "epoch": 0.6324786324786325, "grad_norm": 28.576400660174777, "learning_rate": 9.025808064739549e-07, "logits/chosen": -2.2794651985168457, "logits/rejected": -2.2391860485076904, "logps/chosen": -175.87045288085938, "logps/rejected": -189.4848175048828, "loss": 0.4854, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8901998400688171, "rewards/margins": 1.4675487279891968, "rewards/rejected": -2.357748508453369, "step": 185 }, { "epoch": 0.6495726495726496, "grad_norm": 25.73471562251865, "learning_rate": 8.971363015988113e-07, "logits/chosen": -2.1966824531555176, "logits/rejected": -2.1603925228118896, "logps/chosen": -172.0600128173828, "logps/rejected": -191.96176147460938, "loss": 0.4681, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9620615243911743, "rewards/margins": 1.4954371452331543, "rewards/rejected": -2.457498550415039, "step": 190 }, { "epoch": 0.6666666666666666, "grad_norm": 34.912982133976655, "learning_rate": 8.91561140034225e-07, "logits/chosen": -2.1389029026031494, "logits/rejected": -2.0825791358947754, "logps/chosen": -174.3153839111328, "logps/rejected": -194.2677459716797, "loss": 0.4935, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4726169109344482, "rewards/margins": 1.4599871635437012, "rewards/rejected": -2.9326040744781494, "step": 195 }, { "epoch": 0.6837606837606838, "grad_norm": 25.756167591259292, "learning_rate": 8.858571559921537e-07, "logits/chosen": -2.135298013687134, "logits/rejected": -2.067862033843994, "logps/chosen": -178.73361206054688, "logps/rejected": -193.21209716796875, "loss": 0.4562, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.452704668045044, "rewards/margins": 1.6391651630401611, "rewards/rejected": -3.091869831085205, "step": 200 }, { "epoch": 0.6837606837606838, "eval_logits/chosen": -2.1462392807006836, "eval_logits/rejected": -2.1028637886047363, "eval_logps/chosen": -173.41998291015625, "eval_logps/rejected": -191.55532836914062, "eval_loss": 0.4528014361858368, "eval_rewards/accuracies": 0.7567307949066162, "eval_rewards/chosen": -1.4942626953125, "eval_rewards/margins": 1.7732419967651367, "eval_rewards/rejected": -3.2675046920776367, "eval_runtime": 510.9487, "eval_samples_per_second": 16.272, "eval_steps_per_second": 0.254, "step": 200 } ], "logging_steps": 5, "max_steps": 876, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2358113407598592.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }