diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3003 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.994871794871795, + "eval_steps": 40, + "global_step": 876, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.017094017094017096, + "grad_norm": 35.038580788061665, + "learning_rate": 5e-07, + "logits/chosen": -2.7457876205444336, + "logits/rejected": -2.7444841861724854, + "logps/chosen": -164.26461791992188, + "logps/rejected": -170.55870056152344, + "loss": 0.6935, + "rewards/accuracies": 0.26875001192092896, + "rewards/chosen": 0.003455913159996271, + "rewards/margins": -0.0019886991940438747, + "rewards/rejected": 0.0054446132853627205, + "step": 5 + }, + { + "epoch": 0.03418803418803419, + "grad_norm": 36.203903910498276, + "learning_rate": 1e-06, + "logits/chosen": -2.7106502056121826, + "logits/rejected": -2.716397523880005, + "logps/chosen": -171.80043029785156, + "logps/rejected": -165.20602416992188, + "loss": 0.6875, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 0.012000308372080326, + "rewards/margins": 0.0025437879376113415, + "rewards/rejected": 0.009456520900130272, + "step": 10 + }, + { + "epoch": 0.05128205128205128, + "grad_norm": 33.9576577784673, + "learning_rate": 9.999177507263144e-07, + "logits/chosen": -2.651571750640869, + "logits/rejected": -2.629457473754883, + "logps/chosen": -174.04080200195312, + "logps/rejected": -174.0542755126953, + "loss": 0.6698, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.23909731209278107, + "rewards/margins": 0.10868903249502182, + "rewards/rejected": 0.13040827214717865, + "step": 15 + }, + { + "epoch": 0.06837606837606838, + "grad_norm": 34.33646066636181, + "learning_rate": 9.996710299650301e-07, + "logits/chosen": -2.476440668106079, + "logits/rejected": -2.450225353240967, + "logps/chosen": -158.1311798095703, + "logps/rejected": -158.0066680908203, + "loss": 0.6613, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.4318675100803375, + "rewards/margins": 0.14549395442008972, + "rewards/rejected": 0.2863735556602478, + "step": 20 + }, + { + "epoch": 0.08547008547008547, + "grad_norm": 33.16430522723429, + "learning_rate": 9.992599188865604e-07, + "logits/chosen": -2.3086318969726562, + "logits/rejected": -2.3104796409606934, + "logps/chosen": -150.59771728515625, + "logps/rejected": -156.85037231445312, + "loss": 0.6494, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.5047669410705566, + "rewards/margins": 0.16554531455039978, + "rewards/rejected": 0.33922165632247925, + "step": 25 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 34.52861424862365, + "learning_rate": 9.98684552745256e-07, + "logits/chosen": -2.217874050140381, + "logits/rejected": -2.2254481315612793, + "logps/chosen": -161.29412841796875, + "logps/rejected": -161.40841674804688, + "loss": 0.6295, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4176379144191742, + "rewards/margins": 0.26531916856765747, + "rewards/rejected": 0.15231874585151672, + "step": 30 + }, + { + "epoch": 0.11965811965811966, + "grad_norm": 31.455117829218544, + "learning_rate": 9.979451208349055e-07, + "logits/chosen": -2.2608728408813477, + "logits/rejected": -2.246007204055786, + "logps/chosen": -171.71456909179688, + "logps/rejected": -174.46578979492188, + "loss": 0.6305, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01912705972790718, + "rewards/margins": 0.31441593170166016, + "rewards/rejected": -0.33354294300079346, + "step": 35 + }, + { + "epoch": 0.13675213675213677, + "grad_norm": 31.67318837058587, + "learning_rate": 9.970418664264595e-07, + "logits/chosen": -2.345672130584717, + "logits/rejected": -2.331491470336914, + "logps/chosen": -171.24766540527344, + "logps/rejected": -176.8189697265625, + "loss": 0.5989, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.27867692708969116, + "rewards/margins": 0.5290472507476807, + "rewards/rejected": -0.8077241778373718, + "step": 40 + }, + { + "epoch": 0.13675213675213677, + "eval_logits/chosen": -2.4102065563201904, + "eval_logits/rejected": -2.401230573654175, + "eval_logps/chosen": -162.36439514160156, + "eval_logps/rejected": -167.4954071044922, + "eval_loss": 0.6069236993789673, + "eval_rewards/accuracies": 0.6365384459495544, + "eval_rewards/chosen": -0.388705849647522, + "eval_rewards/margins": 0.47280558943748474, + "eval_rewards/rejected": -0.8615114688873291, + "eval_runtime": 509.918, + "eval_samples_per_second": 16.305, + "eval_steps_per_second": 0.255, + "step": 40 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 36.18313806223269, + "learning_rate": 9.95975086687994e-07, + "logits/chosen": -2.44050669670105, + "logits/rejected": -2.4460220336914062, + "logps/chosen": -163.82875061035156, + "logps/rejected": -167.35989379882812, + "loss": 0.6146, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.31098368763923645, + "rewards/margins": 0.46269193291664124, + "rewards/rejected": -0.7736755609512329, + "step": 45 + }, + { + "epoch": 0.17094017094017094, + "grad_norm": 31.13412274683678, + "learning_rate": 9.947451325869439e-07, + "logits/chosen": -2.501091718673706, + "logits/rejected": -2.4991250038146973, + "logps/chosen": -172.09686279296875, + "logps/rejected": -177.7747802734375, + "loss": 0.577, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.212348073720932, + "rewards/margins": 0.6062799692153931, + "rewards/rejected": -0.8186280131340027, + "step": 50 + }, + { + "epoch": 0.18803418803418803, + "grad_norm": 31.508672436862835, + "learning_rate": 9.933524087746347e-07, + "logits/chosen": -2.437525510787964, + "logits/rejected": -2.4285693168640137, + "logps/chosen": -168.1316375732422, + "logps/rejected": -175.23193359375, + "loss": 0.571, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.513076901435852, + "rewards/margins": 0.7702310681343079, + "rewards/rejected": -1.2833080291748047, + "step": 55 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 30.148068867306787, + "learning_rate": 9.917973734531549e-07, + "logits/chosen": -2.431530475616455, + "logits/rejected": -2.431729793548584, + "logps/chosen": -159.38168334960938, + "logps/rejected": -170.52500915527344, + "loss": 0.5762, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.34855490922927856, + "rewards/margins": 0.5969334244728088, + "rewards/rejected": -0.9454883337020874, + "step": 60 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 32.03814968183332, + "learning_rate": 9.90080538224607e-07, + "logits/chosen": -2.533193588256836, + "logits/rejected": -2.5252978801727295, + "logps/chosen": -157.30966186523438, + "logps/rejected": -166.26011657714844, + "loss": 0.5643, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.007600936107337475, + "rewards/margins": 0.5010749697685242, + "rewards/rejected": -0.5086758732795715, + "step": 65 + }, + { + "epoch": 0.23931623931623933, + "grad_norm": 29.16308768569833, + "learning_rate": 9.882024679227938e-07, + "logits/chosen": -2.5899624824523926, + "logits/rejected": -2.5779967308044434, + "logps/chosen": -178.4553985595703, + "logps/rejected": -179.71542358398438, + "loss": 0.5464, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.47189587354660034, + "rewards/margins": 0.8304598927497864, + "rewards/rejected": -1.3023556470870972, + "step": 70 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 28.918531347661485, + "learning_rate": 9.861637804273881e-07, + "logits/chosen": -2.578892469406128, + "logits/rejected": -2.5758416652679443, + "logps/chosen": -162.60537719726562, + "logps/rejected": -170.6789093017578, + "loss": 0.5553, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.45147842168807983, + "rewards/margins": 0.6994724273681641, + "rewards/rejected": -1.1509509086608887, + "step": 75 + }, + { + "epoch": 0.27350427350427353, + "grad_norm": 26.98866754941649, + "learning_rate": 9.83965146460653e-07, + "logits/chosen": -2.54936146736145, + "logits/rejected": -2.5406956672668457, + "logps/chosen": -168.81484985351562, + "logps/rejected": -179.770751953125, + "loss": 0.5452, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6899678111076355, + "rewards/margins": 0.8549306988716125, + "rewards/rejected": -1.544898509979248, + "step": 80 + }, + { + "epoch": 0.27350427350427353, + "eval_logits/chosen": -2.53336238861084, + "eval_logits/rejected": -2.517695665359497, + "eval_logps/chosen": -167.28964233398438, + "eval_logps/rejected": -177.21824645996094, + "eval_loss": 0.5331124663352966, + "eval_rewards/accuracies": 0.7134615182876587, + "eval_rewards/chosen": -0.8812309503555298, + "eval_rewards/margins": 0.9525622725486755, + "eval_rewards/rejected": -1.8337931632995605, + "eval_runtime": 510.0922, + "eval_samples_per_second": 16.299, + "eval_steps_per_second": 0.255, + "step": 80 + }, + { + "epoch": 0.2905982905982906, + "grad_norm": 34.783908892421536, + "learning_rate": 9.816072893667758e-07, + "logits/chosen": -2.5432825088500977, + "logits/rejected": -2.5159504413604736, + "logps/chosen": -174.62197875976562, + "logps/rejected": -185.89413452148438, + "loss": 0.5581, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0434839725494385, + "rewards/margins": 1.0283188819885254, + "rewards/rejected": -2.0718026161193848, + "step": 85 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 26.697686805838906, + "learning_rate": 9.790909848738904e-07, + "logits/chosen": -2.5102508068084717, + "logits/rejected": -2.5222485065460205, + "logps/chosen": -175.47544860839844, + "logps/rejected": -183.92678833007812, + "loss": 0.5208, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9199908971786499, + "rewards/margins": 0.8521744608879089, + "rewards/rejected": -1.7721655368804932, + "step": 90 + }, + { + "epoch": 0.3247863247863248, + "grad_norm": 30.125094604814798, + "learning_rate": 9.764170608388647e-07, + "logits/chosen": -2.514260768890381, + "logits/rejected": -2.4829812049865723, + "logps/chosen": -167.62655639648438, + "logps/rejected": -174.2395477294922, + "loss": 0.5242, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6241778135299683, + "rewards/margins": 1.0742968320846558, + "rewards/rejected": -1.6984745264053345, + "step": 95 + }, + { + "epoch": 0.3418803418803419, + "grad_norm": 27.550843374580296, + "learning_rate": 9.735863969749371e-07, + "logits/chosen": -2.4171032905578613, + "logits/rejected": -2.381608486175537, + "logps/chosen": -177.05935668945312, + "logps/rejected": -188.4621124267578, + "loss": 0.5002, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7831762433052063, + "rewards/margins": 1.0672458410263062, + "rewards/rejected": -1.8504221439361572, + "step": 100 + }, + { + "epoch": 0.358974358974359, + "grad_norm": 30.39392617500016, + "learning_rate": 9.705999245622956e-07, + "logits/chosen": -2.3619236946105957, + "logits/rejected": -2.3391060829162598, + "logps/chosen": -170.48300170898438, + "logps/rejected": -183.28384399414062, + "loss": 0.5026, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8889726400375366, + "rewards/margins": 0.9097515940666199, + "rewards/rejected": -1.7987244129180908, + "step": 105 + }, + { + "epoch": 0.37606837606837606, + "grad_norm": 26.741945030347612, + "learning_rate": 9.674586261416873e-07, + "logits/chosen": -2.2946972846984863, + "logits/rejected": -2.2440435886383057, + "logps/chosen": -179.06390380859375, + "logps/rejected": -188.00010681152344, + "loss": 0.5206, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6539386510848999, + "rewards/margins": 1.0372655391693115, + "rewards/rejected": -1.691204309463501, + "step": 110 + }, + { + "epoch": 0.39316239316239315, + "grad_norm": 33.116742735027486, + "learning_rate": 9.641635351911664e-07, + "logits/chosen": -2.218276262283325, + "logits/rejected": -2.18500018119812, + "logps/chosen": -171.17381286621094, + "logps/rejected": -183.25845336914062, + "loss": 0.4801, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9279203414916992, + "rewards/margins": 1.2727015018463135, + "rewards/rejected": -2.200622081756592, + "step": 115 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 27.185641229760538, + "learning_rate": 9.607157357860821e-07, + "logits/chosen": -2.124584436416626, + "logits/rejected": -2.0961549282073975, + "logps/chosen": -189.48277282714844, + "logps/rejected": -203.43951416015625, + "loss": 0.5026, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2869656085968018, + "rewards/margins": 1.3039339780807495, + "rewards/rejected": -2.5908992290496826, + "step": 120 + }, + { + "epoch": 0.41025641025641024, + "eval_logits/chosen": -2.0268211364746094, + "eval_logits/rejected": -1.9764775037765503, + "eval_logps/chosen": -172.888671875, + "eval_logps/rejected": -185.58355712890625, + "eval_loss": 0.49246644973754883, + "eval_rewards/accuracies": 0.7442307472229004, + "eval_rewards/chosen": -1.441135048866272, + "eval_rewards/margins": 1.2291908264160156, + "eval_rewards/rejected": -2.670325756072998, + "eval_runtime": 510.1247, + "eval_samples_per_second": 16.298, + "eval_steps_per_second": 0.255, + "step": 120 + }, + { + "epoch": 0.42735042735042733, + "grad_norm": 31.03461706328688, + "learning_rate": 9.571163622424225e-07, + "logits/chosen": -1.944964051246643, + "logits/rejected": -1.9178746938705444, + "logps/chosen": -175.3327178955078, + "logps/rejected": -188.2616729736328, + "loss": 0.5017, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.579502820968628, + "rewards/margins": 1.2485122680664062, + "rewards/rejected": -2.828014850616455, + "step": 125 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 29.080520770184428, + "learning_rate": 9.533665987436261e-07, + "logits/chosen": -1.8825464248657227, + "logits/rejected": -1.8078832626342773, + "logps/chosen": -178.3484649658203, + "logps/rejected": -197.55380249023438, + "loss": 0.4983, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5868518352508545, + "rewards/margins": 1.2471343278884888, + "rewards/rejected": -2.8339860439300537, + "step": 130 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 28.903021536294002, + "learning_rate": 9.494676789509899e-07, + "logits/chosen": -1.8585374355316162, + "logits/rejected": -1.8128669261932373, + "logps/chosen": -178.5911407470703, + "logps/rejected": -195.90933227539062, + "loss": 0.492, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2965319156646729, + "rewards/margins": 1.4173026084899902, + "rewards/rejected": -2.713834285736084, + "step": 135 + }, + { + "epoch": 0.47863247863247865, + "grad_norm": 27.5476391641307, + "learning_rate": 9.454208855977985e-07, + "logits/chosen": -1.920654296875, + "logits/rejected": -1.8412939310073853, + "logps/chosen": -179.1053924560547, + "logps/rejected": -196.11526489257812, + "loss": 0.4753, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5140180587768555, + "rewards/margins": 1.5388453006744385, + "rewards/rejected": -3.052863121032715, + "step": 140 + }, + { + "epoch": 0.49572649572649574, + "grad_norm": 30.03317842923354, + "learning_rate": 9.41227550067308e-07, + "logits/chosen": -1.9514515399932861, + "logits/rejected": -1.949883222579956, + "logps/chosen": -178.63250732421875, + "logps/rejected": -191.42721557617188, + "loss": 0.4803, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.513338327407837, + "rewards/margins": 1.4887291193008423, + "rewards/rejected": -3.0020670890808105, + "step": 145 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 30.28469957381902, + "learning_rate": 9.36889051954725e-07, + "logits/chosen": -2.0093894004821777, + "logits/rejected": -1.9657704830169678, + "logps/chosen": -180.35043334960938, + "logps/rejected": -197.2502899169922, + "loss": 0.4895, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.602224588394165, + "rewards/margins": 1.6883083581924438, + "rewards/rejected": -3.2905325889587402, + "step": 150 + }, + { + "epoch": 0.5299145299145299, + "grad_norm": 28.420242591686232, + "learning_rate": 9.324068186133245e-07, + "logits/chosen": -1.9976894855499268, + "logits/rejected": -1.9886022806167603, + "logps/chosen": -171.70602416992188, + "logps/rejected": -185.99795532226562, + "loss": 0.4608, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2952425479888916, + "rewards/margins": 1.7483227252960205, + "rewards/rejected": -3.043565034866333, + "step": 155 + }, + { + "epoch": 0.5470085470085471, + "grad_norm": 26.601543429998234, + "learning_rate": 9.277823246848536e-07, + "logits/chosen": -2.056879758834839, + "logits/rejected": -1.9998328685760498, + "logps/chosen": -186.3706817626953, + "logps/rejected": -196.63290405273438, + "loss": 0.4511, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2312135696411133, + "rewards/margins": 1.352858304977417, + "rewards/rejected": -2.5840718746185303, + "step": 160 + }, + { + "epoch": 0.5470085470085471, + "eval_logits/chosen": -2.070892095565796, + "eval_logits/rejected": -2.0279953479766846, + "eval_logps/chosen": -171.76034545898438, + "eval_logps/rejected": -189.1643829345703, + "eval_loss": 0.4683005213737488, + "eval_rewards/accuracies": 0.762499988079071, + "eval_rewards/chosen": -1.328302264213562, + "eval_rewards/margins": 1.70010507106781, + "eval_rewards/rejected": -3.028407096862793, + "eval_runtime": 509.9565, + "eval_samples_per_second": 16.303, + "eval_steps_per_second": 0.255, + "step": 160 + }, + { + "epoch": 0.5641025641025641, + "grad_norm": 41.76296476638838, + "learning_rate": 9.230170916143793e-07, + "logits/chosen": -2.1190731525421143, + "logits/rejected": -2.083359956741333, + "logps/chosen": -176.87539672851562, + "logps/rejected": -198.44384765625, + "loss": 0.4944, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2975060939788818, + "rewards/margins": 1.6890850067138672, + "rewards/rejected": -2.98659086227417, + "step": 165 + }, + { + "epoch": 0.5811965811965812, + "grad_norm": 28.83194976337172, + "learning_rate": 9.181126871497378e-07, + "logits/chosen": -2.175851583480835, + "logits/rejected": -2.1391243934631348, + "logps/chosen": -178.2881317138672, + "logps/rejected": -197.88473510742188, + "loss": 0.4813, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2544641494750977, + "rewards/margins": 1.7747846841812134, + "rewards/rejected": -3.0292489528656006, + "step": 170 + }, + { + "epoch": 0.5982905982905983, + "grad_norm": 30.93659066586097, + "learning_rate": 9.130707248257491e-07, + "logits/chosen": -2.313814640045166, + "logits/rejected": -2.2677135467529297, + "logps/chosen": -170.06781005859375, + "logps/rejected": -177.8175811767578, + "loss": 0.4863, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.0524061918258667, + "rewards/margins": 1.3644572496414185, + "rewards/rejected": -2.416863441467285, + "step": 175 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 25.018999438635433, + "learning_rate": 9.078928634333698e-07, + "logits/chosen": -2.302171230316162, + "logits/rejected": -2.2788572311401367, + "logps/chosen": -179.72390747070312, + "logps/rejected": -197.12283325195312, + "loss": 0.4553, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.6731274724006653, + "rewards/margins": 1.6728944778442383, + "rewards/rejected": -2.346021890640259, + "step": 180 + }, + { + "epoch": 0.6324786324786325, + "grad_norm": 28.576400660174777, + "learning_rate": 9.025808064739549e-07, + "logits/chosen": -2.2794651985168457, + "logits/rejected": -2.2391860485076904, + "logps/chosen": -175.87045288085938, + "logps/rejected": -189.4848175048828, + "loss": 0.4854, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8901998400688171, + "rewards/margins": 1.4675487279891968, + "rewards/rejected": -2.357748508453369, + "step": 185 + }, + { + "epoch": 0.6495726495726496, + "grad_norm": 25.73471562251865, + "learning_rate": 8.971363015988113e-07, + "logits/chosen": -2.1966824531555176, + "logits/rejected": -2.1603925228118896, + "logps/chosen": -172.0600128173828, + "logps/rejected": -191.96176147460938, + "loss": 0.4681, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.9620615243911743, + "rewards/margins": 1.4954371452331543, + "rewards/rejected": -2.457498550415039, + "step": 190 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 34.912982133976655, + "learning_rate": 8.91561140034225e-07, + "logits/chosen": -2.1389029026031494, + "logits/rejected": -2.0825791358947754, + "logps/chosen": -174.3153839111328, + "logps/rejected": -194.2677459716797, + "loss": 0.4935, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4726169109344482, + "rewards/margins": 1.4599871635437012, + "rewards/rejected": -2.9326040744781494, + "step": 195 + }, + { + "epoch": 0.6837606837606838, + "grad_norm": 25.756167591259292, + "learning_rate": 8.858571559921537e-07, + "logits/chosen": -2.135298013687134, + "logits/rejected": -2.067862033843994, + "logps/chosen": -178.73361206054688, + "logps/rejected": -193.21209716796875, + "loss": 0.4562, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.452704668045044, + "rewards/margins": 1.6391651630401611, + "rewards/rejected": -3.091869831085205, + "step": 200 + }, + { + "epoch": 0.6837606837606838, + "eval_logits/chosen": -2.1462392807006836, + "eval_logits/rejected": -2.1028637886047363, + "eval_logps/chosen": -173.41998291015625, + "eval_logps/rejected": -191.55532836914062, + "eval_loss": 0.4528014361858368, + "eval_rewards/accuracies": 0.7567307949066162, + "eval_rewards/chosen": -1.4942626953125, + "eval_rewards/margins": 1.7732419967651367, + "eval_rewards/rejected": -3.2675046920776367, + "eval_runtime": 510.9487, + "eval_samples_per_second": 16.272, + "eval_steps_per_second": 0.254, + "step": 200 + }, + { + "epoch": 0.7008547008547008, + "grad_norm": 26.77931167801656, + "learning_rate": 8.800262260667754e-07, + "logits/chosen": -2.1584880352020264, + "logits/rejected": -2.100416660308838, + "logps/chosen": -165.63743591308594, + "logps/rejected": -183.36476135253906, + "loss": 0.4653, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.409201741218567, + "rewards/margins": 1.6899499893188477, + "rewards/rejected": -3.099151611328125, + "step": 205 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 25.986078947597964, + "learning_rate": 8.740702686170954e-07, + "logits/chosen": -2.2075798511505127, + "logits/rejected": -2.151484727859497, + "logps/chosen": -179.00509643554688, + "logps/rejected": -194.68353271484375, + "loss": 0.4426, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2429417371749878, + "rewards/margins": 1.6721550226211548, + "rewards/rejected": -2.9150967597961426, + "step": 210 + }, + { + "epoch": 0.7350427350427351, + "grad_norm": 24.89101303634129, + "learning_rate": 8.679912431358109e-07, + "logits/chosen": -2.1802072525024414, + "logits/rejected": -2.1238255500793457, + "logps/chosen": -172.57705688476562, + "logps/rejected": -189.31666564941406, + "loss": 0.4521, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4200295209884644, + "rewards/margins": 1.9437878131866455, + "rewards/rejected": -3.3638176918029785, + "step": 215 + }, + { + "epoch": 0.7521367521367521, + "grad_norm": 27.617679879143566, + "learning_rate": 8.617911496046445e-07, + "logits/chosen": -2.174743413925171, + "logits/rejected": -2.1131985187530518, + "logps/chosen": -171.0723876953125, + "logps/rejected": -189.23275756835938, + "loss": 0.4655, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5752933025360107, + "rewards/margins": 1.7622945308685303, + "rewards/rejected": -3.337587833404541, + "step": 220 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 25.912722321128637, + "learning_rate": 8.554720278363547e-07, + "logits/chosen": -2.206986427307129, + "logits/rejected": -2.1668283939361572, + "logps/chosen": -175.4432830810547, + "logps/rejected": -193.02845764160156, + "loss": 0.4693, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6182912588119507, + "rewards/margins": 1.7047646045684814, + "rewards/rejected": -3.3230559825897217, + "step": 225 + }, + { + "epoch": 0.7863247863247863, + "grad_norm": 27.20181876713083, + "learning_rate": 8.490359568036445e-07, + "logits/chosen": -2.3055601119995117, + "logits/rejected": -2.2838051319122314, + "logps/chosen": -183.21449279785156, + "logps/rejected": -205.37521362304688, + "loss": 0.4524, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.517019271850586, + "rewards/margins": 1.586157202720642, + "rewards/rejected": -3.1031765937805176, + "step": 230 + }, + { + "epoch": 0.8034188034188035, + "grad_norm": 73.7275222246594, + "learning_rate": 8.424850539551856e-07, + "logits/chosen": -2.367276668548584, + "logits/rejected": -2.349586009979248, + "logps/chosen": -174.82656860351562, + "logps/rejected": -191.1867218017578, + "loss": 0.4536, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4777748584747314, + "rewards/margins": 1.7120048999786377, + "rewards/rejected": -3.1897799968719482, + "step": 235 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 23.009281700277114, + "learning_rate": 8.358214745189829e-07, + "logits/chosen": -2.4104866981506348, + "logits/rejected": -2.3766913414001465, + "logps/chosen": -184.68222045898438, + "logps/rejected": -205.4649200439453, + "loss": 0.4189, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6187865734100342, + "rewards/margins": 2.1064658164978027, + "rewards/rejected": -3.725252151489258, + "step": 240 + }, + { + "epoch": 0.8205128205128205, + "eval_logits/chosen": -2.447181224822998, + "eval_logits/rejected": -2.4165050983428955, + "eval_logps/chosen": -177.78672790527344, + "eval_logps/rejected": -197.77915954589844, + "eval_loss": 0.44941428303718567, + "eval_rewards/accuracies": 0.7663461565971375, + "eval_rewards/chosen": -1.9309390783309937, + "eval_rewards/margins": 1.958947777748108, + "eval_rewards/rejected": -3.8898868560791016, + "eval_runtime": 510.6066, + "eval_samples_per_second": 16.283, + "eval_steps_per_second": 0.255, + "step": 240 + }, + { + "epoch": 0.8376068376068376, + "grad_norm": 25.877339189588067, + "learning_rate": 8.290474107933114e-07, + "logits/chosen": -2.450867176055908, + "logits/rejected": -2.427006483078003, + "logps/chosen": -186.76683044433594, + "logps/rejected": -206.23147583007812, + "loss": 0.4441, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9867289066314697, + "rewards/margins": 2.0301365852355957, + "rewards/rejected": -4.0168657302856445, + "step": 245 + }, + { + "epoch": 0.8547008547008547, + "grad_norm": 29.453953450785107, + "learning_rate": 8.221650914254565e-07, + "logits/chosen": -2.464049816131592, + "logits/rejected": -2.4303317070007324, + "logps/chosen": -184.5537872314453, + "logps/rejected": -196.9418487548828, + "loss": 0.4919, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.074887752532959, + "rewards/margins": 1.6710395812988281, + "rewards/rejected": -3.745927333831787, + "step": 250 + }, + { + "epoch": 0.8717948717948718, + "grad_norm": 26.569329016808155, + "learning_rate": 8.151767806784953e-07, + "logits/chosen": -2.4366822242736816, + "logits/rejected": -2.4094901084899902, + "logps/chosen": -188.01376342773438, + "logps/rejected": -199.4722442626953, + "loss": 0.4651, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7826770544052124, + "rewards/margins": 1.4153121709823608, + "rewards/rejected": -3.1979892253875732, + "step": 255 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 26.68703703609934, + "learning_rate": 8.080847776863608e-07, + "logits/chosen": -2.4146647453308105, + "logits/rejected": -2.386958360671997, + "logps/chosen": -186.34954833984375, + "logps/rejected": -200.6859588623047, + "loss": 0.4474, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0444929599761963, + "rewards/margins": 1.9850835800170898, + "rewards/rejected": -3.029576539993286, + "step": 260 + }, + { + "epoch": 0.905982905982906, + "grad_norm": 27.397197833953705, + "learning_rate": 8.008914156974333e-07, + "logits/chosen": -2.3684728145599365, + "logits/rejected": -2.3436620235443115, + "logps/chosen": -172.65553283691406, + "logps/rejected": -194.290283203125, + "loss": 0.4427, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.7620879411697388, + "rewards/margins": 1.6837621927261353, + "rewards/rejected": -2.445849895477295, + "step": 265 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 25.92494835175286, + "learning_rate": 7.935990613069086e-07, + "logits/chosen": -2.312016010284424, + "logits/rejected": -2.2800450325012207, + "logps/chosen": -171.95416259765625, + "logps/rejected": -193.59378051757812, + "loss": 0.445, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8496273756027222, + "rewards/margins": 2.038412094116211, + "rewards/rejected": -2.8880395889282227, + "step": 270 + }, + { + "epoch": 0.9401709401709402, + "grad_norm": 24.604360112973207, + "learning_rate": 7.862101136781946e-07, + "logits/chosen": -2.2761037349700928, + "logits/rejected": -2.241076707839966, + "logps/chosen": -169.81842041015625, + "logps/rejected": -192.06903076171875, + "loss": 0.4173, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2587201595306396, + "rewards/margins": 1.723170280456543, + "rewards/rejected": -2.9818906784057617, + "step": 275 + }, + { + "epoch": 0.9572649572649573, + "grad_norm": 33.24758765533294, + "learning_rate": 7.78727003753595e-07, + "logits/chosen": -2.2211129665374756, + "logits/rejected": -2.1957130432128906, + "logps/chosen": -172.84083557128906, + "logps/rejected": -195.34005737304688, + "loss": 0.4484, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.557621717453003, + "rewards/margins": 2.1072278022766113, + "rewards/rejected": -3.6648497581481934, + "step": 280 + }, + { + "epoch": 0.9572649572649573, + "eval_logits/chosen": -2.199989080429077, + "eval_logits/rejected": -2.158634662628174, + "eval_logps/chosen": -175.8746337890625, + "eval_logps/rejected": -197.1187286376953, + "eval_loss": 0.4431803524494171, + "eval_rewards/accuracies": 0.7634615302085876, + "eval_rewards/chosen": -1.7397303581237793, + "eval_rewards/margins": 2.084113836288452, + "eval_rewards/rejected": -3.8238441944122314, + "eval_runtime": 510.3208, + "eval_samples_per_second": 16.292, + "eval_steps_per_second": 0.255, + "step": 280 + }, + { + "epoch": 0.9743589743589743, + "grad_norm": 23.855033657318305, + "learning_rate": 7.711521934545342e-07, + "logits/chosen": -2.1965622901916504, + "logits/rejected": -2.1558597087860107, + "logps/chosen": -185.0030059814453, + "logps/rejected": -205.0108642578125, + "loss": 0.4233, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4067909717559814, + "rewards/margins": 2.130743980407715, + "rewards/rejected": -3.537534713745117, + "step": 285 + }, + { + "epoch": 0.9914529914529915, + "grad_norm": 27.781853522970472, + "learning_rate": 7.63488174871594e-07, + "logits/chosen": -2.209836483001709, + "logits/rejected": -2.1382126808166504, + "logps/chosen": -181.1676788330078, + "logps/rejected": -201.53524780273438, + "loss": 0.4064, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.1297556161880493, + "rewards/margins": 2.355384349822998, + "rewards/rejected": -3.485139846801758, + "step": 290 + }, + { + "epoch": 1.0085470085470085, + "grad_norm": 16.38566211549952, + "learning_rate": 7.557374694446221e-07, + "logits/chosen": -2.191758632659912, + "logits/rejected": -2.182082176208496, + "logps/chosen": -169.5143585205078, + "logps/rejected": -191.7115936279297, + "loss": 0.3182, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.8321866989135742, + "rewards/margins": 2.2344491481781006, + "rewards/rejected": -3.066636085510254, + "step": 295 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 16.985648781909422, + "learning_rate": 7.479026271331863e-07, + "logits/chosen": -2.267702579498291, + "logits/rejected": -2.205897092819214, + "logps/chosen": -169.5579833984375, + "logps/rejected": -197.75802612304688, + "loss": 0.2168, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.39328667521476746, + "rewards/margins": 2.9768226146698, + "rewards/rejected": -3.3701090812683105, + "step": 300 + }, + { + "epoch": 1.0427350427350428, + "grad_norm": 18.84075344730222, + "learning_rate": 7.399862255776448e-07, + "logits/chosen": -2.3038105964660645, + "logits/rejected": -2.2806408405303955, + "logps/chosen": -164.28530883789062, + "logps/rejected": -197.4765625, + "loss": 0.2127, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.0233527421951294, + "rewards/margins": 3.003648042678833, + "rewards/rejected": -4.02700138092041, + "step": 305 + }, + { + "epoch": 1.0598290598290598, + "grad_norm": 22.421420935891007, + "learning_rate": 7.319908692511102e-07, + "logits/chosen": -2.4081215858459473, + "logits/rejected": -2.3740234375, + "logps/chosen": -171.662109375, + "logps/rejected": -209.674560546875, + "loss": 0.2371, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.9765853881835938, + "rewards/margins": 3.642939805984497, + "rewards/rejected": -4.619524955749512, + "step": 310 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 19.013199137435198, + "learning_rate": 7.239191886025853e-07, + "logits/chosen": -2.438504695892334, + "logits/rejected": -2.4153828620910645, + "logps/chosen": -175.62979125976562, + "logps/rejected": -207.75692749023438, + "loss": 0.2077, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.7931637763977051, + "rewards/margins": 3.6793007850646973, + "rewards/rejected": -4.472464561462402, + "step": 315 + }, + { + "epoch": 1.0940170940170941, + "grad_norm": 18.91200622344116, + "learning_rate": 7.15773839191553e-07, + "logits/chosen": -2.44122314453125, + "logits/rejected": -2.4007716178894043, + "logps/chosen": -164.8516387939453, + "logps/rejected": -196.8404998779297, + "loss": 0.222, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5334895253181458, + "rewards/margins": 3.032334804534912, + "rewards/rejected": -3.565824508666992, + "step": 320 + }, + { + "epoch": 1.0940170940170941, + "eval_logits/chosen": -2.439704418182373, + "eval_logits/rejected": -2.4060051441192627, + "eval_logps/chosen": -170.68392944335938, + "eval_logps/rejected": -188.57797241210938, + "eval_loss": 0.45035940408706665, + "eval_rewards/accuracies": 0.7759615182876587, + "eval_rewards/chosen": -1.2206590175628662, + "eval_rewards/margins": 1.7491083145141602, + "eval_rewards/rejected": -2.9697670936584473, + "eval_runtime": 509.779, + "eval_samples_per_second": 16.309, + "eval_steps_per_second": 0.255, + "step": 320 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 22.512058111846653, + "learning_rate": 7.075575008143054e-07, + "logits/chosen": -2.4442429542541504, + "logits/rejected": -2.418893337249756, + "logps/chosen": -172.38357543945312, + "logps/rejected": -206.02169799804688, + "loss": 0.2187, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.565311074256897, + "rewards/margins": 3.2330455780029297, + "rewards/rejected": -3.798356533050537, + "step": 325 + }, + { + "epoch": 1.1282051282051282, + "grad_norm": 20.168560881998502, + "learning_rate": 6.99272876622298e-07, + "logits/chosen": -2.451326847076416, + "logits/rejected": -2.412057399749756, + "logps/chosen": -176.92236328125, + "logps/rejected": -212.13027954101562, + "loss": 0.2223, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.7077795267105103, + "rewards/margins": 3.4706287384033203, + "rewards/rejected": -4.178408622741699, + "step": 330 + }, + { + "epoch": 1.1452991452991452, + "grad_norm": 22.192981205024807, + "learning_rate": 6.909226922328211e-07, + "logits/chosen": -2.409423351287842, + "logits/rejected": -2.3877720832824707, + "logps/chosen": -178.28134155273438, + "logps/rejected": -211.2721405029297, + "loss": 0.2107, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3355214595794678, + "rewards/margins": 3.4370384216308594, + "rewards/rejected": -4.77255916595459, + "step": 335 + }, + { + "epoch": 1.1623931623931625, + "grad_norm": 21.419705510014087, + "learning_rate": 6.82509694832279e-07, + "logits/chosen": -2.4000024795532227, + "logits/rejected": -2.370281219482422, + "logps/chosen": -177.16690063476562, + "logps/rejected": -216.00479125976562, + "loss": 0.2184, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.2021948099136353, + "rewards/margins": 3.6778666973114014, + "rewards/rejected": -4.880061149597168, + "step": 340 + }, + { + "epoch": 1.1794871794871795, + "grad_norm": 22.354670781786442, + "learning_rate": 6.740366522723752e-07, + "logits/chosen": -2.403994083404541, + "logits/rejected": -2.376459836959839, + "logps/chosen": -182.30508422851562, + "logps/rejected": -216.92324829101562, + "loss": 0.1978, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.2613385915756226, + "rewards/margins": 3.6934847831726074, + "rewards/rejected": -4.9548234939575195, + "step": 345 + }, + { + "epoch": 1.1965811965811965, + "grad_norm": 23.407629177854666, + "learning_rate": 6.655063521594949e-07, + "logits/chosen": -2.379652738571167, + "logits/rejected": -2.324298143386841, + "logps/chosen": -173.8491668701172, + "logps/rejected": -214.69384765625, + "loss": 0.2039, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4865599870681763, + "rewards/margins": 3.8272087574005127, + "rewards/rejected": -5.3137688636779785, + "step": 350 + }, + { + "epoch": 1.2136752136752136, + "grad_norm": 21.571334703323895, + "learning_rate": 6.569216009375929e-07, + "logits/chosen": -2.3660831451416016, + "logits/rejected": -2.339773654937744, + "logps/chosen": -173.77110290527344, + "logps/rejected": -213.0174560546875, + "loss": 0.1915, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.3100428581237793, + "rewards/margins": 3.860780715942383, + "rewards/rejected": -5.170823574066162, + "step": 355 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 22.187677873940732, + "learning_rate": 6.482852229648801e-07, + "logits/chosen": -2.3646240234375, + "logits/rejected": -2.3246006965637207, + "logps/chosen": -175.00582885742188, + "logps/rejected": -206.28280639648438, + "loss": 0.2018, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.4125807285308838, + "rewards/margins": 3.3710312843322754, + "rewards/rejected": -4.783612251281738, + "step": 360 + }, + { + "epoch": 1.2307692307692308, + "eval_logits/chosen": -2.3790221214294434, + "eval_logits/rejected": -2.34446120262146, + "eval_logps/chosen": -179.33251953125, + "eval_logps/rejected": -203.62615966796875, + "eval_loss": 0.44376233220100403, + "eval_rewards/accuracies": 0.7884615659713745, + "eval_rewards/chosen": -2.0855188369750977, + "eval_rewards/margins": 2.3890678882598877, + "eval_rewards/rejected": -4.474586486816406, + "eval_runtime": 510.4715, + "eval_samples_per_second": 16.287, + "eval_steps_per_second": 0.255, + "step": 360 + }, + { + "epoch": 1.2478632478632479, + "grad_norm": 22.165120276430873, + "learning_rate": 6.396000595846187e-07, + "logits/chosen": -2.3558402061462402, + "logits/rejected": -2.3481571674346924, + "logps/chosen": -181.36770629882812, + "logps/rejected": -209.7890167236328, + "loss": 0.2058, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.5554149150848389, + "rewards/margins": 3.3219857215881348, + "rewards/rejected": -4.8774003982543945, + "step": 365 + }, + { + "epoch": 1.264957264957265, + "grad_norm": 20.68437270723084, + "learning_rate": 6.30868968190328e-07, + "logits/chosen": -2.3541951179504395, + "logits/rejected": -2.3390259742736816, + "logps/chosen": -176.1993865966797, + "logps/rejected": -212.49215698242188, + "loss": 0.1952, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4656041860580444, + "rewards/margins": 3.678515672683716, + "rewards/rejected": -5.144120216369629, + "step": 370 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 19.201697233386618, + "learning_rate": 6.220948212857111e-07, + "logits/chosen": -2.3458924293518066, + "logits/rejected": -2.3219103813171387, + "logps/chosen": -174.4814910888672, + "logps/rejected": -214.3359375, + "loss": 0.1874, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.0674364566802979, + "rewards/margins": 3.7355704307556152, + "rewards/rejected": -4.803006172180176, + "step": 375 + }, + { + "epoch": 1.2991452991452992, + "grad_norm": 22.01586220428365, + "learning_rate": 6.13280505539608e-07, + "logits/chosen": -2.342374324798584, + "logits/rejected": -2.3162975311279297, + "logps/chosen": -183.77163696289062, + "logps/rejected": -228.7787322998047, + "loss": 0.2029, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.164026141166687, + "rewards/margins": 3.908278226852417, + "rewards/rejected": -5.072304725646973, + "step": 380 + }, + { + "epoch": 1.3162393162393162, + "grad_norm": 25.00184157353881, + "learning_rate": 6.044289208362914e-07, + "logits/chosen": -2.333132028579712, + "logits/rejected": -2.2765920162200928, + "logps/chosen": -174.96617126464844, + "logps/rejected": -214.20108032226562, + "loss": 0.1954, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.3976377248764038, + "rewards/margins": 3.710228443145752, + "rewards/rejected": -5.107865810394287, + "step": 385 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 31.735546531917564, + "learning_rate": 5.955429793214128e-07, + "logits/chosen": -2.303215503692627, + "logits/rejected": -2.2655513286590576, + "logps/chosen": -185.8195037841797, + "logps/rejected": -223.3665313720703, + "loss": 0.2087, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.0371310710906982, + "rewards/margins": 3.6536872386932373, + "rewards/rejected": -5.690817832946777, + "step": 390 + }, + { + "epoch": 1.3504273504273505, + "grad_norm": 22.637262975463738, + "learning_rate": 5.866256044439142e-07, + "logits/chosen": -2.3101253509521484, + "logits/rejected": -2.2920727729797363, + "logps/chosen": -177.775390625, + "logps/rejected": -217.62863159179688, + "loss": 0.2183, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.9160019159317017, + "rewards/margins": 3.9770541191101074, + "rewards/rejected": -5.8930559158325195, + "step": 395 + }, + { + "epoch": 1.3675213675213675, + "grad_norm": 20.130725870467003, + "learning_rate": 5.776797299942235e-07, + "logits/chosen": -2.318878412246704, + "logits/rejected": -2.2966861724853516, + "logps/chosen": -173.74664306640625, + "logps/rejected": -212.4857940673828, + "loss": 0.2017, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.386197566986084, + "rewards/margins": 3.7791614532470703, + "rewards/rejected": -5.165358543395996, + "step": 400 + }, + { + "epoch": 1.3675213675213675, + "eval_logits/chosen": -2.3350861072540283, + "eval_logits/rejected": -2.3021962642669678, + "eval_logps/chosen": -177.5862274169922, + "eval_logps/rejected": -200.29428100585938, + "eval_loss": 0.43498364090919495, + "eval_rewards/accuracies": 0.7980769276618958, + "eval_rewards/chosen": -1.910888433456421, + "eval_rewards/margins": 2.230510711669922, + "eval_rewards/rejected": -4.141399383544922, + "eval_runtime": 544.4003, + "eval_samples_per_second": 15.272, + "eval_steps_per_second": 0.239, + "step": 400 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 23.596482868635878, + "learning_rate": 5.687082991390443e-07, + "logits/chosen": -2.318471908569336, + "logits/rejected": -2.3120999336242676, + "logps/chosen": -181.3382568359375, + "logps/rejected": -223.269287109375, + "loss": 0.2194, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8589428067207336, + "rewards/margins": 3.934159517288208, + "rewards/rejected": -4.793102741241455, + "step": 405 + }, + { + "epoch": 1.4017094017094016, + "grad_norm": 21.689666892141172, + "learning_rate": 5.597142634530638e-07, + "logits/chosen": -2.320355176925659, + "logits/rejected": -2.291660785675049, + "logps/chosen": -173.66566467285156, + "logps/rejected": -208.9145050048828, + "loss": 0.1923, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6978217363357544, + "rewards/margins": 3.58097505569458, + "rewards/rejected": -4.278796195983887, + "step": 410 + }, + { + "epoch": 1.4188034188034189, + "grad_norm": 24.150486741969612, + "learning_rate": 5.507005819478924e-07, + "logits/chosen": -2.3113839626312256, + "logits/rejected": -2.2749829292297363, + "logps/chosen": -177.13357543945312, + "logps/rejected": -209.0823211669922, + "loss": 0.2273, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.8599263429641724, + "rewards/margins": 3.658784866333008, + "rewards/rejected": -4.518711090087891, + "step": 415 + }, + { + "epoch": 1.435897435897436, + "grad_norm": 21.96020470207621, + "learning_rate": 5.416702200985584e-07, + "logits/chosen": -2.2829697132110596, + "logits/rejected": -2.2764744758605957, + "logps/chosen": -176.04710388183594, + "logps/rejected": -212.1916961669922, + "loss": 0.2022, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.101433515548706, + "rewards/margins": 3.7728042602539062, + "rewards/rejected": -4.874238014221191, + "step": 420 + }, + { + "epoch": 1.452991452991453, + "grad_norm": 27.78726291669523, + "learning_rate": 5.326261488678748e-07, + "logits/chosen": -2.2171132564544678, + "logits/rejected": -2.185006856918335, + "logps/chosen": -169.00796508789062, + "logps/rejected": -199.62350463867188, + "loss": 0.2124, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3692271709442139, + "rewards/margins": 3.5437960624694824, + "rewards/rejected": -4.913023948669434, + "step": 425 + }, + { + "epoch": 1.4700854700854702, + "grad_norm": 21.241057222727857, + "learning_rate": 5.235713437290011e-07, + "logits/chosen": -2.220639705657959, + "logits/rejected": -2.189408779144287, + "logps/chosen": -174.73651123046875, + "logps/rejected": -213.7786865234375, + "loss": 0.1905, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5636754035949707, + "rewards/margins": 3.8633148670196533, + "rewards/rejected": -5.426989555358887, + "step": 430 + }, + { + "epoch": 1.4871794871794872, + "grad_norm": 21.34359675472229, + "learning_rate": 5.145087836865213e-07, + "logits/chosen": -2.236384630203247, + "logits/rejected": -2.1837754249572754, + "logps/chosen": -179.68634033203125, + "logps/rejected": -222.42453002929688, + "loss": 0.1887, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4160200357437134, + "rewards/margins": 4.235383033752441, + "rewards/rejected": -5.651402950286865, + "step": 435 + }, + { + "epoch": 1.5042735042735043, + "grad_norm": 25.914610827614307, + "learning_rate": 5.054414502963604e-07, + "logits/chosen": -2.1725914478302, + "logits/rejected": -2.133784294128418, + "logps/chosen": -171.98902893066406, + "logps/rejected": -210.50491333007812, + "loss": 0.1999, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4368568658828735, + "rewards/margins": 3.967801332473755, + "rewards/rejected": -5.404658317565918, + "step": 440 + }, + { + "epoch": 1.5042735042735043, + "eval_logits/chosen": -2.1716132164001465, + "eval_logits/rejected": -2.1360905170440674, + "eval_logps/chosen": -179.5331268310547, + "eval_logps/rejected": -203.52139282226562, + "eval_loss": 0.42879074811935425, + "eval_rewards/accuracies": 0.8048076629638672, + "eval_rewards/chosen": -2.105579137802124, + "eval_rewards/margins": 2.3585293292999268, + "eval_rewards/rejected": -4.464108467102051, + "eval_runtime": 549.544, + "eval_samples_per_second": 15.129, + "eval_steps_per_second": 0.237, + "step": 440 + }, + { + "epoch": 1.5213675213675213, + "grad_norm": 24.359905435182267, + "learning_rate": 4.963723266848609e-07, + "logits/chosen": -2.177730083465576, + "logits/rejected": -2.1423938274383545, + "logps/chosen": -177.42788696289062, + "logps/rejected": -215.1343994140625, + "loss": 0.2006, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.3451989889144897, + "rewards/margins": 4.096414089202881, + "rewards/rejected": -5.441613674163818, + "step": 445 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 23.49528331346096, + "learning_rate": 4.873043965673426e-07, + "logits/chosen": -2.2352283000946045, + "logits/rejected": -2.18994140625, + "logps/chosen": -175.6567840576172, + "logps/rejected": -212.6671905517578, + "loss": 0.1836, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -1.1007821559906006, + "rewards/margins": 3.7487030029296875, + "rewards/rejected": -4.849484920501709, + "step": 450 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 29.593399249552508, + "learning_rate": 4.782406432664698e-07, + "logits/chosen": -2.2524051666259766, + "logits/rejected": -2.222111225128174, + "logps/chosen": -173.7434844970703, + "logps/rejected": -211.3067169189453, + "loss": 0.2032, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.4029574394226074, + "rewards/margins": 3.8389480113983154, + "rewards/rejected": -5.24190616607666, + "step": 455 + }, + { + "epoch": 1.5726495726495726, + "grad_norm": 21.988260575983947, + "learning_rate": 4.691840487307457e-07, + "logits/chosen": -2.2877230644226074, + "logits/rejected": -2.240570545196533, + "logps/chosen": -178.6396484375, + "logps/rejected": -215.9919891357422, + "loss": 0.2007, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.566180944442749, + "rewards/margins": 3.822190761566162, + "rewards/rejected": -5.38837194442749, + "step": 460 + }, + { + "epoch": 1.5897435897435899, + "grad_norm": 21.383643220476998, + "learning_rate": 4.601375925534609e-07, + "logits/chosen": -2.283261775970459, + "logits/rejected": -2.250561237335205, + "logps/chosen": -173.04922485351562, + "logps/rejected": -215.74740600585938, + "loss": 0.1872, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.3229897022247314, + "rewards/margins": 4.225142478942871, + "rewards/rejected": -5.548131465911865, + "step": 465 + }, + { + "epoch": 1.606837606837607, + "grad_norm": 23.12331536249533, + "learning_rate": 4.5110425099241564e-07, + "logits/chosen": -2.2584376335144043, + "logits/rejected": -2.2355711460113525, + "logps/chosen": -181.23280334472656, + "logps/rejected": -220.05606079101562, + "loss": 0.1678, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.5019714832305908, + "rewards/margins": 4.194333076477051, + "rewards/rejected": -5.6963043212890625, + "step": 470 + }, + { + "epoch": 1.623931623931624, + "grad_norm": 20.770063381017334, + "learning_rate": 4.4208699599073867e-07, + "logits/chosen": -2.247239589691162, + "logits/rejected": -2.2174410820007324, + "logps/chosen": -173.77297973632812, + "logps/rejected": -212.5133514404297, + "loss": 0.1657, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.4749728441238403, + "rewards/margins": 3.9321682453155518, + "rewards/rejected": -5.407141208648682, + "step": 475 + }, + { + "epoch": 1.641025641025641, + "grad_norm": 25.740541616781023, + "learning_rate": 4.330887941991288e-07, + "logits/chosen": -2.2574265003204346, + "logits/rejected": -2.2185988426208496, + "logps/chosen": -179.1300811767578, + "logps/rejected": -221.7349090576172, + "loss": 0.1837, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.613585114479065, + "rewards/margins": 4.0682172775268555, + "rewards/rejected": -5.681802272796631, + "step": 480 + }, + { + "epoch": 1.641025641025641, + "eval_logits/chosen": -2.2452123165130615, + "eval_logits/rejected": -2.2126853466033936, + "eval_logps/chosen": -180.79489135742188, + "eval_logps/rejected": -205.93594360351562, + "eval_loss": 0.42615845799446106, + "eval_rewards/accuracies": 0.8125, + "eval_rewards/chosen": -2.2317543029785156, + "eval_rewards/margins": 2.4738099575042725, + "eval_rewards/rejected": -4.705564498901367, + "eval_runtime": 549.0015, + "eval_samples_per_second": 15.144, + "eval_steps_per_second": 0.237, + "step": 480 + }, + { + "epoch": 1.658119658119658, + "grad_norm": 26.878756894059855, + "learning_rate": 4.241126059998332e-07, + "logits/chosen": -2.2241618633270264, + "logits/rejected": -2.2010858058929443, + "logps/chosen": -174.66329956054688, + "logps/rejected": -213.66751098632812, + "loss": 0.1703, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2549582719802856, + "rewards/margins": 3.9710934162139893, + "rewards/rejected": -5.226052284240723, + "step": 485 + }, + { + "epoch": 1.6752136752136753, + "grad_norm": 26.319140964321747, + "learning_rate": 4.151613845326911e-07, + "logits/chosen": -2.2174792289733887, + "logits/rejected": -2.1797664165496826, + "logps/chosen": -178.86734008789062, + "logps/rejected": -219.2964324951172, + "loss": 0.2076, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.4113620519638062, + "rewards/margins": 4.0504631996154785, + "rewards/rejected": -5.461825370788574, + "step": 490 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 22.333623237534447, + "learning_rate": 4.062380747235595e-07, + "logits/chosen": -2.2221896648406982, + "logits/rejected": -2.2027692794799805, + "logps/chosen": -186.13905334472656, + "logps/rejected": -229.50302124023438, + "loss": 0.1886, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.598037838935852, + "rewards/margins": 3.9121577739715576, + "rewards/rejected": -5.510195732116699, + "step": 495 + }, + { + "epoch": 1.7094017094017095, + "grad_norm": 24.87704538149545, + "learning_rate": 3.9734561231544143e-07, + "logits/chosen": -2.236389398574829, + "logits/rejected": -2.1884818077087402, + "logps/chosen": -169.65292358398438, + "logps/rejected": -212.2926025390625, + "loss": 0.1921, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -1.4791710376739502, + "rewards/margins": 3.933107376098633, + "rewards/rejected": -5.412278175354004, + "step": 500 + }, + { + "epoch": 1.7264957264957266, + "grad_norm": 22.979952992122115, + "learning_rate": 3.8848692290263427e-07, + "logits/chosen": -2.2684485912323, + "logits/rejected": -2.250487804412842, + "logps/chosen": -182.28250122070312, + "logps/rejected": -221.0419921875, + "loss": 0.1821, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.5764131546020508, + "rewards/margins": 4.01028299331665, + "rewards/rejected": -5.586695671081543, + "step": 505 + }, + { + "epoch": 1.7435897435897436, + "grad_norm": 25.865085919031618, + "learning_rate": 3.796649209682177e-07, + "logits/chosen": -2.2985148429870605, + "logits/rejected": -2.255593776702881, + "logps/chosen": -177.64010620117188, + "logps/rejected": -215.03195190429688, + "loss": 0.1997, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3909369707107544, + "rewards/margins": 3.831480026245117, + "rewards/rejected": -5.222417831420898, + "step": 510 + }, + { + "epoch": 1.7606837606837606, + "grad_norm": 23.24338527810705, + "learning_rate": 3.708825089251979e-07, + "logits/chosen": -2.331456422805786, + "logits/rejected": -2.3176369667053223, + "logps/chosen": -175.8199920654297, + "logps/rejected": -217.2957000732422, + "loss": 0.168, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.6907331943511963, + "rewards/margins": 4.071774482727051, + "rewards/rejected": -5.762507915496826, + "step": 515 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 27.938487011364977, + "learning_rate": 3.6214257616162237e-07, + "logits/chosen": -2.3677480220794678, + "logits/rejected": -2.3415515422821045, + "logps/chosen": -183.40509033203125, + "logps/rejected": -224.7578582763672, + "loss": 0.1942, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.3930178880691528, + "rewards/margins": 4.226252555847168, + "rewards/rejected": -5.6192708015441895, + "step": 520 + }, + { + "epoch": 1.7777777777777777, + "eval_logits/chosen": -2.3675007820129395, + "eval_logits/rejected": -2.3332858085632324, + "eval_logps/chosen": -182.28289794921875, + "eval_logps/rejected": -209.16372680664062, + "eval_loss": 0.41628795862197876, + "eval_rewards/accuracies": 0.8115384578704834, + "eval_rewards/chosen": -2.380557060241699, + "eval_rewards/margins": 2.6477856636047363, + "eval_rewards/rejected": -5.028342247009277, + "eval_runtime": 510.4037, + "eval_samples_per_second": 16.289, + "eval_steps_per_second": 0.255, + "step": 520 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 23.98423738261615, + "learning_rate": 3.5344799808997837e-07, + "logits/chosen": -2.3682048320770264, + "logits/rejected": -2.332885980606079, + "logps/chosen": -176.56248474121094, + "logps/rejected": -217.3412322998047, + "loss": 0.1878, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5687954425811768, + "rewards/margins": 4.247409820556641, + "rewards/rejected": -5.816205978393555, + "step": 525 + }, + { + "epoch": 1.811965811965812, + "grad_norm": 18.73032275897564, + "learning_rate": 3.448016352011913e-07, + "logits/chosen": -2.365084171295166, + "logits/rejected": -2.3294410705566406, + "logps/chosen": -172.4662628173828, + "logps/rejected": -211.91983032226562, + "loss": 0.1854, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.4405930042266846, + "rewards/margins": 3.777637481689453, + "rewards/rejected": -5.218230247497559, + "step": 530 + }, + { + "epoch": 1.8290598290598292, + "grad_norm": 24.2914549893428, + "learning_rate": 3.3620633212353176e-07, + "logits/chosen": -2.3548572063446045, + "logits/rejected": -2.3260064125061035, + "logps/chosen": -186.42669677734375, + "logps/rejected": -228.9281768798828, + "loss": 0.197, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.2932475805282593, + "rewards/margins": 4.461441993713379, + "rewards/rejected": -5.7546892166137695, + "step": 535 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 20.987391137372303, + "learning_rate": 3.2766491668674054e-07, + "logits/chosen": -2.324582576751709, + "logits/rejected": -2.2957985401153564, + "logps/chosen": -167.59715270996094, + "logps/rejected": -203.4008026123047, + "loss": 0.1988, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.3020083904266357, + "rewards/margins": 3.8016700744628906, + "rewards/rejected": -5.1036787033081055, + "step": 540 + }, + { + "epoch": 1.8632478632478633, + "grad_norm": 22.05478633330539, + "learning_rate": 3.1918019899168167e-07, + "logits/chosen": -2.348886013031006, + "logits/rejected": -2.305481195449829, + "logps/chosen": -176.51336669921875, + "logps/rejected": -215.34512329101562, + "loss": 0.2022, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.4230921268463135, + "rewards/margins": 4.097233772277832, + "rewards/rejected": -5.520325660705566, + "step": 545 + }, + { + "epoch": 1.8803418803418803, + "grad_norm": 26.34707224120942, + "learning_rate": 3.107549704858263e-07, + "logits/chosen": -2.3584249019622803, + "logits/rejected": -2.313872814178467, + "logps/chosen": -176.78977966308594, + "logps/rejected": -219.0670623779297, + "loss": 0.196, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0785906314849854, + "rewards/margins": 4.638379096984863, + "rewards/rejected": -5.716969966888428, + "step": 550 + }, + { + "epoch": 1.8974358974358974, + "grad_norm": 23.12033204874394, + "learning_rate": 3.0239200304487555e-07, + "logits/chosen": -2.360549211502075, + "logits/rejected": -2.3217921257019043, + "logps/chosen": -178.0338592529297, + "logps/rejected": -217.72946166992188, + "loss": 0.1672, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.3084853887557983, + "rewards/margins": 4.044316291809082, + "rewards/rejected": -5.35280179977417, + "step": 555 + }, + { + "epoch": 1.9145299145299144, + "grad_norm": 22.31661468469242, + "learning_rate": 2.940940480608207e-07, + "logits/chosen": -2.3631398677825928, + "logits/rejected": -2.3302533626556396, + "logps/chosen": -184.8848419189453, + "logps/rejected": -222.1883544921875, + "loss": 0.1821, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.4888789653778076, + "rewards/margins": 3.832033157348633, + "rewards/rejected": -5.320911884307861, + "step": 560 + }, + { + "epoch": 1.9145299145299144, + "eval_logits/chosen": -2.3542850017547607, + "eval_logits/rejected": -2.3237853050231934, + "eval_logps/chosen": -180.5154571533203, + "eval_logps/rejected": -205.5892791748047, + "eval_loss": 0.4165222644805908, + "eval_rewards/accuracies": 0.817307710647583, + "eval_rewards/chosen": -2.2038121223449707, + "eval_rewards/margins": 2.467085838317871, + "eval_rewards/rejected": -4.670897960662842, + "eval_runtime": 510.5174, + "eval_samples_per_second": 16.285, + "eval_steps_per_second": 0.255, + "step": 560 + }, + { + "epoch": 1.9316239316239316, + "grad_norm": 24.71052098979795, + "learning_rate": 2.858638355367439e-07, + "logits/chosen": -2.3543615341186523, + "logits/rejected": -2.3290767669677734, + "logps/chosen": -182.73150634765625, + "logps/rejected": -218.5353240966797, + "loss": 0.2078, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.6487442255020142, + "rewards/margins": 3.928899049758911, + "rewards/rejected": -5.577642440795898, + "step": 565 + }, + { + "epoch": 1.9487179487179487, + "grad_norm": 23.610122970814867, + "learning_rate": 2.7770407318865484e-07, + "logits/chosen": -2.3738222122192383, + "logits/rejected": -2.3505990505218506, + "logps/chosen": -178.51979064941406, + "logps/rejected": -217.83358764648438, + "loss": 0.179, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.6580047607421875, + "rewards/margins": 4.061518669128418, + "rewards/rejected": -5.719522953033447, + "step": 570 + }, + { + "epoch": 1.965811965811966, + "grad_norm": 24.787697799658925, + "learning_rate": 2.696174455546599e-07, + "logits/chosen": -2.37988018989563, + "logits/rejected": -2.350247621536255, + "logps/chosen": -182.24929809570312, + "logps/rejected": -221.3693084716797, + "loss": 0.2082, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.5788228511810303, + "rewards/margins": 3.9897353649139404, + "rewards/rejected": -5.568558692932129, + "step": 575 + }, + { + "epoch": 1.982905982905983, + "grad_norm": 22.43146369740861, + "learning_rate": 2.616066131117562e-07, + "logits/chosen": -2.3819072246551514, + "logits/rejected": -2.3640804290771484, + "logps/chosen": -180.75192260742188, + "logps/rejected": -221.94863891601562, + "loss": 0.1861, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -1.5098506212234497, + "rewards/margins": 4.033259868621826, + "rewards/rejected": -5.543110370635986, + "step": 580 + }, + { + "epoch": 2.0, + "grad_norm": 24.29500576957158, + "learning_rate": 2.536742114005448e-07, + "logits/chosen": -2.3809404373168945, + "logits/rejected": -2.3699495792388916, + "logps/chosen": -178.88844299316406, + "logps/rejected": -219.03109741210938, + "loss": 0.1861, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.5433608293533325, + "rewards/margins": 4.056896209716797, + "rewards/rejected": -5.60025691986084, + "step": 585 + }, + { + "epoch": 2.017094017094017, + "grad_norm": 18.084009984910033, + "learning_rate": 2.4582285015814256e-07, + "logits/chosen": -2.3655307292938232, + "logits/rejected": -2.340487003326416, + "logps/chosen": -178.33230590820312, + "logps/rejected": -223.9752655029297, + "loss": 0.0966, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.1265462636947632, + "rewards/margins": 5.09350061416626, + "rewards/rejected": -6.2200469970703125, + "step": 590 + }, + { + "epoch": 2.034188034188034, + "grad_norm": 19.038968132230398, + "learning_rate": 2.3805511245958815e-07, + "logits/chosen": -2.3579788208007812, + "logits/rejected": -2.317511796951294, + "logps/chosen": -173.7149200439453, + "logps/rejected": -227.1968231201172, + "loss": 0.1006, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.0953209400177002, + "rewards/margins": 4.835733413696289, + "rewards/rejected": -5.93105411529541, + "step": 595 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 15.255014864865824, + "learning_rate": 2.3037355386801683e-07, + "logits/chosen": -2.3267550468444824, + "logits/rejected": -2.3092563152313232, + "logps/chosen": -169.4131622314453, + "logps/rejected": -217.4252166748047, + "loss": 0.0858, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.402895450592041, + "rewards/margins": 5.089356899261475, + "rewards/rejected": -6.492252349853516, + "step": 600 + }, + { + "epoch": 2.051282051282051, + "eval_logits/chosen": -2.3219799995422363, + "eval_logits/rejected": -2.287177562713623, + "eval_logps/chosen": -185.50662231445312, + "eval_logps/rejected": -210.85968017578125, + "eval_loss": 0.44152435660362244, + "eval_rewards/accuracies": 0.8144230842590332, + "eval_rewards/chosen": -2.702928304672241, + "eval_rewards/margins": 2.4950103759765625, + "eval_rewards/rejected": -5.197938442230225, + "eval_runtime": 510.4122, + "eval_samples_per_second": 16.289, + "eval_steps_per_second": 0.255, + "step": 600 + }, + { + "epoch": 2.0683760683760686, + "grad_norm": 18.26123103060911, + "learning_rate": 2.2278070159388872e-07, + "logits/chosen": -2.3193914890289307, + "logits/rejected": -2.2786076068878174, + "logps/chosen": -178.44021606445312, + "logps/rejected": -230.6327362060547, + "loss": 0.0852, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.5032012462615967, + "rewards/margins": 5.247129440307617, + "rewards/rejected": -6.750330924987793, + "step": 605 + }, + { + "epoch": 2.0854700854700856, + "grad_norm": 21.277477128670753, + "learning_rate": 2.1527905366354289e-07, + "logits/chosen": -2.256251573562622, + "logits/rejected": -2.2409000396728516, + "logps/chosen": -168.60833740234375, + "logps/rejected": -217.9632568359375, + "loss": 0.0849, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.7518459558486938, + "rewards/margins": 5.027651309967041, + "rewards/rejected": -6.7794976234436035, + "step": 610 + }, + { + "epoch": 2.1025641025641026, + "grad_norm": 17.989878343678413, + "learning_rate": 2.0787107809735372e-07, + "logits/chosen": -2.2655296325683594, + "logits/rejected": -2.2357075214385986, + "logps/chosen": -187.8769989013672, + "logps/rejected": -243.11782836914062, + "loss": 0.0868, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.2156667709350586, + "rewards/margins": 5.461678504943848, + "rewards/rejected": -7.677346229553223, + "step": 615 + }, + { + "epoch": 2.1196581196581197, + "grad_norm": 25.39886918820101, + "learning_rate": 2.0055921209776062e-07, + "logits/chosen": -2.260803461074829, + "logits/rejected": -2.230661392211914, + "logps/chosen": -187.24539184570312, + "logps/rejected": -241.15328979492188, + "loss": 0.0942, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -1.8768556118011475, + "rewards/margins": 5.8292131423950195, + "rewards/rejected": -7.706068992614746, + "step": 620 + }, + { + "epoch": 2.1367521367521367, + "grad_norm": 20.749497849828533, + "learning_rate": 1.9334586124743446e-07, + "logits/chosen": -2.279060125350952, + "logits/rejected": -2.242183208465576, + "logps/chosen": -184.6155242919922, + "logps/rejected": -238.03311157226562, + "loss": 0.0898, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.563646674156189, + "rewards/margins": 5.580409049987793, + "rewards/rejected": -7.144055366516113, + "step": 625 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 16.10697134046993, + "learning_rate": 1.8623339871784866e-07, + "logits/chosen": -2.2914092540740967, + "logits/rejected": -2.2620978355407715, + "logps/chosen": -175.74288940429688, + "logps/rejected": -239.6836700439453, + "loss": 0.0734, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.3752235174179077, + "rewards/margins": 5.611042499542236, + "rewards/rejected": -6.986265659332275, + "step": 630 + }, + { + "epoch": 2.1709401709401708, + "grad_norm": 18.699361706034736, + "learning_rate": 1.792241644885118e-07, + "logits/chosen": -2.3186287879943848, + "logits/rejected": -2.2809865474700928, + "logps/chosen": -181.44424438476562, + "logps/rejected": -240.8855438232422, + "loss": 0.0837, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.49187171459198, + "rewards/margins": 5.5167741775512695, + "rewards/rejected": -7.008645534515381, + "step": 635 + }, + { + "epoch": 2.1880341880341883, + "grad_norm": 14.526674830354029, + "learning_rate": 1.7232046457712162e-07, + "logits/chosen": -2.3097856044769287, + "logits/rejected": -2.2732465267181396, + "logps/chosen": -173.0911102294922, + "logps/rejected": -232.27151489257812, + "loss": 0.0832, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.6045150756835938, + "rewards/margins": 5.56905460357666, + "rewards/rejected": -7.173569679260254, + "step": 640 + }, + { + "epoch": 2.1880341880341883, + "eval_logits/chosen": -2.3247125148773193, + "eval_logits/rejected": -2.2892396450042725, + "eval_logps/chosen": -187.42820739746094, + "eval_logps/rejected": -215.4343719482422, + "eval_loss": 0.44137728214263916, + "eval_rewards/accuracies": 0.817307710647583, + "eval_rewards/chosen": -2.895087242126465, + "eval_rewards/margins": 2.760321855545044, + "eval_rewards/rejected": -5.655409336090088, + "eval_runtime": 510.2899, + "eval_samples_per_second": 16.293, + "eval_steps_per_second": 0.255, + "step": 640 + }, + { + "epoch": 2.2051282051282053, + "grad_norm": 19.763278785896016, + "learning_rate": 1.65524570280892e-07, + "logits/chosen": -2.3227803707122803, + "logits/rejected": -2.295450448989868, + "logps/chosen": -192.46340942382812, + "logps/rejected": -244.09390258789062, + "loss": 0.0723, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.7332662343978882, + "rewards/margins": 5.458445072174072, + "rewards/rejected": -7.191710472106934, + "step": 645 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 21.8863379094822, + "learning_rate": 1.5883871742930255e-07, + "logits/chosen": -2.315948247909546, + "logits/rejected": -2.2889504432678223, + "logps/chosen": -187.40028381347656, + "logps/rejected": -233.48171997070312, + "loss": 0.0845, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.8563635349273682, + "rewards/margins": 5.307656288146973, + "rewards/rejected": -7.164019584655762, + "step": 650 + }, + { + "epoch": 2.2393162393162394, + "grad_norm": 19.285682999107884, + "learning_rate": 1.522651056485173e-07, + "logits/chosen": -2.3096861839294434, + "logits/rejected": -2.283123254776001, + "logps/chosen": -177.6830596923828, + "logps/rejected": -234.69467163085938, + "loss": 0.0851, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.258054256439209, + "rewards/margins": 5.4839186668396, + "rewards/rejected": -7.741972923278809, + "step": 655 + }, + { + "epoch": 2.2564102564102564, + "grad_norm": 15.418975392685667, + "learning_rate": 1.458058976377141e-07, + "logits/chosen": -2.3327953815460205, + "logits/rejected": -2.2950236797332764, + "logps/chosen": -190.35812377929688, + "logps/rejected": -253.2418975830078, + "loss": 0.0739, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.100916624069214, + "rewards/margins": 5.835064888000488, + "rewards/rejected": -7.935981750488281, + "step": 660 + }, + { + "epoch": 2.2735042735042734, + "grad_norm": 21.281597244890587, + "learning_rate": 1.3946321845756276e-07, + "logits/chosen": -2.312046527862549, + "logits/rejected": -2.2943308353424072, + "logps/chosen": -186.85183715820312, + "logps/rejected": -244.026123046875, + "loss": 0.0756, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.037853717803955, + "rewards/margins": 5.661513805389404, + "rewards/rejected": -7.699367523193359, + "step": 665 + }, + { + "epoch": 2.2905982905982905, + "grad_norm": 24.811809940902464, + "learning_rate": 1.3323915483108662e-07, + "logits/chosen": -2.3066821098327637, + "logits/rejected": -2.2808244228363037, + "logps/chosen": -188.14840698242188, + "logps/rejected": -245.43148803710938, + "loss": 0.0952, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.6107755899429321, + "rewards/margins": 5.839807987213135, + "rewards/rejected": -7.450583457946777, + "step": 670 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 14.461116164284503, + "learning_rate": 1.2713575445713615e-07, + "logits/chosen": -2.3114190101623535, + "logits/rejected": -2.26875638961792, + "logps/chosen": -189.15365600585938, + "logps/rejected": -239.4384765625, + "loss": 0.0753, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.071650743484497, + "rewards/margins": 5.3538007736206055, + "rewards/rejected": -7.425451755523682, + "step": 675 + }, + { + "epoch": 2.324786324786325, + "grad_norm": 19.813989102406204, + "learning_rate": 1.2115502533670252e-07, + "logits/chosen": -2.304924249649048, + "logits/rejected": -2.2652640342712402, + "logps/chosen": -181.27056884765625, + "logps/rejected": -234.0359649658203, + "loss": 0.0817, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.8430267572402954, + "rewards/margins": 5.729722023010254, + "rewards/rejected": -7.57274866104126, + "step": 680 + }, + { + "epoch": 2.324786324786325, + "eval_logits/chosen": -2.305617570877075, + "eval_logits/rejected": -2.2696919441223145, + "eval_logps/chosen": -190.88035583496094, + "eval_logps/rejected": -218.89450073242188, + "eval_loss": 0.45208829641342163, + "eval_rewards/accuracies": 0.8153846263885498, + "eval_rewards/chosen": -3.2403008937835693, + "eval_rewards/margins": 2.7611193656921387, + "eval_rewards/rejected": -6.001419544219971, + "eval_runtime": 510.6949, + "eval_samples_per_second": 16.28, + "eval_steps_per_second": 0.255, + "step": 680 + }, + { + "epoch": 2.341880341880342, + "grad_norm": 13.916793548794622, + "learning_rate": 1.1529893511229066e-07, + "logits/chosen": -2.305900812149048, + "logits/rejected": -2.2629313468933105, + "logps/chosen": -184.71005249023438, + "logps/rejected": -248.4109649658203, + "loss": 0.0723, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.8482071161270142, + "rewards/margins": 5.727347373962402, + "rewards/rejected": -7.575555324554443, + "step": 685 + }, + { + "epoch": 2.358974358974359, + "grad_norm": 22.341188351956706, + "learning_rate": 1.0956941042057105e-07, + "logits/chosen": -2.3007519245147705, + "logits/rejected": -2.2794156074523926, + "logps/chosen": -185.30859375, + "logps/rejected": -237.7666778564453, + "loss": 0.0825, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.918454885482788, + "rewards/margins": 5.560369491577148, + "rewards/rejected": -7.478825569152832, + "step": 690 + }, + { + "epoch": 2.376068376068376, + "grad_norm": 17.79636384627603, + "learning_rate": 1.0396833625852147e-07, + "logits/chosen": -2.3259952068328857, + "logits/rejected": -2.29671049118042, + "logps/chosen": -190.3791961669922, + "logps/rejected": -235.37075805664062, + "loss": 0.0735, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0598902702331543, + "rewards/margins": 5.3269853591918945, + "rewards/rejected": -7.386874198913574, + "step": 695 + }, + { + "epoch": 2.393162393162393, + "grad_norm": 18.853495950999168, + "learning_rate": 9.849755536326865e-08, + "logits/chosen": -2.3200008869171143, + "logits/rejected": -2.2810311317443848, + "logps/chosen": -176.51504516601562, + "logps/rejected": -230.62057495117188, + "loss": 0.0931, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.021453380584717, + "rewards/margins": 5.410551071166992, + "rewards/rejected": -7.432004451751709, + "step": 700 + }, + { + "epoch": 2.41025641025641, + "grad_norm": 20.98252714366612, + "learning_rate": 9.31588676058332e-08, + "logits/chosen": -2.304353713989258, + "logits/rejected": -2.276763916015625, + "logps/chosen": -174.02536010742188, + "logps/rejected": -237.18313598632812, + "loss": 0.0747, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7528259754180908, + "rewards/margins": 5.926802158355713, + "rewards/rejected": -7.679627418518066, + "step": 705 + }, + { + "epoch": 2.427350427350427, + "grad_norm": 19.38549030227641, + "learning_rate": 8.795402939897678e-08, + "logits/chosen": -2.3206698894500732, + "logits/rejected": -2.277614116668701, + "logps/chosen": -185.8979949951172, + "logps/rejected": -236.58267211914062, + "loss": 0.0858, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0482637882232666, + "rewards/margins": 5.332827091217041, + "rewards/rejected": -7.381091117858887, + "step": 710 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 24.728604072670628, + "learning_rate": 8.288475311934839e-08, + "logits/chosen": -2.308656692504883, + "logits/rejected": -2.2804362773895264, + "logps/chosen": -186.02822875976562, + "logps/rejected": -239.73403930664062, + "loss": 0.0793, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2324843406677246, + "rewards/margins": 5.412593841552734, + "rewards/rejected": -7.645077705383301, + "step": 715 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 21.233600955151765, + "learning_rate": 7.795270654411634e-08, + "logits/chosen": -2.2966208457946777, + "logits/rejected": -2.2731049060821533, + "logps/chosen": -177.4196319580078, + "logps/rejected": -231.11868286132812, + "loss": 0.0858, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.1023478507995605, + "rewards/margins": 5.321855068206787, + "rewards/rejected": -7.424202919006348, + "step": 720 + }, + { + "epoch": 2.4615384615384617, + "eval_logits/chosen": -2.3071556091308594, + "eval_logits/rejected": -2.2708208560943604, + "eval_logps/chosen": -192.32481384277344, + "eval_logps/rejected": -221.89260864257812, + "eval_loss": 0.4478990435600281, + "eval_rewards/accuracies": 0.8221153616905212, + "eval_rewards/chosen": -3.384748935699463, + "eval_rewards/margins": 2.9164834022521973, + "eval_rewards/rejected": -6.301231861114502, + "eval_runtime": 510.4622, + "eval_samples_per_second": 16.287, + "eval_steps_per_second": 0.255, + "step": 720 + }, + { + "epoch": 2.4786324786324787, + "grad_norm": 19.099521811525374, + "learning_rate": 7.315951230227501e-08, + "logits/chosen": -2.303283214569092, + "logits/rejected": -2.2725837230682373, + "logps/chosen": -183.1450958251953, + "logps/rejected": -240.37429809570312, + "loss": 0.0794, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1688971519470215, + "rewards/margins": 5.648955345153809, + "rewards/rejected": -7.817852973937988, + "step": 725 + }, + { + "epoch": 2.4957264957264957, + "grad_norm": 26.958627340601723, + "learning_rate": 6.850674734080453e-08, + "logits/chosen": -2.3182692527770996, + "logits/rejected": -2.287144422531128, + "logps/chosen": -194.92135620117188, + "logps/rejected": -248.2643585205078, + "loss": 0.0897, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.0353550910949707, + "rewards/margins": 5.611361503601074, + "rewards/rejected": -7.646716117858887, + "step": 730 + }, + { + "epoch": 2.5128205128205128, + "grad_norm": 16.799646075642695, + "learning_rate": 6.399594240585965e-08, + "logits/chosen": -2.307258367538452, + "logits/rejected": -2.28397274017334, + "logps/chosen": -182.8865509033203, + "logps/rejected": -239.4032440185547, + "loss": 0.075, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.2343621253967285, + "rewards/margins": 5.376429080963135, + "rewards/rejected": -7.610791206359863, + "step": 735 + }, + { + "epoch": 2.52991452991453, + "grad_norm": 22.575905648213276, + "learning_rate": 5.962858153915896e-08, + "logits/chosen": -2.324538469314575, + "logits/rejected": -2.2831063270568848, + "logps/chosen": -181.92108154296875, + "logps/rejected": -236.6908416748047, + "loss": 0.0794, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.367906093597412, + "rewards/margins": 5.319285869598389, + "rewards/rejected": -7.687191963195801, + "step": 740 + }, + { + "epoch": 2.547008547008547, + "grad_norm": 18.6782244870375, + "learning_rate": 5.540610158973935e-08, + "logits/chosen": -2.314725160598755, + "logits/rejected": -2.2854819297790527, + "logps/chosen": -186.04443359375, + "logps/rejected": -246.89944458007812, + "loss": 0.0799, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.9573853015899658, + "rewards/margins": 6.006454944610596, + "rewards/rejected": -7.963840484619141, + "step": 745 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 16.439313605440155, + "learning_rate": 5.1329891741236585e-08, + "logits/chosen": -2.3117012977600098, + "logits/rejected": -2.2856240272521973, + "logps/chosen": -172.4759979248047, + "logps/rejected": -228.8611602783203, + "loss": 0.0746, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.0346360206604004, + "rewards/margins": 5.2914581298828125, + "rewards/rejected": -7.3260931968688965, + "step": 750 + }, + { + "epoch": 2.5811965811965814, + "grad_norm": 20.803224348795787, + "learning_rate": 4.740129305484869e-08, + "logits/chosen": -2.328427791595459, + "logits/rejected": -2.2921013832092285, + "logps/chosen": -195.317138671875, + "logps/rejected": -249.6083984375, + "loss": 0.0825, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.082848072052002, + "rewards/margins": 5.33933687210083, + "rewards/rejected": -7.422184944152832, + "step": 755 + }, + { + "epoch": 2.5982905982905984, + "grad_norm": 20.390168764254813, + "learning_rate": 4.36215980281297e-08, + "logits/chosen": -2.2899012565612793, + "logits/rejected": -2.256514072418213, + "logps/chosen": -181.3725128173828, + "logps/rejected": -234.7386932373047, + "loss": 0.0723, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.0902533531188965, + "rewards/margins": 5.608729839324951, + "rewards/rejected": -7.698983192443848, + "step": 760 + }, + { + "epoch": 2.5982905982905984, + "eval_logits/chosen": -2.3103232383728027, + "eval_logits/rejected": -2.2754170894622803, + "eval_logps/chosen": -191.91326904296875, + "eval_logps/rejected": -219.9931640625, + "eval_loss": 0.4573881924152374, + "eval_rewards/accuracies": 0.817307710647583, + "eval_rewards/chosen": -3.343592405319214, + "eval_rewards/margins": 2.767695903778076, + "eval_rewards/rejected": -6.111288547515869, + "eval_runtime": 510.3536, + "eval_samples_per_second": 16.291, + "eval_steps_per_second": 0.255, + "step": 760 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 20.03939391159064, + "learning_rate": 3.9992050169762483e-08, + "logits/chosen": -2.326202392578125, + "logits/rejected": -2.2834787368774414, + "logps/chosen": -188.00990295410156, + "logps/rejected": -243.7239990234375, + "loss": 0.0854, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.0669825077056885, + "rewards/margins": 5.66384220123291, + "rewards/rejected": -7.7308244705200195, + "step": 765 + }, + { + "epoch": 2.6324786324786325, + "grad_norm": 18.90450563159807, + "learning_rate": 3.651384359044773e-08, + "logits/chosen": -2.289638042449951, + "logits/rejected": -2.260533332824707, + "logps/chosen": -179.10984802246094, + "logps/rejected": -227.11178588867188, + "loss": 0.0839, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.0667707920074463, + "rewards/margins": 5.173145771026611, + "rewards/rejected": -7.2399163246154785, + "step": 770 + }, + { + "epoch": 2.6495726495726495, + "grad_norm": 20.019780117687827, + "learning_rate": 3.318812261004467e-08, + "logits/chosen": -2.297818660736084, + "logits/rejected": -2.2862818241119385, + "logps/chosen": -184.581787109375, + "logps/rejected": -241.4400177001953, + "loss": 0.0792, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.9384911060333252, + "rewards/margins": 5.484036922454834, + "rewards/rejected": -7.4225287437438965, + "step": 775 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 25.308189579478544, + "learning_rate": 3.001598138109407e-08, + "logits/chosen": -2.3005475997924805, + "logits/rejected": -2.2629055976867676, + "logps/chosen": -182.78228759765625, + "logps/rejected": -235.35086059570312, + "loss": 0.0885, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.8058080673217773, + "rewards/margins": 5.312202453613281, + "rewards/rejected": -7.118011474609375, + "step": 780 + }, + { + "epoch": 2.683760683760684, + "grad_norm": 13.474267468085888, + "learning_rate": 2.6998463528844217e-08, + "logits/chosen": -2.3062500953674316, + "logits/rejected": -2.2558088302612305, + "logps/chosen": -181.8199462890625, + "logps/rejected": -233.541748046875, + "loss": 0.074, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.7403972148895264, + "rewards/margins": 5.567864418029785, + "rewards/rejected": -7.308261871337891, + "step": 785 + }, + { + "epoch": 2.700854700854701, + "grad_norm": 19.30064581779934, + "learning_rate": 2.4136561807901913e-08, + "logits/chosen": -2.3051257133483887, + "logits/rejected": -2.270841598510742, + "logps/chosen": -185.358642578125, + "logps/rejected": -246.6787567138672, + "loss": 0.0762, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9827969074249268, + "rewards/margins": 6.065325736999512, + "rewards/rejected": -8.048123359680176, + "step": 790 + }, + { + "epoch": 2.717948717948718, + "grad_norm": 19.741579494509946, + "learning_rate": 2.143121777561868e-08, + "logits/chosen": -2.287276029586792, + "logits/rejected": -2.258674144744873, + "logps/chosen": -176.90139770507812, + "logps/rejected": -232.11026000976562, + "loss": 0.0724, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.573284387588501, + "rewards/margins": 6.089115619659424, + "rewards/rejected": -7.6623992919921875, + "step": 795 + }, + { + "epoch": 2.735042735042735, + "grad_norm": 19.940662080091073, + "learning_rate": 1.8883321482321578e-08, + "logits/chosen": -2.2892422676086426, + "logits/rejected": -2.2566611766815186, + "logps/chosen": -172.860595703125, + "logps/rejected": -232.4971160888672, + "loss": 0.0717, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.6984087228775024, + "rewards/margins": 5.417222499847412, + "rewards/rejected": -7.115631103515625, + "step": 800 + }, + { + "epoch": 2.735042735042735, + "eval_logits/chosen": -2.2973055839538574, + "eval_logits/rejected": -2.2609665393829346, + "eval_logps/chosen": -191.64833068847656, + "eval_logps/rejected": -220.16880798339844, + "eval_loss": 0.4532097578048706, + "eval_rewards/accuracies": 0.8192307949066162, + "eval_rewards/chosen": -3.317098617553711, + "eval_rewards/margins": 2.8117525577545166, + "eval_rewards/rejected": -6.128850936889648, + "eval_runtime": 510.4686, + "eval_samples_per_second": 16.287, + "eval_steps_per_second": 0.255, + "step": 800 + }, + { + "epoch": 2.752136752136752, + "grad_norm": 17.41444629544834, + "learning_rate": 1.6493711178488744e-08, + "logits/chosen": -2.283872365951538, + "logits/rejected": -2.251258134841919, + "logps/chosen": -179.69078063964844, + "logps/rejected": -234.2012939453125, + "loss": 0.08, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.8895294666290283, + "rewards/margins": 5.602183818817139, + "rewards/rejected": -7.491713523864746, + "step": 805 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 17.858679577395865, + "learning_rate": 1.4263173038967624e-08, + "logits/chosen": -2.2997751235961914, + "logits/rejected": -2.249659776687622, + "logps/chosen": -184.75465393066406, + "logps/rejected": -241.6949920654297, + "loss": 0.0881, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.9601249694824219, + "rewards/margins": 5.683220863342285, + "rewards/rejected": -7.643344879150391, + "step": 810 + }, + { + "epoch": 2.786324786324786, + "grad_norm": 24.316728090401927, + "learning_rate": 1.2192440904325863e-08, + "logits/chosen": -2.2953031063079834, + "logits/rejected": -2.2512805461883545, + "logps/chosen": -180.14657592773438, + "logps/rejected": -238.9154815673828, + "loss": 0.076, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.9033313989639282, + "rewards/margins": 6.157049655914307, + "rewards/rejected": -8.060380935668945, + "step": 815 + }, + { + "epoch": 2.8034188034188032, + "grad_norm": 25.493853055907447, + "learning_rate": 1.0282196039419822e-08, + "logits/chosen": -2.296130418777466, + "logits/rejected": -2.242311954498291, + "logps/chosen": -180.66371154785156, + "logps/rejected": -233.595703125, + "loss": 0.0871, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.8362958431243896, + "rewards/margins": 5.512847423553467, + "rewards/rejected": -7.349143028259277, + "step": 820 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 19.824952977643857, + "learning_rate": 8.5330669092602e-09, + "logits/chosen": -2.300177574157715, + "logits/rejected": -2.274824619293213, + "logps/chosen": -182.87335205078125, + "logps/rejected": -240.62448120117188, + "loss": 0.084, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.805980920791626, + "rewards/margins": 5.6406683921813965, + "rewards/rejected": -7.446649074554443, + "step": 825 + }, + { + "epoch": 2.8376068376068377, + "grad_norm": 17.98540368194656, + "learning_rate": 6.945628972249207e-09, + "logits/chosen": -2.2921595573425293, + "logits/rejected": -2.252385377883911, + "logps/chosen": -187.79107666015625, + "logps/rejected": -234.39712524414062, + "loss": 0.0866, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.911435842514038, + "rewards/margins": 5.263657569885254, + "rewards/rejected": -7.175093650817871, + "step": 830 + }, + { + "epoch": 2.8547008547008548, + "grad_norm": 22.73115154368679, + "learning_rate": 5.520404490856223e-09, + "logits/chosen": -2.2854042053222656, + "logits/rejected": -2.2550876140594482, + "logps/chosen": -188.21945190429688, + "logps/rejected": -236.88558959960938, + "loss": 0.0789, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.0177128314971924, + "rewards/margins": 5.572363376617432, + "rewards/rejected": -7.590075492858887, + "step": 835 + }, + { + "epoch": 2.871794871794872, + "grad_norm": 20.407514829393712, + "learning_rate": 4.257862359794917e-09, + "logits/chosen": -2.289051055908203, + "logits/rejected": -2.247889280319214, + "logps/chosen": -179.62939453125, + "logps/rejected": -240.20358276367188, + "loss": 0.0691, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8513367176055908, + "rewards/margins": 5.8751325607299805, + "rewards/rejected": -7.726468563079834, + "step": 840 + }, + { + "epoch": 2.871794871794872, + "eval_logits/chosen": -2.296386957168579, + "eval_logits/rejected": -2.2604150772094727, + "eval_logps/chosen": -191.21661376953125, + "eval_logps/rejected": -219.7353515625, + "eval_loss": 0.45144182443618774, + "eval_rewards/accuracies": 0.8211538195610046, + "eval_rewards/chosen": -3.273927688598633, + "eval_rewards/margins": 2.811577558517456, + "eval_rewards/rejected": -6.085505485534668, + "eval_runtime": 510.3719, + "eval_samples_per_second": 16.29, + "eval_steps_per_second": 0.255, + "step": 840 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 16.797802487321363, + "learning_rate": 3.158417951758474e-09, + "logits/chosen": -2.2995002269744873, + "logits/rejected": -2.263909339904785, + "logps/chosen": -187.00927734375, + "logps/rejected": -240.8574676513672, + "loss": 0.0761, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9329954385757446, + "rewards/margins": 5.68951940536499, + "rewards/rejected": -7.622515678405762, + "step": 845 + }, + { + "epoch": 2.905982905982906, + "grad_norm": 16.791784749254507, + "learning_rate": 2.2224329807629118e-09, + "logits/chosen": -2.290558338165283, + "logits/rejected": -2.2760066986083984, + "logps/chosen": -196.16612243652344, + "logps/rejected": -246.82626342773438, + "loss": 0.0746, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.8158422708511353, + "rewards/margins": 5.765191078186035, + "rewards/rejected": -7.581032752990723, + "step": 850 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 19.558918422253715, + "learning_rate": 1.450215383144382e-09, + "logits/chosen": -2.2866015434265137, + "logits/rejected": -2.2506113052368164, + "logps/chosen": -180.97921752929688, + "logps/rejected": -238.69216918945312, + "loss": 0.079, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.7389026880264282, + "rewards/margins": 5.652696132659912, + "rewards/rejected": -7.391599178314209, + "step": 855 + }, + { + "epoch": 2.9401709401709404, + "grad_norm": 17.624810409934664, + "learning_rate": 8.420192162490458e-10, + "logits/chosen": -2.300001621246338, + "logits/rejected": -2.256424903869629, + "logps/chosen": -175.36312866210938, + "logps/rejected": -237.1394500732422, + "loss": 0.0725, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.7655397653579712, + "rewards/margins": 5.986645698547363, + "rewards/rejected": -7.752184867858887, + "step": 860 + }, + { + "epoch": 2.9572649572649574, + "grad_norm": 18.143952678520897, + "learning_rate": 3.98044574848877e-10, + "logits/chosen": -2.295402765274048, + "logits/rejected": -2.252981424331665, + "logps/chosen": -168.66152954101562, + "logps/rejected": -224.2865753173828, + "loss": 0.085, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.086804151535034, + "rewards/margins": 5.356126308441162, + "rewards/rejected": -7.442930698394775, + "step": 865 + }, + { + "epoch": 2.9743589743589745, + "grad_norm": 16.814549347952457, + "learning_rate": 1.1843752531104368e-10, + "logits/chosen": -2.302126407623291, + "logits/rejected": -2.25750732421875, + "logps/chosen": -182.16998291015625, + "logps/rejected": -239.323486328125, + "loss": 0.0743, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.7848079204559326, + "rewards/margins": 5.679649829864502, + "rewards/rejected": -7.464458465576172, + "step": 870 + }, + { + "epoch": 2.9914529914529915, + "grad_norm": 21.29362677599653, + "learning_rate": 3.290057542459923e-12, + "logits/chosen": -2.307149887084961, + "logits/rejected": -2.273266077041626, + "logps/chosen": -187.17897033691406, + "logps/rejected": -248.2684326171875, + "loss": 0.0748, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9107911586761475, + "rewards/margins": 5.949364185333252, + "rewards/rejected": -7.8601555824279785, + "step": 875 + }, + { + "epoch": 2.994871794871795, + "step": 876, + "total_flos": 1.0330118169821184e+16, + "train_loss": 0.2632859356310134, + "train_runtime": 40450.3897, + "train_samples_per_second": 5.549, + "train_steps_per_second": 0.022 + } + ], + "logging_steps": 5, + "max_steps": 876, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 40, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0330118169821184e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}