htlou's picture
Upload folder using huggingface_hub
5acd714 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0940170940170941,
"eval_steps": 40,
"global_step": 320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017094017094017096,
"grad_norm": 35.038580788061665,
"learning_rate": 5e-07,
"logits/chosen": -2.7457876205444336,
"logits/rejected": -2.7444841861724854,
"logps/chosen": -164.26461791992188,
"logps/rejected": -170.55870056152344,
"loss": 0.6935,
"rewards/accuracies": 0.26875001192092896,
"rewards/chosen": 0.003455913159996271,
"rewards/margins": -0.0019886991940438747,
"rewards/rejected": 0.0054446132853627205,
"step": 5
},
{
"epoch": 0.03418803418803419,
"grad_norm": 36.203903910498276,
"learning_rate": 1e-06,
"logits/chosen": -2.7106502056121826,
"logits/rejected": -2.716397523880005,
"logps/chosen": -171.80043029785156,
"logps/rejected": -165.20602416992188,
"loss": 0.6875,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.012000308372080326,
"rewards/margins": 0.0025437879376113415,
"rewards/rejected": 0.009456520900130272,
"step": 10
},
{
"epoch": 0.05128205128205128,
"grad_norm": 33.9576577784673,
"learning_rate": 9.999177507263144e-07,
"logits/chosen": -2.651571750640869,
"logits/rejected": -2.629457473754883,
"logps/chosen": -174.04080200195312,
"logps/rejected": -174.0542755126953,
"loss": 0.6698,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.23909731209278107,
"rewards/margins": 0.10868903249502182,
"rewards/rejected": 0.13040827214717865,
"step": 15
},
{
"epoch": 0.06837606837606838,
"grad_norm": 34.33646066636181,
"learning_rate": 9.996710299650301e-07,
"logits/chosen": -2.476440668106079,
"logits/rejected": -2.450225353240967,
"logps/chosen": -158.1311798095703,
"logps/rejected": -158.0066680908203,
"loss": 0.6613,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.4318675100803375,
"rewards/margins": 0.14549395442008972,
"rewards/rejected": 0.2863735556602478,
"step": 20
},
{
"epoch": 0.08547008547008547,
"grad_norm": 33.16430522723429,
"learning_rate": 9.992599188865604e-07,
"logits/chosen": -2.3086318969726562,
"logits/rejected": -2.3104796409606934,
"logps/chosen": -150.59771728515625,
"logps/rejected": -156.85037231445312,
"loss": 0.6494,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.5047669410705566,
"rewards/margins": 0.16554531455039978,
"rewards/rejected": 0.33922165632247925,
"step": 25
},
{
"epoch": 0.10256410256410256,
"grad_norm": 34.52861424862365,
"learning_rate": 9.98684552745256e-07,
"logits/chosen": -2.217874050140381,
"logits/rejected": -2.2254481315612793,
"logps/chosen": -161.29412841796875,
"logps/rejected": -161.40841674804688,
"loss": 0.6295,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.4176379144191742,
"rewards/margins": 0.26531916856765747,
"rewards/rejected": 0.15231874585151672,
"step": 30
},
{
"epoch": 0.11965811965811966,
"grad_norm": 31.455117829218544,
"learning_rate": 9.979451208349055e-07,
"logits/chosen": -2.2608728408813477,
"logits/rejected": -2.246007204055786,
"logps/chosen": -171.71456909179688,
"logps/rejected": -174.46578979492188,
"loss": 0.6305,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.01912705972790718,
"rewards/margins": 0.31441593170166016,
"rewards/rejected": -0.33354294300079346,
"step": 35
},
{
"epoch": 0.13675213675213677,
"grad_norm": 31.67318837058587,
"learning_rate": 9.970418664264595e-07,
"logits/chosen": -2.345672130584717,
"logits/rejected": -2.331491470336914,
"logps/chosen": -171.24766540527344,
"logps/rejected": -176.8189697265625,
"loss": 0.5989,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.27867692708969116,
"rewards/margins": 0.5290472507476807,
"rewards/rejected": -0.8077241778373718,
"step": 40
},
{
"epoch": 0.13675213675213677,
"eval_logits/chosen": -2.4102065563201904,
"eval_logits/rejected": -2.401230573654175,
"eval_logps/chosen": -162.36439514160156,
"eval_logps/rejected": -167.4954071044922,
"eval_loss": 0.6069236993789673,
"eval_rewards/accuracies": 0.6365384459495544,
"eval_rewards/chosen": -0.388705849647522,
"eval_rewards/margins": 0.47280558943748474,
"eval_rewards/rejected": -0.8615114688873291,
"eval_runtime": 509.918,
"eval_samples_per_second": 16.305,
"eval_steps_per_second": 0.255,
"step": 40
},
{
"epoch": 0.15384615384615385,
"grad_norm": 36.18313806223269,
"learning_rate": 9.95975086687994e-07,
"logits/chosen": -2.44050669670105,
"logits/rejected": -2.4460220336914062,
"logps/chosen": -163.82875061035156,
"logps/rejected": -167.35989379882812,
"loss": 0.6146,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.31098368763923645,
"rewards/margins": 0.46269193291664124,
"rewards/rejected": -0.7736755609512329,
"step": 45
},
{
"epoch": 0.17094017094017094,
"grad_norm": 31.13412274683678,
"learning_rate": 9.947451325869439e-07,
"logits/chosen": -2.501091718673706,
"logits/rejected": -2.4991250038146973,
"logps/chosen": -172.09686279296875,
"logps/rejected": -177.7747802734375,
"loss": 0.577,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.212348073720932,
"rewards/margins": 0.6062799692153931,
"rewards/rejected": -0.8186280131340027,
"step": 50
},
{
"epoch": 0.18803418803418803,
"grad_norm": 31.508672436862835,
"learning_rate": 9.933524087746347e-07,
"logits/chosen": -2.437525510787964,
"logits/rejected": -2.4285693168640137,
"logps/chosen": -168.1316375732422,
"logps/rejected": -175.23193359375,
"loss": 0.571,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.513076901435852,
"rewards/margins": 0.7702310681343079,
"rewards/rejected": -1.2833080291748047,
"step": 55
},
{
"epoch": 0.20512820512820512,
"grad_norm": 30.148068867306787,
"learning_rate": 9.917973734531549e-07,
"logits/chosen": -2.431530475616455,
"logits/rejected": -2.431729793548584,
"logps/chosen": -159.38168334960938,
"logps/rejected": -170.52500915527344,
"loss": 0.5762,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.34855490922927856,
"rewards/margins": 0.5969334244728088,
"rewards/rejected": -0.9454883337020874,
"step": 60
},
{
"epoch": 0.2222222222222222,
"grad_norm": 32.03814968183332,
"learning_rate": 9.90080538224607e-07,
"logits/chosen": -2.533193588256836,
"logits/rejected": -2.5252978801727295,
"logps/chosen": -157.30966186523438,
"logps/rejected": -166.26011657714844,
"loss": 0.5643,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.007600936107337475,
"rewards/margins": 0.5010749697685242,
"rewards/rejected": -0.5086758732795715,
"step": 65
},
{
"epoch": 0.23931623931623933,
"grad_norm": 29.16308768569833,
"learning_rate": 9.882024679227938e-07,
"logits/chosen": -2.5899624824523926,
"logits/rejected": -2.5779967308044434,
"logps/chosen": -178.4553985595703,
"logps/rejected": -179.71542358398438,
"loss": 0.5464,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.47189587354660034,
"rewards/margins": 0.8304598927497864,
"rewards/rejected": -1.3023556470870972,
"step": 70
},
{
"epoch": 0.2564102564102564,
"grad_norm": 28.918531347661485,
"learning_rate": 9.861637804273881e-07,
"logits/chosen": -2.578892469406128,
"logits/rejected": -2.5758416652679443,
"logps/chosen": -162.60537719726562,
"logps/rejected": -170.6789093017578,
"loss": 0.5553,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.45147842168807983,
"rewards/margins": 0.6994724273681641,
"rewards/rejected": -1.1509509086608887,
"step": 75
},
{
"epoch": 0.27350427350427353,
"grad_norm": 26.98866754941649,
"learning_rate": 9.83965146460653e-07,
"logits/chosen": -2.54936146736145,
"logits/rejected": -2.5406956672668457,
"logps/chosen": -168.81484985351562,
"logps/rejected": -179.770751953125,
"loss": 0.5452,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6899678111076355,
"rewards/margins": 0.8549306988716125,
"rewards/rejected": -1.544898509979248,
"step": 80
},
{
"epoch": 0.27350427350427353,
"eval_logits/chosen": -2.53336238861084,
"eval_logits/rejected": -2.517695665359497,
"eval_logps/chosen": -167.28964233398438,
"eval_logps/rejected": -177.21824645996094,
"eval_loss": 0.5331124663352966,
"eval_rewards/accuracies": 0.7134615182876587,
"eval_rewards/chosen": -0.8812309503555298,
"eval_rewards/margins": 0.9525622725486755,
"eval_rewards/rejected": -1.8337931632995605,
"eval_runtime": 510.0922,
"eval_samples_per_second": 16.299,
"eval_steps_per_second": 0.255,
"step": 80
},
{
"epoch": 0.2905982905982906,
"grad_norm": 34.783908892421536,
"learning_rate": 9.816072893667758e-07,
"logits/chosen": -2.5432825088500977,
"logits/rejected": -2.5159504413604736,
"logps/chosen": -174.62197875976562,
"logps/rejected": -185.89413452148438,
"loss": 0.5581,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.0434839725494385,
"rewards/margins": 1.0283188819885254,
"rewards/rejected": -2.0718026161193848,
"step": 85
},
{
"epoch": 0.3076923076923077,
"grad_norm": 26.697686805838906,
"learning_rate": 9.790909848738904e-07,
"logits/chosen": -2.5102508068084717,
"logits/rejected": -2.5222485065460205,
"logps/chosen": -175.47544860839844,
"logps/rejected": -183.92678833007812,
"loss": 0.5208,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.9199908971786499,
"rewards/margins": 0.8521744608879089,
"rewards/rejected": -1.7721655368804932,
"step": 90
},
{
"epoch": 0.3247863247863248,
"grad_norm": 30.125094604814798,
"learning_rate": 9.764170608388647e-07,
"logits/chosen": -2.514260768890381,
"logits/rejected": -2.4829812049865723,
"logps/chosen": -167.62655639648438,
"logps/rejected": -174.2395477294922,
"loss": 0.5242,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.6241778135299683,
"rewards/margins": 1.0742968320846558,
"rewards/rejected": -1.6984745264053345,
"step": 95
},
{
"epoch": 0.3418803418803419,
"grad_norm": 27.550843374580296,
"learning_rate": 9.735863969749371e-07,
"logits/chosen": -2.4171032905578613,
"logits/rejected": -2.381608486175537,
"logps/chosen": -177.05935668945312,
"logps/rejected": -188.4621124267578,
"loss": 0.5002,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.7831762433052063,
"rewards/margins": 1.0672458410263062,
"rewards/rejected": -1.8504221439361572,
"step": 100
},
{
"epoch": 0.358974358974359,
"grad_norm": 30.39392617500016,
"learning_rate": 9.705999245622956e-07,
"logits/chosen": -2.3619236946105957,
"logits/rejected": -2.3391060829162598,
"logps/chosen": -170.48300170898438,
"logps/rejected": -183.28384399414062,
"loss": 0.5026,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8889726400375366,
"rewards/margins": 0.9097515940666199,
"rewards/rejected": -1.7987244129180908,
"step": 105
},
{
"epoch": 0.37606837606837606,
"grad_norm": 26.741945030347612,
"learning_rate": 9.674586261416873e-07,
"logits/chosen": -2.2946972846984863,
"logits/rejected": -2.2440435886383057,
"logps/chosen": -179.06390380859375,
"logps/rejected": -188.00010681152344,
"loss": 0.5206,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6539386510848999,
"rewards/margins": 1.0372655391693115,
"rewards/rejected": -1.691204309463501,
"step": 110
},
{
"epoch": 0.39316239316239315,
"grad_norm": 33.116742735027486,
"learning_rate": 9.641635351911664e-07,
"logits/chosen": -2.218276262283325,
"logits/rejected": -2.18500018119812,
"logps/chosen": -171.17381286621094,
"logps/rejected": -183.25845336914062,
"loss": 0.4801,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.9279203414916992,
"rewards/margins": 1.2727015018463135,
"rewards/rejected": -2.200622081756592,
"step": 115
},
{
"epoch": 0.41025641025641024,
"grad_norm": 27.185641229760538,
"learning_rate": 9.607157357860821e-07,
"logits/chosen": -2.124584436416626,
"logits/rejected": -2.0961549282073975,
"logps/chosen": -189.48277282714844,
"logps/rejected": -203.43951416015625,
"loss": 0.5026,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.2869656085968018,
"rewards/margins": 1.3039339780807495,
"rewards/rejected": -2.5908992290496826,
"step": 120
},
{
"epoch": 0.41025641025641024,
"eval_logits/chosen": -2.0268211364746094,
"eval_logits/rejected": -1.9764775037765503,
"eval_logps/chosen": -172.888671875,
"eval_logps/rejected": -185.58355712890625,
"eval_loss": 0.49246644973754883,
"eval_rewards/accuracies": 0.7442307472229004,
"eval_rewards/chosen": -1.441135048866272,
"eval_rewards/margins": 1.2291908264160156,
"eval_rewards/rejected": -2.670325756072998,
"eval_runtime": 510.1247,
"eval_samples_per_second": 16.298,
"eval_steps_per_second": 0.255,
"step": 120
},
{
"epoch": 0.42735042735042733,
"grad_norm": 31.03461706328688,
"learning_rate": 9.571163622424225e-07,
"logits/chosen": -1.944964051246643,
"logits/rejected": -1.9178746938705444,
"logps/chosen": -175.3327178955078,
"logps/rejected": -188.2616729736328,
"loss": 0.5017,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.579502820968628,
"rewards/margins": 1.2485122680664062,
"rewards/rejected": -2.828014850616455,
"step": 125
},
{
"epoch": 0.4444444444444444,
"grad_norm": 29.080520770184428,
"learning_rate": 9.533665987436261e-07,
"logits/chosen": -1.8825464248657227,
"logits/rejected": -1.8078832626342773,
"logps/chosen": -178.3484649658203,
"logps/rejected": -197.55380249023438,
"loss": 0.4983,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.5868518352508545,
"rewards/margins": 1.2471343278884888,
"rewards/rejected": -2.8339860439300537,
"step": 130
},
{
"epoch": 0.46153846153846156,
"grad_norm": 28.903021536294002,
"learning_rate": 9.494676789509899e-07,
"logits/chosen": -1.8585374355316162,
"logits/rejected": -1.8128669261932373,
"logps/chosen": -178.5911407470703,
"logps/rejected": -195.90933227539062,
"loss": 0.492,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2965319156646729,
"rewards/margins": 1.4173026084899902,
"rewards/rejected": -2.713834285736084,
"step": 135
},
{
"epoch": 0.47863247863247865,
"grad_norm": 27.5476391641307,
"learning_rate": 9.454208855977985e-07,
"logits/chosen": -1.920654296875,
"logits/rejected": -1.8412939310073853,
"logps/chosen": -179.1053924560547,
"logps/rejected": -196.11526489257812,
"loss": 0.4753,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.5140180587768555,
"rewards/margins": 1.5388453006744385,
"rewards/rejected": -3.052863121032715,
"step": 140
},
{
"epoch": 0.49572649572649574,
"grad_norm": 30.03317842923354,
"learning_rate": 9.41227550067308e-07,
"logits/chosen": -1.9514515399932861,
"logits/rejected": -1.949883222579956,
"logps/chosen": -178.63250732421875,
"logps/rejected": -191.42721557617188,
"loss": 0.4803,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.513338327407837,
"rewards/margins": 1.4887291193008423,
"rewards/rejected": -3.0020670890808105,
"step": 145
},
{
"epoch": 0.5128205128205128,
"grad_norm": 30.28469957381902,
"learning_rate": 9.36889051954725e-07,
"logits/chosen": -2.0093894004821777,
"logits/rejected": -1.9657704830169678,
"logps/chosen": -180.35043334960938,
"logps/rejected": -197.2502899169922,
"loss": 0.4895,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.602224588394165,
"rewards/margins": 1.6883083581924438,
"rewards/rejected": -3.2905325889587402,
"step": 150
},
{
"epoch": 0.5299145299145299,
"grad_norm": 28.420242591686232,
"learning_rate": 9.324068186133245e-07,
"logits/chosen": -1.9976894855499268,
"logits/rejected": -1.9886022806167603,
"logps/chosen": -171.70602416992188,
"logps/rejected": -185.99795532226562,
"loss": 0.4608,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.2952425479888916,
"rewards/margins": 1.7483227252960205,
"rewards/rejected": -3.043565034866333,
"step": 155
},
{
"epoch": 0.5470085470085471,
"grad_norm": 26.601543429998234,
"learning_rate": 9.277823246848536e-07,
"logits/chosen": -2.056879758834839,
"logits/rejected": -1.9998328685760498,
"logps/chosen": -186.3706817626953,
"logps/rejected": -196.63290405273438,
"loss": 0.4511,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2312135696411133,
"rewards/margins": 1.352858304977417,
"rewards/rejected": -2.5840718746185303,
"step": 160
},
{
"epoch": 0.5470085470085471,
"eval_logits/chosen": -2.070892095565796,
"eval_logits/rejected": -2.0279953479766846,
"eval_logps/chosen": -171.76034545898438,
"eval_logps/rejected": -189.1643829345703,
"eval_loss": 0.4683005213737488,
"eval_rewards/accuracies": 0.762499988079071,
"eval_rewards/chosen": -1.328302264213562,
"eval_rewards/margins": 1.70010507106781,
"eval_rewards/rejected": -3.028407096862793,
"eval_runtime": 509.9565,
"eval_samples_per_second": 16.303,
"eval_steps_per_second": 0.255,
"step": 160
},
{
"epoch": 0.5641025641025641,
"grad_norm": 41.76296476638838,
"learning_rate": 9.230170916143793e-07,
"logits/chosen": -2.1190731525421143,
"logits/rejected": -2.083359956741333,
"logps/chosen": -176.87539672851562,
"logps/rejected": -198.44384765625,
"loss": 0.4944,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.2975060939788818,
"rewards/margins": 1.6890850067138672,
"rewards/rejected": -2.98659086227417,
"step": 165
},
{
"epoch": 0.5811965811965812,
"grad_norm": 28.83194976337172,
"learning_rate": 9.181126871497378e-07,
"logits/chosen": -2.175851583480835,
"logits/rejected": -2.1391243934631348,
"logps/chosen": -178.2881317138672,
"logps/rejected": -197.88473510742188,
"loss": 0.4813,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2544641494750977,
"rewards/margins": 1.7747846841812134,
"rewards/rejected": -3.0292489528656006,
"step": 170
},
{
"epoch": 0.5982905982905983,
"grad_norm": 30.93659066586097,
"learning_rate": 9.130707248257491e-07,
"logits/chosen": -2.313814640045166,
"logits/rejected": -2.2677135467529297,
"logps/chosen": -170.06781005859375,
"logps/rejected": -177.8175811767578,
"loss": 0.4863,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.0524061918258667,
"rewards/margins": 1.3644572496414185,
"rewards/rejected": -2.416863441467285,
"step": 175
},
{
"epoch": 0.6153846153846154,
"grad_norm": 25.018999438635433,
"learning_rate": 9.078928634333698e-07,
"logits/chosen": -2.302171230316162,
"logits/rejected": -2.2788572311401367,
"logps/chosen": -179.72390747070312,
"logps/rejected": -197.12283325195312,
"loss": 0.4553,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.6731274724006653,
"rewards/margins": 1.6728944778442383,
"rewards/rejected": -2.346021890640259,
"step": 180
},
{
"epoch": 0.6324786324786325,
"grad_norm": 28.576400660174777,
"learning_rate": 9.025808064739549e-07,
"logits/chosen": -2.2794651985168457,
"logits/rejected": -2.2391860485076904,
"logps/chosen": -175.87045288085938,
"logps/rejected": -189.4848175048828,
"loss": 0.4854,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.8901998400688171,
"rewards/margins": 1.4675487279891968,
"rewards/rejected": -2.357748508453369,
"step": 185
},
{
"epoch": 0.6495726495726496,
"grad_norm": 25.73471562251865,
"learning_rate": 8.971363015988113e-07,
"logits/chosen": -2.1966824531555176,
"logits/rejected": -2.1603925228118896,
"logps/chosen": -172.0600128173828,
"logps/rejected": -191.96176147460938,
"loss": 0.4681,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.9620615243911743,
"rewards/margins": 1.4954371452331543,
"rewards/rejected": -2.457498550415039,
"step": 190
},
{
"epoch": 0.6666666666666666,
"grad_norm": 34.912982133976655,
"learning_rate": 8.91561140034225e-07,
"logits/chosen": -2.1389029026031494,
"logits/rejected": -2.0825791358947754,
"logps/chosen": -174.3153839111328,
"logps/rejected": -194.2677459716797,
"loss": 0.4935,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.4726169109344482,
"rewards/margins": 1.4599871635437012,
"rewards/rejected": -2.9326040744781494,
"step": 195
},
{
"epoch": 0.6837606837606838,
"grad_norm": 25.756167591259292,
"learning_rate": 8.858571559921537e-07,
"logits/chosen": -2.135298013687134,
"logits/rejected": -2.067862033843994,
"logps/chosen": -178.73361206054688,
"logps/rejected": -193.21209716796875,
"loss": 0.4562,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.452704668045044,
"rewards/margins": 1.6391651630401611,
"rewards/rejected": -3.091869831085205,
"step": 200
},
{
"epoch": 0.6837606837606838,
"eval_logits/chosen": -2.1462392807006836,
"eval_logits/rejected": -2.1028637886047363,
"eval_logps/chosen": -173.41998291015625,
"eval_logps/rejected": -191.55532836914062,
"eval_loss": 0.4528014361858368,
"eval_rewards/accuracies": 0.7567307949066162,
"eval_rewards/chosen": -1.4942626953125,
"eval_rewards/margins": 1.7732419967651367,
"eval_rewards/rejected": -3.2675046920776367,
"eval_runtime": 510.9487,
"eval_samples_per_second": 16.272,
"eval_steps_per_second": 0.254,
"step": 200
},
{
"epoch": 0.7008547008547008,
"grad_norm": 26.77931167801656,
"learning_rate": 8.800262260667754e-07,
"logits/chosen": -2.1584880352020264,
"logits/rejected": -2.100416660308838,
"logps/chosen": -165.63743591308594,
"logps/rejected": -183.36476135253906,
"loss": 0.4653,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.409201741218567,
"rewards/margins": 1.6899499893188477,
"rewards/rejected": -3.099151611328125,
"step": 205
},
{
"epoch": 0.717948717948718,
"grad_norm": 25.986078947597964,
"learning_rate": 8.740702686170954e-07,
"logits/chosen": -2.2075798511505127,
"logits/rejected": -2.151484727859497,
"logps/chosen": -179.00509643554688,
"logps/rejected": -194.68353271484375,
"loss": 0.4426,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.2429417371749878,
"rewards/margins": 1.6721550226211548,
"rewards/rejected": -2.9150967597961426,
"step": 210
},
{
"epoch": 0.7350427350427351,
"grad_norm": 24.89101303634129,
"learning_rate": 8.679912431358109e-07,
"logits/chosen": -2.1802072525024414,
"logits/rejected": -2.1238255500793457,
"logps/chosen": -172.57705688476562,
"logps/rejected": -189.31666564941406,
"loss": 0.4521,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.4200295209884644,
"rewards/margins": 1.9437878131866455,
"rewards/rejected": -3.3638176918029785,
"step": 215
},
{
"epoch": 0.7521367521367521,
"grad_norm": 27.617679879143566,
"learning_rate": 8.617911496046445e-07,
"logits/chosen": -2.174743413925171,
"logits/rejected": -2.1131985187530518,
"logps/chosen": -171.0723876953125,
"logps/rejected": -189.23275756835938,
"loss": 0.4655,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.5752933025360107,
"rewards/margins": 1.7622945308685303,
"rewards/rejected": -3.337587833404541,
"step": 220
},
{
"epoch": 0.7692307692307693,
"grad_norm": 25.912722321128637,
"learning_rate": 8.554720278363547e-07,
"logits/chosen": -2.206986427307129,
"logits/rejected": -2.1668283939361572,
"logps/chosen": -175.4432830810547,
"logps/rejected": -193.02845764160156,
"loss": 0.4693,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.6182912588119507,
"rewards/margins": 1.7047646045684814,
"rewards/rejected": -3.3230559825897217,
"step": 225
},
{
"epoch": 0.7863247863247863,
"grad_norm": 27.20181876713083,
"learning_rate": 8.490359568036445e-07,
"logits/chosen": -2.3055601119995117,
"logits/rejected": -2.2838051319122314,
"logps/chosen": -183.21449279785156,
"logps/rejected": -205.37521362304688,
"loss": 0.4524,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.517019271850586,
"rewards/margins": 1.586157202720642,
"rewards/rejected": -3.1031765937805176,
"step": 230
},
{
"epoch": 0.8034188034188035,
"grad_norm": 73.7275222246594,
"learning_rate": 8.424850539551856e-07,
"logits/chosen": -2.367276668548584,
"logits/rejected": -2.349586009979248,
"logps/chosen": -174.82656860351562,
"logps/rejected": -191.1867218017578,
"loss": 0.4536,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4777748584747314,
"rewards/margins": 1.7120048999786377,
"rewards/rejected": -3.1897799968719482,
"step": 235
},
{
"epoch": 0.8205128205128205,
"grad_norm": 23.009281700277114,
"learning_rate": 8.358214745189829e-07,
"logits/chosen": -2.4104866981506348,
"logits/rejected": -2.3766913414001465,
"logps/chosen": -184.68222045898438,
"logps/rejected": -205.4649200439453,
"loss": 0.4189,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.6187865734100342,
"rewards/margins": 2.1064658164978027,
"rewards/rejected": -3.725252151489258,
"step": 240
},
{
"epoch": 0.8205128205128205,
"eval_logits/chosen": -2.447181224822998,
"eval_logits/rejected": -2.4165050983428955,
"eval_logps/chosen": -177.78672790527344,
"eval_logps/rejected": -197.77915954589844,
"eval_loss": 0.44941428303718567,
"eval_rewards/accuracies": 0.7663461565971375,
"eval_rewards/chosen": -1.9309390783309937,
"eval_rewards/margins": 1.958947777748108,
"eval_rewards/rejected": -3.8898868560791016,
"eval_runtime": 510.6066,
"eval_samples_per_second": 16.283,
"eval_steps_per_second": 0.255,
"step": 240
},
{
"epoch": 0.8376068376068376,
"grad_norm": 25.877339189588067,
"learning_rate": 8.290474107933114e-07,
"logits/chosen": -2.450867176055908,
"logits/rejected": -2.427006483078003,
"logps/chosen": -186.76683044433594,
"logps/rejected": -206.23147583007812,
"loss": 0.4441,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.9867289066314697,
"rewards/margins": 2.0301365852355957,
"rewards/rejected": -4.0168657302856445,
"step": 245
},
{
"epoch": 0.8547008547008547,
"grad_norm": 29.453953450785107,
"learning_rate": 8.221650914254565e-07,
"logits/chosen": -2.464049816131592,
"logits/rejected": -2.4303317070007324,
"logps/chosen": -184.5537872314453,
"logps/rejected": -196.9418487548828,
"loss": 0.4919,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.074887752532959,
"rewards/margins": 1.6710395812988281,
"rewards/rejected": -3.745927333831787,
"step": 250
},
{
"epoch": 0.8717948717948718,
"grad_norm": 26.569329016808155,
"learning_rate": 8.151767806784953e-07,
"logits/chosen": -2.4366822242736816,
"logits/rejected": -2.4094901084899902,
"logps/chosen": -188.01376342773438,
"logps/rejected": -199.4722442626953,
"loss": 0.4651,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.7826770544052124,
"rewards/margins": 1.4153121709823608,
"rewards/rejected": -3.1979892253875732,
"step": 255
},
{
"epoch": 0.8888888888888888,
"grad_norm": 26.68703703609934,
"learning_rate": 8.080847776863608e-07,
"logits/chosen": -2.4146647453308105,
"logits/rejected": -2.386958360671997,
"logps/chosen": -186.34954833984375,
"logps/rejected": -200.6859588623047,
"loss": 0.4474,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.0444929599761963,
"rewards/margins": 1.9850835800170898,
"rewards/rejected": -3.029576539993286,
"step": 260
},
{
"epoch": 0.905982905982906,
"grad_norm": 27.397197833953705,
"learning_rate": 8.008914156974333e-07,
"logits/chosen": -2.3684728145599365,
"logits/rejected": -2.3436620235443115,
"logps/chosen": -172.65553283691406,
"logps/rejected": -194.290283203125,
"loss": 0.4427,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.7620879411697388,
"rewards/margins": 1.6837621927261353,
"rewards/rejected": -2.445849895477295,
"step": 265
},
{
"epoch": 0.9230769230769231,
"grad_norm": 25.92494835175286,
"learning_rate": 7.935990613069086e-07,
"logits/chosen": -2.312016010284424,
"logits/rejected": -2.2800450325012207,
"logps/chosen": -171.95416259765625,
"logps/rejected": -193.59378051757812,
"loss": 0.445,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8496273756027222,
"rewards/margins": 2.038412094116211,
"rewards/rejected": -2.8880395889282227,
"step": 270
},
{
"epoch": 0.9401709401709402,
"grad_norm": 24.604360112973207,
"learning_rate": 7.862101136781946e-07,
"logits/chosen": -2.2761037349700928,
"logits/rejected": -2.241076707839966,
"logps/chosen": -169.81842041015625,
"logps/rejected": -192.06903076171875,
"loss": 0.4173,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.2587201595306396,
"rewards/margins": 1.723170280456543,
"rewards/rejected": -2.9818906784057617,
"step": 275
},
{
"epoch": 0.9572649572649573,
"grad_norm": 33.24758765533294,
"learning_rate": 7.78727003753595e-07,
"logits/chosen": -2.2211129665374756,
"logits/rejected": -2.1957130432128906,
"logps/chosen": -172.84083557128906,
"logps/rejected": -195.34005737304688,
"loss": 0.4484,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.557621717453003,
"rewards/margins": 2.1072278022766113,
"rewards/rejected": -3.6648497581481934,
"step": 280
},
{
"epoch": 0.9572649572649573,
"eval_logits/chosen": -2.199989080429077,
"eval_logits/rejected": -2.158634662628174,
"eval_logps/chosen": -175.8746337890625,
"eval_logps/rejected": -197.1187286376953,
"eval_loss": 0.4431803524494171,
"eval_rewards/accuracies": 0.7634615302085876,
"eval_rewards/chosen": -1.7397303581237793,
"eval_rewards/margins": 2.084113836288452,
"eval_rewards/rejected": -3.8238441944122314,
"eval_runtime": 510.3208,
"eval_samples_per_second": 16.292,
"eval_steps_per_second": 0.255,
"step": 280
},
{
"epoch": 0.9743589743589743,
"grad_norm": 23.855033657318305,
"learning_rate": 7.711521934545342e-07,
"logits/chosen": -2.1965622901916504,
"logits/rejected": -2.1558597087860107,
"logps/chosen": -185.0030059814453,
"logps/rejected": -205.0108642578125,
"loss": 0.4233,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4067909717559814,
"rewards/margins": 2.130743980407715,
"rewards/rejected": -3.537534713745117,
"step": 285
},
{
"epoch": 0.9914529914529915,
"grad_norm": 27.781853522970472,
"learning_rate": 7.63488174871594e-07,
"logits/chosen": -2.209836483001709,
"logits/rejected": -2.1382126808166504,
"logps/chosen": -181.1676788330078,
"logps/rejected": -201.53524780273438,
"loss": 0.4064,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.1297556161880493,
"rewards/margins": 2.355384349822998,
"rewards/rejected": -3.485139846801758,
"step": 290
},
{
"epoch": 1.0085470085470085,
"grad_norm": 16.38566211549952,
"learning_rate": 7.557374694446221e-07,
"logits/chosen": -2.191758632659912,
"logits/rejected": -2.182082176208496,
"logps/chosen": -169.5143585205078,
"logps/rejected": -191.7115936279297,
"loss": 0.3182,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.8321866989135742,
"rewards/margins": 2.2344491481781006,
"rewards/rejected": -3.066636085510254,
"step": 295
},
{
"epoch": 1.0256410256410255,
"grad_norm": 16.985648781909422,
"learning_rate": 7.479026271331863e-07,
"logits/chosen": -2.267702579498291,
"logits/rejected": -2.205897092819214,
"logps/chosen": -169.5579833984375,
"logps/rejected": -197.75802612304688,
"loss": 0.2168,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -0.39328667521476746,
"rewards/margins": 2.9768226146698,
"rewards/rejected": -3.3701090812683105,
"step": 300
},
{
"epoch": 1.0427350427350428,
"grad_norm": 18.84075344730222,
"learning_rate": 7.399862255776448e-07,
"logits/chosen": -2.3038105964660645,
"logits/rejected": -2.2806408405303955,
"logps/chosen": -164.28530883789062,
"logps/rejected": -197.4765625,
"loss": 0.2127,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -1.0233527421951294,
"rewards/margins": 3.003648042678833,
"rewards/rejected": -4.02700138092041,
"step": 305
},
{
"epoch": 1.0598290598290598,
"grad_norm": 22.421420935891007,
"learning_rate": 7.319908692511102e-07,
"logits/chosen": -2.4081215858459473,
"logits/rejected": -2.3740234375,
"logps/chosen": -171.662109375,
"logps/rejected": -209.674560546875,
"loss": 0.2371,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.9765853881835938,
"rewards/margins": 3.642939805984497,
"rewards/rejected": -4.619524955749512,
"step": 310
},
{
"epoch": 1.0769230769230769,
"grad_norm": 19.013199137435198,
"learning_rate": 7.239191886025853e-07,
"logits/chosen": -2.438504695892334,
"logits/rejected": -2.4153828620910645,
"logps/chosen": -175.62979125976562,
"logps/rejected": -207.75692749023438,
"loss": 0.2077,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.7931637763977051,
"rewards/margins": 3.6793007850646973,
"rewards/rejected": -4.472464561462402,
"step": 315
},
{
"epoch": 1.0940170940170941,
"grad_norm": 18.91200622344116,
"learning_rate": 7.15773839191553e-07,
"logits/chosen": -2.44122314453125,
"logits/rejected": -2.4007716178894043,
"logps/chosen": -164.8516387939453,
"logps/rejected": -196.8404998779297,
"loss": 0.222,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.5334895253181458,
"rewards/margins": 3.032334804534912,
"rewards/rejected": -3.565824508666992,
"step": 320
},
{
"epoch": 1.0940170940170941,
"eval_logits/chosen": -2.439704418182373,
"eval_logits/rejected": -2.4060051441192627,
"eval_logps/chosen": -170.68392944335938,
"eval_logps/rejected": -188.57797241210938,
"eval_loss": 0.45035940408706665,
"eval_rewards/accuracies": 0.7759615182876587,
"eval_rewards/chosen": -1.2206590175628662,
"eval_rewards/margins": 1.7491083145141602,
"eval_rewards/rejected": -2.9697670936584473,
"eval_runtime": 509.779,
"eval_samples_per_second": 16.309,
"eval_steps_per_second": 0.255,
"step": 320
}
],
"logging_steps": 5,
"max_steps": 876,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 40,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3773262181957632.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}