htlou's picture
Upload folder using huggingface_hub
cd83cf6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.7094017094017095,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017094017094017096,
"grad_norm": 35.00202204303521,
"learning_rate": 5e-07,
"logits/chosen": -2.7455849647521973,
"logits/rejected": -2.7442612648010254,
"logps/chosen": -164.2725830078125,
"logps/rejected": -170.57113647460938,
"loss": 0.6934,
"rewards/accuracies": 0.23749999701976776,
"rewards/chosen": 0.0026612328365445137,
"rewards/margins": -0.001539617427624762,
"rewards/rejected": 0.004200850613415241,
"step": 5
},
{
"epoch": 0.03418803418803419,
"grad_norm": 36.29266486314593,
"learning_rate": 1e-06,
"logits/chosen": -2.709902763366699,
"logits/rejected": -2.7155404090881348,
"logps/chosen": -171.80032348632812,
"logps/rejected": -165.20169067382812,
"loss": 0.6879,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.012009668163955212,
"rewards/margins": 0.0021203968208283186,
"rewards/rejected": 0.009889272041618824,
"step": 10
},
{
"epoch": 0.05128205128205128,
"grad_norm": 33.83921269470837,
"learning_rate": 9.999177507263144e-07,
"logits/chosen": -2.6502068042755127,
"logits/rejected": -2.628007411956787,
"logps/chosen": -174.082275390625,
"logps/rejected": -174.13429260253906,
"loss": 0.6698,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.23495244979858398,
"rewards/margins": 0.1125468835234642,
"rewards/rejected": 0.12240554392337799,
"step": 15
},
{
"epoch": 0.06837606837606838,
"grad_norm": 34.14427373918799,
"learning_rate": 9.996710299650301e-07,
"logits/chosen": -2.473665714263916,
"logits/rejected": -2.4469008445739746,
"logps/chosen": -158.2163848876953,
"logps/rejected": -158.0710906982422,
"loss": 0.661,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.4233472943305969,
"rewards/margins": 0.1434161365032196,
"rewards/rejected": 0.2799311578273773,
"step": 20
},
{
"epoch": 0.08547008547008547,
"grad_norm": 33.2696083475879,
"learning_rate": 9.992599188865604e-07,
"logits/chosen": -2.314507007598877,
"logits/rejected": -2.3168132305145264,
"logps/chosen": -150.67019653320312,
"logps/rejected": -156.8417510986328,
"loss": 0.6501,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.4975205063819885,
"rewards/margins": 0.15743504464626312,
"rewards/rejected": 0.3400854766368866,
"step": 25
},
{
"epoch": 0.10256410256410256,
"grad_norm": 34.42253361988952,
"learning_rate": 9.98684552745256e-07,
"logits/chosen": -2.243194103240967,
"logits/rejected": -2.251340866088867,
"logps/chosen": -161.2266845703125,
"logps/rejected": -161.32298278808594,
"loss": 0.6289,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.4243805408477783,
"rewards/margins": 0.2635195851325989,
"rewards/rejected": 0.16086098551750183,
"step": 30
},
{
"epoch": 0.11965811965811966,
"grad_norm": 31.414296706456245,
"learning_rate": 9.979451208349055e-07,
"logits/chosen": -2.30315899848938,
"logits/rejected": -2.289762496948242,
"logps/chosen": -171.71713256835938,
"logps/rejected": -174.50900268554688,
"loss": 0.6296,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.019384615123271942,
"rewards/margins": 0.318477988243103,
"rewards/rejected": -0.3378625512123108,
"step": 35
},
{
"epoch": 0.13675213675213677,
"grad_norm": 32.071830655862556,
"learning_rate": 9.970418664264595e-07,
"logits/chosen": -2.3935599327087402,
"logits/rejected": -2.3812546730041504,
"logps/chosen": -171.0698699951172,
"logps/rejected": -176.58578491210938,
"loss": 0.5991,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.26089853048324585,
"rewards/margins": 0.5235068202018738,
"rewards/rejected": -0.7844053506851196,
"step": 40
},
{
"epoch": 0.15384615384615385,
"grad_norm": 36.19466541168301,
"learning_rate": 9.95975086687994e-07,
"logits/chosen": -2.4914021492004395,
"logits/rejected": -2.4973323345184326,
"logps/chosen": -163.68099975585938,
"logps/rejected": -167.174072265625,
"loss": 0.6141,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2962096929550171,
"rewards/margins": 0.4588828682899475,
"rewards/rejected": -0.7550925016403198,
"step": 45
},
{
"epoch": 0.17094017094017094,
"grad_norm": 31.16276115760231,
"learning_rate": 9.947451325869439e-07,
"logits/chosen": -2.5575203895568848,
"logits/rejected": -2.557717800140381,
"logps/chosen": -172.04318237304688,
"logps/rejected": -177.67672729492188,
"loss": 0.5777,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.2069791853427887,
"rewards/margins": 0.6018465757369995,
"rewards/rejected": -0.808825671672821,
"step": 50
},
{
"epoch": 0.17094017094017094,
"eval_logits/chosen": -2.5221025943756104,
"eval_logits/rejected": -2.5152711868286133,
"eval_logps/chosen": -163.01820373535156,
"eval_logps/rejected": -169.54832458496094,
"eval_loss": 0.5812540650367737,
"eval_rewards/accuracies": 0.6682692170143127,
"eval_rewards/chosen": -0.45408713817596436,
"eval_rewards/margins": 0.6127156615257263,
"eval_rewards/rejected": -1.0668028593063354,
"eval_runtime": 510.3361,
"eval_samples_per_second": 16.291,
"eval_steps_per_second": 0.255,
"step": 50
},
{
"epoch": 0.18803418803418803,
"grad_norm": 31.575578721339145,
"learning_rate": 9.933524087746347e-07,
"logits/chosen": -2.490377426147461,
"logits/rejected": -2.4825081825256348,
"logps/chosen": -168.06161499023438,
"logps/rejected": -175.0494384765625,
"loss": 0.5706,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.5060762763023376,
"rewards/margins": 0.7589826583862305,
"rewards/rejected": -1.2650587558746338,
"step": 55
},
{
"epoch": 0.20512820512820512,
"grad_norm": 30.171745273288415,
"learning_rate": 9.917973734531549e-07,
"logits/chosen": -2.48228120803833,
"logits/rejected": -2.4833157062530518,
"logps/chosen": -159.47142028808594,
"logps/rejected": -170.63671875,
"loss": 0.5753,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.35752761363983154,
"rewards/margins": 0.5991309881210327,
"rewards/rejected": -0.9566585421562195,
"step": 60
},
{
"epoch": 0.2222222222222222,
"grad_norm": 32.13878319029882,
"learning_rate": 9.90080538224607e-07,
"logits/chosen": -2.585407018661499,
"logits/rejected": -2.5767769813537598,
"logps/chosen": -157.43936157226562,
"logps/rejected": -166.13589477539062,
"loss": 0.566,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.02057185396552086,
"rewards/margins": 0.47568243741989136,
"rewards/rejected": -0.4962543547153473,
"step": 65
},
{
"epoch": 0.23931623931623933,
"grad_norm": 29.494674721856043,
"learning_rate": 9.882024679227938e-07,
"logits/chosen": -2.6504979133605957,
"logits/rejected": -2.6398470401763916,
"logps/chosen": -178.0801239013672,
"logps/rejected": -179.46328735351562,
"loss": 0.5444,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.43436694145202637,
"rewards/margins": 0.8427752256393433,
"rewards/rejected": -1.27714204788208,
"step": 70
},
{
"epoch": 0.2564102564102564,
"grad_norm": 28.856733948308104,
"learning_rate": 9.861637804273881e-07,
"logits/chosen": -2.660489082336426,
"logits/rejected": -2.655539035797119,
"logps/chosen": -162.1233673095703,
"logps/rejected": -170.16131591796875,
"loss": 0.5568,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4032784402370453,
"rewards/margins": 0.6959114074707031,
"rewards/rejected": -1.0991899967193604,
"step": 75
},
{
"epoch": 0.27350427350427353,
"grad_norm": 26.646061534818323,
"learning_rate": 9.83965146460653e-07,
"logits/chosen": -2.6391615867614746,
"logits/rejected": -2.628577709197998,
"logps/chosen": -168.58099365234375,
"logps/rejected": -179.22805786132812,
"loss": 0.5448,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6665827035903931,
"rewards/margins": 0.8240470886230469,
"rewards/rejected": -1.4906299114227295,
"step": 80
},
{
"epoch": 0.2905982905982906,
"grad_norm": 36.04159750418885,
"learning_rate": 9.816072893667758e-07,
"logits/chosen": -2.6322970390319824,
"logits/rejected": -2.6053385734558105,
"logps/chosen": -174.82640075683594,
"logps/rejected": -186.0735626220703,
"loss": 0.5579,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.0639268159866333,
"rewards/margins": 1.0258175134658813,
"rewards/rejected": -2.0897443294525146,
"step": 85
},
{
"epoch": 0.3076923076923077,
"grad_norm": 26.922939193632168,
"learning_rate": 9.790909848738904e-07,
"logits/chosen": -2.60801362991333,
"logits/rejected": -2.6101624965667725,
"logps/chosen": -176.20538330078125,
"logps/rejected": -184.7812957763672,
"loss": 0.5215,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.9929834604263306,
"rewards/margins": 0.8646324276924133,
"rewards/rejected": -1.8576160669326782,
"step": 90
},
{
"epoch": 0.3247863247863248,
"grad_norm": 30.3564450245371,
"learning_rate": 9.764170608388647e-07,
"logits/chosen": -2.6054036617279053,
"logits/rejected": -2.5733799934387207,
"logps/chosen": -168.037109375,
"logps/rejected": -174.51144409179688,
"loss": 0.5197,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.6652337312698364,
"rewards/margins": 1.060430884361267,
"rewards/rejected": -1.725664734840393,
"step": 95
},
{
"epoch": 0.3418803418803419,
"grad_norm": 28.936164680674203,
"learning_rate": 9.735863969749371e-07,
"logits/chosen": -2.5255179405212402,
"logits/rejected": -2.4874520301818848,
"logps/chosen": -177.73861694335938,
"logps/rejected": -189.82369995117188,
"loss": 0.4982,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.8511013984680176,
"rewards/margins": 1.1354777812957764,
"rewards/rejected": -1.986579179763794,
"step": 100
},
{
"epoch": 0.3418803418803419,
"eval_logits/chosen": -2.484687328338623,
"eval_logits/rejected": -2.460559368133545,
"eval_logps/chosen": -168.28323364257812,
"eval_logps/rejected": -180.8539276123047,
"eval_loss": 0.5161151885986328,
"eval_rewards/accuracies": 0.7211538553237915,
"eval_rewards/chosen": -0.9805887937545776,
"eval_rewards/margins": 1.2167747020721436,
"eval_rewards/rejected": -2.1973636150360107,
"eval_runtime": 510.3447,
"eval_samples_per_second": 16.291,
"eval_steps_per_second": 0.255,
"step": 100
},
{
"epoch": 0.358974358974359,
"grad_norm": 31.089971589067016,
"learning_rate": 9.705999245622956e-07,
"logits/chosen": -2.4702706336975098,
"logits/rejected": -2.4523651599884033,
"logps/chosen": -170.59246826171875,
"logps/rejected": -182.99813842773438,
"loss": 0.4991,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.8999192118644714,
"rewards/margins": 0.8702341318130493,
"rewards/rejected": -1.770153284072876,
"step": 105
},
{
"epoch": 0.37606837606837606,
"grad_norm": 27.339023914835686,
"learning_rate": 9.674586261416873e-07,
"logits/chosen": -2.4866347312927246,
"logits/rejected": -2.4518179893493652,
"logps/chosen": -179.46290588378906,
"logps/rejected": -188.7920379638672,
"loss": 0.5213,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.6938365697860718,
"rewards/margins": 1.0765600204467773,
"rewards/rejected": -1.7703965902328491,
"step": 110
},
{
"epoch": 0.39316239316239315,
"grad_norm": 31.312902469600562,
"learning_rate": 9.641635351911664e-07,
"logits/chosen": -2.4456398487091064,
"logits/rejected": -2.426159381866455,
"logps/chosen": -170.3855438232422,
"logps/rejected": -181.9676513671875,
"loss": 0.4823,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8490931391716003,
"rewards/margins": 1.2224478721618652,
"rewards/rejected": -2.0715408325195312,
"step": 115
},
{
"epoch": 0.41025641025641024,
"grad_norm": 25.966469642807997,
"learning_rate": 9.607157357860821e-07,
"logits/chosen": -2.4072113037109375,
"logits/rejected": -2.3874144554138184,
"logps/chosen": -187.41197204589844,
"logps/rejected": -201.69454956054688,
"loss": 0.5037,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.0798847675323486,
"rewards/margins": 1.3365159034729004,
"rewards/rejected": -2.41640043258667,
"step": 120
},
{
"epoch": 0.42735042735042733,
"grad_norm": 32.18242375190423,
"learning_rate": 9.571163622424225e-07,
"logits/chosen": -2.2766659259796143,
"logits/rejected": -2.252072811126709,
"logps/chosen": -174.78514099121094,
"logps/rejected": -187.40646362304688,
"loss": 0.5019,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.5247443914413452,
"rewards/margins": 1.2177503108978271,
"rewards/rejected": -2.742494821548462,
"step": 125
},
{
"epoch": 0.4444444444444444,
"grad_norm": 30.579550576640443,
"learning_rate": 9.533665987436261e-07,
"logits/chosen": -2.182610034942627,
"logits/rejected": -2.128113269805908,
"logps/chosen": -178.1033477783203,
"logps/rejected": -197.4458465576172,
"loss": 0.4957,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5623412132263184,
"rewards/margins": 1.2608497142791748,
"rewards/rejected": -2.823190689086914,
"step": 130
},
{
"epoch": 0.46153846153846156,
"grad_norm": 29.287644373971865,
"learning_rate": 9.494676789509899e-07,
"logits/chosen": -2.1067867279052734,
"logits/rejected": -2.0683400630950928,
"logps/chosen": -176.67918395996094,
"logps/rejected": -193.65371704101562,
"loss": 0.4939,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1053364276885986,
"rewards/margins": 1.3829355239868164,
"rewards/rejected": -2.488272190093994,
"step": 135
},
{
"epoch": 0.47863247863247865,
"grad_norm": 27.563555703636343,
"learning_rate": 9.454208855977985e-07,
"logits/chosen": -2.0855822563171387,
"logits/rejected": -2.013296127319336,
"logps/chosen": -178.40390014648438,
"logps/rejected": -196.03305053710938,
"loss": 0.4715,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.443866491317749,
"rewards/margins": 1.600778341293335,
"rewards/rejected": -3.044644832611084,
"step": 140
},
{
"epoch": 0.49572649572649574,
"grad_norm": 30.234814125811326,
"learning_rate": 9.41227550067308e-07,
"logits/chosen": -2.0734238624572754,
"logits/rejected": -2.0634400844573975,
"logps/chosen": -179.51080322265625,
"logps/rejected": -191.87046813964844,
"loss": 0.4798,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6011661291122437,
"rewards/margins": 1.445229172706604,
"rewards/rejected": -3.0463955402374268,
"step": 145
},
{
"epoch": 0.5128205128205128,
"grad_norm": 31.371346339775513,
"learning_rate": 9.36889051954725e-07,
"logits/chosen": -2.127821683883667,
"logits/rejected": -2.080082416534424,
"logps/chosen": -180.66383361816406,
"logps/rejected": -196.1031494140625,
"loss": 0.4954,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.6335647106170654,
"rewards/margins": 1.5422546863555908,
"rewards/rejected": -3.1758196353912354,
"step": 150
},
{
"epoch": 0.5128205128205128,
"eval_logits/chosen": -2.1472573280334473,
"eval_logits/rejected": -2.0990829467773438,
"eval_logps/chosen": -173.8290557861328,
"eval_logps/rejected": -191.683349609375,
"eval_loss": 0.47699737548828125,
"eval_rewards/accuracies": 0.754807710647583,
"eval_rewards/chosen": -1.5351712703704834,
"eval_rewards/margins": 1.7451337575912476,
"eval_rewards/rejected": -3.2803049087524414,
"eval_runtime": 510.7048,
"eval_samples_per_second": 16.279,
"eval_steps_per_second": 0.255,
"step": 150
},
{
"epoch": 0.5299145299145299,
"grad_norm": 27.790777356361556,
"learning_rate": 9.324068186133245e-07,
"logits/chosen": -2.1372084617614746,
"logits/rejected": -2.124948024749756,
"logps/chosen": -172.3369598388672,
"logps/rejected": -186.5850372314453,
"loss": 0.4644,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.3583369255065918,
"rewards/margins": 1.7439367771148682,
"rewards/rejected": -3.102273464202881,
"step": 155
},
{
"epoch": 0.5470085470085471,
"grad_norm": 27.612583401785376,
"learning_rate": 9.277823246848536e-07,
"logits/chosen": -2.2635793685913086,
"logits/rejected": -2.2123026847839355,
"logps/chosen": -186.25137329101562,
"logps/rejected": -196.69564819335938,
"loss": 0.4547,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2192834615707397,
"rewards/margins": 1.371063470840454,
"rewards/rejected": -2.5903468132019043,
"step": 160
},
{
"epoch": 0.5641025641025641,
"grad_norm": 28.168886287584876,
"learning_rate": 9.230170916143793e-07,
"logits/chosen": -2.3309006690979004,
"logits/rejected": -2.2978808879852295,
"logps/chosen": -174.7559814453125,
"logps/rejected": -195.68280029296875,
"loss": 0.4967,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0855658054351807,
"rewards/margins": 1.6249233484268188,
"rewards/rejected": -2.710489273071289,
"step": 165
},
{
"epoch": 0.5811965811965812,
"grad_norm": 28.93959851544435,
"learning_rate": 9.181126871497378e-07,
"logits/chosen": -2.376833915710449,
"logits/rejected": -2.340681552886963,
"logps/chosen": -175.3675537109375,
"logps/rejected": -194.9619903564453,
"loss": 0.4651,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.9624043703079224,
"rewards/margins": 1.7745708227157593,
"rewards/rejected": -2.7369751930236816,
"step": 170
},
{
"epoch": 0.5982905982905983,
"grad_norm": 30.43477724579486,
"learning_rate": 9.130707248257491e-07,
"logits/chosen": -2.458378553390503,
"logits/rejected": -2.4171223640441895,
"logps/chosen": -168.79849243164062,
"logps/rejected": -178.6556396484375,
"loss": 0.4728,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9254748225212097,
"rewards/margins": 1.5751961469650269,
"rewards/rejected": -2.500671148300171,
"step": 175
},
{
"epoch": 0.6153846153846154,
"grad_norm": 25.220318056395065,
"learning_rate": 9.078928634333698e-07,
"logits/chosen": -2.4454641342163086,
"logits/rejected": -2.4170265197753906,
"logps/chosen": -181.41317749023438,
"logps/rejected": -199.88668823242188,
"loss": 0.4526,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.84205561876297,
"rewards/margins": 1.7803510427474976,
"rewards/rejected": -2.622406482696533,
"step": 180
},
{
"epoch": 0.6324786324786325,
"grad_norm": 29.414031929374275,
"learning_rate": 9.025808064739549e-07,
"logits/chosen": -2.4103400707244873,
"logits/rejected": -2.370731830596924,
"logps/chosen": -178.70916748046875,
"logps/rejected": -193.0004119873047,
"loss": 0.4891,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.174070119857788,
"rewards/margins": 1.535239338874817,
"rewards/rejected": -2.7093093395233154,
"step": 185
},
{
"epoch": 0.6495726495726496,
"grad_norm": 26.0372223221703,
"learning_rate": 8.971363015988113e-07,
"logits/chosen": -2.3428735733032227,
"logits/rejected": -2.2986531257629395,
"logps/chosen": -173.8651580810547,
"logps/rejected": -194.9317626953125,
"loss": 0.4643,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1425771713256836,
"rewards/margins": 1.611919641494751,
"rewards/rejected": -2.7544968128204346,
"step": 190
},
{
"epoch": 0.6666666666666666,
"grad_norm": 33.87434178682573,
"learning_rate": 8.91561140034225e-07,
"logits/chosen": -2.2664923667907715,
"logits/rejected": -2.2088184356689453,
"logps/chosen": -172.7240753173828,
"logps/rejected": -193.1275177001953,
"loss": 0.5029,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3134868144989014,
"rewards/margins": 1.5050963163375854,
"rewards/rejected": -2.8185834884643555,
"step": 195
},
{
"epoch": 0.6837606837606838,
"grad_norm": 25.21313391058931,
"learning_rate": 8.858571559921537e-07,
"logits/chosen": -2.191737174987793,
"logits/rejected": -2.1188113689422607,
"logps/chosen": -174.46722412109375,
"logps/rejected": -188.4197540283203,
"loss": 0.4567,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.026064157485962,
"rewards/margins": 1.5865710973739624,
"rewards/rejected": -2.612635374069214,
"step": 200
},
{
"epoch": 0.6837606837606838,
"eval_logits/chosen": -2.1586899757385254,
"eval_logits/rejected": -2.1090493202209473,
"eval_logps/chosen": -170.4287872314453,
"eval_logps/rejected": -187.2865447998047,
"eval_loss": 0.45979756116867065,
"eval_rewards/accuracies": 0.7596153616905212,
"eval_rewards/chosen": -1.1951465606689453,
"eval_rewards/margins": 1.64547860622406,
"eval_rewards/rejected": -2.840625047683716,
"eval_runtime": 510.4854,
"eval_samples_per_second": 16.286,
"eval_steps_per_second": 0.255,
"step": 200
},
{
"epoch": 0.7008547008547008,
"grad_norm": 35.24537564842276,
"learning_rate": 8.800262260667754e-07,
"logits/chosen": -2.1282899379730225,
"logits/rejected": -2.0742104053497314,
"logps/chosen": -165.1837921142578,
"logps/rejected": -182.61190795898438,
"loss": 0.4603,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3638368844985962,
"rewards/margins": 1.6600275039672852,
"rewards/rejected": -3.023864507675171,
"step": 205
},
{
"epoch": 0.717948717948718,
"grad_norm": 28.161564325401752,
"learning_rate": 8.740702686170954e-07,
"logits/chosen": -2.131873369216919,
"logits/rejected": -2.0582406520843506,
"logps/chosen": -184.79843139648438,
"logps/rejected": -201.92471313476562,
"loss": 0.4458,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.8222748041152954,
"rewards/margins": 1.8169399499893188,
"rewards/rejected": -3.639214277267456,
"step": 210
},
{
"epoch": 0.7350427350427351,
"grad_norm": 27.38246704123344,
"learning_rate": 8.679912431358109e-07,
"logits/chosen": -2.149195909500122,
"logits/rejected": -2.0881500244140625,
"logps/chosen": -176.96768188476562,
"logps/rejected": -193.83583068847656,
"loss": 0.4503,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8590911626815796,
"rewards/margins": 1.9566419124603271,
"rewards/rejected": -3.8157334327697754,
"step": 215
},
{
"epoch": 0.7521367521367521,
"grad_norm": 26.680467589239278,
"learning_rate": 8.617911496046445e-07,
"logits/chosen": -2.1819872856140137,
"logits/rejected": -2.1153197288513184,
"logps/chosen": -172.77716064453125,
"logps/rejected": -191.78298950195312,
"loss": 0.4676,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7457681894302368,
"rewards/margins": 1.8468406200408936,
"rewards/rejected": -3.592608690261841,
"step": 220
},
{
"epoch": 0.7692307692307693,
"grad_norm": 27.17086789362823,
"learning_rate": 8.554720278363547e-07,
"logits/chosen": -2.2397422790527344,
"logits/rejected": -2.199204444885254,
"logps/chosen": -173.76963806152344,
"logps/rejected": -191.92172241210938,
"loss": 0.474,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4509254693984985,
"rewards/margins": 1.7614561319351196,
"rewards/rejected": -3.212381362915039,
"step": 225
},
{
"epoch": 0.7863247863247863,
"grad_norm": 34.41944082003437,
"learning_rate": 8.490359568036445e-07,
"logits/chosen": -2.3623924255371094,
"logits/rejected": -2.332777976989746,
"logps/chosen": -181.22314453125,
"logps/rejected": -203.68490600585938,
"loss": 0.458,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3178858757019043,
"rewards/margins": 1.6162595748901367,
"rewards/rejected": -2.934145450592041,
"step": 230
},
{
"epoch": 0.8034188034188035,
"grad_norm": 26.240821537730366,
"learning_rate": 8.424850539551856e-07,
"logits/chosen": -2.457951068878174,
"logits/rejected": -2.430043935775757,
"logps/chosen": -174.00967407226562,
"logps/rejected": -190.4671173095703,
"loss": 0.4457,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3960864543914795,
"rewards/margins": 1.7217321395874023,
"rewards/rejected": -3.117818593978882,
"step": 235
},
{
"epoch": 0.8205128205128205,
"grad_norm": 23.836616559503103,
"learning_rate": 8.358214745189829e-07,
"logits/chosen": -2.5360074043273926,
"logits/rejected": -2.502570867538452,
"logps/chosen": -183.65618896484375,
"logps/rejected": -204.5669403076172,
"loss": 0.4236,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.516182541847229,
"rewards/margins": 2.1192710399627686,
"rewards/rejected": -3.635453462600708,
"step": 240
},
{
"epoch": 0.8376068376068376,
"grad_norm": 26.25101678311949,
"learning_rate": 8.290474107933114e-07,
"logits/chosen": -2.5879926681518555,
"logits/rejected": -2.5638298988342285,
"logps/chosen": -184.1548614501953,
"logps/rejected": -202.95677185058594,
"loss": 0.4494,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7255306243896484,
"rewards/margins": 1.9638643264770508,
"rewards/rejected": -3.68939471244812,
"step": 245
},
{
"epoch": 0.8547008547008547,
"grad_norm": 28.730820152276525,
"learning_rate": 8.221650914254565e-07,
"logits/chosen": -2.592283248901367,
"logits/rejected": -2.561645984649658,
"logps/chosen": -183.74407958984375,
"logps/rejected": -195.3282470703125,
"loss": 0.4873,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.9939171075820923,
"rewards/margins": 1.590651273727417,
"rewards/rejected": -3.584568500518799,
"step": 250
},
{
"epoch": 0.8547008547008547,
"eval_logits/chosen": -2.5724236965179443,
"eval_logits/rejected": -2.545708417892456,
"eval_logps/chosen": -177.6819305419922,
"eval_logps/rejected": -195.5203399658203,
"eval_loss": 0.44874703884124756,
"eval_rewards/accuracies": 0.7634615302085876,
"eval_rewards/chosen": -1.9204591512680054,
"eval_rewards/margins": 1.743545413017273,
"eval_rewards/rejected": -3.664004325866699,
"eval_runtime": 510.735,
"eval_samples_per_second": 16.278,
"eval_steps_per_second": 0.255,
"step": 250
},
{
"epoch": 0.8717948717948718,
"grad_norm": 26.28610055865234,
"learning_rate": 8.151767806784953e-07,
"logits/chosen": -2.5476808547973633,
"logits/rejected": -2.5247130393981934,
"logps/chosen": -188.99472045898438,
"logps/rejected": -200.0706329345703,
"loss": 0.4745,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.8807718753814697,
"rewards/margins": 1.3770544528961182,
"rewards/rejected": -3.257826328277588,
"step": 255
},
{
"epoch": 0.8888888888888888,
"grad_norm": 31.895303400766686,
"learning_rate": 8.080847776863608e-07,
"logits/chosen": -2.523097038269043,
"logits/rejected": -2.4968159198760986,
"logps/chosen": -187.11276245117188,
"logps/rejected": -202.34288024902344,
"loss": 0.4478,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.120813250541687,
"rewards/margins": 2.0744528770446777,
"rewards/rejected": -3.1952662467956543,
"step": 260
},
{
"epoch": 0.905982905982906,
"grad_norm": 26.320681707210774,
"learning_rate": 8.008914156974333e-07,
"logits/chosen": -2.462744951248169,
"logits/rejected": -2.4353935718536377,
"logps/chosen": -173.44297790527344,
"logps/rejected": -195.9517822265625,
"loss": 0.4374,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.8408319354057312,
"rewards/margins": 1.7711703777313232,
"rewards/rejected": -2.612001895904541,
"step": 265
},
{
"epoch": 0.9230769230769231,
"grad_norm": 26.997987002073426,
"learning_rate": 7.935990613069086e-07,
"logits/chosen": -2.4033281803131104,
"logits/rejected": -2.369520425796509,
"logps/chosen": -171.97085571289062,
"logps/rejected": -191.66697692871094,
"loss": 0.4474,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8512971997261047,
"rewards/margins": 1.844062089920044,
"rewards/rejected": -2.695359230041504,
"step": 270
},
{
"epoch": 0.9401709401709402,
"grad_norm": 27.58725138274707,
"learning_rate": 7.862101136781946e-07,
"logits/chosen": -2.3467745780944824,
"logits/rejected": -2.3106236457824707,
"logps/chosen": -171.92318725585938,
"logps/rejected": -194.26400756835938,
"loss": 0.4207,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.4691975116729736,
"rewards/margins": 1.7321897745132446,
"rewards/rejected": -3.201387405395508,
"step": 275
},
{
"epoch": 0.9572649572649573,
"grad_norm": 28.716376412825436,
"learning_rate": 7.78727003753595e-07,
"logits/chosen": -2.287221670150757,
"logits/rejected": -2.260993480682373,
"logps/chosen": -175.48435974121094,
"logps/rejected": -199.2796173095703,
"loss": 0.4469,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8219757080078125,
"rewards/margins": 2.2368292808532715,
"rewards/rejected": -4.058804512023926,
"step": 280
},
{
"epoch": 0.9743589743589743,
"grad_norm": 22.896854237985796,
"learning_rate": 7.711521934545342e-07,
"logits/chosen": -2.2684969902038574,
"logits/rejected": -2.2303268909454346,
"logps/chosen": -187.36607360839844,
"logps/rejected": -207.65261840820312,
"loss": 0.4153,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6430988311767578,
"rewards/margins": 2.1586105823516846,
"rewards/rejected": -3.8017096519470215,
"step": 285
},
{
"epoch": 0.9914529914529915,
"grad_norm": 27.30975328138857,
"learning_rate": 7.63488174871594e-07,
"logits/chosen": -2.2886035442352295,
"logits/rejected": -2.2276930809020996,
"logps/chosen": -183.37933349609375,
"logps/rejected": -201.49429321289062,
"loss": 0.4079,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.3509235382080078,
"rewards/margins": 2.130119800567627,
"rewards/rejected": -3.481043577194214,
"step": 290
},
{
"epoch": 1.0085470085470085,
"grad_norm": 20.392666769639558,
"learning_rate": 7.557374694446221e-07,
"logits/chosen": -2.264070510864258,
"logits/rejected": -2.248257637023926,
"logps/chosen": -170.48977661132812,
"logps/rejected": -192.82720947265625,
"loss": 0.3189,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.9297264814376831,
"rewards/margins": 2.2484724521636963,
"rewards/rejected": -3.1781985759735107,
"step": 295
},
{
"epoch": 1.0256410256410255,
"grad_norm": 16.68267756771692,
"learning_rate": 7.479026271331863e-07,
"logits/chosen": -2.3048789501190186,
"logits/rejected": -2.2495861053466797,
"logps/chosen": -170.22055053710938,
"logps/rejected": -198.54359436035156,
"loss": 0.2176,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -0.4595426619052887,
"rewards/margins": 2.989124059677124,
"rewards/rejected": -3.44866681098938,
"step": 300
},
{
"epoch": 1.0256410256410255,
"eval_logits/chosen": -2.3489644527435303,
"eval_logits/rejected": -2.3130385875701904,
"eval_logps/chosen": -170.4687957763672,
"eval_logps/rejected": -190.08233642578125,
"eval_loss": 0.4382702112197876,
"eval_rewards/accuracies": 0.7846153974533081,
"eval_rewards/chosen": -1.1991456747055054,
"eval_rewards/margins": 1.9210587739944458,
"eval_rewards/rejected": -3.1202046871185303,
"eval_runtime": 511.3486,
"eval_samples_per_second": 16.259,
"eval_steps_per_second": 0.254,
"step": 300
},
{
"epoch": 1.0427350427350428,
"grad_norm": 18.410064070731565,
"learning_rate": 7.399862255776448e-07,
"logits/chosen": -2.331937074661255,
"logits/rejected": -2.315340995788574,
"logps/chosen": -161.1502227783203,
"logps/rejected": -192.48519897460938,
"loss": 0.2177,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -0.7098430395126343,
"rewards/margins": 2.818021535873413,
"rewards/rejected": -3.527864933013916,
"step": 305
},
{
"epoch": 1.0598290598290598,
"grad_norm": 21.826180204028997,
"learning_rate": 7.319908692511102e-07,
"logits/chosen": -2.384028911590576,
"logits/rejected": -2.3547444343566895,
"logps/chosen": -168.48654174804688,
"logps/rejected": -203.76600646972656,
"loss": 0.2374,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -0.6590297818183899,
"rewards/margins": 3.3696398735046387,
"rewards/rejected": -4.028670310974121,
"step": 310
},
{
"epoch": 1.0769230769230769,
"grad_norm": 19.05079845541507,
"learning_rate": 7.239191886025853e-07,
"logits/chosen": -2.4289774894714355,
"logits/rejected": -2.406567096710205,
"logps/chosen": -175.2897186279297,
"logps/rejected": -207.41915893554688,
"loss": 0.2069,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -0.7591577768325806,
"rewards/margins": 3.679530620574951,
"rewards/rejected": -4.438688278198242,
"step": 315
},
{
"epoch": 1.0940170940170941,
"grad_norm": 20.698533938628618,
"learning_rate": 7.15773839191553e-07,
"logits/chosen": -2.4449350833892822,
"logits/rejected": -2.4075865745544434,
"logps/chosen": -166.36514282226562,
"logps/rejected": -199.648193359375,
"loss": 0.2229,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -0.6848402619361877,
"rewards/margins": 3.1617534160614014,
"rewards/rejected": -3.8465933799743652,
"step": 320
},
{
"epoch": 1.1111111111111112,
"grad_norm": 23.234017756053024,
"learning_rate": 7.075575008143054e-07,
"logits/chosen": -2.4447779655456543,
"logits/rejected": -2.4182941913604736,
"logps/chosen": -173.80941772460938,
"logps/rejected": -208.5805206298828,
"loss": 0.2179,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7078952789306641,
"rewards/margins": 3.346341609954834,
"rewards/rejected": -4.05423641204834,
"step": 325
},
{
"epoch": 1.1282051282051282,
"grad_norm": 20.797265468675864,
"learning_rate": 6.99272876622298e-07,
"logits/chosen": -2.4514238834381104,
"logits/rejected": -2.4154598712921143,
"logps/chosen": -175.63339233398438,
"logps/rejected": -210.8448944091797,
"loss": 0.2255,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -0.5788818597793579,
"rewards/margins": 3.4709906578063965,
"rewards/rejected": -4.049872875213623,
"step": 330
},
{
"epoch": 1.1452991452991452,
"grad_norm": 22.69234920407025,
"learning_rate": 6.909226922328211e-07,
"logits/chosen": -2.4148731231689453,
"logits/rejected": -2.396899461746216,
"logps/chosen": -174.77548217773438,
"logps/rejected": -205.1797332763672,
"loss": 0.2147,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -0.9849351644515991,
"rewards/margins": 3.1783816814422607,
"rewards/rejected": -4.1633172035217285,
"step": 335
},
{
"epoch": 1.1623931623931625,
"grad_norm": 21.207875658566053,
"learning_rate": 6.82509694832279e-07,
"logits/chosen": -2.411642551422119,
"logits/rejected": -2.3832614421844482,
"logps/chosen": -174.6155242919922,
"logps/rejected": -210.0262451171875,
"loss": 0.2199,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.9470588564872742,
"rewards/margins": 3.335145950317383,
"rewards/rejected": -4.282204627990723,
"step": 340
},
{
"epoch": 1.1794871794871795,
"grad_norm": 22.62073459229978,
"learning_rate": 6.740366522723752e-07,
"logits/chosen": -2.4228436946868896,
"logits/rejected": -2.3924076557159424,
"logps/chosen": -181.4166259765625,
"logps/rejected": -214.4353485107422,
"loss": 0.1949,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -1.1724941730499268,
"rewards/margins": 3.5335402488708496,
"rewards/rejected": -4.7060346603393555,
"step": 345
},
{
"epoch": 1.1965811965811965,
"grad_norm": 24.043667365803458,
"learning_rate": 6.655063521594949e-07,
"logits/chosen": -2.4030003547668457,
"logits/rejected": -2.3506176471710205,
"logps/chosen": -175.32406616210938,
"logps/rejected": -218.69326782226562,
"loss": 0.2095,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -1.6340477466583252,
"rewards/margins": 4.079660415649414,
"rewards/rejected": -5.713706970214844,
"step": 350
},
{
"epoch": 1.1965811965811965,
"eval_logits/chosen": -2.3941731452941895,
"eval_logits/rejected": -2.3655929565429688,
"eval_logps/chosen": -182.0218963623047,
"eval_logps/rejected": -207.61233520507812,
"eval_loss": 0.4536636769771576,
"eval_rewards/accuracies": 0.7932692170143127,
"eval_rewards/chosen": -2.3544540405273438,
"eval_rewards/margins": 2.5187504291534424,
"eval_rewards/rejected": -4.873203754425049,
"eval_runtime": 510.8381,
"eval_samples_per_second": 16.275,
"eval_steps_per_second": 0.254,
"step": 350
},
{
"epoch": 1.2136752136752136,
"grad_norm": 21.542790573622174,
"learning_rate": 6.569216009375929e-07,
"logits/chosen": -2.3871009349823,
"logits/rejected": -2.3618922233581543,
"logps/chosen": -174.978271484375,
"logps/rejected": -215.9545440673828,
"loss": 0.191,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -1.4307578802108765,
"rewards/margins": 4.03377628326416,
"rewards/rejected": -5.464534759521484,
"step": 355
},
{
"epoch": 1.2307692307692308,
"grad_norm": 21.574671776242766,
"learning_rate": 6.482852229648801e-07,
"logits/chosen": -2.388662576675415,
"logits/rejected": -2.3574576377868652,
"logps/chosen": -173.49014282226562,
"logps/rejected": -204.75392150878906,
"loss": 0.1953,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.2610100507736206,
"rewards/margins": 3.369713544845581,
"rewards/rejected": -4.630723476409912,
"step": 360
},
{
"epoch": 1.2478632478632479,
"grad_norm": 18.919534146199894,
"learning_rate": 6.396000595846187e-07,
"logits/chosen": -2.393681049346924,
"logits/rejected": -2.3899941444396973,
"logps/chosen": -177.26788330078125,
"logps/rejected": -204.59396362304688,
"loss": 0.205,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -1.1454333066940308,
"rewards/margins": 3.212463855743408,
"rewards/rejected": -4.35789680480957,
"step": 365
},
{
"epoch": 1.264957264957265,
"grad_norm": 20.063931866077127,
"learning_rate": 6.30868968190328e-07,
"logits/chosen": -2.399217128753662,
"logits/rejected": -2.3884811401367188,
"logps/chosen": -173.6759490966797,
"logps/rejected": -208.24868774414062,
"loss": 0.1916,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.2132608890533447,
"rewards/margins": 3.5065104961395264,
"rewards/rejected": -4.719771862030029,
"step": 370
},
{
"epoch": 1.282051282051282,
"grad_norm": 19.65973417080842,
"learning_rate": 6.220948212857111e-07,
"logits/chosen": -2.3873801231384277,
"logits/rejected": -2.36773419380188,
"logps/chosen": -176.6146240234375,
"logps/rejected": -217.32119750976562,
"loss": 0.1865,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -1.2807495594024658,
"rewards/margins": 3.8207836151123047,
"rewards/rejected": -5.101532459259033,
"step": 375
},
{
"epoch": 1.2991452991452992,
"grad_norm": 22.23986474829352,
"learning_rate": 6.13280505539608e-07,
"logits/chosen": -2.38179349899292,
"logits/rejected": -2.3515422344207764,
"logps/chosen": -188.8070068359375,
"logps/rejected": -235.27456665039062,
"loss": 0.2025,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -1.6675641536712646,
"rewards/margins": 4.054324150085449,
"rewards/rejected": -5.721888542175293,
"step": 380
},
{
"epoch": 1.3162393162393162,
"grad_norm": 25.60349271225297,
"learning_rate": 6.044289208362914e-07,
"logits/chosen": -2.3684115409851074,
"logits/rejected": -2.3191847801208496,
"logps/chosen": -178.47006225585938,
"logps/rejected": -219.3975372314453,
"loss": 0.1953,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.7480275630950928,
"rewards/margins": 3.8794853687286377,
"rewards/rejected": -5.6275129318237305,
"step": 385
},
{
"epoch": 1.3333333333333333,
"grad_norm": 26.515708025155362,
"learning_rate": 5.955429793214128e-07,
"logits/chosen": -2.3351917266845703,
"logits/rejected": -2.3046581745147705,
"logps/chosen": -185.07766723632812,
"logps/rejected": -222.58560180664062,
"loss": 0.2143,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -1.9629478454589844,
"rewards/margins": 3.649775981903076,
"rewards/rejected": -5.612724304199219,
"step": 390
},
{
"epoch": 1.3504273504273505,
"grad_norm": 22.854515057042835,
"learning_rate": 5.866256044439142e-07,
"logits/chosen": -2.3319296836853027,
"logits/rejected": -2.317962169647217,
"logps/chosen": -176.97047424316406,
"logps/rejected": -215.57284545898438,
"loss": 0.2196,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.8355095386505127,
"rewards/margins": 3.851968765258789,
"rewards/rejected": -5.687478065490723,
"step": 395
},
{
"epoch": 1.3675213675213675,
"grad_norm": 19.88579820520089,
"learning_rate": 5.776797299942235e-07,
"logits/chosen": -2.33012056350708,
"logits/rejected": -2.312140941619873,
"logps/chosen": -172.976806640625,
"logps/rejected": -211.60244750976562,
"loss": 0.1952,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.3092162609100342,
"rewards/margins": 3.7678093910217285,
"rewards/rejected": -5.077025890350342,
"step": 400
},
{
"epoch": 1.3675213675213675,
"eval_logits/chosen": -2.3361263275146484,
"eval_logits/rejected": -2.3058180809020996,
"eval_logps/chosen": -178.1995391845703,
"eval_logps/rejected": -200.75048828125,
"eval_loss": 0.43530720472335815,
"eval_rewards/accuracies": 0.7961538434028625,
"eval_rewards/chosen": -1.972220540046692,
"eval_rewards/margins": 2.21479868888855,
"eval_rewards/rejected": -4.1870198249816895,
"eval_runtime": 510.4272,
"eval_samples_per_second": 16.288,
"eval_steps_per_second": 0.255,
"step": 400
},
{
"epoch": 1.3846153846153846,
"grad_norm": 23.602459821195318,
"learning_rate": 5.687082991390443e-07,
"logits/chosen": -2.3179829120635986,
"logits/rejected": -2.312061071395874,
"logps/chosen": -182.93972778320312,
"logps/rejected": -224.7725372314453,
"loss": 0.2185,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -1.019089937210083,
"rewards/margins": 3.9243361949920654,
"rewards/rejected": -4.943426132202148,
"step": 405
},
{
"epoch": 1.4017094017094016,
"grad_norm": 21.267642660984247,
"learning_rate": 5.597142634530638e-07,
"logits/chosen": -2.3200526237487793,
"logits/rejected": -2.2980990409851074,
"logps/chosen": -177.72433471679688,
"logps/rejected": -213.3227081298828,
"loss": 0.188,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -1.1036903858184814,
"rewards/margins": 3.615924835205078,
"rewards/rejected": -4.719615459442139,
"step": 410
},
{
"epoch": 1.4188034188034189,
"grad_norm": 25.068644731473288,
"learning_rate": 5.507005819478924e-07,
"logits/chosen": -2.3367302417755127,
"logits/rejected": -2.3034427165985107,
"logps/chosen": -181.5726776123047,
"logps/rejected": -214.39950561523438,
"loss": 0.2292,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -1.3038368225097656,
"rewards/margins": 3.746593475341797,
"rewards/rejected": -5.0504302978515625,
"step": 415
},
{
"epoch": 1.435897435897436,
"grad_norm": 22.91245347503312,
"learning_rate": 5.416702200985584e-07,
"logits/chosen": -2.3304390907287598,
"logits/rejected": -2.318666458129883,
"logps/chosen": -179.07366943359375,
"logps/rejected": -215.1077117919922,
"loss": 0.2055,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.4040899276733398,
"rewards/margins": 3.7617506980895996,
"rewards/rejected": -5.165841102600098,
"step": 420
},
{
"epoch": 1.452991452991453,
"grad_norm": 26.341700780637733,
"learning_rate": 5.326261488678748e-07,
"logits/chosen": -2.280813694000244,
"logits/rejected": -2.246502637863159,
"logps/chosen": -170.13156127929688,
"logps/rejected": -199.88107299804688,
"loss": 0.2133,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.4815866947174072,
"rewards/margins": 3.457195281982422,
"rewards/rejected": -4.938782215118408,
"step": 425
},
{
"epoch": 1.4700854700854702,
"grad_norm": 21.52180672361721,
"learning_rate": 5.235713437290011e-07,
"logits/chosen": -2.295285701751709,
"logits/rejected": -2.2669897079467773,
"logps/chosen": -173.1139373779297,
"logps/rejected": -211.16552734375,
"loss": 0.189,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -1.4014195203781128,
"rewards/margins": 3.7642555236816406,
"rewards/rejected": -5.165675163269043,
"step": 430
},
{
"epoch": 1.4871794871794872,
"grad_norm": 20.666518232391027,
"learning_rate": 5.145087836865213e-07,
"logits/chosen": -2.298299789428711,
"logits/rejected": -2.257429599761963,
"logps/chosen": -179.28443908691406,
"logps/rejected": -220.42227172851562,
"loss": 0.1814,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -1.3758282661437988,
"rewards/margins": 4.075347900390625,
"rewards/rejected": -5.451176643371582,
"step": 435
},
{
"epoch": 1.5042735042735043,
"grad_norm": 25.557504448431423,
"learning_rate": 5.054414502963604e-07,
"logits/chosen": -2.2477543354034424,
"logits/rejected": -2.210355758666992,
"logps/chosen": -171.3217315673828,
"logps/rejected": -210.87026977539062,
"loss": 0.1984,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.3701257705688477,
"rewards/margins": 4.071067810058594,
"rewards/rejected": -5.4411940574646,
"step": 440
},
{
"epoch": 1.5213675213675213,
"grad_norm": 24.9362563093443,
"learning_rate": 4.963723266848609e-07,
"logits/chosen": -2.230846881866455,
"logits/rejected": -2.202775716781616,
"logps/chosen": -177.6237030029297,
"logps/rejected": -215.97097778320312,
"loss": 0.1919,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.3647804260253906,
"rewards/margins": 4.160490989685059,
"rewards/rejected": -5.525271415710449,
"step": 445
},
{
"epoch": 1.5384615384615383,
"grad_norm": 24.288168777721136,
"learning_rate": 4.873043965673426e-07,
"logits/chosen": -2.2525508403778076,
"logits/rejected": -2.2148168087005615,
"logps/chosen": -175.71871948242188,
"logps/rejected": -212.42724609375,
"loss": 0.1819,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.1069761514663696,
"rewards/margins": 3.7185158729553223,
"rewards/rejected": -4.825491905212402,
"step": 450
},
{
"epoch": 1.5384615384615383,
"eval_logits/chosen": -2.261197566986084,
"eval_logits/rejected": -2.2281806468963623,
"eval_logps/chosen": -178.94309997558594,
"eval_logps/rejected": -203.29603576660156,
"eval_loss": 0.4320841133594513,
"eval_rewards/accuracies": 0.807692289352417,
"eval_rewards/chosen": -2.046576976776123,
"eval_rewards/margins": 2.394998550415039,
"eval_rewards/rejected": -4.441575050354004,
"eval_runtime": 510.5379,
"eval_samples_per_second": 16.285,
"eval_steps_per_second": 0.255,
"step": 450
},
{
"epoch": 1.5555555555555556,
"grad_norm": 26.933268532412992,
"learning_rate": 4.782406432664698e-07,
"logits/chosen": -2.26383376121521,
"logits/rejected": -2.237532377243042,
"logps/chosen": -173.788818359375,
"logps/rejected": -212.0675811767578,
"loss": 0.1977,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.4074937105178833,
"rewards/margins": 3.9105000495910645,
"rewards/rejected": -5.317993640899658,
"step": 455
},
{
"epoch": 1.5726495726495726,
"grad_norm": 23.337744592069985,
"learning_rate": 4.691840487307457e-07,
"logits/chosen": -2.299950361251831,
"logits/rejected": -2.2528157234191895,
"logps/chosen": -179.13462829589844,
"logps/rejected": -217.1257781982422,
"loss": 0.1976,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.615675926208496,
"rewards/margins": 3.8860745429992676,
"rewards/rejected": -5.5017499923706055,
"step": 460
},
{
"epoch": 1.5897435897435899,
"grad_norm": 21.62949139825628,
"learning_rate": 4.601375925534609e-07,
"logits/chosen": -2.297147512435913,
"logits/rejected": -2.263063430786133,
"logps/chosen": -173.408203125,
"logps/rejected": -216.2610321044922,
"loss": 0.1913,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -1.3588893413543701,
"rewards/margins": 4.240605354309082,
"rewards/rejected": -5.599493980407715,
"step": 465
},
{
"epoch": 1.606837606837607,
"grad_norm": 21.79188153692548,
"learning_rate": 4.5110425099241564e-07,
"logits/chosen": -2.28120756149292,
"logits/rejected": -2.2566189765930176,
"logps/chosen": -181.21665954589844,
"logps/rejected": -219.48165893554688,
"loss": 0.1652,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -1.5003584623336792,
"rewards/margins": 4.1385087966918945,
"rewards/rejected": -5.638867378234863,
"step": 470
},
{
"epoch": 1.623931623931624,
"grad_norm": 25.738944423129137,
"learning_rate": 4.4208699599073867e-07,
"logits/chosen": -2.2613961696624756,
"logits/rejected": -2.23286509513855,
"logps/chosen": -173.91307067871094,
"logps/rejected": -212.1099090576172,
"loss": 0.1659,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -1.48898184299469,
"rewards/margins": 3.877812147140503,
"rewards/rejected": -5.366794586181641,
"step": 475
},
{
"epoch": 1.641025641025641,
"grad_norm": 26.101379731114886,
"learning_rate": 4.330887941991288e-07,
"logits/chosen": -2.2699358463287354,
"logits/rejected": -2.2326736450195312,
"logps/chosen": -178.81298828125,
"logps/rejected": -221.71041870117188,
"loss": 0.187,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.581876516342163,
"rewards/margins": 4.097477436065674,
"rewards/rejected": -5.679353713989258,
"step": 480
},
{
"epoch": 1.658119658119658,
"grad_norm": 26.808664761300346,
"learning_rate": 4.241126059998332e-07,
"logits/chosen": -2.2472572326660156,
"logits/rejected": -2.229518413543701,
"logps/chosen": -174.30648803710938,
"logps/rejected": -213.32943725585938,
"loss": 0.1747,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -1.2192766666412354,
"rewards/margins": 3.972968578338623,
"rewards/rejected": -5.1922454833984375,
"step": 485
},
{
"epoch": 1.6752136752136753,
"grad_norm": 33.9138732828219,
"learning_rate": 4.151613845326911e-07,
"logits/chosen": -2.2618186473846436,
"logits/rejected": -2.229682445526123,
"logps/chosen": -176.83139038085938,
"logps/rejected": -215.8927001953125,
"loss": 0.2152,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.2077680826187134,
"rewards/margins": 3.913682222366333,
"rewards/rejected": -5.121450424194336,
"step": 490
},
{
"epoch": 1.6923076923076923,
"grad_norm": 21.688254773299708,
"learning_rate": 4.062380747235595e-07,
"logits/chosen": -2.2666947841644287,
"logits/rejected": -2.251443386077881,
"logps/chosen": -181.6000518798828,
"logps/rejected": -223.68405151367188,
"loss": 0.1913,
"rewards/accuracies": 0.90625,
"rewards/chosen": -1.144136667251587,
"rewards/margins": 3.784163236618042,
"rewards/rejected": -4.928299903869629,
"step": 495
},
{
"epoch": 1.7094017094017095,
"grad_norm": 23.63480044253807,
"learning_rate": 3.9734561231544143e-07,
"logits/chosen": -2.277580738067627,
"logits/rejected": -2.2314701080322266,
"logps/chosen": -164.60968017578125,
"logps/rejected": -206.13772583007812,
"loss": 0.1932,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.9748468399047852,
"rewards/margins": 3.8219451904296875,
"rewards/rejected": -4.7967915534973145,
"step": 500
},
{
"epoch": 1.7094017094017095,
"eval_logits/chosen": -2.296976327896118,
"eval_logits/rejected": -2.2658658027648926,
"eval_logps/chosen": -177.0738983154297,
"eval_logps/rejected": -200.2040557861328,
"eval_loss": 0.4246754050254822,
"eval_rewards/accuracies": 0.8086538314819336,
"eval_rewards/chosen": -1.859657645225525,
"eval_rewards/margins": 2.272717237472534,
"eval_rewards/rejected": -4.1323747634887695,
"eval_runtime": 510.9457,
"eval_samples_per_second": 16.272,
"eval_steps_per_second": 0.254,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 876,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5895985343496192.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}