{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3675213675213675, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017094017094017096, "grad_norm": 35.00202204303521, "learning_rate": 5e-07, "logits/chosen": -2.7455849647521973, "logits/rejected": -2.7442612648010254, "logps/chosen": -164.2725830078125, "logps/rejected": -170.57113647460938, "loss": 0.6934, "rewards/accuracies": 0.23749999701976776, "rewards/chosen": 0.0026612328365445137, "rewards/margins": -0.001539617427624762, "rewards/rejected": 0.004200850613415241, "step": 5 }, { "epoch": 0.03418803418803419, "grad_norm": 36.29266486314593, "learning_rate": 1e-06, "logits/chosen": -2.709902763366699, "logits/rejected": -2.7155404090881348, "logps/chosen": -171.80032348632812, "logps/rejected": -165.20169067382812, "loss": 0.6879, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.012009668163955212, "rewards/margins": 0.0021203968208283186, "rewards/rejected": 0.009889272041618824, "step": 10 }, { "epoch": 0.05128205128205128, "grad_norm": 33.83921269470837, "learning_rate": 9.999177507263144e-07, "logits/chosen": -2.6502068042755127, "logits/rejected": -2.628007411956787, "logps/chosen": -174.082275390625, "logps/rejected": -174.13429260253906, "loss": 0.6698, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.23495244979858398, "rewards/margins": 0.1125468835234642, "rewards/rejected": 0.12240554392337799, "step": 15 }, { "epoch": 0.06837606837606838, "grad_norm": 34.14427373918799, "learning_rate": 9.996710299650301e-07, "logits/chosen": -2.473665714263916, "logits/rejected": -2.4469008445739746, "logps/chosen": -158.2163848876953, "logps/rejected": -158.0710906982422, "loss": 0.661, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.4233472943305969, "rewards/margins": 0.1434161365032196, "rewards/rejected": 0.2799311578273773, "step": 20 }, { "epoch": 0.08547008547008547, "grad_norm": 33.2696083475879, "learning_rate": 9.992599188865604e-07, "logits/chosen": -2.314507007598877, "logits/rejected": -2.3168132305145264, "logps/chosen": -150.67019653320312, "logps/rejected": -156.8417510986328, "loss": 0.6501, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.4975205063819885, "rewards/margins": 0.15743504464626312, "rewards/rejected": 0.3400854766368866, "step": 25 }, { "epoch": 0.10256410256410256, "grad_norm": 34.42253361988952, "learning_rate": 9.98684552745256e-07, "logits/chosen": -2.243194103240967, "logits/rejected": -2.251340866088867, "logps/chosen": -161.2266845703125, "logps/rejected": -161.32298278808594, "loss": 0.6289, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.4243805408477783, "rewards/margins": 0.2635195851325989, "rewards/rejected": 0.16086098551750183, "step": 30 }, { "epoch": 0.11965811965811966, "grad_norm": 31.414296706456245, "learning_rate": 9.979451208349055e-07, "logits/chosen": -2.30315899848938, "logits/rejected": -2.289762496948242, "logps/chosen": -171.71713256835938, "logps/rejected": -174.50900268554688, "loss": 0.6296, "rewards/accuracies": 0.65625, "rewards/chosen": -0.019384615123271942, "rewards/margins": 0.318477988243103, "rewards/rejected": -0.3378625512123108, "step": 35 }, { "epoch": 0.13675213675213677, "grad_norm": 32.071830655862556, "learning_rate": 9.970418664264595e-07, "logits/chosen": -2.3935599327087402, "logits/rejected": -2.3812546730041504, "logps/chosen": -171.0698699951172, "logps/rejected": -176.58578491210938, "loss": 0.5991, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.26089853048324585, "rewards/margins": 0.5235068202018738, "rewards/rejected": -0.7844053506851196, "step": 40 }, { "epoch": 0.15384615384615385, "grad_norm": 36.19466541168301, "learning_rate": 9.95975086687994e-07, "logits/chosen": -2.4914021492004395, "logits/rejected": -2.4973323345184326, "logps/chosen": -163.68099975585938, "logps/rejected": -167.174072265625, "loss": 0.6141, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2962096929550171, "rewards/margins": 0.4588828682899475, "rewards/rejected": -0.7550925016403198, "step": 45 }, { "epoch": 0.17094017094017094, "grad_norm": 31.16276115760231, "learning_rate": 9.947451325869439e-07, "logits/chosen": -2.5575203895568848, "logits/rejected": -2.557717800140381, "logps/chosen": -172.04318237304688, "logps/rejected": -177.67672729492188, "loss": 0.5777, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2069791853427887, "rewards/margins": 0.6018465757369995, "rewards/rejected": -0.808825671672821, "step": 50 }, { "epoch": 0.17094017094017094, "eval_logits/chosen": -2.5221025943756104, "eval_logits/rejected": -2.5152711868286133, "eval_logps/chosen": -163.01820373535156, "eval_logps/rejected": -169.54832458496094, "eval_loss": 0.5812540650367737, "eval_rewards/accuracies": 0.6682692170143127, "eval_rewards/chosen": -0.45408713817596436, "eval_rewards/margins": 0.6127156615257263, "eval_rewards/rejected": -1.0668028593063354, "eval_runtime": 510.3361, "eval_samples_per_second": 16.291, "eval_steps_per_second": 0.255, "step": 50 }, { "epoch": 0.18803418803418803, "grad_norm": 31.575578721339145, "learning_rate": 9.933524087746347e-07, "logits/chosen": -2.490377426147461, "logits/rejected": -2.4825081825256348, "logps/chosen": -168.06161499023438, "logps/rejected": -175.0494384765625, "loss": 0.5706, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5060762763023376, "rewards/margins": 0.7589826583862305, "rewards/rejected": -1.2650587558746338, "step": 55 }, { "epoch": 0.20512820512820512, "grad_norm": 30.171745273288415, "learning_rate": 9.917973734531549e-07, "logits/chosen": -2.48228120803833, "logits/rejected": -2.4833157062530518, "logps/chosen": -159.47142028808594, "logps/rejected": -170.63671875, "loss": 0.5753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.35752761363983154, "rewards/margins": 0.5991309881210327, "rewards/rejected": -0.9566585421562195, "step": 60 }, { "epoch": 0.2222222222222222, "grad_norm": 32.13878319029882, "learning_rate": 9.90080538224607e-07, "logits/chosen": -2.585407018661499, "logits/rejected": -2.5767769813537598, "logps/chosen": -157.43936157226562, "logps/rejected": -166.13589477539062, "loss": 0.566, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02057185396552086, "rewards/margins": 0.47568243741989136, "rewards/rejected": -0.4962543547153473, "step": 65 }, { "epoch": 0.23931623931623933, "grad_norm": 29.494674721856043, "learning_rate": 9.882024679227938e-07, "logits/chosen": -2.6504979133605957, "logits/rejected": -2.6398470401763916, "logps/chosen": -178.0801239013672, "logps/rejected": -179.46328735351562, "loss": 0.5444, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.43436694145202637, "rewards/margins": 0.8427752256393433, "rewards/rejected": -1.27714204788208, "step": 70 }, { "epoch": 0.2564102564102564, "grad_norm": 28.856733948308104, "learning_rate": 9.861637804273881e-07, "logits/chosen": -2.660489082336426, "logits/rejected": -2.655539035797119, "logps/chosen": -162.1233673095703, "logps/rejected": -170.16131591796875, "loss": 0.5568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4032784402370453, "rewards/margins": 0.6959114074707031, "rewards/rejected": -1.0991899967193604, "step": 75 }, { "epoch": 0.27350427350427353, "grad_norm": 26.646061534818323, "learning_rate": 9.83965146460653e-07, "logits/chosen": -2.6391615867614746, "logits/rejected": -2.628577709197998, "logps/chosen": -168.58099365234375, "logps/rejected": -179.22805786132812, "loss": 0.5448, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6665827035903931, "rewards/margins": 0.8240470886230469, "rewards/rejected": -1.4906299114227295, "step": 80 }, { "epoch": 0.2905982905982906, "grad_norm": 36.04159750418885, "learning_rate": 9.816072893667758e-07, "logits/chosen": -2.6322970390319824, "logits/rejected": -2.6053385734558105, "logps/chosen": -174.82640075683594, "logps/rejected": -186.0735626220703, "loss": 0.5579, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0639268159866333, "rewards/margins": 1.0258175134658813, "rewards/rejected": -2.0897443294525146, "step": 85 }, { "epoch": 0.3076923076923077, "grad_norm": 26.922939193632168, "learning_rate": 9.790909848738904e-07, "logits/chosen": -2.60801362991333, "logits/rejected": -2.6101624965667725, "logps/chosen": -176.20538330078125, "logps/rejected": -184.7812957763672, "loss": 0.5215, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9929834604263306, "rewards/margins": 0.8646324276924133, "rewards/rejected": -1.8576160669326782, "step": 90 }, { "epoch": 0.3247863247863248, "grad_norm": 30.3564450245371, "learning_rate": 9.764170608388647e-07, "logits/chosen": -2.6054036617279053, "logits/rejected": -2.5733799934387207, "logps/chosen": -168.037109375, "logps/rejected": -174.51144409179688, "loss": 0.5197, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6652337312698364, "rewards/margins": 1.060430884361267, "rewards/rejected": -1.725664734840393, "step": 95 }, { "epoch": 0.3418803418803419, "grad_norm": 28.936164680674203, "learning_rate": 9.735863969749371e-07, "logits/chosen": -2.5255179405212402, "logits/rejected": -2.4874520301818848, "logps/chosen": -177.73861694335938, "logps/rejected": -189.82369995117188, "loss": 0.4982, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8511013984680176, "rewards/margins": 1.1354777812957764, "rewards/rejected": -1.986579179763794, "step": 100 }, { "epoch": 0.3418803418803419, "eval_logits/chosen": -2.484687328338623, "eval_logits/rejected": -2.460559368133545, "eval_logps/chosen": -168.28323364257812, "eval_logps/rejected": -180.8539276123047, "eval_loss": 0.5161151885986328, "eval_rewards/accuracies": 0.7211538553237915, "eval_rewards/chosen": -0.9805887937545776, "eval_rewards/margins": 1.2167747020721436, "eval_rewards/rejected": -2.1973636150360107, "eval_runtime": 510.3447, "eval_samples_per_second": 16.291, "eval_steps_per_second": 0.255, "step": 100 }, { "epoch": 0.358974358974359, "grad_norm": 31.089971589067016, "learning_rate": 9.705999245622956e-07, "logits/chosen": -2.4702706336975098, "logits/rejected": -2.4523651599884033, "logps/chosen": -170.59246826171875, "logps/rejected": -182.99813842773438, "loss": 0.4991, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8999192118644714, "rewards/margins": 0.8702341318130493, "rewards/rejected": -1.770153284072876, "step": 105 }, { "epoch": 0.37606837606837606, "grad_norm": 27.339023914835686, "learning_rate": 9.674586261416873e-07, "logits/chosen": -2.4866347312927246, "logits/rejected": -2.4518179893493652, "logps/chosen": -179.46290588378906, "logps/rejected": -188.7920379638672, "loss": 0.5213, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6938365697860718, "rewards/margins": 1.0765600204467773, "rewards/rejected": -1.7703965902328491, "step": 110 }, { "epoch": 0.39316239316239315, "grad_norm": 31.312902469600562, "learning_rate": 9.641635351911664e-07, "logits/chosen": -2.4456398487091064, "logits/rejected": -2.426159381866455, "logps/chosen": -170.3855438232422, "logps/rejected": -181.9676513671875, "loss": 0.4823, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8490931391716003, "rewards/margins": 1.2224478721618652, "rewards/rejected": -2.0715408325195312, "step": 115 }, { "epoch": 0.41025641025641024, "grad_norm": 25.966469642807997, "learning_rate": 9.607157357860821e-07, "logits/chosen": -2.4072113037109375, "logits/rejected": -2.3874144554138184, "logps/chosen": -187.41197204589844, "logps/rejected": -201.69454956054688, "loss": 0.5037, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0798847675323486, "rewards/margins": 1.3365159034729004, "rewards/rejected": -2.41640043258667, "step": 120 }, { "epoch": 0.42735042735042733, "grad_norm": 32.18242375190423, "learning_rate": 9.571163622424225e-07, "logits/chosen": -2.2766659259796143, "logits/rejected": -2.252072811126709, "logps/chosen": -174.78514099121094, "logps/rejected": -187.40646362304688, "loss": 0.5019, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5247443914413452, "rewards/margins": 1.2177503108978271, "rewards/rejected": -2.742494821548462, "step": 125 }, { "epoch": 0.4444444444444444, "grad_norm": 30.579550576640443, "learning_rate": 9.533665987436261e-07, "logits/chosen": -2.182610034942627, "logits/rejected": -2.128113269805908, "logps/chosen": -178.1033477783203, "logps/rejected": -197.4458465576172, "loss": 0.4957, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5623412132263184, "rewards/margins": 1.2608497142791748, "rewards/rejected": -2.823190689086914, "step": 130 }, { "epoch": 0.46153846153846156, "grad_norm": 29.287644373971865, "learning_rate": 9.494676789509899e-07, "logits/chosen": -2.1067867279052734, "logits/rejected": -2.0683400630950928, "logps/chosen": -176.67918395996094, "logps/rejected": -193.65371704101562, "loss": 0.4939, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1053364276885986, "rewards/margins": 1.3829355239868164, "rewards/rejected": -2.488272190093994, "step": 135 }, { "epoch": 0.47863247863247865, "grad_norm": 27.563555703636343, "learning_rate": 9.454208855977985e-07, "logits/chosen": -2.0855822563171387, "logits/rejected": -2.013296127319336, "logps/chosen": -178.40390014648438, "logps/rejected": -196.03305053710938, "loss": 0.4715, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.443866491317749, "rewards/margins": 1.600778341293335, "rewards/rejected": -3.044644832611084, "step": 140 }, { "epoch": 0.49572649572649574, "grad_norm": 30.234814125811326, "learning_rate": 9.41227550067308e-07, "logits/chosen": -2.0734238624572754, "logits/rejected": -2.0634400844573975, "logps/chosen": -179.51080322265625, "logps/rejected": -191.87046813964844, "loss": 0.4798, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6011661291122437, "rewards/margins": 1.445229172706604, "rewards/rejected": -3.0463955402374268, "step": 145 }, { "epoch": 0.5128205128205128, "grad_norm": 31.371346339775513, "learning_rate": 9.36889051954725e-07, "logits/chosen": -2.127821683883667, "logits/rejected": -2.080082416534424, "logps/chosen": -180.66383361816406, "logps/rejected": -196.1031494140625, "loss": 0.4954, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6335647106170654, "rewards/margins": 1.5422546863555908, "rewards/rejected": -3.1758196353912354, "step": 150 }, { "epoch": 0.5128205128205128, "eval_logits/chosen": -2.1472573280334473, "eval_logits/rejected": -2.0990829467773438, "eval_logps/chosen": -173.8290557861328, "eval_logps/rejected": -191.683349609375, "eval_loss": 0.47699737548828125, "eval_rewards/accuracies": 0.754807710647583, "eval_rewards/chosen": -1.5351712703704834, "eval_rewards/margins": 1.7451337575912476, "eval_rewards/rejected": -3.2803049087524414, "eval_runtime": 510.7048, "eval_samples_per_second": 16.279, "eval_steps_per_second": 0.255, "step": 150 }, { "epoch": 0.5299145299145299, "grad_norm": 27.790777356361556, "learning_rate": 9.324068186133245e-07, "logits/chosen": -2.1372084617614746, "logits/rejected": -2.124948024749756, "logps/chosen": -172.3369598388672, "logps/rejected": -186.5850372314453, "loss": 0.4644, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3583369255065918, "rewards/margins": 1.7439367771148682, "rewards/rejected": -3.102273464202881, "step": 155 }, { "epoch": 0.5470085470085471, "grad_norm": 27.612583401785376, "learning_rate": 9.277823246848536e-07, "logits/chosen": -2.2635793685913086, "logits/rejected": -2.2123026847839355, "logps/chosen": -186.25137329101562, "logps/rejected": -196.69564819335938, "loss": 0.4547, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2192834615707397, "rewards/margins": 1.371063470840454, "rewards/rejected": -2.5903468132019043, "step": 160 }, { "epoch": 0.5641025641025641, "grad_norm": 28.168886287584876, "learning_rate": 9.230170916143793e-07, "logits/chosen": -2.3309006690979004, "logits/rejected": -2.2978808879852295, "logps/chosen": -174.7559814453125, "logps/rejected": -195.68280029296875, "loss": 0.4967, "rewards/accuracies": 0.75, "rewards/chosen": -1.0855658054351807, "rewards/margins": 1.6249233484268188, "rewards/rejected": -2.710489273071289, "step": 165 }, { "epoch": 0.5811965811965812, "grad_norm": 28.93959851544435, "learning_rate": 9.181126871497378e-07, "logits/chosen": -2.376833915710449, "logits/rejected": -2.340681552886963, "logps/chosen": -175.3675537109375, "logps/rejected": -194.9619903564453, "loss": 0.4651, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9624043703079224, "rewards/margins": 1.7745708227157593, "rewards/rejected": -2.7369751930236816, "step": 170 }, { "epoch": 0.5982905982905983, "grad_norm": 30.43477724579486, "learning_rate": 9.130707248257491e-07, "logits/chosen": -2.458378553390503, "logits/rejected": -2.4171223640441895, "logps/chosen": -168.79849243164062, "logps/rejected": -178.6556396484375, "loss": 0.4728, "rewards/accuracies": 0.75, "rewards/chosen": -0.9254748225212097, "rewards/margins": 1.5751961469650269, "rewards/rejected": -2.500671148300171, "step": 175 }, { "epoch": 0.6153846153846154, "grad_norm": 25.220318056395065, "learning_rate": 9.078928634333698e-07, "logits/chosen": -2.4454641342163086, "logits/rejected": -2.4170265197753906, "logps/chosen": -181.41317749023438, "logps/rejected": -199.88668823242188, "loss": 0.4526, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.84205561876297, "rewards/margins": 1.7803510427474976, "rewards/rejected": -2.622406482696533, "step": 180 }, { "epoch": 0.6324786324786325, "grad_norm": 29.414031929374275, "learning_rate": 9.025808064739549e-07, "logits/chosen": -2.4103400707244873, "logits/rejected": -2.370731830596924, "logps/chosen": -178.70916748046875, "logps/rejected": -193.0004119873047, "loss": 0.4891, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.174070119857788, "rewards/margins": 1.535239338874817, "rewards/rejected": -2.7093093395233154, "step": 185 }, { "epoch": 0.6495726495726496, "grad_norm": 26.0372223221703, "learning_rate": 8.971363015988113e-07, "logits/chosen": -2.3428735733032227, "logits/rejected": -2.2986531257629395, "logps/chosen": -173.8651580810547, "logps/rejected": -194.9317626953125, "loss": 0.4643, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1425771713256836, "rewards/margins": 1.611919641494751, "rewards/rejected": -2.7544968128204346, "step": 190 }, { "epoch": 0.6666666666666666, "grad_norm": 33.87434178682573, "learning_rate": 8.91561140034225e-07, "logits/chosen": -2.2664923667907715, "logits/rejected": -2.2088184356689453, "logps/chosen": -172.7240753173828, "logps/rejected": -193.1275177001953, "loss": 0.5029, "rewards/accuracies": 0.75, "rewards/chosen": -1.3134868144989014, "rewards/margins": 1.5050963163375854, "rewards/rejected": -2.8185834884643555, "step": 195 }, { "epoch": 0.6837606837606838, "grad_norm": 25.21313391058931, "learning_rate": 8.858571559921537e-07, "logits/chosen": -2.191737174987793, "logits/rejected": -2.1188113689422607, "logps/chosen": -174.46722412109375, "logps/rejected": -188.4197540283203, "loss": 0.4567, "rewards/accuracies": 0.75, "rewards/chosen": -1.026064157485962, "rewards/margins": 1.5865710973739624, "rewards/rejected": -2.612635374069214, "step": 200 }, { "epoch": 0.6837606837606838, "eval_logits/chosen": -2.1586899757385254, "eval_logits/rejected": -2.1090493202209473, "eval_logps/chosen": -170.4287872314453, "eval_logps/rejected": -187.2865447998047, "eval_loss": 0.45979756116867065, "eval_rewards/accuracies": 0.7596153616905212, "eval_rewards/chosen": -1.1951465606689453, "eval_rewards/margins": 1.64547860622406, "eval_rewards/rejected": -2.840625047683716, "eval_runtime": 510.4854, "eval_samples_per_second": 16.286, "eval_steps_per_second": 0.255, "step": 200 }, { "epoch": 0.7008547008547008, "grad_norm": 35.24537564842276, "learning_rate": 8.800262260667754e-07, "logits/chosen": -2.1282899379730225, "logits/rejected": -2.0742104053497314, "logps/chosen": -165.1837921142578, "logps/rejected": -182.61190795898438, "loss": 0.4603, "rewards/accuracies": 0.75, "rewards/chosen": -1.3638368844985962, "rewards/margins": 1.6600275039672852, "rewards/rejected": -3.023864507675171, "step": 205 }, { "epoch": 0.717948717948718, "grad_norm": 28.161564325401752, "learning_rate": 8.740702686170954e-07, "logits/chosen": -2.131873369216919, "logits/rejected": -2.0582406520843506, "logps/chosen": -184.79843139648438, "logps/rejected": -201.92471313476562, "loss": 0.4458, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8222748041152954, "rewards/margins": 1.8169399499893188, "rewards/rejected": -3.639214277267456, "step": 210 }, { "epoch": 0.7350427350427351, "grad_norm": 27.38246704123344, "learning_rate": 8.679912431358109e-07, "logits/chosen": -2.149195909500122, "logits/rejected": -2.0881500244140625, "logps/chosen": -176.96768188476562, "logps/rejected": -193.83583068847656, "loss": 0.4503, "rewards/accuracies": 0.75, "rewards/chosen": -1.8590911626815796, "rewards/margins": 1.9566419124603271, "rewards/rejected": -3.8157334327697754, "step": 215 }, { "epoch": 0.7521367521367521, "grad_norm": 26.680467589239278, "learning_rate": 8.617911496046445e-07, "logits/chosen": -2.1819872856140137, "logits/rejected": -2.1153197288513184, "logps/chosen": -172.77716064453125, "logps/rejected": -191.78298950195312, "loss": 0.4676, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7457681894302368, "rewards/margins": 1.8468406200408936, "rewards/rejected": -3.592608690261841, "step": 220 }, { "epoch": 0.7692307692307693, "grad_norm": 27.17086789362823, "learning_rate": 8.554720278363547e-07, "logits/chosen": -2.2397422790527344, "logits/rejected": -2.199204444885254, "logps/chosen": -173.76963806152344, "logps/rejected": -191.92172241210938, "loss": 0.474, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4509254693984985, "rewards/margins": 1.7614561319351196, "rewards/rejected": -3.212381362915039, "step": 225 }, { "epoch": 0.7863247863247863, "grad_norm": 34.41944082003437, "learning_rate": 8.490359568036445e-07, "logits/chosen": -2.3623924255371094, "logits/rejected": -2.332777976989746, "logps/chosen": -181.22314453125, "logps/rejected": -203.68490600585938, "loss": 0.458, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3178858757019043, "rewards/margins": 1.6162595748901367, "rewards/rejected": -2.934145450592041, "step": 230 }, { "epoch": 0.8034188034188035, "grad_norm": 26.240821537730366, "learning_rate": 8.424850539551856e-07, "logits/chosen": -2.457951068878174, "logits/rejected": -2.430043935775757, "logps/chosen": -174.00967407226562, "logps/rejected": -190.4671173095703, "loss": 0.4457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3960864543914795, "rewards/margins": 1.7217321395874023, "rewards/rejected": -3.117818593978882, "step": 235 }, { "epoch": 0.8205128205128205, "grad_norm": 23.836616559503103, "learning_rate": 8.358214745189829e-07, "logits/chosen": -2.5360074043273926, "logits/rejected": -2.502570867538452, "logps/chosen": -183.65618896484375, "logps/rejected": -204.5669403076172, "loss": 0.4236, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.516182541847229, "rewards/margins": 2.1192710399627686, "rewards/rejected": -3.635453462600708, "step": 240 }, { "epoch": 0.8376068376068376, "grad_norm": 26.25101678311949, "learning_rate": 8.290474107933114e-07, "logits/chosen": -2.5879926681518555, "logits/rejected": -2.5638298988342285, "logps/chosen": -184.1548614501953, "logps/rejected": -202.95677185058594, "loss": 0.4494, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7255306243896484, "rewards/margins": 1.9638643264770508, "rewards/rejected": -3.68939471244812, "step": 245 }, { "epoch": 0.8547008547008547, "grad_norm": 28.730820152276525, "learning_rate": 8.221650914254565e-07, "logits/chosen": -2.592283248901367, "logits/rejected": -2.561645984649658, "logps/chosen": -183.74407958984375, "logps/rejected": -195.3282470703125, "loss": 0.4873, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9939171075820923, "rewards/margins": 1.590651273727417, "rewards/rejected": -3.584568500518799, "step": 250 }, { "epoch": 0.8547008547008547, "eval_logits/chosen": -2.5724236965179443, "eval_logits/rejected": -2.545708417892456, "eval_logps/chosen": -177.6819305419922, "eval_logps/rejected": -195.5203399658203, "eval_loss": 0.44874703884124756, "eval_rewards/accuracies": 0.7634615302085876, "eval_rewards/chosen": -1.9204591512680054, "eval_rewards/margins": 1.743545413017273, "eval_rewards/rejected": -3.664004325866699, "eval_runtime": 510.735, "eval_samples_per_second": 16.278, "eval_steps_per_second": 0.255, "step": 250 }, { "epoch": 0.8717948717948718, "grad_norm": 26.28610055865234, "learning_rate": 8.151767806784953e-07, "logits/chosen": -2.5476808547973633, "logits/rejected": -2.5247130393981934, "logps/chosen": -188.99472045898438, "logps/rejected": -200.0706329345703, "loss": 0.4745, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8807718753814697, "rewards/margins": 1.3770544528961182, "rewards/rejected": -3.257826328277588, "step": 255 }, { "epoch": 0.8888888888888888, "grad_norm": 31.895303400766686, "learning_rate": 8.080847776863608e-07, "logits/chosen": -2.523097038269043, "logits/rejected": -2.4968159198760986, "logps/chosen": -187.11276245117188, "logps/rejected": -202.34288024902344, "loss": 0.4478, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.120813250541687, "rewards/margins": 2.0744528770446777, "rewards/rejected": -3.1952662467956543, "step": 260 }, { "epoch": 0.905982905982906, "grad_norm": 26.320681707210774, "learning_rate": 8.008914156974333e-07, "logits/chosen": -2.462744951248169, "logits/rejected": -2.4353935718536377, "logps/chosen": -173.44297790527344, "logps/rejected": -195.9517822265625, "loss": 0.4374, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.8408319354057312, "rewards/margins": 1.7711703777313232, "rewards/rejected": -2.612001895904541, "step": 265 }, { "epoch": 0.9230769230769231, "grad_norm": 26.997987002073426, "learning_rate": 7.935990613069086e-07, "logits/chosen": -2.4033281803131104, "logits/rejected": -2.369520425796509, "logps/chosen": -171.97085571289062, "logps/rejected": -191.66697692871094, "loss": 0.4474, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8512971997261047, "rewards/margins": 1.844062089920044, "rewards/rejected": -2.695359230041504, "step": 270 }, { "epoch": 0.9401709401709402, "grad_norm": 27.58725138274707, "learning_rate": 7.862101136781946e-07, "logits/chosen": -2.3467745780944824, "logits/rejected": -2.3106236457824707, "logps/chosen": -171.92318725585938, "logps/rejected": -194.26400756835938, "loss": 0.4207, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4691975116729736, "rewards/margins": 1.7321897745132446, "rewards/rejected": -3.201387405395508, "step": 275 }, { "epoch": 0.9572649572649573, "grad_norm": 28.716376412825436, "learning_rate": 7.78727003753595e-07, "logits/chosen": -2.287221670150757, "logits/rejected": -2.260993480682373, "logps/chosen": -175.48435974121094, "logps/rejected": -199.2796173095703, "loss": 0.4469, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8219757080078125, "rewards/margins": 2.2368292808532715, "rewards/rejected": -4.058804512023926, "step": 280 }, { "epoch": 0.9743589743589743, "grad_norm": 22.896854237985796, "learning_rate": 7.711521934545342e-07, "logits/chosen": -2.2684969902038574, "logits/rejected": -2.2303268909454346, "logps/chosen": -187.36607360839844, "logps/rejected": -207.65261840820312, "loss": 0.4153, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6430988311767578, "rewards/margins": 2.1586105823516846, "rewards/rejected": -3.8017096519470215, "step": 285 }, { "epoch": 0.9914529914529915, "grad_norm": 27.30975328138857, "learning_rate": 7.63488174871594e-07, "logits/chosen": -2.2886035442352295, "logits/rejected": -2.2276930809020996, "logps/chosen": -183.37933349609375, "logps/rejected": -201.49429321289062, "loss": 0.4079, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3509235382080078, "rewards/margins": 2.130119800567627, "rewards/rejected": -3.481043577194214, "step": 290 }, { "epoch": 1.0085470085470085, "grad_norm": 20.392666769639558, "learning_rate": 7.557374694446221e-07, "logits/chosen": -2.264070510864258, "logits/rejected": -2.248257637023926, "logps/chosen": -170.48977661132812, "logps/rejected": -192.82720947265625, "loss": 0.3189, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9297264814376831, "rewards/margins": 2.2484724521636963, "rewards/rejected": -3.1781985759735107, "step": 295 }, { "epoch": 1.0256410256410255, "grad_norm": 16.68267756771692, "learning_rate": 7.479026271331863e-07, "logits/chosen": -2.3048789501190186, "logits/rejected": -2.2495861053466797, "logps/chosen": -170.22055053710938, "logps/rejected": -198.54359436035156, "loss": 0.2176, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.4595426619052887, "rewards/margins": 2.989124059677124, "rewards/rejected": -3.44866681098938, "step": 300 }, { "epoch": 1.0256410256410255, "eval_logits/chosen": -2.3489644527435303, "eval_logits/rejected": -2.3130385875701904, "eval_logps/chosen": -170.4687957763672, "eval_logps/rejected": -190.08233642578125, "eval_loss": 0.4382702112197876, "eval_rewards/accuracies": 0.7846153974533081, "eval_rewards/chosen": -1.1991456747055054, "eval_rewards/margins": 1.9210587739944458, "eval_rewards/rejected": -3.1202046871185303, "eval_runtime": 511.3486, "eval_samples_per_second": 16.259, "eval_steps_per_second": 0.254, "step": 300 }, { "epoch": 1.0427350427350428, "grad_norm": 18.410064070731565, "learning_rate": 7.399862255776448e-07, "logits/chosen": -2.331937074661255, "logits/rejected": -2.315340995788574, "logps/chosen": -161.1502227783203, "logps/rejected": -192.48519897460938, "loss": 0.2177, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.7098430395126343, "rewards/margins": 2.818021535873413, "rewards/rejected": -3.527864933013916, "step": 305 }, { "epoch": 1.0598290598290598, "grad_norm": 21.826180204028997, "learning_rate": 7.319908692511102e-07, "logits/chosen": -2.384028911590576, "logits/rejected": -2.3547444343566895, "logps/chosen": -168.48654174804688, "logps/rejected": -203.76600646972656, "loss": 0.2374, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.6590297818183899, "rewards/margins": 3.3696398735046387, "rewards/rejected": -4.028670310974121, "step": 310 }, { "epoch": 1.0769230769230769, "grad_norm": 19.05079845541507, "learning_rate": 7.239191886025853e-07, "logits/chosen": -2.4289774894714355, "logits/rejected": -2.406567096710205, "logps/chosen": -175.2897186279297, "logps/rejected": -207.41915893554688, "loss": 0.2069, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.7591577768325806, "rewards/margins": 3.679530620574951, "rewards/rejected": -4.438688278198242, "step": 315 }, { "epoch": 1.0940170940170941, "grad_norm": 20.698533938628618, "learning_rate": 7.15773839191553e-07, "logits/chosen": -2.4449350833892822, "logits/rejected": -2.4075865745544434, "logps/chosen": -166.36514282226562, "logps/rejected": -199.648193359375, "loss": 0.2229, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.6848402619361877, "rewards/margins": 3.1617534160614014, "rewards/rejected": -3.8465933799743652, "step": 320 }, { "epoch": 1.1111111111111112, "grad_norm": 23.234017756053024, "learning_rate": 7.075575008143054e-07, "logits/chosen": -2.4447779655456543, "logits/rejected": -2.4182941913604736, "logps/chosen": -173.80941772460938, "logps/rejected": -208.5805206298828, "loss": 0.2179, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7078952789306641, "rewards/margins": 3.346341609954834, "rewards/rejected": -4.05423641204834, "step": 325 }, { "epoch": 1.1282051282051282, "grad_norm": 20.797265468675864, "learning_rate": 6.99272876622298e-07, "logits/chosen": -2.4514238834381104, "logits/rejected": -2.4154598712921143, "logps/chosen": -175.63339233398438, "logps/rejected": -210.8448944091797, "loss": 0.2255, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.5788818597793579, "rewards/margins": 3.4709906578063965, "rewards/rejected": -4.049872875213623, "step": 330 }, { "epoch": 1.1452991452991452, "grad_norm": 22.69234920407025, "learning_rate": 6.909226922328211e-07, "logits/chosen": -2.4148731231689453, "logits/rejected": -2.396899461746216, "logps/chosen": -174.77548217773438, "logps/rejected": -205.1797332763672, "loss": 0.2147, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.9849351644515991, "rewards/margins": 3.1783816814422607, "rewards/rejected": -4.1633172035217285, "step": 335 }, { "epoch": 1.1623931623931625, "grad_norm": 21.207875658566053, "learning_rate": 6.82509694832279e-07, "logits/chosen": -2.411642551422119, "logits/rejected": -2.3832614421844482, "logps/chosen": -174.6155242919922, "logps/rejected": -210.0262451171875, "loss": 0.2199, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.9470588564872742, "rewards/margins": 3.335145950317383, "rewards/rejected": -4.282204627990723, "step": 340 }, { "epoch": 1.1794871794871795, "grad_norm": 22.62073459229978, "learning_rate": 6.740366522723752e-07, "logits/chosen": -2.4228436946868896, "logits/rejected": -2.3924076557159424, "logps/chosen": -181.4166259765625, "logps/rejected": -214.4353485107422, "loss": 0.1949, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.1724941730499268, "rewards/margins": 3.5335402488708496, "rewards/rejected": -4.7060346603393555, "step": 345 }, { "epoch": 1.1965811965811965, "grad_norm": 24.043667365803458, "learning_rate": 6.655063521594949e-07, "logits/chosen": -2.4030003547668457, "logits/rejected": -2.3506176471710205, "logps/chosen": -175.32406616210938, "logps/rejected": -218.69326782226562, "loss": 0.2095, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.6340477466583252, "rewards/margins": 4.079660415649414, "rewards/rejected": -5.713706970214844, "step": 350 }, { "epoch": 1.1965811965811965, "eval_logits/chosen": -2.3941731452941895, "eval_logits/rejected": -2.3655929565429688, "eval_logps/chosen": -182.0218963623047, "eval_logps/rejected": -207.61233520507812, "eval_loss": 0.4536636769771576, "eval_rewards/accuracies": 0.7932692170143127, "eval_rewards/chosen": -2.3544540405273438, "eval_rewards/margins": 2.5187504291534424, "eval_rewards/rejected": -4.873203754425049, "eval_runtime": 510.8381, "eval_samples_per_second": 16.275, "eval_steps_per_second": 0.254, "step": 350 }, { "epoch": 1.2136752136752136, "grad_norm": 21.542790573622174, "learning_rate": 6.569216009375929e-07, "logits/chosen": -2.3871009349823, "logits/rejected": -2.3618922233581543, "logps/chosen": -174.978271484375, "logps/rejected": -215.9545440673828, "loss": 0.191, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.4307578802108765, "rewards/margins": 4.03377628326416, "rewards/rejected": -5.464534759521484, "step": 355 }, { "epoch": 1.2307692307692308, "grad_norm": 21.574671776242766, "learning_rate": 6.482852229648801e-07, "logits/chosen": -2.388662576675415, "logits/rejected": -2.3574576377868652, "logps/chosen": -173.49014282226562, "logps/rejected": -204.75392150878906, "loss": 0.1953, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.2610100507736206, "rewards/margins": 3.369713544845581, "rewards/rejected": -4.630723476409912, "step": 360 }, { "epoch": 1.2478632478632479, "grad_norm": 18.919534146199894, "learning_rate": 6.396000595846187e-07, "logits/chosen": -2.393681049346924, "logits/rejected": -2.3899941444396973, "logps/chosen": -177.26788330078125, "logps/rejected": -204.59396362304688, "loss": 0.205, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.1454333066940308, "rewards/margins": 3.212463855743408, "rewards/rejected": -4.35789680480957, "step": 365 }, { "epoch": 1.264957264957265, "grad_norm": 20.063931866077127, "learning_rate": 6.30868968190328e-07, "logits/chosen": -2.399217128753662, "logits/rejected": -2.3884811401367188, "logps/chosen": -173.6759490966797, "logps/rejected": -208.24868774414062, "loss": 0.1916, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2132608890533447, "rewards/margins": 3.5065104961395264, "rewards/rejected": -4.719771862030029, "step": 370 }, { "epoch": 1.282051282051282, "grad_norm": 19.65973417080842, "learning_rate": 6.220948212857111e-07, "logits/chosen": -2.3873801231384277, "logits/rejected": -2.36773419380188, "logps/chosen": -176.6146240234375, "logps/rejected": -217.32119750976562, "loss": 0.1865, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.2807495594024658, "rewards/margins": 3.8207836151123047, "rewards/rejected": -5.101532459259033, "step": 375 }, { "epoch": 1.2991452991452992, "grad_norm": 22.23986474829352, "learning_rate": 6.13280505539608e-07, "logits/chosen": -2.38179349899292, "logits/rejected": -2.3515422344207764, "logps/chosen": -188.8070068359375, "logps/rejected": -235.27456665039062, "loss": 0.2025, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.6675641536712646, "rewards/margins": 4.054324150085449, "rewards/rejected": -5.721888542175293, "step": 380 }, { "epoch": 1.3162393162393162, "grad_norm": 25.60349271225297, "learning_rate": 6.044289208362914e-07, "logits/chosen": -2.3684115409851074, "logits/rejected": -2.3191847801208496, "logps/chosen": -178.47006225585938, "logps/rejected": -219.3975372314453, "loss": 0.1953, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7480275630950928, "rewards/margins": 3.8794853687286377, "rewards/rejected": -5.6275129318237305, "step": 385 }, { "epoch": 1.3333333333333333, "grad_norm": 26.515708025155362, "learning_rate": 5.955429793214128e-07, "logits/chosen": -2.3351917266845703, "logits/rejected": -2.3046581745147705, "logps/chosen": -185.07766723632812, "logps/rejected": -222.58560180664062, "loss": 0.2143, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.9629478454589844, "rewards/margins": 3.649775981903076, "rewards/rejected": -5.612724304199219, "step": 390 }, { "epoch": 1.3504273504273505, "grad_norm": 22.854515057042835, "learning_rate": 5.866256044439142e-07, "logits/chosen": -2.3319296836853027, "logits/rejected": -2.317962169647217, "logps/chosen": -176.97047424316406, "logps/rejected": -215.57284545898438, "loss": 0.2196, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.8355095386505127, "rewards/margins": 3.851968765258789, "rewards/rejected": -5.687478065490723, "step": 395 }, { "epoch": 1.3675213675213675, "grad_norm": 19.88579820520089, "learning_rate": 5.776797299942235e-07, "logits/chosen": -2.33012056350708, "logits/rejected": -2.312140941619873, "logps/chosen": -172.976806640625, "logps/rejected": -211.60244750976562, "loss": 0.1952, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3092162609100342, "rewards/margins": 3.7678093910217285, "rewards/rejected": -5.077025890350342, "step": 400 }, { "epoch": 1.3675213675213675, "eval_logits/chosen": -2.3361263275146484, "eval_logits/rejected": -2.3058180809020996, "eval_logps/chosen": -178.1995391845703, "eval_logps/rejected": -200.75048828125, "eval_loss": 0.43530720472335815, "eval_rewards/accuracies": 0.7961538434028625, "eval_rewards/chosen": -1.972220540046692, "eval_rewards/margins": 2.21479868888855, "eval_rewards/rejected": -4.1870198249816895, "eval_runtime": 510.4272, "eval_samples_per_second": 16.288, "eval_steps_per_second": 0.255, "step": 400 } ], "logging_steps": 5, "max_steps": 876, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4716694698196992.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }