{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9906542056074765, "eval_steps": 50, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06230529595015576, "grad_norm": 64.78196225600344, "learning_rate": 5e-07, "logits/chosen": -2.7243542671203613, "logits/rejected": -2.7354743480682373, "logps/chosen": -260.3916320800781, "logps/rejected": -244.31298828125, "loss": 0.6904, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": 0.012873289175331593, "rewards/margins": 0.0037455155979841948, "rewards/rejected": 0.009127774275839329, "step": 5 }, { "epoch": 0.12461059190031153, "grad_norm": 45.8994374201786, "learning_rate": 1e-06, "logits/chosen": -2.6493823528289795, "logits/rejected": -2.650038957595825, "logps/chosen": -234.55868530273438, "logps/rejected": -202.4860076904297, "loss": 0.6405, "rewards/accuracies": 0.59375, "rewards/chosen": 0.4499489665031433, "rewards/margins": 0.15348409116268158, "rewards/rejected": 0.29646486043930054, "step": 10 }, { "epoch": 0.18691588785046728, "grad_norm": 39.36900659105447, "learning_rate": 9.988343845952696e-07, "logits/chosen": -2.487607479095459, "logits/rejected": -2.481687068939209, "logps/chosen": -227.55001831054688, "logps/rejected": -217.2289581298828, "loss": 0.6294, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.2727148532867432, "rewards/margins": 0.4717913269996643, "rewards/rejected": 0.8009236454963684, "step": 15 }, { "epoch": 0.24922118380062305, "grad_norm": 48.3442406976696, "learning_rate": 9.953429730181652e-07, "logits/chosen": -2.409071683883667, "logits/rejected": -2.3922438621520996, "logps/chosen": -244.6020965576172, "logps/rejected": -230.30615234375, "loss": 0.6128, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.3646974563598633, "rewards/margins": 0.8445035815238953, "rewards/rejected": 0.5201937556266785, "step": 20 }, { "epoch": 0.3115264797507788, "grad_norm": 43.25375347405899, "learning_rate": 9.895420438411615e-07, "logits/chosen": -2.404625654220581, "logits/rejected": -2.380873203277588, "logps/chosen": -260.6168212890625, "logps/rejected": -226.8410186767578, "loss": 0.5858, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 1.0510780811309814, "rewards/margins": 1.0232232809066772, "rewards/rejected": 0.027854669839143753, "step": 25 }, { "epoch": 0.37383177570093457, "grad_norm": 35.13451219232732, "learning_rate": 9.814586436738997e-07, "logits/chosen": -2.457035779953003, "logits/rejected": -2.448774576187134, "logps/chosen": -257.907470703125, "logps/rejected": -218.4612579345703, "loss": 0.5578, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.1478369235992432, "rewards/margins": 1.3836512565612793, "rewards/rejected": -0.23581421375274658, "step": 30 }, { "epoch": 0.43613707165109034, "grad_norm": 41.912297815106236, "learning_rate": 9.711304610594102e-07, "logits/chosen": -2.485905408859253, "logits/rejected": -2.459043025970459, "logps/chosen": -246.7182159423828, "logps/rejected": -219.68246459960938, "loss": 0.593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6787748336791992, "rewards/margins": 1.0257090330123901, "rewards/rejected": -0.34693413972854614, "step": 35 }, { "epoch": 0.4984423676012461, "grad_norm": 47.725973440299406, "learning_rate": 9.586056507527264e-07, "logits/chosen": -2.4728102684020996, "logits/rejected": -2.4659764766693115, "logps/chosen": -242.521728515625, "logps/rejected": -244.7397003173828, "loss": 0.5851, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.7883692979812622, "rewards/margins": 1.2785449028015137, "rewards/rejected": -0.4901755452156067, "step": 40 }, { "epoch": 0.5607476635514018, "grad_norm": 36.45457422056591, "learning_rate": 9.439426092011875e-07, "logits/chosen": -2.435615301132202, "logits/rejected": -2.4443929195404053, "logps/chosen": -280.24755859375, "logps/rejected": -212.96292114257812, "loss": 0.558, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.2600438594818115, "rewards/margins": 1.4364185333251953, "rewards/rejected": -0.17637479305267334, "step": 45 }, { "epoch": 0.6230529595015576, "grad_norm": 37.12133742483065, "learning_rate": 9.272097022732443e-07, "logits/chosen": -2.447756052017212, "logits/rejected": -2.4266324043273926, "logps/chosen": -246.95840454101562, "logps/rejected": -214.29959106445312, "loss": 0.5451, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.2870022058486938, "rewards/margins": 1.596084475517273, "rewards/rejected": -0.3090822994709015, "step": 50 }, { "epoch": 0.6230529595015576, "eval_logits/chosen": -2.4445273876190186, "eval_logits/rejected": -2.4265356063842773, "eval_logps/chosen": -268.5102844238281, "eval_logps/rejected": -221.56869506835938, "eval_loss": 0.5847126245498657, "eval_rewards/accuracies": 0.7465277910232544, "eval_rewards/chosen": 1.2566064596176147, "eval_rewards/margins": 1.209053635597229, "eval_rewards/rejected": 0.04755274951457977, "eval_runtime": 151.9316, "eval_samples_per_second": 15.007, "eval_steps_per_second": 0.237, "step": 50 }, { "epoch": 0.6853582554517134, "grad_norm": 40.79262464200207, "learning_rate": 9.084849465052209e-07, "logits/chosen": -2.3776774406433105, "logits/rejected": -2.3956453800201416, "logps/chosen": -257.2913513183594, "logps/rejected": -197.3280029296875, "loss": 0.5601, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 1.311511516571045, "rewards/margins": 1.5678449869155884, "rewards/rejected": -0.2563334107398987, "step": 55 }, { "epoch": 0.7476635514018691, "grad_norm": 48.34269415680465, "learning_rate": 8.878556453522099e-07, "logits/chosen": -2.3521735668182373, "logits/rejected": -2.3181591033935547, "logps/chosen": -233.2803497314453, "logps/rejected": -218.5789031982422, "loss": 0.5833, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7265979647636414, "rewards/margins": 1.4349805116653442, "rewards/rejected": -0.7083825469017029, "step": 60 }, { "epoch": 0.8099688473520249, "grad_norm": 37.4927506245011, "learning_rate": 8.654179821390621e-07, "logits/chosen": -2.362112522125244, "logits/rejected": -2.3338117599487305, "logps/chosen": -267.119140625, "logps/rejected": -216.8646240234375, "loss": 0.5886, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.6950536966323853, "rewards/margins": 1.5640451908111572, "rewards/rejected": -0.8689913749694824, "step": 65 }, { "epoch": 0.8722741433021807, "grad_norm": 39.43481962342951, "learning_rate": 8.41276571609327e-07, "logits/chosen": -2.3775908946990967, "logits/rejected": -2.365400552749634, "logps/chosen": -277.951904296875, "logps/rejected": -225.5980682373047, "loss": 0.5456, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.0310388803482056, "rewards/margins": 1.5813945531845093, "rewards/rejected": -0.5503557324409485, "step": 70 }, { "epoch": 0.9345794392523364, "grad_norm": 41.2092804317616, "learning_rate": 8.155439721630264e-07, "logits/chosen": -2.337092638015747, "logits/rejected": -2.31592059135437, "logps/chosen": -252.63143920898438, "logps/rejected": -201.25433349609375, "loss": 0.5874, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.8362587690353394, "rewards/margins": 1.4375728368759155, "rewards/rejected": -0.6013139486312866, "step": 75 }, { "epoch": 0.9968847352024922, "grad_norm": 32.58419185513781, "learning_rate": 7.883401610574336e-07, "logits/chosen": -2.255605459213257, "logits/rejected": -2.23878812789917, "logps/chosen": -257.6908264160156, "logps/rejected": -198.56063842773438, "loss": 0.5088, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.8411666750907898, "rewards/margins": 1.4409376382827759, "rewards/rejected": -0.5997709035873413, "step": 80 }, { "epoch": 1.0591900311526479, "grad_norm": 19.724965519418905, "learning_rate": 7.597919750177168e-07, "logits/chosen": -2.2557170391082764, "logits/rejected": -2.2346577644348145, "logps/chosen": -261.6788330078125, "logps/rejected": -238.0162353515625, "loss": 0.2444, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7993049621582031, "rewards/margins": 3.0407943725585938, "rewards/rejected": -1.2414895296096802, "step": 85 }, { "epoch": 1.1214953271028036, "grad_norm": 22.70642349887756, "learning_rate": 7.30032518865576e-07, "logits/chosen": -2.3579375743865967, "logits/rejected": -2.339137554168701, "logps/chosen": -234.2419891357422, "logps/rejected": -222.1098175048828, "loss": 0.2253, "rewards/accuracies": 0.9375, "rewards/chosen": 2.0569443702697754, "rewards/margins": 3.1539688110351562, "rewards/rejected": -1.0970245599746704, "step": 90 }, { "epoch": 1.1838006230529594, "grad_norm": 26.513380455541846, "learning_rate": 6.992005449231207e-07, "logits/chosen": -2.4564146995544434, "logits/rejected": -2.480577230453491, "logps/chosen": -241.68600463867188, "logps/rejected": -214.6504364013672, "loss": 0.2597, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.810720682144165, "rewards/margins": 3.2071242332458496, "rewards/rejected": -1.3964035511016846, "step": 95 }, { "epoch": 1.2461059190031152, "grad_norm": 25.112538207360036, "learning_rate": 6.67439806085493e-07, "logits/chosen": -2.5166354179382324, "logits/rejected": -2.513838291168213, "logps/chosen": -245.987548828125, "logps/rejected": -252.9072265625, "loss": 0.2411, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6626945734024048, "rewards/margins": 3.801987409591675, "rewards/rejected": -2.1392927169799805, "step": 100 }, { "epoch": 1.2461059190031152, "eval_logits/chosen": -2.5858781337738037, "eval_logits/rejected": -2.563420534133911, "eval_logps/chosen": -270.93853759765625, "eval_logps/rejected": -233.65652465820312, "eval_loss": 0.5330603718757629, "eval_rewards/accuracies": 0.7881944179534912, "eval_rewards/chosen": 1.0137810707092285, "eval_rewards/margins": 2.17501163482666, "eval_rewards/rejected": -1.1612308025360107, "eval_runtime": 151.6106, "eval_samples_per_second": 15.039, "eval_steps_per_second": 0.237, "step": 100 }, { "epoch": 1.308411214953271, "grad_norm": 21.36224069653613, "learning_rate": 6.348983855785121e-07, "logits/chosen": -2.5647153854370117, "logits/rejected": -2.575026273727417, "logps/chosen": -243.7207489013672, "logps/rejected": -250.43545532226562, "loss": 0.2593, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4892621040344238, "rewards/margins": 3.568880796432495, "rewards/rejected": -2.0796191692352295, "step": 105 }, { "epoch": 1.3707165109034267, "grad_norm": 20.24131121354291, "learning_rate": 6.01728006526317e-07, "logits/chosen": -2.5996501445770264, "logits/rejected": -2.613658905029297, "logps/chosen": -234.7429962158203, "logps/rejected": -225.66238403320312, "loss": 0.251, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5861287117004395, "rewards/margins": 3.3715546131134033, "rewards/rejected": -1.7854257822036743, "step": 110 }, { "epoch": 1.4330218068535825, "grad_norm": 20.01846071105953, "learning_rate": 5.680833245481234e-07, "logits/chosen": -2.6137535572052, "logits/rejected": -2.6190667152404785, "logps/chosen": -240.59274291992188, "logps/rejected": -214.5608367919922, "loss": 0.2555, "rewards/accuracies": 0.9375, "rewards/chosen": 2.0559613704681396, "rewards/margins": 3.704247236251831, "rewards/rejected": -1.6482856273651123, "step": 115 }, { "epoch": 1.4953271028037383, "grad_norm": 20.827725724325912, "learning_rate": 5.341212066823355e-07, "logits/chosen": -2.6061787605285645, "logits/rejected": -2.5920939445495605, "logps/chosen": -247.16384887695312, "logps/rejected": -219.4833526611328, "loss": 0.2356, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.6824995279312134, "rewards/margins": 3.495360851287842, "rewards/rejected": -1.812861442565918, "step": 120 }, { "epoch": 1.557632398753894, "grad_norm": 23.949899148725493, "learning_rate": 5e-07, "logits/chosen": -2.5854973793029785, "logits/rejected": -2.567333936691284, "logps/chosen": -265.8291931152344, "logps/rejected": -234.7073974609375, "loss": 0.2704, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.8032957315444946, "rewards/margins": 3.318470001220703, "rewards/rejected": -1.5151745080947876, "step": 125 }, { "epoch": 1.6199376947040498, "grad_norm": 24.77516023906911, "learning_rate": 4.6587879331766457e-07, "logits/chosen": -2.5356271266937256, "logits/rejected": -2.5272650718688965, "logps/chosen": -213.00497436523438, "logps/rejected": -224.9855194091797, "loss": 0.2557, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8242075443267822, "rewards/margins": 3.5226008892059326, "rewards/rejected": -1.69839346408844, "step": 130 }, { "epoch": 1.6822429906542056, "grad_norm": 24.357268196093703, "learning_rate": 4.3191667545187675e-07, "logits/chosen": -2.485233783721924, "logits/rejected": -2.4866693019866943, "logps/chosen": -238.76327514648438, "logps/rejected": -205.4741973876953, "loss": 0.3279, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.1047568321228027, "rewards/margins": 3.5537636280059814, "rewards/rejected": -1.4490069150924683, "step": 135 }, { "epoch": 1.7445482866043613, "grad_norm": 28.92490643539179, "learning_rate": 3.9827199347368317e-07, "logits/chosen": -2.4208996295928955, "logits/rejected": -2.425780773162842, "logps/chosen": -253.2147674560547, "logps/rejected": -217.5721893310547, "loss": 0.2906, "rewards/accuracies": 0.875, "rewards/chosen": 1.5468943119049072, "rewards/margins": 3.1680281162261963, "rewards/rejected": -1.621133804321289, "step": 140 }, { "epoch": 1.8068535825545171, "grad_norm": 22.981053169106552, "learning_rate": 3.651016144214878e-07, "logits/chosen": -2.4239203929901123, "logits/rejected": -2.3991637229919434, "logps/chosen": -270.43402099609375, "logps/rejected": -230.588134765625, "loss": 0.2683, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 2.071660280227661, "rewards/margins": 3.900665760040283, "rewards/rejected": -1.829005241394043, "step": 145 }, { "epoch": 1.8691588785046729, "grad_norm": 24.36269176225689, "learning_rate": 3.325601939145069e-07, "logits/chosen": -2.3990695476531982, "logits/rejected": -2.389209270477295, "logps/chosen": -237.0928955078125, "logps/rejected": -214.083984375, "loss": 0.2838, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.1656646728515625, "rewards/margins": 3.661673069000244, "rewards/rejected": -1.4960081577301025, "step": 150 }, { "epoch": 1.8691588785046729, "eval_logits/chosen": -2.4266276359558105, "eval_logits/rejected": -2.3947973251342773, "eval_logps/chosen": -268.1986083984375, "eval_logps/rejected": -233.29901123046875, "eval_loss": 0.5178025960922241, "eval_rewards/accuracies": 0.7986111044883728, "eval_rewards/chosen": 1.287774920463562, "eval_rewards/margins": 2.413252830505371, "eval_rewards/rejected": -1.1254781484603882, "eval_runtime": 151.6475, "eval_samples_per_second": 15.035, "eval_steps_per_second": 0.237, "step": 150 }, { "epoch": 1.9314641744548287, "grad_norm": 22.358701210505725, "learning_rate": 3.007994550768793e-07, "logits/chosen": -2.397007703781128, "logits/rejected": -2.3971073627471924, "logps/chosen": -242.742919921875, "logps/rejected": -243.31558227539062, "loss": 0.3023, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.8354260921478271, "rewards/margins": 3.713207960128784, "rewards/rejected": -1.8777821063995361, "step": 155 }, { "epoch": 1.9937694704049844, "grad_norm": 18.438345971642573, "learning_rate": 2.699674811344239e-07, "logits/chosen": -2.359415054321289, "logits/rejected": -2.3509092330932617, "logps/chosen": -237.6903076171875, "logps/rejected": -228.841552734375, "loss": 0.2334, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5688589811325073, "rewards/margins": 3.6319289207458496, "rewards/rejected": -2.063070297241211, "step": 160 }, { "epoch": 2.05607476635514, "grad_norm": 15.458418492004268, "learning_rate": 2.4020802498228334e-07, "logits/chosen": -2.3363451957702637, "logits/rejected": -2.3300869464874268, "logps/chosen": -224.5348663330078, "logps/rejected": -242.59542846679688, "loss": 0.1776, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.179067850112915, "rewards/margins": 3.411412000656128, "rewards/rejected": -2.232343912124634, "step": 165 }, { "epoch": 2.1183800623052957, "grad_norm": 18.095729715855835, "learning_rate": 2.1165983894256646e-07, "logits/chosen": -2.354485034942627, "logits/rejected": -2.313121795654297, "logps/chosen": -241.8025665283203, "logps/rejected": -231.444580078125, "loss": 0.1694, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.9406883716583252, "rewards/margins": 3.57853627204895, "rewards/rejected": -1.637847900390625, "step": 170 }, { "epoch": 2.1806853582554515, "grad_norm": 15.90752453900369, "learning_rate": 1.8445602783697373e-07, "logits/chosen": -2.3171322345733643, "logits/rejected": -2.33518648147583, "logps/chosen": -234.36575317382812, "logps/rejected": -217.76229858398438, "loss": 0.1457, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.836796522140503, "rewards/margins": 3.68644380569458, "rewards/rejected": -1.8496471643447876, "step": 175 }, { "epoch": 2.2429906542056073, "grad_norm": 13.872989061840107, "learning_rate": 1.5872342839067304e-07, "logits/chosen": -2.3331127166748047, "logits/rejected": -2.3253281116485596, "logps/chosen": -236.17605590820312, "logps/rejected": -230.16336059570312, "loss": 0.1305, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.0620455741882324, "rewards/margins": 4.138816833496094, "rewards/rejected": -2.076770544052124, "step": 180 }, { "epoch": 2.305295950155763, "grad_norm": 16.862615654071075, "learning_rate": 1.3458201786093794e-07, "logits/chosen": -2.3675315380096436, "logits/rejected": -2.361820697784424, "logps/chosen": -233.4048614501953, "logps/rejected": -234.2860870361328, "loss": 0.1533, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.2529256343841553, "rewards/margins": 4.307948112487793, "rewards/rejected": -2.055023193359375, "step": 185 }, { "epoch": 2.367601246105919, "grad_norm": 18.352780454288954, "learning_rate": 1.1214435464779003e-07, "logits/chosen": -2.3591573238372803, "logits/rejected": -2.358222246170044, "logps/chosen": -261.05035400390625, "logps/rejected": -228.7847900390625, "loss": 0.1603, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.2873449325561523, "rewards/margins": 4.070608139038086, "rewards/rejected": -1.7832629680633545, "step": 190 }, { "epoch": 2.4299065420560746, "grad_norm": 15.489042019269524, "learning_rate": 9.1515053494779e-08, "logits/chosen": -2.3544795513153076, "logits/rejected": -2.3470654487609863, "logps/chosen": -261.56707763671875, "logps/rejected": -247.87741088867188, "loss": 0.1255, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.4453256130218506, "rewards/margins": 4.215699672698975, "rewards/rejected": -1.7703742980957031, "step": 195 }, { "epoch": 2.4922118380062304, "grad_norm": 15.905641597447591, "learning_rate": 7.279029772675571e-08, "logits/chosen": -2.3422131538391113, "logits/rejected": -2.34535813331604, "logps/chosen": -261.5428466796875, "logps/rejected": -238.24478149414062, "loss": 0.1415, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.112720012664795, "rewards/margins": 4.672650337219238, "rewards/rejected": -2.5599300861358643, "step": 200 }, { "epoch": 2.4922118380062304, "eval_logits/chosen": -2.377676486968994, "eval_logits/rejected": -2.3378918170928955, "eval_logps/chosen": -267.9961242675781, "eval_logps/rejected": -234.715087890625, "eval_loss": 0.532503604888916, "eval_rewards/accuracies": 0.8090277910232544, "eval_rewards/chosen": 1.3080248832702637, "eval_rewards/margins": 2.575108766555786, "eval_rewards/rejected": -1.267083764076233, "eval_runtime": 151.6115, "eval_samples_per_second": 15.038, "eval_steps_per_second": 0.237, "step": 200 }, { "epoch": 2.554517133956386, "grad_norm": 16.249183319695316, "learning_rate": 5.605739079881239e-08, "logits/chosen": -2.3523454666137695, "logits/rejected": -2.3391835689544678, "logps/chosen": -247.360107421875, "logps/rejected": -231.1007843017578, "loss": 0.1524, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.8860536813735962, "rewards/margins": 4.352172374725342, "rewards/rejected": -2.466118574142456, "step": 205 }, { "epoch": 2.616822429906542, "grad_norm": 15.111045956204817, "learning_rate": 4.139434924727358e-08, "logits/chosen": -2.333988666534424, "logits/rejected": -2.3273518085479736, "logps/chosen": -238.6435089111328, "logps/rejected": -244.59951782226562, "loss": 0.1293, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.8741600513458252, "rewards/margins": 4.382053375244141, "rewards/rejected": -2.5078930854797363, "step": 210 }, { "epoch": 2.6791277258566977, "grad_norm": 20.15600996108675, "learning_rate": 2.88695389405898e-08, "logits/chosen": -2.346717357635498, "logits/rejected": -2.3138866424560547, "logps/chosen": -252.88015747070312, "logps/rejected": -246.45449829101562, "loss": 0.1573, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.8761266469955444, "rewards/margins": 4.2409467697143555, "rewards/rejected": -2.3648200035095215, "step": 215 }, { "epoch": 2.7414330218068534, "grad_norm": 17.558745126946615, "learning_rate": 1.8541356326100433e-08, "logits/chosen": -2.336160898208618, "logits/rejected": -2.306347370147705, "logps/chosen": -264.2869567871094, "logps/rejected": -244.0496826171875, "loss": 0.1516, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 2.0822863578796387, "rewards/margins": 4.703708171844482, "rewards/rejected": -2.6214218139648438, "step": 220 }, { "epoch": 2.803738317757009, "grad_norm": 11.413996749154455, "learning_rate": 1.0457956158838544e-08, "logits/chosen": -2.3174614906311035, "logits/rejected": -2.295732021331787, "logps/chosen": -244.0086669921875, "logps/rejected": -221.67776489257812, "loss": 0.133, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.9246057271957397, "rewards/margins": 4.322554588317871, "rewards/rejected": -2.3979482650756836, "step": 225 }, { "epoch": 2.866043613707165, "grad_norm": 24.637369187556377, "learning_rate": 4.657026981834622e-09, "logits/chosen": -2.326481819152832, "logits/rejected": -2.330221652984619, "logps/chosen": -233.2654571533203, "logps/rejected": -257.41412353515625, "loss": 0.1537, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7080739736557007, "rewards/margins": 4.544651508331299, "rewards/rejected": -2.8365769386291504, "step": 230 }, { "epoch": 2.9283489096573208, "grad_norm": 16.185060485352604, "learning_rate": 1.165615404730369e-09, "logits/chosen": -2.324538469314575, "logits/rejected": -2.2891266345977783, "logps/chosen": -235.43801879882812, "logps/rejected": -247.25717163085938, "loss": 0.1482, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.030827522277832, "rewards/margins": 5.017447471618652, "rewards/rejected": -2.9866199493408203, "step": 235 }, { "epoch": 2.9906542056074765, "grad_norm": 14.802366180254003, "learning_rate": 0.0, "logits/chosen": -2.3141098022460938, "logits/rejected": -2.321915626525879, "logps/chosen": -221.14602661132812, "logps/rejected": -229.0386505126953, "loss": 0.1532, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.363684058189392, "rewards/margins": 3.9382190704345703, "rewards/rejected": -2.5745348930358887, "step": 240 } ], "logging_steps": 5, "max_steps": 240, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2829829665718272.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }