{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.981366459627329, "eval_steps": 50, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12422360248447205, "grad_norm": 66.62363784407601, "learning_rate": 5e-07, "logits/chosen": -2.737199068069458, "logits/rejected": -2.716709613800049, "logps/chosen": -271.75335693359375, "logps/rejected": -215.45742797851562, "loss": 0.6894, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": 0.016401026397943497, "rewards/margins": 0.012943076901137829, "rewards/rejected": 0.0034579492639750242, "step": 5 }, { "epoch": 0.2484472049689441, "grad_norm": 48.6825109729797, "learning_rate": 1e-06, "logits/chosen": -2.673478364944458, "logits/rejected": -2.6825709342956543, "logps/chosen": -274.2002258300781, "logps/rejected": -210.3507537841797, "loss": 0.6196, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5796890258789062, "rewards/margins": 0.2897328734397888, "rewards/rejected": 0.28995609283447266, "step": 10 }, { "epoch": 0.37267080745341613, "grad_norm": 46.74043921332661, "learning_rate": 9.949107209404663e-07, "logits/chosen": -2.474240303039551, "logits/rejected": -2.441648006439209, "logps/chosen": -231.01181030273438, "logps/rejected": -208.47927856445312, "loss": 0.6484, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.5896726846694946, "rewards/margins": 0.8002876043319702, "rewards/rejected": 0.7893850207328796, "step": 15 }, { "epoch": 0.4968944099378882, "grad_norm": 51.05927483012936, "learning_rate": 9.797464868072486e-07, "logits/chosen": -2.29701566696167, "logits/rejected": -2.2914681434631348, "logps/chosen": -262.2198181152344, "logps/rejected": -218.2954864501953, "loss": 0.6125, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 1.8730179071426392, "rewards/margins": 1.2619432210922241, "rewards/rejected": 0.6110745668411255, "step": 20 }, { "epoch": 0.6211180124223602, "grad_norm": 53.85418375459847, "learning_rate": 9.548159976772592e-07, "logits/chosen": -2.2311034202575684, "logits/rejected": -2.2267913818359375, "logps/chosen": -259.2829895019531, "logps/rejected": -203.93215942382812, "loss": 0.581, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 1.702134370803833, "rewards/margins": 1.3699487447738647, "rewards/rejected": 0.3321855068206787, "step": 25 }, { "epoch": 0.7453416149068323, "grad_norm": 41.921620963182335, "learning_rate": 9.206267664155906e-07, "logits/chosen": -2.2750821113586426, "logits/rejected": -2.2540435791015625, "logps/chosen": -252.75881958007812, "logps/rejected": -227.4959716796875, "loss": 0.5342, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 1.7333396673202515, "rewards/margins": 1.6640625, "rewards/rejected": 0.06927712261676788, "step": 30 }, { "epoch": 0.8695652173913043, "grad_norm": 36.22502143325224, "learning_rate": 8.778747871771291e-07, "logits/chosen": -2.3146414756774902, "logits/rejected": -2.2875092029571533, "logps/chosen": -263.4291687011719, "logps/rejected": -221.8418731689453, "loss": 0.5177, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.4538803100585938, "rewards/margins": 1.299953579902649, "rewards/rejected": 0.153926819562912, "step": 35 }, { "epoch": 0.9937888198757764, "grad_norm": 45.69470972967703, "learning_rate": 8.274303669726426e-07, "logits/chosen": -2.289536714553833, "logits/rejected": -2.2876548767089844, "logps/chosen": -274.63818359375, "logps/rejected": -218.11038208007812, "loss": 0.5869, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 1.579737901687622, "rewards/margins": 1.5463366508483887, "rewards/rejected": 0.03340107947587967, "step": 40 }, { "epoch": 1.1180124223602483, "grad_norm": 25.091084074273006, "learning_rate": 7.703204087277988e-07, "logits/chosen": -2.2782795429229736, "logits/rejected": -2.2675344944000244, "logps/chosen": -241.4102020263672, "logps/rejected": -232.1530303955078, "loss": 0.2415, "rewards/accuracies": 0.90625, "rewards/chosen": 2.0038599967956543, "rewards/margins": 2.8345634937286377, "rewards/rejected": -0.8307037353515625, "step": 45 }, { "epoch": 1.2422360248447206, "grad_norm": 21.01590218402833, "learning_rate": 7.077075065009433e-07, "logits/chosen": -2.304749011993408, "logits/rejected": -2.2815442085266113, "logps/chosen": -248.9634552001953, "logps/rejected": -262.0959777832031, "loss": 0.2594, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.365910053253174, "rewards/margins": 3.3114895820617676, "rewards/rejected": -0.9455796480178833, "step": 50 }, { "epoch": 1.2422360248447206, "eval_logits/chosen": -2.2956461906433105, "eval_logits/rejected": -2.294363498687744, "eval_logps/chosen": -252.96337890625, "eval_logps/rejected": -231.52207946777344, "eval_loss": 0.5901808142662048, "eval_rewards/accuracies": 0.7986111044883728, "eval_rewards/chosen": 1.6738612651824951, "eval_rewards/margins": 1.8752751350402832, "eval_rewards/rejected": -0.20141386985778809, "eval_runtime": 75.2847, "eval_samples_per_second": 15.143, "eval_steps_per_second": 0.239, "step": 50 }, { "epoch": 1.3664596273291925, "grad_norm": 18.646514200070648, "learning_rate": 6.408662784207149e-07, "logits/chosen": -2.307347297668457, "logits/rejected": -2.263925075531006, "logps/chosen": -249.4068145751953, "logps/rejected": -214.16445922851562, "loss": 0.2176, "rewards/accuracies": 0.90625, "rewards/chosen": 2.3753602504730225, "rewards/margins": 3.139112949371338, "rewards/rejected": -0.7637524008750916, "step": 55 }, { "epoch": 1.4906832298136645, "grad_norm": 20.993852286888092, "learning_rate": 5.711574191366427e-07, "logits/chosen": -2.32312273979187, "logits/rejected": -2.313824415206909, "logps/chosen": -243.2293243408203, "logps/rejected": -206.0082550048828, "loss": 0.2417, "rewards/accuracies": 0.9375, "rewards/chosen": 2.353982448577881, "rewards/margins": 3.16640043258667, "rewards/rejected": -0.8124181032180786, "step": 60 }, { "epoch": 1.6149068322981366, "grad_norm": 19.030512980103158, "learning_rate": 5e-07, "logits/chosen": -2.377202272415161, "logits/rejected": -2.331650495529175, "logps/chosen": -248.04483032226562, "logps/rejected": -222.1618194580078, "loss": 0.2488, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.3110172748565674, "rewards/margins": 3.183046579360962, "rewards/rejected": -0.8720294833183289, "step": 65 }, { "epoch": 1.7391304347826086, "grad_norm": 28.536465499864658, "learning_rate": 4.2884258086335745e-07, "logits/chosen": -2.3466696739196777, "logits/rejected": -2.3642795085906982, "logps/chosen": -262.2099304199219, "logps/rejected": -250.7125701904297, "loss": 0.3053, "rewards/accuracies": 0.90625, "rewards/chosen": 2.6217122077941895, "rewards/margins": 3.932690382003784, "rewards/rejected": -1.3109780550003052, "step": 70 }, { "epoch": 1.8633540372670807, "grad_norm": 27.768310670938217, "learning_rate": 3.591337215792851e-07, "logits/chosen": -2.3459136486053467, "logits/rejected": -2.3365659713745117, "logps/chosen": -250.7726593017578, "logps/rejected": -221.8275909423828, "loss": 0.3016, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.6287598609924316, "rewards/margins": 3.746206760406494, "rewards/rejected": -1.117447018623352, "step": 75 }, { "epoch": 1.9875776397515528, "grad_norm": 24.938158584160053, "learning_rate": 2.922924934990568e-07, "logits/chosen": -2.3689913749694824, "logits/rejected": -2.3461122512817383, "logps/chosen": -255.01962280273438, "logps/rejected": -232.1419677734375, "loss": 0.2954, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 3.2598438262939453, "rewards/margins": 4.275376319885254, "rewards/rejected": -1.015533208847046, "step": 80 }, { "epoch": 2.111801242236025, "grad_norm": 22.47951274264517, "learning_rate": 2.2967959127220137e-07, "logits/chosen": -2.3769583702087402, "logits/rejected": -2.313356399536133, "logps/chosen": -245.34432983398438, "logps/rejected": -256.83477783203125, "loss": 0.1701, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.893587112426758, "rewards/margins": 4.111905574798584, "rewards/rejected": -1.218318223953247, "step": 85 }, { "epoch": 2.2360248447204967, "grad_norm": 16.587973457804008, "learning_rate": 1.725696330273575e-07, "logits/chosen": -2.3217408657073975, "logits/rejected": -2.322180986404419, "logps/chosen": -261.08746337890625, "logps/rejected": -253.5300750732422, "loss": 0.1669, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.13687801361084, "rewards/margins": 4.253005027770996, "rewards/rejected": -1.1161267757415771, "step": 90 }, { "epoch": 2.360248447204969, "grad_norm": 14.3562650408135, "learning_rate": 1.2212521282287093e-07, "logits/chosen": -2.2643933296203613, "logits/rejected": -2.2418789863586426, "logps/chosen": -230.6456298828125, "logps/rejected": -250.2694549560547, "loss": 0.1554, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 2.822519302368164, "rewards/margins": 4.386029243469238, "rewards/rejected": -1.5635101795196533, "step": 95 }, { "epoch": 2.4844720496894412, "grad_norm": 18.916632747497697, "learning_rate": 7.937323358440934e-08, "logits/chosen": -2.2358391284942627, "logits/rejected": -2.2123188972473145, "logps/chosen": -245.39889526367188, "logps/rejected": -213.0316925048828, "loss": 0.1658, "rewards/accuracies": 0.96875, "rewards/chosen": 2.8756985664367676, "rewards/margins": 4.066061973571777, "rewards/rejected": -1.1903636455535889, "step": 100 }, { "epoch": 2.4844720496894412, "eval_logits/chosen": -2.241981029510498, "eval_logits/rejected": -2.237220048904419, "eval_logps/chosen": -249.4546661376953, "eval_logps/rejected": -232.948974609375, "eval_loss": 0.5874444842338562, "eval_rewards/accuracies": 0.8333333134651184, "eval_rewards/chosen": 2.0247349739074707, "eval_rewards/margins": 2.3688364028930664, "eval_rewards/rejected": -0.34410178661346436, "eval_runtime": 74.9734, "eval_samples_per_second": 15.205, "eval_steps_per_second": 0.24, "step": 100 }, { "epoch": 2.608695652173913, "grad_norm": 12.416778851124059, "learning_rate": 4.518400232274078e-08, "logits/chosen": -2.229137659072876, "logits/rejected": -2.201681613922119, "logps/chosen": -237.0298309326172, "logps/rejected": -240.43429565429688, "loss": 0.1344, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.9165587425231934, "rewards/margins": 4.170973300933838, "rewards/rejected": -1.2544142007827759, "step": 105 }, { "epoch": 2.732919254658385, "grad_norm": 15.39582049536958, "learning_rate": 2.025351319275137e-08, "logits/chosen": -2.226637840270996, "logits/rejected": -2.1901309490203857, "logps/chosen": -232.8177947998047, "logps/rejected": -221.37646484375, "loss": 0.1763, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 2.9458189010620117, "rewards/margins": 4.043347358703613, "rewards/rejected": -1.0975282192230225, "step": 110 }, { "epoch": 2.857142857142857, "grad_norm": 17.081473104516775, "learning_rate": 5.0892790595336575e-09, "logits/chosen": -2.220303773880005, "logits/rejected": -2.2176926136016846, "logps/chosen": -252.9158172607422, "logps/rejected": -242.60400390625, "loss": 0.1587, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 3.0858166217803955, "rewards/margins": 4.5951924324035645, "rewards/rejected": -1.5093762874603271, "step": 115 }, { "epoch": 2.981366459627329, "grad_norm": 17.254264529974623, "learning_rate": 0.0, "logits/chosen": -2.205538511276245, "logits/rejected": -2.1971898078918457, "logps/chosen": -235.06808471679688, "logps/rejected": -225.6627197265625, "loss": 0.1671, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.8083322048187256, "rewards/margins": 4.552498817443848, "rewards/rejected": -1.7441661357879639, "step": 120 } ], "logging_steps": 5, "max_steps": 120, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1414680891359232.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }