{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.986175115207373, "eval_steps": 40, "global_step": 162, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09216589861751152, "grad_norm": 82.64344716971713, "learning_rate": 5e-07, "logits/chosen": -2.732285976409912, "logits/rejected": -2.7352840900421143, "logps/chosen": -345.11505126953125, "logps/rejected": -257.46209716796875, "loss": 0.6879, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.032340794801712036, "rewards/margins": 0.014306592755019665, "rewards/rejected": 0.018034199252724648, "step": 5 }, { "epoch": 0.18433179723502305, "grad_norm": 55.27766576693423, "learning_rate": 1e-06, "logits/chosen": -2.6805875301361084, "logits/rejected": -2.6747162342071533, "logps/chosen": -333.76953125, "logps/rejected": -232.7953338623047, "loss": 0.5893, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.9853774309158325, "rewards/margins": 0.4447619318962097, "rewards/rejected": 0.540615439414978, "step": 10 }, { "epoch": 0.2764976958525346, "grad_norm": 46.427604431133965, "learning_rate": 9.973324900566213e-07, "logits/chosen": -2.435732364654541, "logits/rejected": -2.423825740814209, "logps/chosen": -310.85443115234375, "logps/rejected": -208.2480926513672, "loss": 0.5442, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 2.8085920810699463, "rewards/margins": 1.5502256155014038, "rewards/rejected": 1.258366346359253, "step": 15 }, { "epoch": 0.3686635944700461, "grad_norm": 39.94842616368073, "learning_rate": 9.893584226636772e-07, "logits/chosen": -2.2710189819335938, "logits/rejected": -2.255997896194458, "logps/chosen": -323.7245788574219, "logps/rejected": -217.50869750976562, "loss": 0.5228, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 3.2350196838378906, "rewards/margins": 1.5412657260894775, "rewards/rejected": 1.6937541961669922, "step": 20 }, { "epoch": 0.4608294930875576, "grad_norm": 48.62034534876174, "learning_rate": 9.761628814374072e-07, "logits/chosen": -2.125060558319092, "logits/rejected": -2.0805716514587402, "logps/chosen": -299.4482421875, "logps/rejected": -230.19662475585938, "loss": 0.4827, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 2.9857540130615234, "rewards/margins": 1.7392915487289429, "rewards/rejected": 1.246462345123291, "step": 25 }, { "epoch": 0.5529953917050692, "grad_norm": 46.06223541077971, "learning_rate": 9.578866633275286e-07, "logits/chosen": -1.9920810461044312, "logits/rejected": -1.9682658910751343, "logps/chosen": -302.1333923339844, "logps/rejected": -240.9350128173828, "loss": 0.5361, "rewards/accuracies": 0.78125, "rewards/chosen": 3.1254260540008545, "rewards/margins": 1.7768207788467407, "rewards/rejected": 1.3486052751541138, "step": 30 }, { "epoch": 0.6451612903225806, "grad_norm": 43.606270955387636, "learning_rate": 9.347247763081833e-07, "logits/chosen": -1.911285638809204, "logits/rejected": -1.8874857425689697, "logps/chosen": -318.9488525390625, "logps/rejected": -232.88931274414062, "loss": 0.446, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 3.207639217376709, "rewards/margins": 2.153695583343506, "rewards/rejected": 1.0539438724517822, "step": 35 }, { "epoch": 0.7373271889400922, "grad_norm": 39.922691563648584, "learning_rate": 9.069243586350975e-07, "logits/chosen": -1.8659719228744507, "logits/rejected": -1.837323546409607, "logps/chosen": -316.7166748046875, "logps/rejected": -247.98440551757812, "loss": 0.5108, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 2.811978816986084, "rewards/margins": 1.7324354648590088, "rewards/rejected": 1.0795437097549438, "step": 40 }, { "epoch": 0.7373271889400922, "eval_logits/chosen": -1.8276515007019043, "eval_logits/rejected": -1.8153278827667236, "eval_logps/chosen": -300.355224609375, "eval_logps/rejected": -250.92237854003906, "eval_loss": 0.49955785274505615, "eval_rewards/accuracies": 0.800000011920929, "eval_rewards/chosen": 2.801389455795288, "eval_rewards/margins": 2.0170528888702393, "eval_rewards/rejected": 0.7843364477157593, "eval_runtime": 105.9206, "eval_samples_per_second": 14.511, "eval_steps_per_second": 0.236, "step": 40 }, { "epoch": 0.8294930875576036, "grad_norm": 37.49193294339852, "learning_rate": 8.74782041870563e-07, "logits/chosen": -1.816886305809021, "logits/rejected": -1.783071517944336, "logps/chosen": -297.0343017578125, "logps/rejected": -222.45816040039062, "loss": 0.459, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.688863515853882, "rewards/margins": 2.0437610149383545, "rewards/rejected": 0.6451026201248169, "step": 45 }, { "epoch": 0.9216589861751152, "grad_norm": 43.56619989347279, "learning_rate": 8.386407858128706e-07, "logits/chosen": -1.8166990280151367, "logits/rejected": -1.7925169467926025, "logps/chosen": -314.9337158203125, "logps/rejected": -244.59609985351562, "loss": 0.4783, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 2.7192726135253906, "rewards/margins": 1.5147265195846558, "rewards/rejected": 1.2045462131500244, "step": 50 }, { "epoch": 1.0138248847926268, "grad_norm": 22.442449713126035, "learning_rate": 7.988862191016203e-07, "logits/chosen": -1.842585802078247, "logits/rejected": -1.831575632095337, "logps/chosen": -302.282958984375, "logps/rejected": -239.08016967773438, "loss": 0.3881, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 2.8818259239196777, "rewards/margins": 2.1279635429382324, "rewards/rejected": 0.7538624405860901, "step": 55 }, { "epoch": 1.1059907834101383, "grad_norm": 15.231974843280263, "learning_rate": 7.559425245448005e-07, "logits/chosen": -1.9866081476211548, "logits/rejected": -1.9679405689239502, "logps/chosen": -297.02923583984375, "logps/rejected": -269.2718200683594, "loss": 0.1072, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.7900550365448, "rewards/margins": 4.232701301574707, "rewards/rejected": -0.4426456391811371, "step": 60 }, { "epoch": 1.1981566820276497, "grad_norm": 18.307111489589293, "learning_rate": 7.102679130713537e-07, "logits/chosen": -2.104555130004883, "logits/rejected": -2.1196866035461426, "logps/chosen": -333.6317138671875, "logps/rejected": -247.2637939453125, "loss": 0.1207, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.633069038391113, "rewards/margins": 4.566880226135254, "rewards/rejected": 0.06618879735469818, "step": 65 }, { "epoch": 1.2903225806451613, "grad_norm": 21.546461613665333, "learning_rate": 6.623497346023417e-07, "logits/chosen": -2.1033987998962402, "logits/rejected": -2.08524751663208, "logps/chosen": -305.8551940917969, "logps/rejected": -242.0274200439453, "loss": 0.134, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.447857856750488, "rewards/margins": 4.793730735778809, "rewards/rejected": -0.34587258100509644, "step": 70 }, { "epoch": 1.3824884792626728, "grad_norm": 18.32116519016538, "learning_rate": 6.126992780079031e-07, "logits/chosen": -1.9051921367645264, "logits/rejected": -1.8795219659805298, "logps/chosen": -289.46319580078125, "logps/rejected": -261.77264404296875, "loss": 0.1416, "rewards/accuracies": 0.96875, "rewards/chosen": 3.8727810382843018, "rewards/margins": 4.719055652618408, "rewards/rejected": -0.846274733543396, "step": 75 }, { "epoch": 1.4746543778801844, "grad_norm": 20.574623734909377, "learning_rate": 5.618463156346739e-07, "logits/chosen": -1.8352515697479248, "logits/rejected": -1.7718498706817627, "logps/chosen": -299.86773681640625, "logps/rejected": -235.0732421875, "loss": 0.1548, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 3.8516736030578613, "rewards/margins": 4.764138221740723, "rewards/rejected": -0.9124643206596375, "step": 80 }, { "epoch": 1.4746543778801844, "eval_logits/chosen": -1.7812533378601074, "eval_logits/rejected": -1.7896403074264526, "eval_logps/chosen": -302.47705078125, "eval_logps/rejected": -261.8770446777344, "eval_loss": 0.5515583157539368, "eval_rewards/accuracies": 0.8149999976158142, "eval_rewards/chosen": 2.5892090797424316, "eval_rewards/margins": 2.9003350734710693, "eval_rewards/rejected": -0.3111259341239929, "eval_runtime": 104.7508, "eval_samples_per_second": 14.673, "eval_steps_per_second": 0.239, "step": 80 }, { "epoch": 1.5668202764976957, "grad_norm": 16.930391905891195, "learning_rate": 5.103334506137772e-07, "logits/chosen": -1.8332993984222412, "logits/rejected": -1.7760483026504517, "logps/chosen": -305.48089599609375, "logps/rejected": -244.80807495117188, "loss": 0.1306, "rewards/accuracies": 0.96875, "rewards/chosen": 3.6695034503936768, "rewards/margins": 4.8659348487854, "rewards/rejected": -1.1964311599731445, "step": 85 }, { "epoch": 1.6589861751152073, "grad_norm": 21.11449418778823, "learning_rate": 4.5871032726383385e-07, "logits/chosen": -1.8413625955581665, "logits/rejected": -1.817728042602539, "logps/chosen": -304.38458251953125, "logps/rejected": -250.87478637695312, "loss": 0.177, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.8436717987060547, "rewards/margins": 5.000934600830078, "rewards/rejected": -1.1572625637054443, "step": 90 }, { "epoch": 1.7511520737327189, "grad_norm": 25.731965158002115, "learning_rate": 4.075277663642208e-07, "logits/chosen": -1.8851152658462524, "logits/rejected": -1.852927565574646, "logps/chosen": -295.5654296875, "logps/rejected": -266.7487487792969, "loss": 0.1983, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.588088274002075, "rewards/margins": 4.905179023742676, "rewards/rejected": -1.3170902729034424, "step": 95 }, { "epoch": 1.8433179723502304, "grad_norm": 21.671116502246942, "learning_rate": 3.5733188787544746e-07, "logits/chosen": -1.9296363592147827, "logits/rejected": -1.9144790172576904, "logps/chosen": -288.9561462402344, "logps/rejected": -262.56787109375, "loss": 0.158, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.9653868675231934, "rewards/margins": 4.910740852355957, "rewards/rejected": -0.9453536868095398, "step": 100 }, { "epoch": 1.935483870967742, "grad_norm": 23.77879577555757, "learning_rate": 3.086582838174551e-07, "logits/chosen": -2.025709390640259, "logits/rejected": -1.9932079315185547, "logps/chosen": -320.6540832519531, "logps/rejected": -243.01376342773438, "loss": 0.1593, "rewards/accuracies": 0.96875, "rewards/chosen": 4.19329309463501, "rewards/margins": 4.572449684143066, "rewards/rejected": -0.37915733456611633, "step": 105 }, { "epoch": 2.0276497695852536, "grad_norm": 7.998912412806614, "learning_rate": 2.620263034814632e-07, "logits/chosen": -2.079678773880005, "logits/rejected": -2.073202133178711, "logps/chosen": -289.73419189453125, "logps/rejected": -268.50738525390625, "loss": 0.109, "rewards/accuracies": 0.96875, "rewards/chosen": 4.6027398109436035, "rewards/margins": 5.521824359893799, "rewards/rejected": -0.9190845489501953, "step": 110 }, { "epoch": 2.119815668202765, "grad_norm": 11.517879985604935, "learning_rate": 2.1793351195237446e-07, "logits/chosen": -2.0473215579986572, "logits/rejected": -2.0440163612365723, "logps/chosen": -293.99163818359375, "logps/rejected": -247.82156372070312, "loss": 0.0524, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 4.580502510070801, "rewards/margins": 6.078469753265381, "rewards/rejected": -1.4979677200317383, "step": 115 }, { "epoch": 2.2119815668202767, "grad_norm": 10.044969791434443, "learning_rate": 1.768503810695295e-07, "logits/chosen": -2.0370969772338867, "logits/rejected": -2.016648769378662, "logps/chosen": -296.79254150390625, "logps/rejected": -247.7318878173828, "loss": 0.0739, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 4.419327735900879, "rewards/margins": 5.548590183258057, "rewards/rejected": -1.1292626857757568, "step": 120 }, { "epoch": 2.2119815668202767, "eval_logits/chosen": -1.9930505752563477, "eval_logits/rejected": -2.023695707321167, "eval_logps/chosen": -299.45892333984375, "eval_logps/rejected": -260.5440673828125, "eval_loss": 0.5419167876243591, "eval_rewards/accuracies": 0.8149999976158142, "eval_rewards/chosen": 2.891019821166992, "eval_rewards/margins": 3.0688512325286865, "eval_rewards/rejected": -0.17783160507678986, "eval_runtime": 104.7566, "eval_samples_per_second": 14.672, "eval_steps_per_second": 0.239, "step": 120 }, { "epoch": 2.3041474654377883, "grad_norm": 9.632925545018697, "learning_rate": 1.3921526947346901e-07, "logits/chosen": -1.9562991857528687, "logits/rejected": -1.9737581014633179, "logps/chosen": -285.3409729003906, "logps/rejected": -246.0544891357422, "loss": 0.0495, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 4.24416971206665, "rewards/margins": 5.664097785949707, "rewards/rejected": -1.419929027557373, "step": 125 }, { "epoch": 2.3963133640552994, "grad_norm": 13.62582621522216, "learning_rate": 1.0542974530180327e-07, "logits/chosen": -1.9481559991836548, "logits/rejected": -1.9454014301300049, "logps/chosen": -314.75067138671875, "logps/rejected": -236.7272186279297, "loss": 0.0637, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 4.275069236755371, "rewards/margins": 5.598044395446777, "rewards/rejected": -1.3229751586914062, "step": 130 }, { "epoch": 2.488479262672811, "grad_norm": 7.059837317710805, "learning_rate": 7.585430144121318e-08, "logits/chosen": -1.8857179880142212, "logits/rejected": -1.8879835605621338, "logps/chosen": -294.124755859375, "logps/rejected": -270.05633544921875, "loss": 0.0368, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 4.17112922668457, "rewards/margins": 5.703551292419434, "rewards/rejected": -1.532421350479126, "step": 135 }, { "epoch": 2.5806451612903225, "grad_norm": 14.302811017421536, "learning_rate": 5.080450905401057e-08, "logits/chosen": -1.9461250305175781, "logits/rejected": -1.869018793106079, "logps/chosen": -277.7373046875, "logps/rejected": -269.21160888671875, "loss": 0.0801, "rewards/accuracies": 0.96875, "rewards/chosen": 4.184547424316406, "rewards/margins": 5.217525482177734, "rewards/rejected": -1.0329779386520386, "step": 140 }, { "epoch": 2.672811059907834, "grad_norm": 10.869742441859588, "learning_rate": 3.054765042128521e-08, "logits/chosen": -1.9559131860733032, "logits/rejected": -1.9399007558822632, "logps/chosen": -302.7723693847656, "logps/rejected": -274.4209289550781, "loss": 0.0562, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 4.239922523498535, "rewards/margins": 5.70468282699585, "rewards/rejected": -1.4647598266601562, "step": 145 }, { "epoch": 2.7649769585253456, "grad_norm": 21.93426765644073, "learning_rate": 1.5299867030334813e-08, "logits/chosen": -1.9453132152557373, "logits/rejected": -1.9195010662078857, "logps/chosen": -313.5506286621094, "logps/rejected": -258.9598693847656, "loss": 0.0751, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 4.334193229675293, "rewards/margins": 5.6270341873168945, "rewards/rejected": -1.292839765548706, "step": 150 }, { "epoch": 2.857142857142857, "grad_norm": 13.389771777225755, "learning_rate": 5.223853336398632e-09, "logits/chosen": -1.9442565441131592, "logits/rejected": -1.934654951095581, "logps/chosen": -288.51190185546875, "logps/rejected": -266.2658996582031, "loss": 0.0671, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 4.63767671585083, "rewards/margins": 5.82180643081665, "rewards/rejected": -1.1841299533843994, "step": 155 }, { "epoch": 2.9493087557603688, "grad_norm": 15.653208968679852, "learning_rate": 4.271208063494902e-10, "logits/chosen": -1.9627739191055298, "logits/rejected": -1.9069459438323975, "logps/chosen": -293.91802978515625, "logps/rejected": -261.7667541503906, "loss": 0.0785, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 4.489874839782715, "rewards/margins": 5.796733379364014, "rewards/rejected": -1.3068585395812988, "step": 160 }, { "epoch": 2.9493087557603688, "eval_logits/chosen": -1.9315097332000732, "eval_logits/rejected": -1.9545789957046509, "eval_logps/chosen": -298.7916564941406, "eval_logps/rejected": -260.6390075683594, "eval_loss": 0.5450286269187927, "eval_rewards/accuracies": 0.824999988079071, "eval_rewards/chosen": 2.957746982574463, "eval_rewards/margins": 3.1450705528259277, "eval_rewards/rejected": -0.18732379376888275, "eval_runtime": 104.7018, "eval_samples_per_second": 14.68, "eval_steps_per_second": 0.239, "step": 160 }, { "epoch": 2.986175115207373, "step": 162, "total_flos": 1909982962384896.0, "train_loss": 0.24334129419775657, "train_runtime": 6038.8466, "train_samples_per_second": 6.87, "train_steps_per_second": 0.027 } ], "logging_steps": 5, "max_steps": 162, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1909982962384896.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }