{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7246376811594203, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.036231884057971016, "grad_norm": 61.75757328159282, "learning_rate": 5e-07, "logits/chosen": -2.732090473175049, "logits/rejected": -2.7100460529327393, "logps/chosen": -182.59107971191406, "logps/rejected": -189.5584716796875, "loss": 0.6889, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.00281245238147676, "rewards/margins": 0.0058334446512162685, "rewards/rejected": -0.008645896799862385, "step": 5 }, { "epoch": 0.07246376811594203, "grad_norm": 44.951594498596215, "learning_rate": 1e-06, "logits/chosen": -2.754081964492798, "logits/rejected": -2.752152919769287, "logps/chosen": -197.337158203125, "logps/rejected": -184.00933837890625, "loss": 0.6274, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.03147688880562782, "rewards/margins": 0.1896156221628189, "rewards/rejected": -0.1581387221813202, "step": 10 }, { "epoch": 0.10869565217391304, "grad_norm": 51.34158391398985, "learning_rate": 9.996221126793764e-07, "logits/chosen": -2.694983959197998, "logits/rejected": -2.692361831665039, "logps/chosen": -203.20387268066406, "logps/rejected": -204.64244079589844, "loss": 0.5838, "rewards/accuracies": 0.75, "rewards/chosen": 0.6150370836257935, "rewards/margins": 0.9413955807685852, "rewards/rejected": -0.32635849714279175, "step": 15 }, { "epoch": 0.14492753623188406, "grad_norm": 34.76477183019994, "learning_rate": 9.984890219128145e-07, "logits/chosen": -2.612672805786133, "logits/rejected": -2.5829074382781982, "logps/chosen": -188.62716674804688, "logps/rejected": -192.87452697753906, "loss": 0.5142, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.8545471429824829, "rewards/margins": 1.280996561050415, "rewards/rejected": -0.4264492094516754, "step": 20 }, { "epoch": 0.18115942028985507, "grad_norm": 36.75278346647978, "learning_rate": 9.966024404228493e-07, "logits/chosen": -2.450106143951416, "logits/rejected": -2.4297895431518555, "logps/chosen": -179.98348999023438, "logps/rejected": -179.38925170898438, "loss": 0.5032, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.38695499300956726, "rewards/margins": 0.8900691866874695, "rewards/rejected": -0.5031141638755798, "step": 25 }, { "epoch": 0.21739130434782608, "grad_norm": 31.781918105397544, "learning_rate": 9.939652198703783e-07, "logits/chosen": -2.324214458465576, "logits/rejected": -2.325657367706299, "logps/chosen": -188.5428466796875, "logps/rejected": -193.8271942138672, "loss": 0.4995, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.6558719873428345, "rewards/margins": 1.2207121849060059, "rewards/rejected": -0.5648401975631714, "step": 30 }, { "epoch": 0.2536231884057971, "grad_norm": 39.36776247005876, "learning_rate": 9.905813465442354e-07, "logits/chosen": -2.236240863800049, "logits/rejected": -2.2105681896209717, "logps/chosen": -203.98277282714844, "logps/rejected": -194.84640502929688, "loss": 0.5091, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8834564089775085, "rewards/margins": 1.2675695419311523, "rewards/rejected": -0.3841131329536438, "step": 35 }, { "epoch": 0.2898550724637681, "grad_norm": 30.817630358317576, "learning_rate": 9.864559353357187e-07, "logits/chosen": -2.068774700164795, "logits/rejected": -2.0603950023651123, "logps/chosen": -182.76817321777344, "logps/rejected": -185.9797821044922, "loss": 0.4873, "rewards/accuracies": 0.78125, "rewards/chosen": 1.03325617313385, "rewards/margins": 1.0384714603424072, "rewards/rejected": -0.005215352866798639, "step": 40 }, { "epoch": 0.32608695652173914, "grad_norm": 29.09268118121073, "learning_rate": 9.815952220071804e-07, "logits/chosen": -1.8718488216400146, "logits/rejected": -1.8250553607940674, "logps/chosen": -195.60968017578125, "logps/rejected": -221.5565643310547, "loss": 0.4597, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3850222826004028, "rewards/margins": 1.8469291925430298, "rewards/rejected": -0.4619070589542389, "step": 45 }, { "epoch": 0.36231884057971014, "grad_norm": 29.526743630011346, "learning_rate": 9.76006553766365e-07, "logits/chosen": -1.653713583946228, "logits/rejected": -1.6171553134918213, "logps/chosen": -198.85989379882812, "logps/rejected": -203.60678100585938, "loss": 0.4516, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.8292047381401062, "rewards/margins": 1.6851797103881836, "rewards/rejected": -0.8559748530387878, "step": 50 }, { "epoch": 0.36231884057971014, "eval_logits/chosen": -1.7065542936325073, "eval_logits/rejected": -1.630993127822876, "eval_logps/chosen": -192.20655822753906, "eval_logps/rejected": -206.51295471191406, "eval_loss": 0.4420754015445709, "eval_rewards/accuracies": 0.7903226017951965, "eval_rewards/chosen": 0.8112886548042297, "eval_rewards/margins": 1.641775369644165, "eval_rewards/rejected": -0.8304866552352905, "eval_runtime": 247.7543, "eval_samples_per_second": 15.83, "eval_steps_per_second": 0.25, "step": 50 }, { "epoch": 0.39855072463768115, "grad_norm": 30.94859785748943, "learning_rate": 9.696983781607415e-07, "logits/chosen": -1.7253024578094482, "logits/rejected": -1.6905288696289062, "logps/chosen": -182.9173126220703, "logps/rejected": -171.9159698486328, "loss": 0.4573, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.6172864437103271, "rewards/margins": 1.648385763168335, "rewards/rejected": -1.031099557876587, "step": 55 }, { "epoch": 0.43478260869565216, "grad_norm": 40.75469044830845, "learning_rate": 9.626802303086209e-07, "logits/chosen": -1.87893807888031, "logits/rejected": -1.8299003839492798, "logps/chosen": -186.30145263671875, "logps/rejected": -193.9145965576172, "loss": 0.4264, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.4657188057899475, "rewards/margins": 1.7288262844085693, "rewards/rejected": -1.2631075382232666, "step": 60 }, { "epoch": 0.47101449275362317, "grad_norm": 35.556274541495966, "learning_rate": 9.549627184863528e-07, "logits/chosen": -2.016784906387329, "logits/rejected": -1.9150521755218506, "logps/chosen": -191.3840789794922, "logps/rejected": -192.66639709472656, "loss": 0.4289, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.0974711924791336, "rewards/margins": 1.6010549068450928, "rewards/rejected": -1.5035837888717651, "step": 65 }, { "epoch": 0.5072463768115942, "grad_norm": 26.46585227154451, "learning_rate": 9.465575080933957e-07, "logits/chosen": -1.853308916091919, "logits/rejected": -1.7947351932525635, "logps/chosen": -172.3099822998047, "logps/rejected": -208.057373046875, "loss": 0.3948, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.21238946914672852, "rewards/margins": 1.8403332233428955, "rewards/rejected": -1.627943754196167, "step": 70 }, { "epoch": 0.5434782608695652, "grad_norm": 31.533541728553253, "learning_rate": 9.374773040194878e-07, "logits/chosen": -1.8850362300872803, "logits/rejected": -1.8103622198104858, "logps/chosen": -205.5053253173828, "logps/rejected": -210.96981811523438, "loss": 0.4364, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.17310531437397003, "rewards/margins": 1.8103282451629639, "rewards/rejected": -1.6372228860855103, "step": 75 }, { "epoch": 0.5797101449275363, "grad_norm": 29.780905727815526, "learning_rate": 9.277358314405818e-07, "logits/chosen": -1.7906593084335327, "logits/rejected": -1.742597222328186, "logps/chosen": -188.9757080078125, "logps/rejected": -205.398193359375, "loss": 0.3987, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.19728976488113403, "rewards/margins": 1.879663109779358, "rewards/rejected": -2.0769529342651367, "step": 80 }, { "epoch": 0.6159420289855072, "grad_norm": 34.4646468352745, "learning_rate": 9.173478150725651e-07, "logits/chosen": -1.7377640008926392, "logits/rejected": -1.6257518529891968, "logps/chosen": -210.00320434570312, "logps/rejected": -215.84835815429688, "loss": 0.4258, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.08612661063671112, "rewards/margins": 2.4435980319976807, "rewards/rejected": -2.357471227645874, "step": 85 }, { "epoch": 0.6521739130434783, "grad_norm": 29.12537980218493, "learning_rate": 9.063289569141251e-07, "logits/chosen": -1.7976572513580322, "logits/rejected": -1.739854097366333, "logps/chosen": -214.8435821533203, "logps/rejected": -224.52005004882812, "loss": 0.4147, "rewards/accuracies": 0.84375, "rewards/chosen": 0.46363013982772827, "rewards/margins": 2.330965518951416, "rewards/rejected": -1.867335557937622, "step": 90 }, { "epoch": 0.6884057971014492, "grad_norm": 35.00421638148543, "learning_rate": 8.946959125124051e-07, "logits/chosen": -1.861108422279358, "logits/rejected": -1.780923843383789, "logps/chosen": -207.5733184814453, "logps/rejected": -193.34400939941406, "loss": 0.4121, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4294491708278656, "rewards/margins": 2.142913341522217, "rewards/rejected": -1.7134641408920288, "step": 95 }, { "epoch": 0.7246376811594203, "grad_norm": 31.611698501726103, "learning_rate": 8.824662657873238e-07, "logits/chosen": -1.8221423625946045, "logits/rejected": -1.802095651626587, "logps/chosen": -173.2090301513672, "logps/rejected": -206.5529327392578, "loss": 0.3759, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.04721298813819885, "rewards/margins": 1.9821780920028687, "rewards/rejected": -2.029391050338745, "step": 100 }, { "epoch": 0.7246376811594203, "eval_logits/chosen": -1.8523844480514526, "eval_logits/rejected": -1.7929590940475464, "eval_logps/chosen": -200.7910614013672, "eval_logps/rejected": -220.96961975097656, "eval_loss": 0.4121003746986389, "eval_rewards/accuracies": 0.8145161271095276, "eval_rewards/chosen": -0.047160252928733826, "eval_rewards/margins": 2.2289960384368896, "eval_rewards/rejected": -2.276156187057495, "eval_runtime": 247.371, "eval_samples_per_second": 15.855, "eval_steps_per_second": 0.251, "step": 100 } ], "logging_steps": 5, "max_steps": 414, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1178822762299392.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }