{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3888888888888888, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06944444444444445, "grad_norm": 36.21096742368932, "learning_rate": 5e-07, "logits/chosen": -2.735914945602417, "logits/rejected": -2.7412195205688477, "logps/chosen": -166.00094604492188, "logps/rejected": -162.81643676757812, "loss": 0.6928, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": 0.00429560337215662, "rewards/margins": 0.0009204222005791962, "rewards/rejected": 0.0033751812297850847, "step": 5 }, { "epoch": 0.1388888888888889, "grad_norm": 35.97443184449595, "learning_rate": 1e-06, "logits/chosen": -2.742196798324585, "logits/rejected": -2.7352712154388428, "logps/chosen": -163.42056274414062, "logps/rejected": -168.62094116210938, "loss": 0.6867, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.10343559086322784, "rewards/margins": 0.005917676724493504, "rewards/rejected": 0.09751791507005692, "step": 10 }, { "epoch": 0.20833333333333334, "grad_norm": 35.479696664348296, "learning_rate": 9.985471028179154e-07, "logits/chosen": -2.715827465057373, "logits/rejected": -2.7099735736846924, "logps/chosen": -164.28744506835938, "logps/rejected": -166.86209106445312, "loss": 0.6686, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.31278976798057556, "rewards/margins": 0.10602164268493652, "rewards/rejected": 0.20676811039447784, "step": 15 }, { "epoch": 0.2777777777777778, "grad_norm": 34.26437345645622, "learning_rate": 9.94196854912548e-07, "logits/chosen": -2.6752734184265137, "logits/rejected": -2.670536518096924, "logps/chosen": -162.92506408691406, "logps/rejected": -162.58132934570312, "loss": 0.653, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.23271910846233368, "rewards/margins": 0.16488614678382874, "rewards/rejected": 0.06783294677734375, "step": 20 }, { "epoch": 0.3472222222222222, "grad_norm": 38.12636989971247, "learning_rate": 9.869745381355905e-07, "logits/chosen": -2.612743854522705, "logits/rejected": -2.601036310195923, "logps/chosen": -169.65054321289062, "logps/rejected": -170.94589233398438, "loss": 0.6341, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.23453514277935028, "rewards/margins": 0.1853707879781723, "rewards/rejected": 0.04916436970233917, "step": 25 }, { "epoch": 0.4166666666666667, "grad_norm": 37.15638790112506, "learning_rate": 9.769221256218162e-07, "logits/chosen": -2.6376729011535645, "logits/rejected": -2.6211869716644287, "logps/chosen": -169.70230102539062, "logps/rejected": -169.1073760986328, "loss": 0.6276, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.197641983628273, "rewards/margins": 0.23505587875843048, "rewards/rejected": -0.03741389513015747, "step": 30 }, { "epoch": 0.4861111111111111, "grad_norm": 31.96578650923538, "learning_rate": 9.64098037858483e-07, "logits/chosen": -2.6476080417633057, "logits/rejected": -2.638826847076416, "logps/chosen": -164.2353515625, "logps/rejected": -171.78424072265625, "loss": 0.6001, "rewards/accuracies": 0.65625, "rewards/chosen": 0.011483956128358841, "rewards/margins": 0.3633750379085541, "rewards/rejected": -0.35189107060432434, "step": 35 }, { "epoch": 0.5555555555555556, "grad_norm": 35.60629409012632, "learning_rate": 9.485768031694871e-07, "logits/chosen": -2.6523194313049316, "logits/rejected": -2.621492385864258, "logps/chosen": -168.99270629882812, "logps/rejected": -177.50718688964844, "loss": 0.5939, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2566075921058655, "rewards/margins": 0.5398613214492798, "rewards/rejected": -0.79646897315979, "step": 40 }, { "epoch": 0.625, "grad_norm": 32.80701192573668, "learning_rate": 9.304486245873971e-07, "logits/chosen": -2.657984495162964, "logits/rejected": -2.6483747959136963, "logps/chosen": -163.6527557373047, "logps/rejected": -167.71705627441406, "loss": 0.5942, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.24404068291187286, "rewards/margins": 0.5225220918655396, "rewards/rejected": -0.766562819480896, "step": 45 }, { "epoch": 0.6944444444444444, "grad_norm": 32.94692651420362, "learning_rate": 9.098188556305262e-07, "logits/chosen": -2.732595682144165, "logits/rejected": -2.7179951667785645, "logps/chosen": -159.82009887695312, "logps/rejected": -163.01516723632812, "loss": 0.575, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.31872302293777466, "rewards/margins": 0.4740700125694275, "rewards/rejected": -0.7927930951118469, "step": 50 }, { "epoch": 0.6944444444444444, "eval_logits/chosen": -2.7981717586517334, "eval_logits/rejected": -2.7966415882110596, "eval_logps/chosen": -171.31138610839844, "eval_logps/rejected": -180.07443237304688, "eval_loss": 0.5679606199264526, "eval_rewards/accuracies": 0.69140625, "eval_rewards/chosen": -0.5232083201408386, "eval_rewards/margins": 0.6266617178916931, "eval_rewards/rejected": -1.1498699188232422, "eval_runtime": 127.2891, "eval_samples_per_second": 16.05, "eval_steps_per_second": 0.251, "step": 50 }, { "epoch": 0.7638888888888888, "grad_norm": 34.97882221943595, "learning_rate": 8.868073880316123e-07, "logits/chosen": -2.835651397705078, "logits/rejected": -2.836982250213623, "logps/chosen": -173.93702697753906, "logps/rejected": -180.1125030517578, "loss": 0.5562, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5288220643997192, "rewards/margins": 0.7004331350326538, "rewards/rejected": -1.229255199432373, "step": 55 }, { "epoch": 0.8333333333333334, "grad_norm": 31.166211388759624, "learning_rate": 8.615479549763755e-07, "logits/chosen": -2.8652548789978027, "logits/rejected": -2.8450732231140137, "logps/chosen": -169.28530883789062, "logps/rejected": -171.57772827148438, "loss": 0.5334, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.237405925989151, "rewards/margins": 0.7358155846595764, "rewards/rejected": -0.973221480846405, "step": 60 }, { "epoch": 0.9027777777777778, "grad_norm": 31.846767846888632, "learning_rate": 8.341873539012443e-07, "logits/chosen": -2.8254175186157227, "logits/rejected": -2.8189828395843506, "logps/chosen": -178.87318420410156, "logps/rejected": -185.26193237304688, "loss": 0.5483, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3370054364204407, "rewards/margins": 0.6962798237800598, "rewards/rejected": -1.0332852602005005, "step": 65 }, { "epoch": 0.9722222222222222, "grad_norm": 30.952350544641195, "learning_rate": 8.048845933670271e-07, "logits/chosen": -2.7731075286865234, "logits/rejected": -2.7470154762268066, "logps/chosen": -181.693359375, "logps/rejected": -192.0342254638672, "loss": 0.5378, "rewards/accuracies": 0.71875, "rewards/chosen": -0.617510974407196, "rewards/margins": 0.8088364601135254, "rewards/rejected": -1.4263474941253662, "step": 70 }, { "epoch": 1.0416666666666667, "grad_norm": 20.244570418413698, "learning_rate": 7.738099689665539e-07, "logits/chosen": -2.679137706756592, "logits/rejected": -2.676011800765991, "logps/chosen": -172.0131072998047, "logps/rejected": -184.72222900390625, "loss": 0.3552, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.7075360417366028, "rewards/margins": 1.6730060577392578, "rewards/rejected": -2.380542278289795, "step": 75 }, { "epoch": 1.1111111111111112, "grad_norm": 18.51901755323729, "learning_rate": 7.41144073636728e-07, "logits/chosen": -2.663628339767456, "logits/rejected": -2.6579511165618896, "logps/chosen": -181.416748046875, "logps/rejected": -201.2371063232422, "loss": 0.2457, "rewards/accuracies": 0.90625, "rewards/chosen": 0.018878469243645668, "rewards/margins": 2.686278820037842, "rewards/rejected": -2.667400360107422, "step": 80 }, { "epoch": 1.1805555555555556, "grad_norm": 20.24347077505837, "learning_rate": 7.070767481266492e-07, "logits/chosen": -2.68660569190979, "logits/rejected": -2.6812427043914795, "logps/chosen": -160.11874389648438, "logps/rejected": -179.59771728515625, "loss": 0.227, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.016312014311552048, "rewards/margins": 2.060859203338623, "rewards/rejected": -2.0771713256835938, "step": 85 }, { "epoch": 1.25, "grad_norm": 20.15132514330672, "learning_rate": 6.718059777212565e-07, "logits/chosen": -2.69787859916687, "logits/rejected": -2.7063913345336914, "logps/chosen": -165.69448852539062, "logps/rejected": -190.65296936035156, "loss": 0.2041, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.18171457946300507, "rewards/margins": 2.763362407684326, "rewards/rejected": -2.9450771808624268, "step": 90 }, { "epoch": 1.3194444444444444, "grad_norm": 21.629755831470078, "learning_rate": 6.355367416322778e-07, "logits/chosen": -2.7282795906066895, "logits/rejected": -2.7191052436828613, "logps/chosen": -176.59262084960938, "logps/rejected": -204.98123168945312, "loss": 0.2028, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1713067591190338, "rewards/margins": 3.3725147247314453, "rewards/rejected": -3.543820858001709, "step": 95 }, { "epoch": 1.3888888888888888, "grad_norm": 26.9593328849758, "learning_rate": 5.984798217433531e-07, "logits/chosen": -2.690068006515503, "logits/rejected": -2.69694185256958, "logps/chosen": -170.9009246826172, "logps/rejected": -199.11679077148438, "loss": 0.2161, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.24792905151844025, "rewards/margins": 3.1425349712371826, "rewards/rejected": -3.3904640674591064, "step": 100 }, { "epoch": 1.3888888888888888, "eval_logits/chosen": -2.6713719367980957, "eval_logits/rejected": -2.6708080768585205, "eval_logps/chosen": -177.34860229492188, "eval_logps/rejected": -191.46810913085938, "eval_loss": 0.541614830493927, "eval_rewards/accuracies": 0.74609375, "eval_rewards/chosen": -1.1269280910491943, "eval_rewards/margins": 1.162311315536499, "eval_rewards/rejected": -2.2892394065856934, "eval_runtime": 126.9171, "eval_samples_per_second": 16.097, "eval_steps_per_second": 0.252, "step": 100 } ], "logging_steps": 5, "max_steps": 216, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1178822762299392.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }