{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9158878504672896, "eval_steps": 50, "global_step": 78, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18691588785046728, "grad_norm": 61.21687204105591, "learning_rate": 5e-07, "logits/chosen": -2.7639615535736084, "logits/rejected": -2.753985643386841, "logps/chosen": -265.36212158203125, "logps/rejected": -219.6597900390625, "loss": 0.6903, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": 0.01777561381459236, "rewards/margins": 0.004832454025745392, "rewards/rejected": 0.012943158857524395, "step": 5 }, { "epoch": 0.37383177570093457, "grad_norm": 46.174004471862034, "learning_rate": 1e-06, "logits/chosen": -2.7134578227996826, "logits/rejected": -2.718207597732544, "logps/chosen": -287.67529296875, "logps/rejected": -230.2861785888672, "loss": 0.6281, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6617986559867859, "rewards/margins": 0.29742884635925293, "rewards/rejected": 0.36436980962753296, "step": 10 }, { "epoch": 0.5607476635514018, "grad_norm": 46.436393575993726, "learning_rate": 9.867190271803463e-07, "logits/chosen": -2.5830750465393066, "logits/rejected": -2.587627410888672, "logps/chosen": -273.77752685546875, "logps/rejected": -220.65695190429688, "loss": 0.6333, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.6920897960662842, "rewards/margins": 0.8979904055595398, "rewards/rejected": 0.7940995097160339, "step": 15 }, { "epoch": 0.7476635514018691, "grad_norm": 43.837511462770195, "learning_rate": 9.475816456775312e-07, "logits/chosen": -2.471787929534912, "logits/rejected": -2.478492259979248, "logps/chosen": -246.120361328125, "logps/rejected": -214.670654296875, "loss": 0.5943, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.5776749849319458, "rewards/margins": 1.0568134784698486, "rewards/rejected": 0.5208614468574524, "step": 20 }, { "epoch": 0.9345794392523364, "grad_norm": 37.09162689266799, "learning_rate": 8.846669854914395e-07, "logits/chosen": -2.4390480518341064, "logits/rejected": -2.411414861679077, "logps/chosen": -269.1220397949219, "logps/rejected": -211.203369140625, "loss": 0.5775, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.8889753818511963, "rewards/margins": 1.2695468664169312, "rewards/rejected": 0.619428277015686, "step": 25 }, { "epoch": 1.1214953271028036, "grad_norm": 23.14113734554105, "learning_rate": 8.013173181896282e-07, "logits/chosen": -2.3594138622283936, "logits/rejected": -2.3916964530944824, "logps/chosen": -255.5403289794922, "logps/rejected": -206.4452667236328, "loss": 0.4246, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.106290340423584, "rewards/margins": 2.024784564971924, "rewards/rejected": 0.08150559663772583, "step": 30 }, { "epoch": 1.308411214953271, "grad_norm": 20.944832198946614, "learning_rate": 7.019605024359474e-07, "logits/chosen": -2.3514552116394043, "logits/rejected": -2.332791805267334, "logps/chosen": -238.28518676757812, "logps/rejected": -240.40560913085938, "loss": 0.2842, "rewards/accuracies": 0.875, "rewards/chosen": 2.428344249725342, "rewards/margins": 3.228165864944458, "rewards/rejected": -0.7998219728469849, "step": 35 }, { "epoch": 1.4953271028037383, "grad_norm": 30.791834420795908, "learning_rate": 5.918747589082852e-07, "logits/chosen": -2.33073353767395, "logits/rejected": -2.3257832527160645, "logps/chosen": -250.0792694091797, "logps/rejected": -221.3015594482422, "loss": 0.2477, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.562706232070923, "rewards/margins": 3.5494167804718018, "rewards/rejected": -0.9867107272148132, "step": 40 }, { "epoch": 1.6822429906542056, "grad_norm": 27.339770904226572, "learning_rate": 4.769082706771303e-07, "logits/chosen": -2.354419708251953, "logits/rejected": -2.345761775970459, "logps/chosen": -251.4537811279297, "logps/rejected": -224.0857391357422, "loss": 0.282, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.958096742630005, "rewards/margins": 3.320951461791992, "rewards/rejected": -0.3628546893596649, "step": 45 }, { "epoch": 1.8691588785046729, "grad_norm": 26.19581986930712, "learning_rate": 3.6316850496395855e-07, "logits/chosen": -2.395775318145752, "logits/rejected": -2.3773467540740967, "logps/chosen": -227.96786499023438, "logps/rejected": -228.518310546875, "loss": 0.3348, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.7191131114959717, "rewards/margins": 2.9623184204101562, "rewards/rejected": -0.24320510029792786, "step": 50 }, { "epoch": 1.8691588785046729, "eval_logits/chosen": -2.406548023223877, "eval_logits/rejected": -2.4020678997039795, "eval_logps/chosen": -249.0997314453125, "eval_logps/rejected": -190.4173583984375, "eval_loss": 0.5792086124420166, "eval_rewards/accuracies": 0.8125, "eval_rewards/chosen": 2.028245210647583, "eval_rewards/margins": 1.727249026298523, "eval_rewards/rejected": 0.300996333360672, "eval_runtime": 50.8966, "eval_samples_per_second": 14.932, "eval_steps_per_second": 0.236, "step": 50 }, { "epoch": 2.05607476635514, "grad_norm": 19.061520787069963, "learning_rate": 2.566977607165719e-07, "logits/chosen": -2.425044298171997, "logits/rejected": -2.430022716522217, "logps/chosen": -258.14617919921875, "logps/rejected": -232.40603637695312, "loss": 0.2745, "rewards/accuracies": 0.90625, "rewards/chosen": 2.87212872505188, "rewards/margins": 3.6639626026153564, "rewards/rejected": -0.7918335795402527, "step": 55 }, { "epoch": 2.2429906542056073, "grad_norm": 15.460701798223598, "learning_rate": 1.631521781767214e-07, "logits/chosen": -2.4355878829956055, "logits/rejected": -2.4206368923187256, "logps/chosen": -223.0980987548828, "logps/rejected": -222.94241333007812, "loss": 0.1636, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.6068944931030273, "rewards/margins": 3.432859420776367, "rewards/rejected": -0.8259647488594055, "step": 60 }, { "epoch": 2.4299065420560746, "grad_norm": 16.614589190551243, "learning_rate": 8.75012627008489e-08, "logits/chosen": -2.4444804191589355, "logits/rejected": -2.4162566661834717, "logps/chosen": -244.12841796875, "logps/rejected": -229.2833251953125, "loss": 0.1617, "rewards/accuracies": 0.9375, "rewards/chosen": 2.8097472190856934, "rewards/margins": 3.784661054611206, "rewards/rejected": -0.9749139547348022, "step": 65 }, { "epoch": 2.616822429906542, "grad_norm": 19.056173520446013, "learning_rate": 3.376388529782215e-08, "logits/chosen": -2.4610393047332764, "logits/rejected": -2.440760612487793, "logps/chosen": -251.9339599609375, "logps/rejected": -247.04275512695312, "loss": 0.1661, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.2026991844177246, "rewards/margins": 4.342484474182129, "rewards/rejected": -1.1397849321365356, "step": 70 }, { "epoch": 2.803738317757009, "grad_norm": 22.14435014976595, "learning_rate": 4.794784562397458e-09, "logits/chosen": -2.439197063446045, "logits/rejected": -2.4480080604553223, "logps/chosen": -245.83792114257812, "logps/rejected": -227.4940185546875, "loss": 0.1707, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.1310062408447266, "rewards/margins": 3.876368761062622, "rewards/rejected": -0.7453619241714478, "step": 75 } ], "logging_steps": 5, "max_steps": 78, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 919378820333568.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }