{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0695187165775402, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.053475935828877004, "grad_norm": 57.38362641680358, "learning_rate": 5e-07, "logits/chosen": -2.734001874923706, "logits/rejected": -2.714400053024292, "logps/chosen": -259.45416259765625, "logps/rejected": -213.60086059570312, "loss": 0.69, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": 0.022585459053516388, "rewards/margins": 0.012519368901848793, "rewards/rejected": 0.010066090151667595, "step": 5 }, { "epoch": 0.10695187165775401, "grad_norm": 47.481455328039516, "learning_rate": 1e-06, "logits/chosen": -2.6481270790100098, "logits/rejected": -2.652583599090576, "logps/chosen": -257.953369140625, "logps/rejected": -188.35134887695312, "loss": 0.6387, "rewards/accuracies": 0.6875, "rewards/chosen": 0.550362229347229, "rewards/margins": 0.1817861795425415, "rewards/rejected": 0.3685761094093323, "step": 10 }, { "epoch": 0.16042780748663102, "grad_norm": 57.27168970604102, "learning_rate": 9.991477798614637e-07, "logits/chosen": -2.517893075942993, "logits/rejected": -2.512808322906494, "logps/chosen": -238.94729614257812, "logps/rejected": -195.82278442382812, "loss": 0.6343, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.433282732963562, "rewards/margins": 0.7576474547386169, "rewards/rejected": 0.6756354570388794, "step": 15 }, { "epoch": 0.21390374331550802, "grad_norm": 49.36931799258527, "learning_rate": 9.965940245625131e-07, "logits/chosen": -2.3814330101013184, "logits/rejected": -2.3650126457214355, "logps/chosen": -240.35299682617188, "logps/rejected": -224.2460174560547, "loss": 0.6444, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.3586658239364624, "rewards/margins": 0.8488560914993286, "rewards/rejected": 0.5098099112510681, "step": 20 }, { "epoch": 0.26737967914438504, "grad_norm": 38.23771695176387, "learning_rate": 9.923474395499264e-07, "logits/chosen": -2.3290395736694336, "logits/rejected": -2.300835371017456, "logps/chosen": -240.2759246826172, "logps/rejected": -190.5952911376953, "loss": 0.5839, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 1.567830204963684, "rewards/margins": 1.0694384574890137, "rewards/rejected": 0.4983917772769928, "step": 25 }, { "epoch": 0.32085561497326204, "grad_norm": 38.55306482248171, "learning_rate": 9.86422500924775e-07, "logits/chosen": -2.385282039642334, "logits/rejected": -2.380516767501831, "logps/chosen": -237.0162353515625, "logps/rejected": -203.95974731445312, "loss": 0.5888, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 1.3996493816375732, "rewards/margins": 0.9638010263442993, "rewards/rejected": 0.43584829568862915, "step": 30 }, { "epoch": 0.37433155080213903, "grad_norm": 40.05177774617033, "learning_rate": 9.788394060951227e-07, "logits/chosen": -2.4351038932800293, "logits/rejected": -2.440431833267212, "logps/chosen": -247.30227661132812, "logps/rejected": -201.399658203125, "loss": 0.6124, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.2308628559112549, "rewards/margins": 1.0766284465789795, "rewards/rejected": 0.15423443913459778, "step": 35 }, { "epoch": 0.42780748663101603, "grad_norm": 38.64888093844348, "learning_rate": 9.696240049254742e-07, "logits/chosen": -2.4633519649505615, "logits/rejected": -2.466259241104126, "logps/chosen": -249.7180938720703, "logps/rejected": -194.3883056640625, "loss": 0.5959, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.1658284664154053, "rewards/margins": 1.0980390310287476, "rewards/rejected": 0.06778934597969055, "step": 40 }, { "epoch": 0.48128342245989303, "grad_norm": 41.079430965451316, "learning_rate": 9.588077116176756e-07, "logits/chosen": -2.4081149101257324, "logits/rejected": -2.406311273574829, "logps/chosen": -238.9816436767578, "logps/rejected": -201.0675048828125, "loss": 0.6076, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.1393609046936035, "rewards/margins": 1.4659656286239624, "rewards/rejected": -0.32660484313964844, "step": 45 }, { "epoch": 0.5347593582887701, "grad_norm": 46.70159372711324, "learning_rate": 9.464273976236516e-07, "logits/chosen": -2.4118716716766357, "logits/rejected": -2.408468246459961, "logps/chosen": -219.6370849609375, "logps/rejected": -186.2394561767578, "loss": 0.5857, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8120909929275513, "rewards/margins": 0.906074047088623, "rewards/rejected": -0.09398309141397476, "step": 50 }, { "epoch": 0.5347593582887701, "eval_logits/chosen": -2.4070043563842773, "eval_logits/rejected": -2.395042896270752, "eval_logps/chosen": -246.7726287841797, "eval_logps/rejected": -220.63446044921875, "eval_loss": 0.5587548613548279, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": 0.8626245856285095, "eval_rewards/margins": 1.329900860786438, "eval_rewards/rejected": -0.46727630496025085, "eval_runtime": 180.0372, "eval_samples_per_second": 14.775, "eval_steps_per_second": 0.233, "step": 50 }, { "epoch": 0.5882352941176471, "grad_norm": 38.079371712676014, "learning_rate": 9.325252659550308e-07, "logits/chosen": -2.387826919555664, "logits/rejected": -2.3770289421081543, "logps/chosen": -226.2966766357422, "logps/rejected": -208.65219116210938, "loss": 0.5977, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6860524415969849, "rewards/margins": 0.8900126218795776, "rewards/rejected": -0.2039601057767868, "step": 55 }, { "epoch": 0.6417112299465241, "grad_norm": 37.176849832612625, "learning_rate": 9.171487073181197e-07, "logits/chosen": -2.309826374053955, "logits/rejected": -2.319997549057007, "logps/chosen": -230.22189331054688, "logps/rejected": -206.3903350830078, "loss": 0.5777, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.7191376686096191, "rewards/margins": 1.5079154968261719, "rewards/rejected": -0.7887779474258423, "step": 60 }, { "epoch": 0.6951871657754011, "grad_norm": 36.39927326888579, "learning_rate": 9.003501385646448e-07, "logits/chosen": -2.2107081413269043, "logits/rejected": -2.21248722076416, "logps/chosen": -238.040771484375, "logps/rejected": -207.3212127685547, "loss": 0.5854, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5230123996734619, "rewards/margins": 1.2387675046920776, "rewards/rejected": -0.715755045413971, "step": 65 }, { "epoch": 0.7486631016042781, "grad_norm": 34.98626076335809, "learning_rate": 8.821868240089676e-07, "logits/chosen": -2.1564557552337646, "logits/rejected": -2.1244616508483887, "logps/chosen": -232.23452758789062, "logps/rejected": -215.52719116210938, "loss": 0.5572, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 1.0822027921676636, "rewards/margins": 1.5664498805999756, "rewards/rejected": -0.48424673080444336, "step": 70 }, { "epoch": 0.8021390374331551, "grad_norm": 33.602724500864966, "learning_rate": 8.62720680220876e-07, "logits/chosen": -2.2200913429260254, "logits/rejected": -2.161189079284668, "logps/chosen": -232.7351837158203, "logps/rejected": -218.43045043945312, "loss": 0.5712, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5824815630912781, "rewards/margins": 1.2068629264831543, "rewards/rejected": -0.6243813037872314, "step": 75 }, { "epoch": 0.8556149732620321, "grad_norm": 32.25811213479902, "learning_rate": 8.420180649593929e-07, "logits/chosen": -2.223334789276123, "logits/rejected": -2.206092119216919, "logps/chosen": -237.7117156982422, "logps/rejected": -213.7598114013672, "loss": 0.5111, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.454035222530365, "rewards/margins": 1.7620937824249268, "rewards/rejected": -1.3080583810806274, "step": 80 }, { "epoch": 0.9090909090909091, "grad_norm": 28.217025801566006, "learning_rate": 8.201495509671036e-07, "logits/chosen": -2.205242872238159, "logits/rejected": -2.2124757766723633, "logps/chosen": -250.62673950195312, "logps/rejected": -233.39517211914062, "loss": 0.5286, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.5412265658378601, "rewards/margins": 1.507828712463379, "rewards/rejected": -0.9666021466255188, "step": 85 }, { "epoch": 0.9625668449197861, "grad_norm": 33.081462718842836, "learning_rate": 7.971896853961042e-07, "logits/chosen": -2.286748170852661, "logits/rejected": -2.2566237449645996, "logps/chosen": -235.38076782226562, "logps/rejected": -223.08413696289062, "loss": 0.5372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0189321041107178, "rewards/margins": 1.5707125663757324, "rewards/rejected": -0.5517805814743042, "step": 90 }, { "epoch": 1.0160427807486632, "grad_norm": 19.031150325303443, "learning_rate": 7.732167356856654e-07, "logits/chosen": -2.383920192718506, "logits/rejected": -2.3609161376953125, "logps/chosen": -249.6156768798828, "logps/rejected": -201.63851928710938, "loss": 0.3778, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3106629848480225, "rewards/margins": 2.160562038421631, "rewards/rejected": -0.8498989939689636, "step": 95 }, { "epoch": 1.0695187165775402, "grad_norm": 21.9932361270153, "learning_rate": 7.48312422757881e-07, "logits/chosen": -2.432481050491333, "logits/rejected": -2.416440010070801, "logps/chosen": -221.5184326171875, "logps/rejected": -238.3723602294922, "loss": 0.2515, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.44422447681427, "rewards/margins": 3.0625545978546143, "rewards/rejected": -1.6183300018310547, "step": 100 }, { "epoch": 1.0695187165775402, "eval_logits/chosen": -2.3926949501037598, "eval_logits/rejected": -2.3801937103271484, "eval_logps/chosen": -243.76559448242188, "eval_logps/rejected": -223.52978515625, "eval_loss": 0.50016188621521, "eval_rewards/accuracies": 0.7827380895614624, "eval_rewards/chosen": 1.1633288860321045, "eval_rewards/margins": 1.9201369285583496, "eval_rewards/rejected": -0.7568081617355347, "eval_runtime": 179.117, "eval_samples_per_second": 14.851, "eval_steps_per_second": 0.234, "step": 100 } ], "logging_steps": 5, "max_steps": 279, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1178822762299392.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }