{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 41, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 363.8611145019531, "epoch": 0.024390243902439025, "grad_norm": 0.5100746750831604, "learning_rate": 2e-06, "loss": -0.0, "reward": 0.4490741044282913, "reward_std": 0.13515063002705574, "rewards/semantic_prob_reward": 0.4490741044282913, "step": 1 }, { "completion_length": 363.4166717529297, "epoch": 0.04878048780487805, "grad_norm": 0.5934435129165649, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.3368055820465088, "reward_std": 0.10725738108158112, "rewards/semantic_prob_reward": 0.3368055820465088, "step": 2 }, { "completion_length": 338.88194274902344, "epoch": 0.07317073170731707, "grad_norm": 0.560379683971405, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.43287038803100586, "reward_std": 0.16148831695318222, "rewards/semantic_prob_reward": 0.43287038803100586, "step": 3 }, { "completion_length": 355.94444274902344, "epoch": 0.0975609756097561, "grad_norm": 0.5169845819473267, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.47337964177131653, "reward_std": 0.1428069844841957, "rewards/semantic_prob_reward": 0.47337964177131653, "step": 4 }, { "completion_length": 288.8958282470703, "epoch": 0.12195121951219512, "grad_norm": 0.570539116859436, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.5069444924592972, "reward_std": 0.09097362495958805, "rewards/semantic_prob_reward": 0.5069444924592972, "step": 5 }, { "completion_length": 293.9166717529297, "epoch": 0.14634146341463414, "grad_norm": 0.5433425903320312, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.5787037014961243, "reward_std": 0.13018950819969177, "rewards/semantic_prob_reward": 0.5787037014961243, "step": 6 }, { "completion_length": 259.09722900390625, "epoch": 0.17073170731707318, "grad_norm": 0.5698252320289612, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.48842595517635345, "reward_std": 0.09774653986096382, "rewards/semantic_prob_reward": 0.48842595517635345, "step": 7 }, { "completion_length": 304.1111145019531, "epoch": 0.1951219512195122, "grad_norm": 0.5670663714408875, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.49537035822868347, "reward_std": 0.15591006726026535, "rewards/semantic_prob_reward": 0.49537035822868347, "step": 8 }, { "completion_length": 260.49305725097656, "epoch": 0.21951219512195122, "grad_norm": 0.5627779960632324, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.6678241193294525, "reward_std": 0.13818088918924332, "rewards/semantic_prob_reward": 0.6678241193294525, "step": 9 }, { "completion_length": 263.2986068725586, "epoch": 0.24390243902439024, "grad_norm": 0.6707796454429626, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.41550928354263306, "reward_std": 0.17615114152431488, "rewards/semantic_prob_reward": 0.41550928354263306, "step": 10 }, { "completion_length": 233.84027862548828, "epoch": 0.2682926829268293, "grad_norm": 0.583340585231781, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.690972238779068, "reward_std": 0.12268242612481117, "rewards/semantic_prob_reward": 0.690972238779068, "step": 11 }, { "completion_length": 270.34027099609375, "epoch": 0.2926829268292683, "grad_norm": 0.48444247245788574, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.637731522321701, "reward_std": 0.07967884186655283, "rewards/semantic_prob_reward": 0.637731522321701, "step": 12 }, { "completion_length": 272.8958435058594, "epoch": 0.3170731707317073, "grad_norm": 0.6469090580940247, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.5243055671453476, "reward_std": 0.17853275686502457, "rewards/semantic_prob_reward": 0.5243055671453476, "step": 13 }, { "completion_length": 232.6041717529297, "epoch": 0.34146341463414637, "grad_norm": 0.7080578804016113, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.4687500149011612, "reward_std": 0.14688704162836075, "rewards/semantic_prob_reward": 0.4687500149011612, "step": 14 }, { "completion_length": 221.69444274902344, "epoch": 0.36585365853658536, "grad_norm": 0.6455399394035339, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.5879629850387573, "reward_std": 0.1403595134615898, "rewards/semantic_prob_reward": 0.5879629850387573, "step": 15 }, { "completion_length": 219.1111068725586, "epoch": 0.3902439024390244, "grad_norm": 0.6930660605430603, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.5509259253740311, "reward_std": 0.16750450432300568, "rewards/semantic_prob_reward": 0.5509259253740311, "step": 16 }, { "completion_length": 180.20833587646484, "epoch": 0.4146341463414634, "grad_norm": 0.6736430525779724, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.6087963134050369, "reward_std": 0.14237143099308014, "rewards/semantic_prob_reward": 0.6087963134050369, "step": 17 }, { "completion_length": 200.77083587646484, "epoch": 0.43902439024390244, "grad_norm": 0.5983349680900574, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.7013888955116272, "reward_std": 0.1272590346634388, "rewards/semantic_prob_reward": 0.7013888955116272, "step": 18 }, { "completion_length": 206.70833587646484, "epoch": 0.4634146341463415, "grad_norm": 0.787395715713501, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.47337962687015533, "reward_std": 0.14758140221238136, "rewards/semantic_prob_reward": 0.47337962687015533, "step": 19 }, { "completion_length": 183.70833587646484, "epoch": 0.4878048780487805, "grad_norm": 0.7048752903938293, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.6863426268100739, "reward_std": 0.16743485629558563, "rewards/semantic_prob_reward": 0.6863426268100739, "step": 20 }, { "completion_length": 172.81945037841797, "epoch": 0.5121951219512195, "grad_norm": 0.6450062394142151, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.7210648357868195, "reward_std": 0.10877622663974762, "rewards/semantic_prob_reward": 0.7210648357868195, "step": 21 }, { "completion_length": 176.85416412353516, "epoch": 0.5365853658536586, "grad_norm": 0.705051064491272, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.6932870447635651, "reward_std": 0.16415037959814072, "rewards/semantic_prob_reward": 0.6932870447635651, "step": 22 }, { "completion_length": 182.38195037841797, "epoch": 0.5609756097560976, "grad_norm": 0.6152874231338501, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.6469907164573669, "reward_std": 0.08873938024044037, "rewards/semantic_prob_reward": 0.6469907164573669, "step": 23 }, { "completion_length": 185.54861450195312, "epoch": 0.5853658536585366, "grad_norm": 0.6029895544052124, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.5891203582286835, "reward_std": 0.09021448716521263, "rewards/semantic_prob_reward": 0.5891203582286835, "step": 24 }, { "completion_length": 167.72222137451172, "epoch": 0.6097560975609756, "grad_norm": 0.5513088703155518, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.7974537312984467, "reward_std": 0.11378278583288193, "rewards/semantic_prob_reward": 0.7974537312984467, "step": 25 }, { "completion_length": 164.90277862548828, "epoch": 0.6341463414634146, "grad_norm": 0.6331700682640076, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.6886574327945709, "reward_std": 0.08421878889203072, "rewards/semantic_prob_reward": 0.6886574327945709, "step": 26 }, { "completion_length": 172.83333587646484, "epoch": 0.6585365853658537, "grad_norm": 0.6214303374290466, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.7604165971279144, "reward_std": 0.13277369365096092, "rewards/semantic_prob_reward": 0.7604165971279144, "step": 27 }, { "completion_length": 188.97222137451172, "epoch": 0.6829268292682927, "grad_norm": 0.528020441532135, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.8263888955116272, "reward_std": 0.1292901635169983, "rewards/semantic_prob_reward": 0.8263888955116272, "step": 28 }, { "completion_length": 186.65277862548828, "epoch": 0.7073170731707317, "grad_norm": 0.6529655456542969, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.6898148357868195, "reward_std": 0.1409335434436798, "rewards/semantic_prob_reward": 0.6898148357868195, "step": 29 }, { "completion_length": 184.77083587646484, "epoch": 0.7317073170731707, "grad_norm": 0.6158609390258789, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.7719907462596893, "reward_std": 0.16161875426769257, "rewards/semantic_prob_reward": 0.7719907462596893, "step": 30 }, { "completion_length": 201.9166717529297, "epoch": 0.7560975609756098, "grad_norm": 0.5692559480667114, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.6377314627170563, "reward_std": 0.10584507510066032, "rewards/semantic_prob_reward": 0.6377314627170563, "step": 31 }, { "completion_length": 185.46527862548828, "epoch": 0.7804878048780488, "grad_norm": 0.5227845907211304, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.78125, "reward_std": 0.11403343454003334, "rewards/semantic_prob_reward": 0.78125, "step": 32 }, { "completion_length": 185.25695037841797, "epoch": 0.8048780487804879, "grad_norm": 0.53897625207901, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.7696759402751923, "reward_std": 0.0648084469139576, "rewards/semantic_prob_reward": 0.7696759402751923, "step": 33 }, { "completion_length": 198.29861450195312, "epoch": 0.8292682926829268, "grad_norm": 0.5177846550941467, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.7083333432674408, "reward_std": 0.09422570466995239, "rewards/semantic_prob_reward": 0.7083333432674408, "step": 34 }, { "completion_length": 180.54861450195312, "epoch": 0.8536585365853658, "grad_norm": 0.4751005470752716, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.7835648655891418, "reward_std": 0.08273815922439098, "rewards/semantic_prob_reward": 0.7835648655891418, "step": 35 }, { "completion_length": 181.69445037841797, "epoch": 0.8780487804878049, "grad_norm": 0.6460740566253662, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.7673611044883728, "reward_std": 0.14799243956804276, "rewards/semantic_prob_reward": 0.7673611044883728, "step": 36 }, { "completion_length": 180.45833587646484, "epoch": 0.9024390243902439, "grad_norm": 0.5229371190071106, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.8067129850387573, "reward_std": 0.1316433995962143, "rewards/semantic_prob_reward": 0.8067129850387573, "step": 37 }, { "completion_length": 177.90277862548828, "epoch": 0.926829268292683, "grad_norm": 0.4125826060771942, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.8946759402751923, "reward_std": 0.07761897146701813, "rewards/semantic_prob_reward": 0.8946759402751923, "step": 38 }, { "completion_length": 188.02777862548828, "epoch": 0.9512195121951219, "grad_norm": 0.5801049470901489, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.8564814925193787, "reward_std": 0.13684561476111412, "rewards/semantic_prob_reward": 0.8564814925193787, "step": 39 }, { "completion_length": 166.98611450195312, "epoch": 0.975609756097561, "grad_norm": 0.4894116222858429, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.8495369851589203, "reward_std": 0.052677496802061796, "rewards/semantic_prob_reward": 0.8495369851589203, "step": 40 }, { "completion_length": 159.5, "epoch": 1.0, "grad_norm": 0.39254793524742126, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.8541666865348816, "reward_std": 0.12028130888938904, "rewards/semantic_prob_reward": 0.8541666865348816, "step": 41 }, { "epoch": 1.0, "step": 41, "total_flos": 0.0, "train_loss": 5.627308753934262e-08, "train_runtime": 2104.6375, "train_samples_per_second": 0.233, "train_steps_per_second": 0.019 } ], "logging_steps": 1, "max_steps": 41, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }