|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.08016032064128256, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 94.5250015258789, |
|
"epoch": 0.0002672010688042752, |
|
"grad_norm": 8.340370178222656, |
|
"kl": 0.0, |
|
"learning_rate": 9.997327632282202e-07, |
|
"loss": 0.0, |
|
"reward": 0.43238671123981476, |
|
"reward_std": 0.10043386369943619, |
|
"rewards/iou_timestamp_reward": 0.43238671123981476, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 90.17499923706055, |
|
"epoch": 0.0005344021376085504, |
|
"grad_norm": 15.389718055725098, |
|
"kl": 0.0009441375732421875, |
|
"learning_rate": 9.994655264564402e-07, |
|
"loss": 0.0, |
|
"reward": 0.49521203339099884, |
|
"reward_std": 0.1284736730158329, |
|
"rewards/iou_timestamp_reward": 0.49521203339099884, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 94.7750015258789, |
|
"epoch": 0.0008016032064128256, |
|
"grad_norm": 14.409242630004883, |
|
"kl": 0.001331329345703125, |
|
"learning_rate": 9.991982896846605e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5351988673210144, |
|
"reward_std": 0.16380538046360016, |
|
"rewards/iou_timestamp_reward": 0.5351988673210144, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 92.67499923706055, |
|
"epoch": 0.0010688042752171009, |
|
"grad_norm": 12.140655517578125, |
|
"kl": 0.001255035400390625, |
|
"learning_rate": 9.989310529128807e-07, |
|
"loss": 0.0001, |
|
"reward": 0.3954579681158066, |
|
"reward_std": 0.1440194696187973, |
|
"rewards/iou_timestamp_reward": 0.3954579681158066, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 93.0250015258789, |
|
"epoch": 0.0013360053440213762, |
|
"grad_norm": 11.99720573425293, |
|
"kl": 0.00121307373046875, |
|
"learning_rate": 9.98663816141101e-07, |
|
"loss": 0.0, |
|
"reward": 0.5536792427301407, |
|
"reward_std": 0.1233767680823803, |
|
"rewards/iou_timestamp_reward": 0.5536792427301407, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 95.32500076293945, |
|
"epoch": 0.0016032064128256513, |
|
"grad_norm": 31.318050384521484, |
|
"kl": 0.001331329345703125, |
|
"learning_rate": 9.983965793693212e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5504237711429596, |
|
"reward_std": 0.17604028433561325, |
|
"rewards/iou_timestamp_reward": 0.5504237711429596, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 96.47500228881836, |
|
"epoch": 0.0018704074816299266, |
|
"grad_norm": 31.36016273498535, |
|
"kl": 0.001834869384765625, |
|
"learning_rate": 9.981293425975412e-07, |
|
"loss": 0.0001, |
|
"reward": 0.3581559360027313, |
|
"reward_std": 0.07435427978634834, |
|
"rewards/iou_timestamp_reward": 0.3581559360027313, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 90.57500076293945, |
|
"epoch": 0.0021376085504342017, |
|
"grad_norm": 12.298426628112793, |
|
"kl": 0.00214385986328125, |
|
"learning_rate": 9.978621058257615e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5335834175348282, |
|
"reward_std": 0.09592974931001663, |
|
"rewards/iou_timestamp_reward": 0.5335834175348282, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 84.61250305175781, |
|
"epoch": 0.002404809619238477, |
|
"grad_norm": 8.306353569030762, |
|
"kl": 0.00254058837890625, |
|
"learning_rate": 9.975948690539817e-07, |
|
"loss": 0.0001, |
|
"reward": 0.4508528411388397, |
|
"reward_std": 0.1074596531689167, |
|
"rewards/iou_timestamp_reward": 0.4508528411388397, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 90.4000015258789, |
|
"epoch": 0.0026720106880427524, |
|
"grad_norm": 9.342815399169922, |
|
"kl": 0.002593994140625, |
|
"learning_rate": 9.97327632282202e-07, |
|
"loss": 0.0001, |
|
"reward": 0.567167192697525, |
|
"reward_std": 0.13745272532105446, |
|
"rewards/iou_timestamp_reward": 0.567167192697525, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 87.07500076293945, |
|
"epoch": 0.0029392117568470275, |
|
"grad_norm": 22.01643943786621, |
|
"kl": 0.00392913818359375, |
|
"learning_rate": 9.970603955104222e-07, |
|
"loss": 0.0002, |
|
"reward": 0.4692964553833008, |
|
"reward_std": 0.0816560797393322, |
|
"rewards/iou_timestamp_reward": 0.4692964553833008, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 86.0625, |
|
"epoch": 0.0032064128256513026, |
|
"grad_norm": 19.23089599609375, |
|
"kl": 0.00360870361328125, |
|
"learning_rate": 9.967931587386423e-07, |
|
"loss": 0.0001, |
|
"reward": 0.35264405608177185, |
|
"reward_std": 0.0850561149418354, |
|
"rewards/iou_timestamp_reward": 0.35264405608177185, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 102.80000305175781, |
|
"epoch": 0.0034736138944555777, |
|
"grad_norm": 19.64198875427246, |
|
"kl": 0.0048370361328125, |
|
"learning_rate": 9.965259219668625e-07, |
|
"loss": 0.0002, |
|
"reward": 0.48492929339408875, |
|
"reward_std": 0.13556858897209167, |
|
"rewards/iou_timestamp_reward": 0.48492929339408875, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 85.86249923706055, |
|
"epoch": 0.0037408149632598532, |
|
"grad_norm": 9.946146011352539, |
|
"kl": 0.0070953369140625, |
|
"learning_rate": 9.962586851950827e-07, |
|
"loss": 0.0003, |
|
"reward": 0.5720771998167038, |
|
"reward_std": 0.12591452151536942, |
|
"rewards/iou_timestamp_reward": 0.5720771998167038, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 90.0625, |
|
"epoch": 0.004008016032064128, |
|
"grad_norm": 13.546260833740234, |
|
"kl": 0.006072998046875, |
|
"learning_rate": 9.95991448423303e-07, |
|
"loss": 0.0002, |
|
"reward": 0.5297239720821381, |
|
"reward_std": 0.10710490867495537, |
|
"rewards/iou_timestamp_reward": 0.5297239720821381, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 83.01250076293945, |
|
"epoch": 0.0042752171008684035, |
|
"grad_norm": 8.076241493225098, |
|
"kl": 0.0060882568359375, |
|
"learning_rate": 9.957242116515232e-07, |
|
"loss": 0.0002, |
|
"reward": 0.29969294369220734, |
|
"reward_std": 0.09227840602397919, |
|
"rewards/iou_timestamp_reward": 0.29969294369220734, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 75.48750305175781, |
|
"epoch": 0.004542418169672679, |
|
"grad_norm": 7.656135559082031, |
|
"kl": 0.0081329345703125, |
|
"learning_rate": 9.954569748797435e-07, |
|
"loss": 0.0003, |
|
"reward": 0.5397222191095352, |
|
"reward_std": 0.12004014477133751, |
|
"rewards/iou_timestamp_reward": 0.5397222191095352, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 89.41250228881836, |
|
"epoch": 0.004809619238476954, |
|
"grad_norm": 15.919727325439453, |
|
"kl": 0.0092315673828125, |
|
"learning_rate": 9.951897381079635e-07, |
|
"loss": 0.0004, |
|
"reward": 0.4049786627292633, |
|
"reward_std": 0.09774354100227356, |
|
"rewards/iou_timestamp_reward": 0.4049786627292633, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 84.35000228881836, |
|
"epoch": 0.005076820307281229, |
|
"grad_norm": 7.11904764175415, |
|
"kl": 0.008270263671875, |
|
"learning_rate": 9.949225013361838e-07, |
|
"loss": 0.0003, |
|
"reward": 0.36138640344142914, |
|
"reward_std": 0.09036249667406082, |
|
"rewards/iou_timestamp_reward": 0.36138640344142914, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 84.35000228881836, |
|
"epoch": 0.005344021376085505, |
|
"grad_norm": 31.289043426513672, |
|
"kl": 0.0091552734375, |
|
"learning_rate": 9.94655264564404e-07, |
|
"loss": 0.0004, |
|
"reward": 0.4811190813779831, |
|
"reward_std": 0.09721268340945244, |
|
"rewards/iou_timestamp_reward": 0.4811190813779831, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 81.3375015258789, |
|
"epoch": 0.0056112224448897794, |
|
"grad_norm": 25.466672897338867, |
|
"kl": 0.010284423828125, |
|
"learning_rate": 9.943880277926243e-07, |
|
"loss": 0.0004, |
|
"reward": 0.5279562920331955, |
|
"reward_std": 0.05652128346264362, |
|
"rewards/iou_timestamp_reward": 0.5279562920331955, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 82.80000305175781, |
|
"epoch": 0.005878423513694055, |
|
"grad_norm": 19.14952278137207, |
|
"kl": 0.009918212890625, |
|
"learning_rate": 9.941207910208445e-07, |
|
"loss": 0.0004, |
|
"reward": 0.4550338685512543, |
|
"reward_std": 0.08861872926354408, |
|
"rewards/iou_timestamp_reward": 0.4550338685512543, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 82.03750228881836, |
|
"epoch": 0.00614562458249833, |
|
"grad_norm": 20.35788345336914, |
|
"kl": 0.010284423828125, |
|
"learning_rate": 9.938535542490645e-07, |
|
"loss": 0.0004, |
|
"reward": 0.5463799238204956, |
|
"reward_std": 0.1116589643061161, |
|
"rewards/iou_timestamp_reward": 0.5463799238204956, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 82.80000305175781, |
|
"epoch": 0.006412825651302605, |
|
"grad_norm": 14.981146812438965, |
|
"kl": 0.0108642578125, |
|
"learning_rate": 9.935863174772848e-07, |
|
"loss": 0.0004, |
|
"reward": 0.4238290339708328, |
|
"reward_std": 0.0759831890463829, |
|
"rewards/iou_timestamp_reward": 0.4238290339708328, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 81.72500228881836, |
|
"epoch": 0.006680026720106881, |
|
"grad_norm": 15.833313941955566, |
|
"kl": 0.011749267578125, |
|
"learning_rate": 9.93319080705505e-07, |
|
"loss": 0.0005, |
|
"reward": 0.3965422213077545, |
|
"reward_std": 0.07669836282730103, |
|
"rewards/iou_timestamp_reward": 0.3965422213077545, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 93.2750015258789, |
|
"epoch": 0.006947227788911155, |
|
"grad_norm": 45.69765090942383, |
|
"kl": 0.011199951171875, |
|
"learning_rate": 9.930518439337253e-07, |
|
"loss": 0.0004, |
|
"reward": 0.548987627029419, |
|
"reward_std": 0.1403028443455696, |
|
"rewards/iou_timestamp_reward": 0.548987627029419, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 81.04999923706055, |
|
"epoch": 0.007214428857715431, |
|
"grad_norm": 10.686071395874023, |
|
"kl": 0.0106201171875, |
|
"learning_rate": 9.927846071619455e-07, |
|
"loss": 0.0004, |
|
"reward": 0.3573782742023468, |
|
"reward_std": 0.11069009453058243, |
|
"rewards/iou_timestamp_reward": 0.3573782742023468, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 78.625, |
|
"epoch": 0.0074816299265197065, |
|
"grad_norm": 8.17104721069336, |
|
"kl": 0.012176513671875, |
|
"learning_rate": 9.925173703901655e-07, |
|
"loss": 0.0005, |
|
"reward": 0.34273259341716766, |
|
"reward_std": 0.08138729631900787, |
|
"rewards/iou_timestamp_reward": 0.34273259341716766, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 78.26250076293945, |
|
"epoch": 0.007748830995323981, |
|
"grad_norm": 31.360939025878906, |
|
"kl": 0.013702392578125, |
|
"learning_rate": 9.922501336183858e-07, |
|
"loss": 0.0005, |
|
"reward": 0.45369820296764374, |
|
"reward_std": 0.10742013901472092, |
|
"rewards/iou_timestamp_reward": 0.45369820296764374, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 75.22500228881836, |
|
"epoch": 0.008016032064128256, |
|
"grad_norm": 9.121045112609863, |
|
"kl": 0.01019287109375, |
|
"learning_rate": 9.91982896846606e-07, |
|
"loss": 0.0004, |
|
"reward": 0.3313401788473129, |
|
"reward_std": 0.13614049553871155, |
|
"rewards/iou_timestamp_reward": 0.3313401788473129, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 89.67500305175781, |
|
"epoch": 0.008283233132932532, |
|
"grad_norm": 19.901443481445312, |
|
"kl": 0.013580322265625, |
|
"learning_rate": 9.917156600748263e-07, |
|
"loss": 0.0005, |
|
"reward": 0.5152908265590668, |
|
"reward_std": 0.09363758563995361, |
|
"rewards/iou_timestamp_reward": 0.5152908265590668, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 87.0250015258789, |
|
"epoch": 0.008550434201736807, |
|
"grad_norm": 18.844900131225586, |
|
"kl": 0.01513671875, |
|
"learning_rate": 9.914484233030465e-07, |
|
"loss": 0.0006, |
|
"reward": 0.47241438925266266, |
|
"reward_std": 0.11852370575070381, |
|
"rewards/iou_timestamp_reward": 0.47241438925266266, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 77.43750381469727, |
|
"epoch": 0.008817635270541082, |
|
"grad_norm": 10.22252082824707, |
|
"kl": 0.01824951171875, |
|
"learning_rate": 9.911811865312668e-07, |
|
"loss": 0.0007, |
|
"reward": 0.44154876470565796, |
|
"reward_std": 0.08548092842102051, |
|
"rewards/iou_timestamp_reward": 0.44154876470565796, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 82.98750305175781, |
|
"epoch": 0.009084836339345358, |
|
"grad_norm": 17.666988372802734, |
|
"kl": 0.01806640625, |
|
"learning_rate": 9.909139497594868e-07, |
|
"loss": 0.0007, |
|
"reward": 0.3770228624343872, |
|
"reward_std": 0.09334432706236839, |
|
"rewards/iou_timestamp_reward": 0.3770228624343872, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 80.98750305175781, |
|
"epoch": 0.009352037408149633, |
|
"grad_norm": 13.110654830932617, |
|
"kl": 0.016082763671875, |
|
"learning_rate": 9.90646712987707e-07, |
|
"loss": 0.0006, |
|
"reward": 0.4078673869371414, |
|
"reward_std": 0.059798676520586014, |
|
"rewards/iou_timestamp_reward": 0.4078673869371414, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 92.25000381469727, |
|
"epoch": 0.009619238476953907, |
|
"grad_norm": 8.614245414733887, |
|
"kl": 0.01861572265625, |
|
"learning_rate": 9.903794762159273e-07, |
|
"loss": 0.0007, |
|
"reward": 0.35014285147190094, |
|
"reward_std": 0.0985935740172863, |
|
"rewards/iou_timestamp_reward": 0.35014285147190094, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 80.56250381469727, |
|
"epoch": 0.009886439545758184, |
|
"grad_norm": 10.115817070007324, |
|
"kl": 0.01837158203125, |
|
"learning_rate": 9.901122394441475e-07, |
|
"loss": 0.0007, |
|
"reward": 0.40472397208213806, |
|
"reward_std": 0.12752560898661613, |
|
"rewards/iou_timestamp_reward": 0.40472397208213806, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 81.17500305175781, |
|
"epoch": 0.010153640614562458, |
|
"grad_norm": 39.667320251464844, |
|
"kl": 0.015625, |
|
"learning_rate": 9.898450026723678e-07, |
|
"loss": 0.0006, |
|
"reward": 0.46072031557559967, |
|
"reward_std": 0.09041710384190083, |
|
"rewards/iou_timestamp_reward": 0.46072031557559967, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 81.61250305175781, |
|
"epoch": 0.010420841683366733, |
|
"grad_norm": 13.683932304382324, |
|
"kl": 0.016387939453125, |
|
"learning_rate": 9.895777659005878e-07, |
|
"loss": 0.0007, |
|
"reward": 0.543788880109787, |
|
"reward_std": 0.10228582844138145, |
|
"rewards/iou_timestamp_reward": 0.543788880109787, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 81.35000228881836, |
|
"epoch": 0.01068804275217101, |
|
"grad_norm": 12.15194034576416, |
|
"kl": 0.017791748046875, |
|
"learning_rate": 9.89310529128808e-07, |
|
"loss": 0.0007, |
|
"reward": 0.4217175394296646, |
|
"reward_std": 0.12605417147278786, |
|
"rewards/iou_timestamp_reward": 0.4217175394296646, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 81.98750305175781, |
|
"epoch": 0.010955243820975284, |
|
"grad_norm": 121.88482666015625, |
|
"kl": 0.01617431640625, |
|
"learning_rate": 9.890432923570283e-07, |
|
"loss": 0.0006, |
|
"reward": 0.36619891971349716, |
|
"reward_std": 0.10043040104210377, |
|
"rewards/iou_timestamp_reward": 0.36619891971349716, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 76.97500228881836, |
|
"epoch": 0.011222444889779559, |
|
"grad_norm": 14.41795825958252, |
|
"kl": 0.0224609375, |
|
"learning_rate": 9.887760555852485e-07, |
|
"loss": 0.0009, |
|
"reward": 0.3520086780190468, |
|
"reward_std": 0.09863714687526226, |
|
"rewards/iou_timestamp_reward": 0.3520086780190468, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 84.72500228881836, |
|
"epoch": 0.011489645958583834, |
|
"grad_norm": 14.718040466308594, |
|
"kl": 0.01531982421875, |
|
"learning_rate": 9.885088188134688e-07, |
|
"loss": 0.0006, |
|
"reward": 0.5319889932870865, |
|
"reward_std": 0.11806132644414902, |
|
"rewards/iou_timestamp_reward": 0.5319889932870865, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 83.61250305175781, |
|
"epoch": 0.01175684702738811, |
|
"grad_norm": 18.669189453125, |
|
"kl": 0.0208740234375, |
|
"learning_rate": 9.882415820416888e-07, |
|
"loss": 0.0008, |
|
"reward": 0.5544195771217346, |
|
"reward_std": 0.11862267926335335, |
|
"rewards/iou_timestamp_reward": 0.5544195771217346, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 82.10000228881836, |
|
"epoch": 0.012024048096192385, |
|
"grad_norm": 13.057522773742676, |
|
"kl": 0.01544189453125, |
|
"learning_rate": 9.87974345269909e-07, |
|
"loss": 0.0006, |
|
"reward": 0.4692879766225815, |
|
"reward_std": 0.08527671545743942, |
|
"rewards/iou_timestamp_reward": 0.4692879766225815, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 81.8125, |
|
"epoch": 0.01229124916499666, |
|
"grad_norm": 62.2025260925293, |
|
"kl": 0.021392822265625, |
|
"learning_rate": 9.877071084981293e-07, |
|
"loss": 0.0009, |
|
"reward": 0.4865482598543167, |
|
"reward_std": 0.11775477230548859, |
|
"rewards/iou_timestamp_reward": 0.4865482598543167, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 81.88750457763672, |
|
"epoch": 0.012558450233800936, |
|
"grad_norm": 11.27963638305664, |
|
"kl": 0.02313232421875, |
|
"learning_rate": 9.874398717263496e-07, |
|
"loss": 0.0009, |
|
"reward": 0.4721195548772812, |
|
"reward_std": 0.0720259640365839, |
|
"rewards/iou_timestamp_reward": 0.4721195548772812, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 81.82500457763672, |
|
"epoch": 0.01282565130260521, |
|
"grad_norm": 9.9296875, |
|
"kl": 0.016571044921875, |
|
"learning_rate": 9.871726349545698e-07, |
|
"loss": 0.0007, |
|
"reward": 0.48466672003269196, |
|
"reward_std": 0.15841123461723328, |
|
"rewards/iou_timestamp_reward": 0.48466672003269196, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 79.66250228881836, |
|
"epoch": 0.013092852371409485, |
|
"grad_norm": 10.52200698852539, |
|
"kl": 0.01617431640625, |
|
"learning_rate": 9.8690539818279e-07, |
|
"loss": 0.0006, |
|
"reward": 0.3634609803557396, |
|
"reward_std": 0.13740423694252968, |
|
"rewards/iou_timestamp_reward": 0.3634609803557396, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 82.2750015258789, |
|
"epoch": 0.013360053440213761, |
|
"grad_norm": 19.474987030029297, |
|
"kl": 0.0179443359375, |
|
"learning_rate": 9.8663816141101e-07, |
|
"loss": 0.0007, |
|
"reward": 0.5607456266880035, |
|
"reward_std": 0.11081211641430855, |
|
"rewards/iou_timestamp_reward": 0.5607456266880035, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 91.9625015258789, |
|
"epoch": 0.013627254509018036, |
|
"grad_norm": 8.144207000732422, |
|
"kl": 0.0185546875, |
|
"learning_rate": 9.863709246392303e-07, |
|
"loss": 0.0007, |
|
"reward": 0.5219191014766693, |
|
"reward_std": 0.12948721647262573, |
|
"rewards/iou_timestamp_reward": 0.5219191014766693, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 82.75, |
|
"epoch": 0.01389445557782231, |
|
"grad_norm": 20.344608306884766, |
|
"kl": 0.02337646484375, |
|
"learning_rate": 9.861036878674504e-07, |
|
"loss": 0.0009, |
|
"reward": 0.4568335562944412, |
|
"reward_std": 0.06874274648725986, |
|
"rewards/iou_timestamp_reward": 0.4568335562944412, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 86.7750015258789, |
|
"epoch": 0.014161656646626587, |
|
"grad_norm": 21.093334197998047, |
|
"kl": 0.01800537109375, |
|
"learning_rate": 9.858364510956706e-07, |
|
"loss": 0.0007, |
|
"reward": 0.5438729226589203, |
|
"reward_std": 0.14354269951581955, |
|
"rewards/iou_timestamp_reward": 0.5438729226589203, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 81.17500305175781, |
|
"epoch": 0.014428857715430862, |
|
"grad_norm": 13.528135299682617, |
|
"kl": 0.02130126953125, |
|
"learning_rate": 9.855692143238909e-07, |
|
"loss": 0.0009, |
|
"reward": 0.43492650985717773, |
|
"reward_std": 0.08151087537407875, |
|
"rewards/iou_timestamp_reward": 0.43492650985717773, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 80.125, |
|
"epoch": 0.014696058784235137, |
|
"grad_norm": 28.23933219909668, |
|
"kl": 0.0201416015625, |
|
"learning_rate": 9.85301977552111e-07, |
|
"loss": 0.0008, |
|
"reward": 0.5359328091144562, |
|
"reward_std": 0.11250880360603333, |
|
"rewards/iou_timestamp_reward": 0.5359328091144562, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 82.36249923706055, |
|
"epoch": 0.014963259853039413, |
|
"grad_norm": 28.83526611328125, |
|
"kl": 0.0228271484375, |
|
"learning_rate": 9.850347407803313e-07, |
|
"loss": 0.0009, |
|
"reward": 0.41243691742420197, |
|
"reward_std": 0.12655200064182281, |
|
"rewards/iou_timestamp_reward": 0.41243691742420197, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 82.375, |
|
"epoch": 0.015230460921843688, |
|
"grad_norm": 11.394326210021973, |
|
"kl": 0.023193359375, |
|
"learning_rate": 9.847675040085516e-07, |
|
"loss": 0.0009, |
|
"reward": 0.44033171236515045, |
|
"reward_std": 0.06593804247677326, |
|
"rewards/iou_timestamp_reward": 0.44033171236515045, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 77.88750076293945, |
|
"epoch": 0.015497661990647962, |
|
"grad_norm": 40.8941535949707, |
|
"kl": 0.025146484375, |
|
"learning_rate": 9.845002672367716e-07, |
|
"loss": 0.001, |
|
"reward": 0.3637496903538704, |
|
"reward_std": 0.040244522504508495, |
|
"rewards/iou_timestamp_reward": 0.3637496903538704, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 82.3375015258789, |
|
"epoch": 0.01576486305945224, |
|
"grad_norm": 14.911782264709473, |
|
"kl": 0.02606201171875, |
|
"learning_rate": 9.842330304649919e-07, |
|
"loss": 0.001, |
|
"reward": 0.5161260962486267, |
|
"reward_std": 0.07627514749765396, |
|
"rewards/iou_timestamp_reward": 0.5161260962486267, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 80.5374984741211, |
|
"epoch": 0.01603206412825651, |
|
"grad_norm": 17.467588424682617, |
|
"kl": 0.02093505859375, |
|
"learning_rate": 9.839657936932121e-07, |
|
"loss": 0.0008, |
|
"reward": 0.3705238401889801, |
|
"reward_std": 0.06593567878007889, |
|
"rewards/iou_timestamp_reward": 0.3705238401889801, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 77.6624984741211, |
|
"epoch": 0.016299265197060788, |
|
"grad_norm": 21.349925994873047, |
|
"kl": 0.0181884765625, |
|
"learning_rate": 9.836985569214324e-07, |
|
"loss": 0.0007, |
|
"reward": 0.662094235420227, |
|
"reward_std": 0.09039272367954254, |
|
"rewards/iou_timestamp_reward": 0.662094235420227, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 84.00000381469727, |
|
"epoch": 0.016566466265865065, |
|
"grad_norm": 7.912447929382324, |
|
"kl": 0.02349853515625, |
|
"learning_rate": 9.834313201496526e-07, |
|
"loss": 0.0009, |
|
"reward": 0.55202616751194, |
|
"reward_std": 0.07802803814411163, |
|
"rewards/iou_timestamp_reward": 0.55202616751194, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 79.85000228881836, |
|
"epoch": 0.016833667334669337, |
|
"grad_norm": 18.316444396972656, |
|
"kl": 0.03076171875, |
|
"learning_rate": 9.831640833778726e-07, |
|
"loss": 0.0012, |
|
"reward": 0.49671244621276855, |
|
"reward_std": 0.07321073487401009, |
|
"rewards/iou_timestamp_reward": 0.49671244621276855, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 75.6500015258789, |
|
"epoch": 0.017100868403473614, |
|
"grad_norm": 32.6612663269043, |
|
"kl": 0.0294189453125, |
|
"learning_rate": 9.828968466060929e-07, |
|
"loss": 0.0012, |
|
"reward": 0.4359857589006424, |
|
"reward_std": 0.1010651346296072, |
|
"rewards/iou_timestamp_reward": 0.4359857589006424, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 84.8125, |
|
"epoch": 0.01736806947227789, |
|
"grad_norm": 19.969472885131836, |
|
"kl": 0.0228271484375, |
|
"learning_rate": 9.826296098343131e-07, |
|
"loss": 0.0009, |
|
"reward": 0.36607126891613007, |
|
"reward_std": 0.08596941456198692, |
|
"rewards/iou_timestamp_reward": 0.36607126891613007, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 82.43750381469727, |
|
"epoch": 0.017635270541082163, |
|
"grad_norm": 10.52409553527832, |
|
"kl": 0.0196533203125, |
|
"learning_rate": 9.823623730625334e-07, |
|
"loss": 0.0008, |
|
"reward": 0.4937574118375778, |
|
"reward_std": 0.09046215564012527, |
|
"rewards/iou_timestamp_reward": 0.4937574118375778, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 82.68750381469727, |
|
"epoch": 0.01790247160988644, |
|
"grad_norm": 13.098502159118652, |
|
"kl": 0.02655029296875, |
|
"learning_rate": 9.820951362907536e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5216490924358368, |
|
"reward_std": 0.08065400645136833, |
|
"rewards/iou_timestamp_reward": 0.5216490924358368, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 80.88750076293945, |
|
"epoch": 0.018169672678690716, |
|
"grad_norm": 7.5821452140808105, |
|
"kl": 0.02789306640625, |
|
"learning_rate": 9.818278995189736e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5076297074556351, |
|
"reward_std": 0.08006440289318562, |
|
"rewards/iou_timestamp_reward": 0.5076297074556351, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 82.78750228881836, |
|
"epoch": 0.01843687374749499, |
|
"grad_norm": 20.07110595703125, |
|
"kl": 0.0191650390625, |
|
"learning_rate": 9.815606627471939e-07, |
|
"loss": 0.0008, |
|
"reward": 0.32277095317840576, |
|
"reward_std": 0.05622895993292332, |
|
"rewards/iou_timestamp_reward": 0.32277095317840576, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 83.57500457763672, |
|
"epoch": 0.018704074816299265, |
|
"grad_norm": 58.287322998046875, |
|
"kl": 0.02606201171875, |
|
"learning_rate": 9.812934259754141e-07, |
|
"loss": 0.001, |
|
"reward": 0.47261086106300354, |
|
"reward_std": 0.1284152865409851, |
|
"rewards/iou_timestamp_reward": 0.47261086106300354, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 76.6875, |
|
"epoch": 0.018971275885103542, |
|
"grad_norm": 15.780878067016602, |
|
"kl": 0.026611328125, |
|
"learning_rate": 9.810261892036344e-07, |
|
"loss": 0.0011, |
|
"reward": 0.4276278167963028, |
|
"reward_std": 0.10460241883993149, |
|
"rewards/iou_timestamp_reward": 0.4276278167963028, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 76.17500305175781, |
|
"epoch": 0.019238476953907815, |
|
"grad_norm": 354.0404968261719, |
|
"kl": 0.02935791015625, |
|
"learning_rate": 9.807589524318546e-07, |
|
"loss": 0.0012, |
|
"reward": 0.6388321816921234, |
|
"reward_std": 0.10210368782281876, |
|
"rewards/iou_timestamp_reward": 0.6388321816921234, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 74.32500076293945, |
|
"epoch": 0.01950567802271209, |
|
"grad_norm": 8.050577163696289, |
|
"kl": 0.02825927734375, |
|
"learning_rate": 9.804917156600749e-07, |
|
"loss": 0.0011, |
|
"reward": 0.3472304940223694, |
|
"reward_std": 0.16149252653121948, |
|
"rewards/iou_timestamp_reward": 0.3472304940223694, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 83.6624984741211, |
|
"epoch": 0.019772879091516368, |
|
"grad_norm": 16.733179092407227, |
|
"kl": 0.0260009765625, |
|
"learning_rate": 9.80224478888295e-07, |
|
"loss": 0.001, |
|
"reward": 0.6152356266975403, |
|
"reward_std": 0.09945449233055115, |
|
"rewards/iou_timestamp_reward": 0.6152356266975403, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 82.03750228881836, |
|
"epoch": 0.02004008016032064, |
|
"grad_norm": 11.940522193908691, |
|
"kl": 0.038330078125, |
|
"learning_rate": 9.799572421165151e-07, |
|
"loss": 0.0015, |
|
"reward": 0.6018393933773041, |
|
"reward_std": 0.10049806535243988, |
|
"rewards/iou_timestamp_reward": 0.6018393933773041, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 77.38750076293945, |
|
"epoch": 0.020307281229124917, |
|
"grad_norm": 16.911163330078125, |
|
"kl": 0.03070068359375, |
|
"learning_rate": 9.796900053447354e-07, |
|
"loss": 0.0012, |
|
"reward": 0.39946897327899933, |
|
"reward_std": 0.07368297874927521, |
|
"rewards/iou_timestamp_reward": 0.39946897327899933, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 91.1500015258789, |
|
"epoch": 0.020574482297929193, |
|
"grad_norm": 23.92142105102539, |
|
"kl": 0.0250244140625, |
|
"learning_rate": 9.794227685729556e-07, |
|
"loss": 0.001, |
|
"reward": 0.6169492304325104, |
|
"reward_std": 0.1484670452773571, |
|
"rewards/iou_timestamp_reward": 0.6169492304325104, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 84.6624984741211, |
|
"epoch": 0.020841683366733466, |
|
"grad_norm": 7.9083685874938965, |
|
"kl": 0.02764892578125, |
|
"learning_rate": 9.791555318011759e-07, |
|
"loss": 0.0011, |
|
"reward": 0.3675663024187088, |
|
"reward_std": 0.09683777764439583, |
|
"rewards/iou_timestamp_reward": 0.3675663024187088, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 81.45000076293945, |
|
"epoch": 0.021108884435537743, |
|
"grad_norm": 31.15521812438965, |
|
"kl": 0.0291748046875, |
|
"learning_rate": 9.78888295029396e-07, |
|
"loss": 0.0012, |
|
"reward": 0.3303828537464142, |
|
"reward_std": 0.08385182730853558, |
|
"rewards/iou_timestamp_reward": 0.3303828537464142, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 79.38750076293945, |
|
"epoch": 0.02137608550434202, |
|
"grad_norm": 45.90896987915039, |
|
"kl": 0.0291748046875, |
|
"learning_rate": 9.786210582576162e-07, |
|
"loss": 0.0012, |
|
"reward": 0.5361809432506561, |
|
"reward_std": 0.1461251862347126, |
|
"rewards/iou_timestamp_reward": 0.5361809432506561, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 82.07500076293945, |
|
"epoch": 0.021643286573146292, |
|
"grad_norm": 305.8792419433594, |
|
"kl": 0.02691650390625, |
|
"learning_rate": 9.783538214858364e-07, |
|
"loss": 0.0011, |
|
"reward": 0.3591422736644745, |
|
"reward_std": 0.06028897315263748, |
|
"rewards/iou_timestamp_reward": 0.3591422736644745, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 85.0875015258789, |
|
"epoch": 0.02191048764195057, |
|
"grad_norm": 37.03975296020508, |
|
"kl": 0.02716064453125, |
|
"learning_rate": 9.780865847140567e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5269813537597656, |
|
"reward_std": 0.13692177832126617, |
|
"rewards/iou_timestamp_reward": 0.5269813537597656, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 85.3499984741211, |
|
"epoch": 0.02217768871075484, |
|
"grad_norm": 44.27296447753906, |
|
"kl": 0.0279541015625, |
|
"learning_rate": 9.77819347942277e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5033409297466278, |
|
"reward_std": 0.14104880392551422, |
|
"rewards/iou_timestamp_reward": 0.5033409297466278, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 82.22500228881836, |
|
"epoch": 0.022444889779559118, |
|
"grad_norm": 14.83884048461914, |
|
"kl": 0.0341796875, |
|
"learning_rate": 9.77552111170497e-07, |
|
"loss": 0.0014, |
|
"reward": 0.3850397616624832, |
|
"reward_std": 0.07732919976115227, |
|
"rewards/iou_timestamp_reward": 0.3850397616624832, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 81.88750076293945, |
|
"epoch": 0.022712090848363394, |
|
"grad_norm": 10.014225959777832, |
|
"kl": 0.02880859375, |
|
"learning_rate": 9.772848743987172e-07, |
|
"loss": 0.0012, |
|
"reward": 0.5256166011095047, |
|
"reward_std": 0.0673129353672266, |
|
"rewards/iou_timestamp_reward": 0.5256166011095047, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 84.23749923706055, |
|
"epoch": 0.022979291917167667, |
|
"grad_norm": 14.976663589477539, |
|
"kl": 0.021484375, |
|
"learning_rate": 9.770176376269374e-07, |
|
"loss": 0.0009, |
|
"reward": 0.4509493559598923, |
|
"reward_std": 0.10696043819189072, |
|
"rewards/iou_timestamp_reward": 0.4509493559598923, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 78.01250457763672, |
|
"epoch": 0.023246492985971944, |
|
"grad_norm": 15.414386749267578, |
|
"kl": 0.02142333984375, |
|
"learning_rate": 9.767504008551577e-07, |
|
"loss": 0.0009, |
|
"reward": 0.58517786860466, |
|
"reward_std": 0.13582511618733406, |
|
"rewards/iou_timestamp_reward": 0.58517786860466, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 87.87500381469727, |
|
"epoch": 0.02351369405477622, |
|
"grad_norm": 7.675116062164307, |
|
"kl": 0.02691650390625, |
|
"learning_rate": 9.76483164083378e-07, |
|
"loss": 0.0011, |
|
"reward": 0.4487306475639343, |
|
"reward_std": 0.14682594686746597, |
|
"rewards/iou_timestamp_reward": 0.4487306475639343, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 81.05000305175781, |
|
"epoch": 0.023780895123580493, |
|
"grad_norm": 13.316726684570312, |
|
"kl": 0.02508544921875, |
|
"learning_rate": 9.762159273115982e-07, |
|
"loss": 0.001, |
|
"reward": 0.707331657409668, |
|
"reward_std": 0.11380118876695633, |
|
"rewards/iou_timestamp_reward": 0.707331657409668, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 89.36250305175781, |
|
"epoch": 0.02404809619238477, |
|
"grad_norm": 14.177164077758789, |
|
"kl": 0.02587890625, |
|
"learning_rate": 9.759486905398182e-07, |
|
"loss": 0.001, |
|
"reward": 0.5567902773618698, |
|
"reward_std": 0.11174913495779037, |
|
"rewards/iou_timestamp_reward": 0.5567902773618698, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 94.0875015258789, |
|
"epoch": 0.024315297261189046, |
|
"grad_norm": 12.30399227142334, |
|
"kl": 0.02972412109375, |
|
"learning_rate": 9.756814537680384e-07, |
|
"loss": 0.0012, |
|
"reward": 0.503425657749176, |
|
"reward_std": 0.10230003297328949, |
|
"rewards/iou_timestamp_reward": 0.503425657749176, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 78.0875015258789, |
|
"epoch": 0.02458249832999332, |
|
"grad_norm": 24.13161277770996, |
|
"kl": 0.0277099609375, |
|
"learning_rate": 9.754142169962587e-07, |
|
"loss": 0.0011, |
|
"reward": 0.24431391805410385, |
|
"reward_std": 0.07724586687982082, |
|
"rewards/iou_timestamp_reward": 0.24431391805410385, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 85.22500228881836, |
|
"epoch": 0.024849699398797595, |
|
"grad_norm": 10.364203453063965, |
|
"kl": 0.0269775390625, |
|
"learning_rate": 9.75146980224479e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5370314419269562, |
|
"reward_std": 0.11360052973031998, |
|
"rewards/iou_timestamp_reward": 0.5370314419269562, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 81.45000457763672, |
|
"epoch": 0.02511690046760187, |
|
"grad_norm": 12.295506477355957, |
|
"kl": 0.0411376953125, |
|
"learning_rate": 9.748797434526992e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5579471290111542, |
|
"reward_std": 0.09515488892793655, |
|
"rewards/iou_timestamp_reward": 0.5579471290111542, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 80.38750076293945, |
|
"epoch": 0.025384101536406144, |
|
"grad_norm": 15.645418167114258, |
|
"kl": 0.03857421875, |
|
"learning_rate": 9.746125066809192e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5186438858509064, |
|
"reward_std": 0.11734926328063011, |
|
"rewards/iou_timestamp_reward": 0.5186438858509064, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 81.93750381469727, |
|
"epoch": 0.02565130260521042, |
|
"grad_norm": 9.69706916809082, |
|
"kl": 0.032958984375, |
|
"learning_rate": 9.743452699091394e-07, |
|
"loss": 0.0013, |
|
"reward": 0.4298373907804489, |
|
"reward_std": 0.12638958543539047, |
|
"rewards/iou_timestamp_reward": 0.4298373907804489, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 86.4124984741211, |
|
"epoch": 0.025918503674014697, |
|
"grad_norm": 10.817190170288086, |
|
"kl": 0.03369140625, |
|
"learning_rate": 9.740780331373597e-07, |
|
"loss": 0.0013, |
|
"reward": 0.4257134795188904, |
|
"reward_std": 0.0996755063533783, |
|
"rewards/iou_timestamp_reward": 0.4257134795188904, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 87.60000228881836, |
|
"epoch": 0.02618570474281897, |
|
"grad_norm": 13.339404106140137, |
|
"kl": 0.03131103515625, |
|
"learning_rate": 9.7381079636558e-07, |
|
"loss": 0.0012, |
|
"reward": 0.466649129986763, |
|
"reward_std": 0.12218038365244865, |
|
"rewards/iou_timestamp_reward": 0.466649129986763, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 83.7125015258789, |
|
"epoch": 0.026452905811623247, |
|
"grad_norm": 6.138172626495361, |
|
"kl": 0.0277099609375, |
|
"learning_rate": 9.735435595938002e-07, |
|
"loss": 0.0011, |
|
"reward": 0.3628492206335068, |
|
"reward_std": 0.08555204793810844, |
|
"rewards/iou_timestamp_reward": 0.3628492206335068, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 83.5250015258789, |
|
"epoch": 0.026720106880427523, |
|
"grad_norm": 12.114334106445312, |
|
"kl": 0.02606201171875, |
|
"learning_rate": 9.732763228220202e-07, |
|
"loss": 0.001, |
|
"reward": 0.47904497385025024, |
|
"reward_std": 0.11454221606254578, |
|
"rewards/iou_timestamp_reward": 0.47904497385025024, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 89.67500305175781, |
|
"epoch": 0.026987307949231796, |
|
"grad_norm": 7.100372314453125, |
|
"kl": 0.025146484375, |
|
"learning_rate": 9.730090860502405e-07, |
|
"loss": 0.001, |
|
"reward": 0.5214036107063293, |
|
"reward_std": 0.09529952518641949, |
|
"rewards/iou_timestamp_reward": 0.5214036107063293, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 88.9375, |
|
"epoch": 0.027254509018036072, |
|
"grad_norm": 13.426268577575684, |
|
"kl": 0.0247802734375, |
|
"learning_rate": 9.727418492784607e-07, |
|
"loss": 0.001, |
|
"reward": 0.45033982396125793, |
|
"reward_std": 0.11235591396689415, |
|
"rewards/iou_timestamp_reward": 0.45033982396125793, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 93.35000228881836, |
|
"epoch": 0.02752171008684035, |
|
"grad_norm": 9.743562698364258, |
|
"kl": 0.029052734375, |
|
"learning_rate": 9.724746125066807e-07, |
|
"loss": 0.0012, |
|
"reward": 0.32814496755599976, |
|
"reward_std": 0.10226564481854439, |
|
"rewards/iou_timestamp_reward": 0.32814496755599976, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 97.7874984741211, |
|
"epoch": 0.02778891115564462, |
|
"grad_norm": 11.970060348510742, |
|
"kl": 0.0242919921875, |
|
"learning_rate": 9.72207375734901e-07, |
|
"loss": 0.001, |
|
"reward": 0.47338829934597015, |
|
"reward_std": 0.12420981377363205, |
|
"rewards/iou_timestamp_reward": 0.47338829934597015, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 89.0250015258789, |
|
"epoch": 0.028056112224448898, |
|
"grad_norm": 8.114420890808105, |
|
"kl": 0.02874755859375, |
|
"learning_rate": 9.719401389631212e-07, |
|
"loss": 0.0012, |
|
"reward": 0.38226931542158127, |
|
"reward_std": 0.1174611747264862, |
|
"rewards/iou_timestamp_reward": 0.38226931542158127, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 93.92499923706055, |
|
"epoch": 0.028323313293253174, |
|
"grad_norm": 17.39056968688965, |
|
"kl": 0.03118896484375, |
|
"learning_rate": 9.716729021913415e-07, |
|
"loss": 0.0012, |
|
"reward": 0.4228970557451248, |
|
"reward_std": 0.13135263323783875, |
|
"rewards/iou_timestamp_reward": 0.4228970557451248, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 102.91250228881836, |
|
"epoch": 0.028590514362057447, |
|
"grad_norm": 17.785310745239258, |
|
"kl": 0.0283203125, |
|
"learning_rate": 9.714056654195617e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5794477760791779, |
|
"reward_std": 0.10743552818894386, |
|
"rewards/iou_timestamp_reward": 0.5794477760791779, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 99.43750381469727, |
|
"epoch": 0.028857715430861724, |
|
"grad_norm": 19.4038028717041, |
|
"kl": 0.0311279296875, |
|
"learning_rate": 9.711384286477817e-07, |
|
"loss": 0.0012, |
|
"reward": 0.45486655831336975, |
|
"reward_std": 0.10233093053102493, |
|
"rewards/iou_timestamp_reward": 0.45486655831336975, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 95.72500228881836, |
|
"epoch": 0.029124916499666, |
|
"grad_norm": 23.040130615234375, |
|
"kl": 0.030517578125, |
|
"learning_rate": 9.70871191876002e-07, |
|
"loss": 0.0012, |
|
"reward": 0.40837743878364563, |
|
"reward_std": 0.12567105516791344, |
|
"rewards/iou_timestamp_reward": 0.40837743878364563, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 96.85000228881836, |
|
"epoch": 0.029392117568470273, |
|
"grad_norm": 8.903193473815918, |
|
"kl": 0.030029296875, |
|
"learning_rate": 9.706039551042222e-07, |
|
"loss": 0.0012, |
|
"reward": 0.5052085667848587, |
|
"reward_std": 0.13449953123927116, |
|
"rewards/iou_timestamp_reward": 0.5052085667848587, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 91.8125, |
|
"epoch": 0.02965931863727455, |
|
"grad_norm": 20.629566192626953, |
|
"kl": 0.02423095703125, |
|
"learning_rate": 9.703367183324425e-07, |
|
"loss": 0.001, |
|
"reward": 0.323476105928421, |
|
"reward_std": 0.07313787564635277, |
|
"rewards/iou_timestamp_reward": 0.323476105928421, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 107.6500015258789, |
|
"epoch": 0.029926519706078826, |
|
"grad_norm": 10.181814193725586, |
|
"kl": 0.0274658203125, |
|
"learning_rate": 9.700694815606627e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5477354377508163, |
|
"reward_std": 0.11607206612825394, |
|
"rewards/iou_timestamp_reward": 0.5477354377508163, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 94.04999923706055, |
|
"epoch": 0.0301937207748831, |
|
"grad_norm": 16.322965621948242, |
|
"kl": 0.0316162109375, |
|
"learning_rate": 9.69802244788883e-07, |
|
"loss": 0.0013, |
|
"reward": 0.45577606558799744, |
|
"reward_std": 0.10110274329781532, |
|
"rewards/iou_timestamp_reward": 0.45577606558799744, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 95.125, |
|
"epoch": 0.030460921843687375, |
|
"grad_norm": 20.937755584716797, |
|
"kl": 0.0269775390625, |
|
"learning_rate": 9.69535008017103e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5386803448200226, |
|
"reward_std": 0.09999115392565727, |
|
"rewards/iou_timestamp_reward": 0.5386803448200226, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 92.45000076293945, |
|
"epoch": 0.03072812291249165, |
|
"grad_norm": 17.845489501953125, |
|
"kl": 0.02886962890625, |
|
"learning_rate": 9.692677712453233e-07, |
|
"loss": 0.0012, |
|
"reward": 0.44140908122062683, |
|
"reward_std": 0.12080599367618561, |
|
"rewards/iou_timestamp_reward": 0.44140908122062683, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 91.2125015258789, |
|
"epoch": 0.030995323981295925, |
|
"grad_norm": 11.29624080657959, |
|
"kl": 0.04010009765625, |
|
"learning_rate": 9.690005344735435e-07, |
|
"loss": 0.0016, |
|
"reward": 0.4579700082540512, |
|
"reward_std": 0.1115545816719532, |
|
"rewards/iou_timestamp_reward": 0.4579700082540512, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 91.4375, |
|
"epoch": 0.0312625250501002, |
|
"grad_norm": 9.191158294677734, |
|
"kl": 0.026611328125, |
|
"learning_rate": 9.687332977017637e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5297419875860214, |
|
"reward_std": 0.08494473621249199, |
|
"rewards/iou_timestamp_reward": 0.5297419875860214, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 86.70000076293945, |
|
"epoch": 0.03152972611890448, |
|
"grad_norm": 155.39776611328125, |
|
"kl": 0.0263671875, |
|
"learning_rate": 9.68466060929984e-07, |
|
"loss": 0.0011, |
|
"reward": 0.3678671717643738, |
|
"reward_std": 0.11950035020709038, |
|
"rewards/iou_timestamp_reward": 0.3678671717643738, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 90.4625015258789, |
|
"epoch": 0.031796927187708754, |
|
"grad_norm": 18.508604049682617, |
|
"kl": 0.02783203125, |
|
"learning_rate": 9.68198824158204e-07, |
|
"loss": 0.0011, |
|
"reward": 0.4056996703147888, |
|
"reward_std": 0.19096750020980835, |
|
"rewards/iou_timestamp_reward": 0.4056996703147888, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 84.625, |
|
"epoch": 0.03206412825651302, |
|
"grad_norm": 11.088163375854492, |
|
"kl": 0.02911376953125, |
|
"learning_rate": 9.679315873864243e-07, |
|
"loss": 0.0012, |
|
"reward": 0.4425441175699234, |
|
"reward_std": 0.09214091300964355, |
|
"rewards/iou_timestamp_reward": 0.4425441175699234, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 89.11250305175781, |
|
"epoch": 0.0323313293253173, |
|
"grad_norm": 20.468366622924805, |
|
"kl": 0.0357666015625, |
|
"learning_rate": 9.676643506146445e-07, |
|
"loss": 0.0014, |
|
"reward": 0.4712321311235428, |
|
"reward_std": 0.10969820618629456, |
|
"rewards/iou_timestamp_reward": 0.4712321311235428, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 90.73750305175781, |
|
"epoch": 0.032598530394121576, |
|
"grad_norm": 20.332138061523438, |
|
"kl": 0.03021240234375, |
|
"learning_rate": 9.673971138428648e-07, |
|
"loss": 0.0012, |
|
"reward": 0.5420858561992645, |
|
"reward_std": 0.14077245816588402, |
|
"rewards/iou_timestamp_reward": 0.5420858561992645, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 80.5374984741211, |
|
"epoch": 0.03286573146292585, |
|
"grad_norm": 11.35522747039795, |
|
"kl": 0.03619384765625, |
|
"learning_rate": 9.67129877071085e-07, |
|
"loss": 0.0014, |
|
"reward": 0.4410727024078369, |
|
"reward_std": 0.13154280930757523, |
|
"rewards/iou_timestamp_reward": 0.4410727024078369, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 80.17499923706055, |
|
"epoch": 0.03313293253173013, |
|
"grad_norm": 15.309946060180664, |
|
"kl": 0.04296875, |
|
"learning_rate": 9.66862640299305e-07, |
|
"loss": 0.0017, |
|
"reward": 0.4710334688425064, |
|
"reward_std": 0.08776786364614964, |
|
"rewards/iou_timestamp_reward": 0.4710334688425064, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 81.87500381469727, |
|
"epoch": 0.033400133600534405, |
|
"grad_norm": 11.466004371643066, |
|
"kl": 0.02783203125, |
|
"learning_rate": 9.665954035275253e-07, |
|
"loss": 0.0011, |
|
"reward": 0.4691763147711754, |
|
"reward_std": 0.08689278736710548, |
|
"rewards/iou_timestamp_reward": 0.4691763147711754, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 82.82500076293945, |
|
"epoch": 0.033667334669338675, |
|
"grad_norm": 34.043914794921875, |
|
"kl": 0.0379638671875, |
|
"learning_rate": 9.663281667557455e-07, |
|
"loss": 0.0015, |
|
"reward": 0.44279107451438904, |
|
"reward_std": 0.12102306261658669, |
|
"rewards/iou_timestamp_reward": 0.44279107451438904, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 86.63750076293945, |
|
"epoch": 0.03393453573814295, |
|
"grad_norm": 193.87367248535156, |
|
"kl": 0.0396728515625, |
|
"learning_rate": 9.660609299839658e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5007461756467819, |
|
"reward_std": 0.10710695758461952, |
|
"rewards/iou_timestamp_reward": 0.5007461756467819, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 88.38750076293945, |
|
"epoch": 0.03420173680694723, |
|
"grad_norm": 12.08212947845459, |
|
"kl": 0.0350341796875, |
|
"learning_rate": 9.65793693212186e-07, |
|
"loss": 0.0014, |
|
"reward": 0.4938567131757736, |
|
"reward_std": 0.12255796045064926, |
|
"rewards/iou_timestamp_reward": 0.4938567131757736, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 82.0625, |
|
"epoch": 0.034468937875751504, |
|
"grad_norm": 12.13739013671875, |
|
"kl": 0.0396728515625, |
|
"learning_rate": 9.655264564404063e-07, |
|
"loss": 0.0016, |
|
"reward": 0.4408050402998924, |
|
"reward_std": 0.07024678587913513, |
|
"rewards/iou_timestamp_reward": 0.4408050402998924, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 89.25000381469727, |
|
"epoch": 0.03473613894455578, |
|
"grad_norm": 9.668633460998535, |
|
"kl": 0.0413818359375, |
|
"learning_rate": 9.652592196686263e-07, |
|
"loss": 0.0017, |
|
"reward": 0.5121255666017532, |
|
"reward_std": 0.08106988109648228, |
|
"rewards/iou_timestamp_reward": 0.5121255666017532, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 83.66250228881836, |
|
"epoch": 0.03500334001336005, |
|
"grad_norm": 13.181618690490723, |
|
"kl": 0.03662109375, |
|
"learning_rate": 9.649919828968465e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5275007039308548, |
|
"reward_std": 0.07400373369455338, |
|
"rewards/iou_timestamp_reward": 0.5275007039308548, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 87.2249984741211, |
|
"epoch": 0.035270541082164326, |
|
"grad_norm": 12.78144645690918, |
|
"kl": 0.04296875, |
|
"learning_rate": 9.647247461250668e-07, |
|
"loss": 0.0017, |
|
"reward": 0.5786421597003937, |
|
"reward_std": 0.12599972262978554, |
|
"rewards/iou_timestamp_reward": 0.5786421597003937, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 89.05000305175781, |
|
"epoch": 0.0355377421509686, |
|
"grad_norm": 14.639240264892578, |
|
"kl": 0.0394287109375, |
|
"learning_rate": 9.64457509353287e-07, |
|
"loss": 0.0016, |
|
"reward": 0.391816645860672, |
|
"reward_std": 0.11170131340622902, |
|
"rewards/iou_timestamp_reward": 0.391816645860672, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 92.57500076293945, |
|
"epoch": 0.03580494321977288, |
|
"grad_norm": 61.03050231933594, |
|
"kl": 0.0511474609375, |
|
"learning_rate": 9.641902725815073e-07, |
|
"loss": 0.002, |
|
"reward": 0.49499718844890594, |
|
"reward_std": 0.11343395337462425, |
|
"rewards/iou_timestamp_reward": 0.49499718844890594, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 84.36250305175781, |
|
"epoch": 0.036072144288577156, |
|
"grad_norm": 16.839923858642578, |
|
"kl": 0.0423583984375, |
|
"learning_rate": 9.639230358097273e-07, |
|
"loss": 0.0017, |
|
"reward": 0.3318554311990738, |
|
"reward_std": 0.05935862101614475, |
|
"rewards/iou_timestamp_reward": 0.3318554311990738, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 85.5875015258789, |
|
"epoch": 0.03633934535738143, |
|
"grad_norm": 14.992598533630371, |
|
"kl": 0.037353515625, |
|
"learning_rate": 9.636557990379475e-07, |
|
"loss": 0.0015, |
|
"reward": 0.4685795456171036, |
|
"reward_std": 0.06687245890498161, |
|
"rewards/iou_timestamp_reward": 0.4685795456171036, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 88.4000015258789, |
|
"epoch": 0.0366065464261857, |
|
"grad_norm": 17.69700813293457, |
|
"kl": 0.038330078125, |
|
"learning_rate": 9.633885622661678e-07, |
|
"loss": 0.0015, |
|
"reward": 0.6672376096248627, |
|
"reward_std": 0.05268973857164383, |
|
"rewards/iou_timestamp_reward": 0.6672376096248627, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 87.1500015258789, |
|
"epoch": 0.03687374749498998, |
|
"grad_norm": 12.710575103759766, |
|
"kl": 0.0401611328125, |
|
"learning_rate": 9.63121325494388e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5832545459270477, |
|
"reward_std": 0.10356196388602257, |
|
"rewards/iou_timestamp_reward": 0.5832545459270477, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 89.20000076293945, |
|
"epoch": 0.037140948563794254, |
|
"grad_norm": 16.954652786254883, |
|
"kl": 0.038818359375, |
|
"learning_rate": 9.628540887226083e-07, |
|
"loss": 0.0016, |
|
"reward": 0.3957640081644058, |
|
"reward_std": 0.09107673540711403, |
|
"rewards/iou_timestamp_reward": 0.3957640081644058, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 89.10000228881836, |
|
"epoch": 0.03740814963259853, |
|
"grad_norm": 10.147904396057129, |
|
"kl": 0.041748046875, |
|
"learning_rate": 9.625868519508283e-07, |
|
"loss": 0.0017, |
|
"reward": 0.5282624512910843, |
|
"reward_std": 0.10442003235220909, |
|
"rewards/iou_timestamp_reward": 0.5282624512910843, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 84.7249984741211, |
|
"epoch": 0.03767535070140281, |
|
"grad_norm": 14.729109764099121, |
|
"kl": 0.0543212890625, |
|
"learning_rate": 9.623196151790486e-07, |
|
"loss": 0.0022, |
|
"reward": 0.4562620669603348, |
|
"reward_std": 0.09092384111136198, |
|
"rewards/iou_timestamp_reward": 0.4562620669603348, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 89.50000381469727, |
|
"epoch": 0.037942551770207084, |
|
"grad_norm": 17.36848258972168, |
|
"kl": 0.038330078125, |
|
"learning_rate": 9.620523784072688e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5253326147794724, |
|
"reward_std": 0.11369239538908005, |
|
"rewards/iou_timestamp_reward": 0.5253326147794724, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 82.37500381469727, |
|
"epoch": 0.03820975283901135, |
|
"grad_norm": 10.699419975280762, |
|
"kl": 0.0457763671875, |
|
"learning_rate": 9.61785141635489e-07, |
|
"loss": 0.0018, |
|
"reward": 0.29642877727746964, |
|
"reward_std": 0.052388858050107956, |
|
"rewards/iou_timestamp_reward": 0.29642877727746964, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 91.28750228881836, |
|
"epoch": 0.03847695390781563, |
|
"grad_norm": 57.955970764160156, |
|
"kl": 0.03057861328125, |
|
"learning_rate": 9.615179048637093e-07, |
|
"loss": 0.0012, |
|
"reward": 0.49990953505039215, |
|
"reward_std": 0.09969761222600937, |
|
"rewards/iou_timestamp_reward": 0.49990953505039215, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 88.41250228881836, |
|
"epoch": 0.038744154976619906, |
|
"grad_norm": 23.626953125, |
|
"kl": 0.0404052734375, |
|
"learning_rate": 9.612506680919295e-07, |
|
"loss": 0.0016, |
|
"reward": 0.41132599115371704, |
|
"reward_std": 0.07006280310451984, |
|
"rewards/iou_timestamp_reward": 0.41132599115371704, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 90.97500228881836, |
|
"epoch": 0.03901135604542418, |
|
"grad_norm": 19.23369598388672, |
|
"kl": 0.0394287109375, |
|
"learning_rate": 9.609834313201496e-07, |
|
"loss": 0.0016, |
|
"reward": 0.4373021870851517, |
|
"reward_std": 0.12405096367001534, |
|
"rewards/iou_timestamp_reward": 0.4373021870851517, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 84.13750076293945, |
|
"epoch": 0.03927855711422846, |
|
"grad_norm": 9.589532852172852, |
|
"kl": 0.039794921875, |
|
"learning_rate": 9.607161945483698e-07, |
|
"loss": 0.0016, |
|
"reward": 0.409616619348526, |
|
"reward_std": 0.11418266966938972, |
|
"rewards/iou_timestamp_reward": 0.409616619348526, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 78.5875015258789, |
|
"epoch": 0.039545758183032735, |
|
"grad_norm": 14.857934951782227, |
|
"kl": 0.0352783203125, |
|
"learning_rate": 9.6044895777659e-07, |
|
"loss": 0.0014, |
|
"reward": 0.5202317535877228, |
|
"reward_std": 0.11668656021356583, |
|
"rewards/iou_timestamp_reward": 0.5202317535877228, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 82.8125, |
|
"epoch": 0.039812959251837005, |
|
"grad_norm": 10.997519493103027, |
|
"kl": 0.03094482421875, |
|
"learning_rate": 9.601817210048103e-07, |
|
"loss": 0.0012, |
|
"reward": 0.42350925505161285, |
|
"reward_std": 0.12441174313426018, |
|
"rewards/iou_timestamp_reward": 0.42350925505161285, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 79.18750381469727, |
|
"epoch": 0.04008016032064128, |
|
"grad_norm": 11.452859878540039, |
|
"kl": 0.0384521484375, |
|
"learning_rate": 9.599144842330306e-07, |
|
"loss": 0.0015, |
|
"reward": 0.42812803387641907, |
|
"reward_std": 0.07005837932229042, |
|
"rewards/iou_timestamp_reward": 0.42812803387641907, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 85.4375, |
|
"epoch": 0.04034736138944556, |
|
"grad_norm": 21.108501434326172, |
|
"kl": 0.0325927734375, |
|
"learning_rate": 9.596472474612506e-07, |
|
"loss": 0.0013, |
|
"reward": 0.3734997771680355, |
|
"reward_std": 0.10468435287475586, |
|
"rewards/iou_timestamp_reward": 0.3734997771680355, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 83.7750015258789, |
|
"epoch": 0.040614562458249834, |
|
"grad_norm": 17.950271606445312, |
|
"kl": 0.045654296875, |
|
"learning_rate": 9.593800106894708e-07, |
|
"loss": 0.0018, |
|
"reward": 0.40336285531520844, |
|
"reward_std": 0.08605783060193062, |
|
"rewards/iou_timestamp_reward": 0.40336285531520844, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 80.5875015258789, |
|
"epoch": 0.04088176352705411, |
|
"grad_norm": 15.713683128356934, |
|
"kl": 0.03759765625, |
|
"learning_rate": 9.59112773917691e-07, |
|
"loss": 0.0015, |
|
"reward": 0.4455360919237137, |
|
"reward_std": 0.10942557081580162, |
|
"rewards/iou_timestamp_reward": 0.4455360919237137, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 93.82500076293945, |
|
"epoch": 0.04114896459585839, |
|
"grad_norm": 130.20535278320312, |
|
"kl": 0.048583984375, |
|
"learning_rate": 9.588455371459111e-07, |
|
"loss": 0.0019, |
|
"reward": 0.47216907143592834, |
|
"reward_std": 0.12527307868003845, |
|
"rewards/iou_timestamp_reward": 0.47216907143592834, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 77.4375, |
|
"epoch": 0.041416165664662656, |
|
"grad_norm": 33.787925720214844, |
|
"kl": 0.044189453125, |
|
"learning_rate": 9.585783003741314e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5094977915287018, |
|
"reward_std": 0.11581655591726303, |
|
"rewards/iou_timestamp_reward": 0.5094977915287018, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 81.1500015258789, |
|
"epoch": 0.04168336673346693, |
|
"grad_norm": 14.96328353881836, |
|
"kl": 0.0411376953125, |
|
"learning_rate": 9.583110636023516e-07, |
|
"loss": 0.0016, |
|
"reward": 0.39587974548339844, |
|
"reward_std": 0.11371102556586266, |
|
"rewards/iou_timestamp_reward": 0.39587974548339844, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 90.51250076293945, |
|
"epoch": 0.04195056780227121, |
|
"grad_norm": 12.745133399963379, |
|
"kl": 0.03662109375, |
|
"learning_rate": 9.580438268305718e-07, |
|
"loss": 0.0015, |
|
"reward": 0.4054044932126999, |
|
"reward_std": 0.06693214736878872, |
|
"rewards/iou_timestamp_reward": 0.4054044932126999, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 85.22500228881836, |
|
"epoch": 0.042217768871075485, |
|
"grad_norm": 15.326485633850098, |
|
"kl": 0.04296875, |
|
"learning_rate": 9.57776590058792e-07, |
|
"loss": 0.0017, |
|
"reward": 0.517610028386116, |
|
"reward_std": 0.08778567984700203, |
|
"rewards/iou_timestamp_reward": 0.517610028386116, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 85.10000228881836, |
|
"epoch": 0.04248496993987976, |
|
"grad_norm": 15.57522964477539, |
|
"kl": 0.0545654296875, |
|
"learning_rate": 9.575093532870121e-07, |
|
"loss": 0.0022, |
|
"reward": 0.47441329061985016, |
|
"reward_std": 0.11022436246275902, |
|
"rewards/iou_timestamp_reward": 0.47441329061985016, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 80.95000076293945, |
|
"epoch": 0.04275217100868404, |
|
"grad_norm": 14.413505554199219, |
|
"kl": 0.0439453125, |
|
"learning_rate": 9.572421165152324e-07, |
|
"loss": 0.0018, |
|
"reward": 0.38343895971775055, |
|
"reward_std": 0.1145082600414753, |
|
"rewards/iou_timestamp_reward": 0.38343895971775055, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 79.8375015258789, |
|
"epoch": 0.04301937207748831, |
|
"grad_norm": 12.668608665466309, |
|
"kl": 0.037109375, |
|
"learning_rate": 9.569748797434526e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5705928802490234, |
|
"reward_std": 0.10167951509356499, |
|
"rewards/iou_timestamp_reward": 0.5705928802490234, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 77.04999923706055, |
|
"epoch": 0.043286573146292584, |
|
"grad_norm": 17.551441192626953, |
|
"kl": 0.0455322265625, |
|
"learning_rate": 9.567076429716729e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5757200121879578, |
|
"reward_std": 0.12645361945033073, |
|
"rewards/iou_timestamp_reward": 0.5757200121879578, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 76.67500305175781, |
|
"epoch": 0.04355377421509686, |
|
"grad_norm": 12.472587585449219, |
|
"kl": 0.043701171875, |
|
"learning_rate": 9.56440406199893e-07, |
|
"loss": 0.0017, |
|
"reward": 0.3930691331624985, |
|
"reward_std": 0.047056157141923904, |
|
"rewards/iou_timestamp_reward": 0.3930691331624985, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 82.97500228881836, |
|
"epoch": 0.04382097528390114, |
|
"grad_norm": 41.681278228759766, |
|
"kl": 0.050048828125, |
|
"learning_rate": 9.561731694281131e-07, |
|
"loss": 0.002, |
|
"reward": 0.37715357542037964, |
|
"reward_std": 0.14025865495204926, |
|
"rewards/iou_timestamp_reward": 0.37715357542037964, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 81.3499984741211, |
|
"epoch": 0.04408817635270541, |
|
"grad_norm": 13.00397777557373, |
|
"kl": 0.0457763671875, |
|
"learning_rate": 9.559059326563334e-07, |
|
"loss": 0.0018, |
|
"reward": 0.4659877270460129, |
|
"reward_std": 0.07307687401771545, |
|
"rewards/iou_timestamp_reward": 0.4659877270460129, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 79.07500076293945, |
|
"epoch": 0.04435537742150968, |
|
"grad_norm": 42.21580505371094, |
|
"kl": 0.039794921875, |
|
"learning_rate": 9.556386958845536e-07, |
|
"loss": 0.0016, |
|
"reward": 0.45951294898986816, |
|
"reward_std": 0.11066334694623947, |
|
"rewards/iou_timestamp_reward": 0.45951294898986816, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 82.25, |
|
"epoch": 0.04462257849031396, |
|
"grad_norm": 16.60811996459961, |
|
"kl": 0.0499267578125, |
|
"learning_rate": 9.553714591127739e-07, |
|
"loss": 0.002, |
|
"reward": 0.5194784104824066, |
|
"reward_std": 0.09066024795174599, |
|
"rewards/iou_timestamp_reward": 0.5194784104824066, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 79.26250076293945, |
|
"epoch": 0.044889779559118236, |
|
"grad_norm": 43.955142974853516, |
|
"kl": 0.0406494140625, |
|
"learning_rate": 9.551042223409941e-07, |
|
"loss": 0.0016, |
|
"reward": 0.40560464560985565, |
|
"reward_std": 0.07998048886656761, |
|
"rewards/iou_timestamp_reward": 0.40560464560985565, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 78.88750076293945, |
|
"epoch": 0.04515698062792251, |
|
"grad_norm": 10.694327354431152, |
|
"kl": 0.0501708984375, |
|
"learning_rate": 9.548369855692144e-07, |
|
"loss": 0.002, |
|
"reward": 0.5325513929128647, |
|
"reward_std": 0.05170968174934387, |
|
"rewards/iou_timestamp_reward": 0.5325513929128647, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 75.86250305175781, |
|
"epoch": 0.04542418169672679, |
|
"grad_norm": 7.294441223144531, |
|
"kl": 0.03662109375, |
|
"learning_rate": 9.545697487974344e-07, |
|
"loss": 0.0015, |
|
"reward": 0.46570850908756256, |
|
"reward_std": 0.07274461165070534, |
|
"rewards/iou_timestamp_reward": 0.46570850908756256, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 75.23750305175781, |
|
"epoch": 0.045691382765531065, |
|
"grad_norm": 13.37921142578125, |
|
"kl": 0.0406494140625, |
|
"learning_rate": 9.543025120256546e-07, |
|
"loss": 0.0016, |
|
"reward": 0.4209117740392685, |
|
"reward_std": 0.10308519750833511, |
|
"rewards/iou_timestamp_reward": 0.4209117740392685, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 75.0250015258789, |
|
"epoch": 0.045958583834335334, |
|
"grad_norm": 19.120006561279297, |
|
"kl": 0.0513916015625, |
|
"learning_rate": 9.540352752538749e-07, |
|
"loss": 0.0021, |
|
"reward": 0.5029748231172562, |
|
"reward_std": 0.06725198961794376, |
|
"rewards/iou_timestamp_reward": 0.5029748231172562, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 76.5625, |
|
"epoch": 0.04622578490313961, |
|
"grad_norm": 11.126622200012207, |
|
"kl": 0.0367431640625, |
|
"learning_rate": 9.537680384820951e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5426850020885468, |
|
"reward_std": 0.07354238256812096, |
|
"rewards/iou_timestamp_reward": 0.5426850020885468, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 81.98749923706055, |
|
"epoch": 0.04649298597194389, |
|
"grad_norm": 15.441088676452637, |
|
"kl": 0.04541015625, |
|
"learning_rate": 9.535008017103154e-07, |
|
"loss": 0.0018, |
|
"reward": 0.366536945104599, |
|
"reward_std": 0.10370578989386559, |
|
"rewards/iou_timestamp_reward": 0.366536945104599, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 76.9124984741211, |
|
"epoch": 0.04676018704074816, |
|
"grad_norm": 18.660261154174805, |
|
"kl": 0.0594482421875, |
|
"learning_rate": 9.532335649385355e-07, |
|
"loss": 0.0024, |
|
"reward": 0.534787967801094, |
|
"reward_std": 0.10525619983673096, |
|
"rewards/iou_timestamp_reward": 0.534787967801094, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 77.11249923706055, |
|
"epoch": 0.04702738810955244, |
|
"grad_norm": 9.616857528686523, |
|
"kl": 0.0528564453125, |
|
"learning_rate": 9.529663281667558e-07, |
|
"loss": 0.0021, |
|
"reward": 0.46122419834136963, |
|
"reward_std": 0.11353456228971481, |
|
"rewards/iou_timestamp_reward": 0.46122419834136963, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 76.57500457763672, |
|
"epoch": 0.047294589178356716, |
|
"grad_norm": 11.133618354797363, |
|
"kl": 0.0484619140625, |
|
"learning_rate": 9.52699091394976e-07, |
|
"loss": 0.0019, |
|
"reward": 0.4199577569961548, |
|
"reward_std": 0.09651762992143631, |
|
"rewards/iou_timestamp_reward": 0.4199577569961548, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 79.05000305175781, |
|
"epoch": 0.047561790247160986, |
|
"grad_norm": 12.524818420410156, |
|
"kl": 0.049560546875, |
|
"learning_rate": 9.52431854623196e-07, |
|
"loss": 0.002, |
|
"reward": 0.5286326110363007, |
|
"reward_std": 0.12330913916230202, |
|
"rewards/iou_timestamp_reward": 0.5286326110363007, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 83.50000381469727, |
|
"epoch": 0.04782899131596526, |
|
"grad_norm": 12.21944808959961, |
|
"kl": 0.0408935546875, |
|
"learning_rate": 9.521646178514163e-07, |
|
"loss": 0.0016, |
|
"reward": 0.47760356962680817, |
|
"reward_std": 0.08412851020693779, |
|
"rewards/iou_timestamp_reward": 0.47760356962680817, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 74.3375015258789, |
|
"epoch": 0.04809619238476954, |
|
"grad_norm": 15.917021751403809, |
|
"kl": 0.0504150390625, |
|
"learning_rate": 9.518973810796364e-07, |
|
"loss": 0.002, |
|
"reward": 0.4619504362344742, |
|
"reward_std": 0.06471715308725834, |
|
"rewards/iou_timestamp_reward": 0.4619504362344742, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 85.70000076293945, |
|
"epoch": 0.048363393453573815, |
|
"grad_norm": 17.021228790283203, |
|
"kl": 0.0552978515625, |
|
"learning_rate": 9.516301443078567e-07, |
|
"loss": 0.0022, |
|
"reward": 0.3753501623868942, |
|
"reward_std": 0.08277348056435585, |
|
"rewards/iou_timestamp_reward": 0.3753501623868942, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 86.86249923706055, |
|
"epoch": 0.04863059452237809, |
|
"grad_norm": 49.27531051635742, |
|
"kl": 0.073974609375, |
|
"learning_rate": 9.513629075360769e-07, |
|
"loss": 0.003, |
|
"reward": 0.644359678030014, |
|
"reward_std": 0.10561589896678925, |
|
"rewards/iou_timestamp_reward": 0.644359678030014, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 79.38750076293945, |
|
"epoch": 0.04889779559118237, |
|
"grad_norm": 94.31536102294922, |
|
"kl": 0.0479736328125, |
|
"learning_rate": 9.51095670764297e-07, |
|
"loss": 0.0019, |
|
"reward": 0.6270514130592346, |
|
"reward_std": 0.09841878339648247, |
|
"rewards/iou_timestamp_reward": 0.6270514130592346, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 76.35000228881836, |
|
"epoch": 0.04916499665998664, |
|
"grad_norm": 19.030534744262695, |
|
"kl": 0.059814453125, |
|
"learning_rate": 9.508284339925173e-07, |
|
"loss": 0.0024, |
|
"reward": 0.5081672370433807, |
|
"reward_std": 0.10591346770524979, |
|
"rewards/iou_timestamp_reward": 0.5081672370433807, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 76.4124984741211, |
|
"epoch": 0.049432197728790914, |
|
"grad_norm": 18.321247100830078, |
|
"kl": 0.046142578125, |
|
"learning_rate": 9.505611972207375e-07, |
|
"loss": 0.0018, |
|
"reward": 0.46649011969566345, |
|
"reward_std": 0.1195777915418148, |
|
"rewards/iou_timestamp_reward": 0.46649011969566345, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 84.7125015258789, |
|
"epoch": 0.04969939879759519, |
|
"grad_norm": 13.643217086791992, |
|
"kl": 0.047607421875, |
|
"learning_rate": 9.502939604489577e-07, |
|
"loss": 0.0019, |
|
"reward": 0.5010816007852554, |
|
"reward_std": 0.07837207242846489, |
|
"rewards/iou_timestamp_reward": 0.5010816007852554, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 94.1500015258789, |
|
"epoch": 0.049966599866399466, |
|
"grad_norm": 18.298660278320312, |
|
"kl": 0.049560546875, |
|
"learning_rate": 9.500267236771779e-07, |
|
"loss": 0.002, |
|
"reward": 0.44392018020153046, |
|
"reward_std": 0.07208472117781639, |
|
"rewards/iou_timestamp_reward": 0.44392018020153046, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 83.9000015258789, |
|
"epoch": 0.05023380093520374, |
|
"grad_norm": 43.245094299316406, |
|
"kl": 0.064208984375, |
|
"learning_rate": 9.497594869053981e-07, |
|
"loss": 0.0026, |
|
"reward": 0.4399380534887314, |
|
"reward_std": 0.07741782441735268, |
|
"rewards/iou_timestamp_reward": 0.4399380534887314, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 90.23750305175781, |
|
"epoch": 0.05050100200400802, |
|
"grad_norm": 7.059505462646484, |
|
"kl": 0.0338134765625, |
|
"learning_rate": 9.494922501336183e-07, |
|
"loss": 0.0014, |
|
"reward": 0.4319871515035629, |
|
"reward_std": 0.030242225155234337, |
|
"rewards/iou_timestamp_reward": 0.4319871515035629, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 83.70000076293945, |
|
"epoch": 0.05076820307281229, |
|
"grad_norm": 17.03489875793457, |
|
"kl": 0.0411376953125, |
|
"learning_rate": 9.492250133618386e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5623960196971893, |
|
"reward_std": 0.10269278660416603, |
|
"rewards/iou_timestamp_reward": 0.5623960196971893, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 90.26250076293945, |
|
"epoch": 0.051035404141616565, |
|
"grad_norm": 8.389057159423828, |
|
"kl": 0.044677734375, |
|
"learning_rate": 9.489577765900587e-07, |
|
"loss": 0.0018, |
|
"reward": 0.462777778506279, |
|
"reward_std": 0.08947985991835594, |
|
"rewards/iou_timestamp_reward": 0.462777778506279, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 88.85000228881836, |
|
"epoch": 0.05130260521042084, |
|
"grad_norm": 8.912788391113281, |
|
"kl": 0.05419921875, |
|
"learning_rate": 9.486905398182789e-07, |
|
"loss": 0.0022, |
|
"reward": 0.3437252789735794, |
|
"reward_std": 0.1144870612770319, |
|
"rewards/iou_timestamp_reward": 0.3437252789735794, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 80.23750305175781, |
|
"epoch": 0.05156980627922512, |
|
"grad_norm": 16.888032913208008, |
|
"kl": 0.0465087890625, |
|
"learning_rate": 9.484233030464992e-07, |
|
"loss": 0.0019, |
|
"reward": 0.32809334993362427, |
|
"reward_std": 0.08787024766206741, |
|
"rewards/iou_timestamp_reward": 0.32809334993362427, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 82.57500076293945, |
|
"epoch": 0.051837007348029394, |
|
"grad_norm": 122.45707702636719, |
|
"kl": 0.05322265625, |
|
"learning_rate": 9.481560662747193e-07, |
|
"loss": 0.0021, |
|
"reward": 0.39617256820201874, |
|
"reward_std": 0.10278888791799545, |
|
"rewards/iou_timestamp_reward": 0.39617256820201874, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 76.86250305175781, |
|
"epoch": 0.052104208416833664, |
|
"grad_norm": 21.19937515258789, |
|
"kl": 0.0482177734375, |
|
"learning_rate": 9.478888295029396e-07, |
|
"loss": 0.0019, |
|
"reward": 0.4638468474149704, |
|
"reward_std": 0.1405925117433071, |
|
"rewards/iou_timestamp_reward": 0.4638468474149704, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 83.17499923706055, |
|
"epoch": 0.05237140948563794, |
|
"grad_norm": 13.251508712768555, |
|
"kl": 0.0523681640625, |
|
"learning_rate": 9.476215927311597e-07, |
|
"loss": 0.0021, |
|
"reward": 0.485527366399765, |
|
"reward_std": 0.09180990979075432, |
|
"rewards/iou_timestamp_reward": 0.485527366399765, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 85.42499923706055, |
|
"epoch": 0.05263861055444222, |
|
"grad_norm": 9.114768028259277, |
|
"kl": 0.042724609375, |
|
"learning_rate": 9.4735435595938e-07, |
|
"loss": 0.0017, |
|
"reward": 0.4791525900363922, |
|
"reward_std": 0.09207051619887352, |
|
"rewards/iou_timestamp_reward": 0.4791525900363922, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 87.76250457763672, |
|
"epoch": 0.05290581162324649, |
|
"grad_norm": 11.511754035949707, |
|
"kl": 0.05859375, |
|
"learning_rate": 9.470871191876002e-07, |
|
"loss": 0.0023, |
|
"reward": 0.3516260161995888, |
|
"reward_std": 0.12914332374930382, |
|
"rewards/iou_timestamp_reward": 0.3516260161995888, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 77.92500305175781, |
|
"epoch": 0.05317301269205077, |
|
"grad_norm": 11.806519508361816, |
|
"kl": 0.0458984375, |
|
"learning_rate": 9.468198824158203e-07, |
|
"loss": 0.0018, |
|
"reward": 0.4304570257663727, |
|
"reward_std": 0.04932829737663269, |
|
"rewards/iou_timestamp_reward": 0.4304570257663727, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 81.9749984741211, |
|
"epoch": 0.053440213760855046, |
|
"grad_norm": 16.832813262939453, |
|
"kl": 0.0489501953125, |
|
"learning_rate": 9.465526456440406e-07, |
|
"loss": 0.002, |
|
"reward": 0.4915754646062851, |
|
"reward_std": 0.13631991669535637, |
|
"rewards/iou_timestamp_reward": 0.4915754646062851, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 85.57500457763672, |
|
"epoch": 0.053707414829659315, |
|
"grad_norm": 8.950471878051758, |
|
"kl": 0.0538330078125, |
|
"learning_rate": 9.462854088722608e-07, |
|
"loss": 0.0022, |
|
"reward": 0.32636383175849915, |
|
"reward_std": 0.0875160414725542, |
|
"rewards/iou_timestamp_reward": 0.32636383175849915, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 88.1875, |
|
"epoch": 0.05397461589846359, |
|
"grad_norm": 8.32710075378418, |
|
"kl": 0.0411376953125, |
|
"learning_rate": 9.46018172100481e-07, |
|
"loss": 0.0016, |
|
"reward": 0.3660304397344589, |
|
"reward_std": 0.12842414155602455, |
|
"rewards/iou_timestamp_reward": 0.3660304397344589, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 83.18750381469727, |
|
"epoch": 0.05424181696726787, |
|
"grad_norm": 34.32887649536133, |
|
"kl": 0.0579833984375, |
|
"learning_rate": 9.457509353287012e-07, |
|
"loss": 0.0023, |
|
"reward": 0.4675799459218979, |
|
"reward_std": 0.11138150840997696, |
|
"rewards/iou_timestamp_reward": 0.4675799459218979, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 78.57500076293945, |
|
"epoch": 0.054509018036072145, |
|
"grad_norm": 11.615337371826172, |
|
"kl": 0.0482177734375, |
|
"learning_rate": 9.454836985569213e-07, |
|
"loss": 0.0019, |
|
"reward": 0.20401198416948318, |
|
"reward_std": 0.06908228248357773, |
|
"rewards/iou_timestamp_reward": 0.20401198416948318, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 80.95000076293945, |
|
"epoch": 0.05477621910487642, |
|
"grad_norm": 15.079862594604492, |
|
"kl": 0.0557861328125, |
|
"learning_rate": 9.452164617851416e-07, |
|
"loss": 0.0022, |
|
"reward": 0.542988657951355, |
|
"reward_std": 0.1345556452870369, |
|
"rewards/iou_timestamp_reward": 0.542988657951355, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 71.92500305175781, |
|
"epoch": 0.0550434201736807, |
|
"grad_norm": 10.543124198913574, |
|
"kl": 0.0556640625, |
|
"learning_rate": 9.449492250133618e-07, |
|
"loss": 0.0022, |
|
"reward": 0.2738891839981079, |
|
"reward_std": 0.10082607716321945, |
|
"rewards/iou_timestamp_reward": 0.2738891839981079, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 75.97500228881836, |
|
"epoch": 0.05531062124248497, |
|
"grad_norm": 11.855889320373535, |
|
"kl": 0.0440673828125, |
|
"learning_rate": 9.44681988241582e-07, |
|
"loss": 0.0018, |
|
"reward": 0.40028373897075653, |
|
"reward_std": 0.07233771868050098, |
|
"rewards/iou_timestamp_reward": 0.40028373897075653, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 75.36250305175781, |
|
"epoch": 0.05557782231128924, |
|
"grad_norm": 11.384106636047363, |
|
"kl": 0.0484619140625, |
|
"learning_rate": 9.444147514698022e-07, |
|
"loss": 0.0019, |
|
"reward": 0.518831193447113, |
|
"reward_std": 0.04942659102380276, |
|
"rewards/iou_timestamp_reward": 0.518831193447113, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 83.93750381469727, |
|
"epoch": 0.05584502338009352, |
|
"grad_norm": 28.444765090942383, |
|
"kl": 0.0517578125, |
|
"learning_rate": 9.441475146980225e-07, |
|
"loss": 0.0021, |
|
"reward": 0.591288223862648, |
|
"reward_std": 0.13365697488188744, |
|
"rewards/iou_timestamp_reward": 0.591288223862648, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 77.67499923706055, |
|
"epoch": 0.056112224448897796, |
|
"grad_norm": 47.66985321044922, |
|
"kl": 0.060791015625, |
|
"learning_rate": 9.438802779262426e-07, |
|
"loss": 0.0024, |
|
"reward": 0.4540657252073288, |
|
"reward_std": 0.11418235674500465, |
|
"rewards/iou_timestamp_reward": 0.4540657252073288, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 85.98750305175781, |
|
"epoch": 0.05637942551770207, |
|
"grad_norm": 17.33128547668457, |
|
"kl": 0.0565185546875, |
|
"learning_rate": 9.436130411544628e-07, |
|
"loss": 0.0023, |
|
"reward": 0.45703037083148956, |
|
"reward_std": 0.12524070590734482, |
|
"rewards/iou_timestamp_reward": 0.45703037083148956, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 77.5, |
|
"epoch": 0.05664662658650635, |
|
"grad_norm": 9.057528495788574, |
|
"kl": 0.05810546875, |
|
"learning_rate": 9.43345804382683e-07, |
|
"loss": 0.0023, |
|
"reward": 0.4245501309633255, |
|
"reward_std": 0.07445455808192492, |
|
"rewards/iou_timestamp_reward": 0.4245501309633255, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 89.73750305175781, |
|
"epoch": 0.05691382765531062, |
|
"grad_norm": 9.169981956481934, |
|
"kl": 0.08251953125, |
|
"learning_rate": 9.430785676109032e-07, |
|
"loss": 0.0033, |
|
"reward": 0.5322041213512421, |
|
"reward_std": 0.09248698502779007, |
|
"rewards/iou_timestamp_reward": 0.5322041213512421, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 82.36250305175781, |
|
"epoch": 0.057181028724114895, |
|
"grad_norm": 13.932048797607422, |
|
"kl": 0.069091796875, |
|
"learning_rate": 9.428113308391235e-07, |
|
"loss": 0.0028, |
|
"reward": 0.43773867189884186, |
|
"reward_std": 0.08092993497848511, |
|
"rewards/iou_timestamp_reward": 0.43773867189884186, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 83.1624984741211, |
|
"epoch": 0.05744822979291917, |
|
"grad_norm": 19.159921646118164, |
|
"kl": 0.0606689453125, |
|
"learning_rate": 9.425440940673436e-07, |
|
"loss": 0.0024, |
|
"reward": 0.5727377682924271, |
|
"reward_std": 0.07809071242809296, |
|
"rewards/iou_timestamp_reward": 0.5727377682924271, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 75.51250076293945, |
|
"epoch": 0.05771543086172345, |
|
"grad_norm": 7.645944118499756, |
|
"kl": 0.0511474609375, |
|
"learning_rate": 9.422768572955639e-07, |
|
"loss": 0.002, |
|
"reward": 0.5318755507469177, |
|
"reward_std": 0.053884051740169525, |
|
"rewards/iou_timestamp_reward": 0.5318755507469177, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 76.20000076293945, |
|
"epoch": 0.057982631930527724, |
|
"grad_norm": 11.048291206359863, |
|
"kl": 0.0552978515625, |
|
"learning_rate": 9.420096205237841e-07, |
|
"loss": 0.0022, |
|
"reward": 0.39921949803829193, |
|
"reward_std": 0.09128798358142376, |
|
"rewards/iou_timestamp_reward": 0.39921949803829193, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 83.10000228881836, |
|
"epoch": 0.058249832999332, |
|
"grad_norm": 14.645618438720703, |
|
"kl": 0.069091796875, |
|
"learning_rate": 9.417423837520042e-07, |
|
"loss": 0.0028, |
|
"reward": 0.41773444414138794, |
|
"reward_std": 0.06791641190648079, |
|
"rewards/iou_timestamp_reward": 0.41773444414138794, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 80.4625015258789, |
|
"epoch": 0.05851703406813627, |
|
"grad_norm": 13.085001945495605, |
|
"kl": 0.0582275390625, |
|
"learning_rate": 9.414751469802245e-07, |
|
"loss": 0.0023, |
|
"reward": 0.44448421895504, |
|
"reward_std": 0.07502865791320801, |
|
"rewards/iou_timestamp_reward": 0.44448421895504, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 92.125, |
|
"epoch": 0.058784235136940546, |
|
"grad_norm": 16.1396484375, |
|
"kl": 0.0535888671875, |
|
"learning_rate": 9.412079102084446e-07, |
|
"loss": 0.0021, |
|
"reward": 0.48494312167167664, |
|
"reward_std": 0.10971744731068611, |
|
"rewards/iou_timestamp_reward": 0.48494312167167664, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 87.7750015258789, |
|
"epoch": 0.05905143620574482, |
|
"grad_norm": 27.641878128051758, |
|
"kl": 0.06103515625, |
|
"learning_rate": 9.409406734366649e-07, |
|
"loss": 0.0024, |
|
"reward": 0.566541999578476, |
|
"reward_std": 0.07634824328124523, |
|
"rewards/iou_timestamp_reward": 0.566541999578476, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 86.26250076293945, |
|
"epoch": 0.0593186372745491, |
|
"grad_norm": 22.46279525756836, |
|
"kl": 0.0477294921875, |
|
"learning_rate": 9.406734366648851e-07, |
|
"loss": 0.0019, |
|
"reward": 0.4909381568431854, |
|
"reward_std": 0.060065316036343575, |
|
"rewards/iou_timestamp_reward": 0.4909381568431854, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 94.0875015258789, |
|
"epoch": 0.059585838343353376, |
|
"grad_norm": 16.373811721801758, |
|
"kl": 0.0496826171875, |
|
"learning_rate": 9.404061998931053e-07, |
|
"loss": 0.002, |
|
"reward": 0.667538046836853, |
|
"reward_std": 0.0789172612130642, |
|
"rewards/iou_timestamp_reward": 0.667538046836853, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 87.62500381469727, |
|
"epoch": 0.05985303941215765, |
|
"grad_norm": 11.457925796508789, |
|
"kl": 0.054931640625, |
|
"learning_rate": 9.401389631213255e-07, |
|
"loss": 0.0022, |
|
"reward": 0.523018479347229, |
|
"reward_std": 0.11646933481097221, |
|
"rewards/iou_timestamp_reward": 0.523018479347229, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 88.86249923706055, |
|
"epoch": 0.06012024048096192, |
|
"grad_norm": 13.132309913635254, |
|
"kl": 0.0538330078125, |
|
"learning_rate": 9.398717263495457e-07, |
|
"loss": 0.0022, |
|
"reward": 0.39982272684574127, |
|
"reward_std": 0.09530311450362206, |
|
"rewards/iou_timestamp_reward": 0.39982272684574127, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 82.91250228881836, |
|
"epoch": 0.0603874415497662, |
|
"grad_norm": 13.165536880493164, |
|
"kl": 0.0540771484375, |
|
"learning_rate": 9.396044895777659e-07, |
|
"loss": 0.0022, |
|
"reward": 0.4518078416585922, |
|
"reward_std": 0.055653566494584084, |
|
"rewards/iou_timestamp_reward": 0.4518078416585922, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 87.9124984741211, |
|
"epoch": 0.060654642618570474, |
|
"grad_norm": 71.02800750732422, |
|
"kl": 0.0499267578125, |
|
"learning_rate": 9.393372528059861e-07, |
|
"loss": 0.002, |
|
"reward": 0.4856627434492111, |
|
"reward_std": 0.11880143359303474, |
|
"rewards/iou_timestamp_reward": 0.4856627434492111, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 88.2125015258789, |
|
"epoch": 0.06092184368737475, |
|
"grad_norm": 19.84975814819336, |
|
"kl": 0.0556640625, |
|
"learning_rate": 9.390700160342062e-07, |
|
"loss": 0.0022, |
|
"reward": 0.5220415741205215, |
|
"reward_std": 0.09827179461717606, |
|
"rewards/iou_timestamp_reward": 0.5220415741205215, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 85.9375, |
|
"epoch": 0.06118904475617903, |
|
"grad_norm": 8.188985824584961, |
|
"kl": 0.04541015625, |
|
"learning_rate": 9.388027792624264e-07, |
|
"loss": 0.0018, |
|
"reward": 0.4307563751935959, |
|
"reward_std": 0.10000541806221008, |
|
"rewards/iou_timestamp_reward": 0.4307563751935959, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 83.05000305175781, |
|
"epoch": 0.0614562458249833, |
|
"grad_norm": 20.375694274902344, |
|
"kl": 0.0589599609375, |
|
"learning_rate": 9.385355424906467e-07, |
|
"loss": 0.0024, |
|
"reward": 0.38110922276973724, |
|
"reward_std": 0.08979238569736481, |
|
"rewards/iou_timestamp_reward": 0.38110922276973724, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 94.28750228881836, |
|
"epoch": 0.06172344689378757, |
|
"grad_norm": 18.514402389526367, |
|
"kl": 0.05224609375, |
|
"learning_rate": 9.382683057188668e-07, |
|
"loss": 0.0021, |
|
"reward": 0.4332323968410492, |
|
"reward_std": 0.11731500551104546, |
|
"rewards/iou_timestamp_reward": 0.4332323968410492, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 83.4375, |
|
"epoch": 0.06199064796259185, |
|
"grad_norm": 12.424543380737305, |
|
"kl": 0.037841796875, |
|
"learning_rate": 9.38001068947087e-07, |
|
"loss": 0.0015, |
|
"reward": 0.3656405210494995, |
|
"reward_std": 0.0752149373292923, |
|
"rewards/iou_timestamp_reward": 0.3656405210494995, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 99.26250076293945, |
|
"epoch": 0.062257849031396126, |
|
"grad_norm": 9.007403373718262, |
|
"kl": 0.0421142578125, |
|
"learning_rate": 9.377338321753073e-07, |
|
"loss": 0.0017, |
|
"reward": 0.5581249594688416, |
|
"reward_std": 0.10320387035608292, |
|
"rewards/iou_timestamp_reward": 0.5581249594688416, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 92.78750228881836, |
|
"epoch": 0.0625250501002004, |
|
"grad_norm": 19.03189468383789, |
|
"kl": 0.062255859375, |
|
"learning_rate": 9.374665954035274e-07, |
|
"loss": 0.0025, |
|
"reward": 0.3949814885854721, |
|
"reward_std": 0.11542447470128536, |
|
"rewards/iou_timestamp_reward": 0.3949814885854721, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 95.35000228881836, |
|
"epoch": 0.06279225116900468, |
|
"grad_norm": 39.19453430175781, |
|
"kl": 0.0509033203125, |
|
"learning_rate": 9.371993586317477e-07, |
|
"loss": 0.002, |
|
"reward": 0.423181414604187, |
|
"reward_std": 0.1289653405547142, |
|
"rewards/iou_timestamp_reward": 0.423181414604187, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 98.38750076293945, |
|
"epoch": 0.06305945223780896, |
|
"grad_norm": 17.67893409729004, |
|
"kl": 0.0491943359375, |
|
"learning_rate": 9.369321218599678e-07, |
|
"loss": 0.002, |
|
"reward": 0.5305302441120148, |
|
"reward_std": 0.07527987286448479, |
|
"rewards/iou_timestamp_reward": 0.5305302441120148, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 89.3125, |
|
"epoch": 0.06332665330661323, |
|
"grad_norm": 16.596872329711914, |
|
"kl": 0.0501708984375, |
|
"learning_rate": 9.36664885088188e-07, |
|
"loss": 0.002, |
|
"reward": 0.47669273614883423, |
|
"reward_std": 0.0898175835609436, |
|
"rewards/iou_timestamp_reward": 0.47669273614883423, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 80.31250381469727, |
|
"epoch": 0.06359385437541751, |
|
"grad_norm": 13.407699584960938, |
|
"kl": 0.0501708984375, |
|
"learning_rate": 9.363976483164083e-07, |
|
"loss": 0.002, |
|
"reward": 0.48390600085258484, |
|
"reward_std": 0.07993911020457745, |
|
"rewards/iou_timestamp_reward": 0.48390600085258484, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 89.95000076293945, |
|
"epoch": 0.06386105544422177, |
|
"grad_norm": 11.182861328125, |
|
"kl": 0.0474853515625, |
|
"learning_rate": 9.361304115446284e-07, |
|
"loss": 0.0019, |
|
"reward": 0.5368602573871613, |
|
"reward_std": 0.08251865021884441, |
|
"rewards/iou_timestamp_reward": 0.5368602573871613, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 90.91250228881836, |
|
"epoch": 0.06412825651302605, |
|
"grad_norm": 17.626962661743164, |
|
"kl": 0.0478515625, |
|
"learning_rate": 9.358631747728487e-07, |
|
"loss": 0.0019, |
|
"reward": 0.589008092880249, |
|
"reward_std": 0.14458639919757843, |
|
"rewards/iou_timestamp_reward": 0.589008092880249, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 94.0250015258789, |
|
"epoch": 0.06439545758183032, |
|
"grad_norm": 19.81915283203125, |
|
"kl": 0.0458984375, |
|
"learning_rate": 9.355959380010689e-07, |
|
"loss": 0.0018, |
|
"reward": 0.38526751101017, |
|
"reward_std": 0.1254762001335621, |
|
"rewards/iou_timestamp_reward": 0.38526751101017, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 90.56250381469727, |
|
"epoch": 0.0646626586506346, |
|
"grad_norm": 13.259626388549805, |
|
"kl": 0.05224609375, |
|
"learning_rate": 9.353287012292891e-07, |
|
"loss": 0.0021, |
|
"reward": 0.3110887184739113, |
|
"reward_std": 0.1004025787115097, |
|
"rewards/iou_timestamp_reward": 0.3110887184739113, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 90.7125015258789, |
|
"epoch": 0.06492985971943888, |
|
"grad_norm": 43.69268035888672, |
|
"kl": 0.0465087890625, |
|
"learning_rate": 9.350614644575093e-07, |
|
"loss": 0.0019, |
|
"reward": 0.5469205528497696, |
|
"reward_std": 0.09666259959340096, |
|
"rewards/iou_timestamp_reward": 0.5469205528497696, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 91.23749923706055, |
|
"epoch": 0.06519706078824315, |
|
"grad_norm": 36.97768783569336, |
|
"kl": 0.0537109375, |
|
"learning_rate": 9.347942276857294e-07, |
|
"loss": 0.0022, |
|
"reward": 0.45238882303237915, |
|
"reward_std": 0.13411134481430054, |
|
"rewards/iou_timestamp_reward": 0.45238882303237915, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 90.42499923706055, |
|
"epoch": 0.06546426185704743, |
|
"grad_norm": 13.66786003112793, |
|
"kl": 0.05322265625, |
|
"learning_rate": 9.345269909139497e-07, |
|
"loss": 0.0021, |
|
"reward": 0.3465211093425751, |
|
"reward_std": 0.06062236800789833, |
|
"rewards/iou_timestamp_reward": 0.3465211093425751, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 88.97500228881836, |
|
"epoch": 0.0657314629258517, |
|
"grad_norm": 37.682861328125, |
|
"kl": 0.05517578125, |
|
"learning_rate": 9.342597541421699e-07, |
|
"loss": 0.0022, |
|
"reward": 0.6093435287475586, |
|
"reward_std": 0.1017681360244751, |
|
"rewards/iou_timestamp_reward": 0.6093435287475586, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 89.5875015258789, |
|
"epoch": 0.06599866399465598, |
|
"grad_norm": 16.518226623535156, |
|
"kl": 0.0675048828125, |
|
"learning_rate": 9.339925173703901e-07, |
|
"loss": 0.0027, |
|
"reward": 0.372649610042572, |
|
"reward_std": 0.09408055618405342, |
|
"rewards/iou_timestamp_reward": 0.372649610042572, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 96.05000305175781, |
|
"epoch": 0.06626586506346026, |
|
"grad_norm": 15.585586547851562, |
|
"kl": 0.0504150390625, |
|
"learning_rate": 9.337252805986103e-07, |
|
"loss": 0.002, |
|
"reward": 0.4263141304254532, |
|
"reward_std": 0.09799651429057121, |
|
"rewards/iou_timestamp_reward": 0.4263141304254532, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 95.10000228881836, |
|
"epoch": 0.06653306613226453, |
|
"grad_norm": 19.904935836791992, |
|
"kl": 0.0557861328125, |
|
"learning_rate": 9.334580438268306e-07, |
|
"loss": 0.0022, |
|
"reward": 0.49050794541835785, |
|
"reward_std": 0.11211569979786873, |
|
"rewards/iou_timestamp_reward": 0.49050794541835785, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 92.03750228881836, |
|
"epoch": 0.06680026720106881, |
|
"grad_norm": 357.7543640136719, |
|
"kl": 0.0576171875, |
|
"learning_rate": 9.331908070550507e-07, |
|
"loss": 0.0023, |
|
"reward": 0.411429226398468, |
|
"reward_std": 0.15467696636915207, |
|
"rewards/iou_timestamp_reward": 0.411429226398468, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 92.78750228881836, |
|
"epoch": 0.06706746826987307, |
|
"grad_norm": 22.54250717163086, |
|
"kl": 0.0506591796875, |
|
"learning_rate": 9.32923570283271e-07, |
|
"loss": 0.002, |
|
"reward": 0.5628893673419952, |
|
"reward_std": 0.10398486256599426, |
|
"rewards/iou_timestamp_reward": 0.5628893673419952, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 87.8375015258789, |
|
"epoch": 0.06733466933867735, |
|
"grad_norm": 37.64324951171875, |
|
"kl": 0.04833984375, |
|
"learning_rate": 9.326563335114911e-07, |
|
"loss": 0.0019, |
|
"reward": 0.47143837809562683, |
|
"reward_std": 0.1283393558114767, |
|
"rewards/iou_timestamp_reward": 0.47143837809562683, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 90.7750015258789, |
|
"epoch": 0.06760187040748163, |
|
"grad_norm": 11.761100769042969, |
|
"kl": 0.0535888671875, |
|
"learning_rate": 9.323890967397113e-07, |
|
"loss": 0.0021, |
|
"reward": 0.4104880392551422, |
|
"reward_std": 0.12630710378289223, |
|
"rewards/iou_timestamp_reward": 0.4104880392551422, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 89.51250076293945, |
|
"epoch": 0.0678690714762859, |
|
"grad_norm": 17.23011016845703, |
|
"kl": 0.051025390625, |
|
"learning_rate": 9.321218599679316e-07, |
|
"loss": 0.002, |
|
"reward": 0.43392352759838104, |
|
"reward_std": 0.11164272576570511, |
|
"rewards/iou_timestamp_reward": 0.43392352759838104, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 94.2125015258789, |
|
"epoch": 0.06813627254509018, |
|
"grad_norm": 20.68499183654785, |
|
"kl": 0.056640625, |
|
"learning_rate": 9.318546231961517e-07, |
|
"loss": 0.0023, |
|
"reward": 0.4146452210843563, |
|
"reward_std": 0.080996323376894, |
|
"rewards/iou_timestamp_reward": 0.4146452210843563, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 83.43750381469727, |
|
"epoch": 0.06840347361389446, |
|
"grad_norm": 21.620370864868164, |
|
"kl": 0.040283203125, |
|
"learning_rate": 9.31587386424372e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5084521472454071, |
|
"reward_std": 0.11804941296577454, |
|
"rewards/iou_timestamp_reward": 0.5084521472454071, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 86.13750076293945, |
|
"epoch": 0.06867067468269873, |
|
"grad_norm": 35.96171569824219, |
|
"kl": 0.0594482421875, |
|
"learning_rate": 9.313201496525922e-07, |
|
"loss": 0.0024, |
|
"reward": 0.41805142164230347, |
|
"reward_std": 0.10501766204833984, |
|
"rewards/iou_timestamp_reward": 0.41805142164230347, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 84.26250457763672, |
|
"epoch": 0.06893787575150301, |
|
"grad_norm": 13.32675552368164, |
|
"kl": 0.05615234375, |
|
"learning_rate": 9.310529128808123e-07, |
|
"loss": 0.0022, |
|
"reward": 0.3628458231687546, |
|
"reward_std": 0.12849557399749756, |
|
"rewards/iou_timestamp_reward": 0.3628458231687546, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 85.36249923706055, |
|
"epoch": 0.06920507682030728, |
|
"grad_norm": 10.719467163085938, |
|
"kl": 0.057861328125, |
|
"learning_rate": 9.307856761090326e-07, |
|
"loss": 0.0023, |
|
"reward": 0.375208780169487, |
|
"reward_std": 0.07890206202864647, |
|
"rewards/iou_timestamp_reward": 0.375208780169487, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 88.38750076293945, |
|
"epoch": 0.06947227788911156, |
|
"grad_norm": 16.24102783203125, |
|
"kl": 0.0723876953125, |
|
"learning_rate": 9.305184393372527e-07, |
|
"loss": 0.0029, |
|
"reward": 0.46249307692050934, |
|
"reward_std": 0.0996074341237545, |
|
"rewards/iou_timestamp_reward": 0.46249307692050934, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 84.4625015258789, |
|
"epoch": 0.06973947895791584, |
|
"grad_norm": 10.950820922851562, |
|
"kl": 0.05029296875, |
|
"learning_rate": 9.30251202565473e-07, |
|
"loss": 0.002, |
|
"reward": 0.43129654228687286, |
|
"reward_std": 0.09118984453380108, |
|
"rewards/iou_timestamp_reward": 0.43129654228687286, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 78.35000228881836, |
|
"epoch": 0.0700066800267201, |
|
"grad_norm": 23.676605224609375, |
|
"kl": 0.0572509765625, |
|
"learning_rate": 9.299839657936932e-07, |
|
"loss": 0.0023, |
|
"reward": 0.4307858943939209, |
|
"reward_std": 0.0822877585887909, |
|
"rewards/iou_timestamp_reward": 0.4307858943939209, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 84.36250305175781, |
|
"epoch": 0.07027388109552438, |
|
"grad_norm": 30.50290870666504, |
|
"kl": 0.050537109375, |
|
"learning_rate": 9.297167290219134e-07, |
|
"loss": 0.002, |
|
"reward": 0.4225500524044037, |
|
"reward_std": 0.08897064253687859, |
|
"rewards/iou_timestamp_reward": 0.4225500524044037, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 86.12500381469727, |
|
"epoch": 0.07054108216432865, |
|
"grad_norm": 9.197006225585938, |
|
"kl": 0.046142578125, |
|
"learning_rate": 9.294494922501336e-07, |
|
"loss": 0.0018, |
|
"reward": 0.523790642619133, |
|
"reward_std": 0.11769257113337517, |
|
"rewards/iou_timestamp_reward": 0.523790642619133, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 85.04999923706055, |
|
"epoch": 0.07080828323313293, |
|
"grad_norm": 30.433368682861328, |
|
"kl": 0.0567626953125, |
|
"learning_rate": 9.291822554783539e-07, |
|
"loss": 0.0023, |
|
"reward": 0.3194416016340256, |
|
"reward_std": 0.07132554426789284, |
|
"rewards/iou_timestamp_reward": 0.3194416016340256, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 84.7249984741211, |
|
"epoch": 0.0710754843019372, |
|
"grad_norm": 24.936206817626953, |
|
"kl": 0.0491943359375, |
|
"learning_rate": 9.28915018706574e-07, |
|
"loss": 0.002, |
|
"reward": 0.37983009219169617, |
|
"reward_std": 0.08205056935548782, |
|
"rewards/iou_timestamp_reward": 0.37983009219169617, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 82.23750305175781, |
|
"epoch": 0.07134268537074148, |
|
"grad_norm": 36.05076217651367, |
|
"kl": 0.046630859375, |
|
"learning_rate": 9.286477819347942e-07, |
|
"loss": 0.0019, |
|
"reward": 0.5814423114061356, |
|
"reward_std": 0.08882495388388634, |
|
"rewards/iou_timestamp_reward": 0.5814423114061356, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 85.0625, |
|
"epoch": 0.07160988643954576, |
|
"grad_norm": 15.191946983337402, |
|
"kl": 0.0489501953125, |
|
"learning_rate": 9.283805451630144e-07, |
|
"loss": 0.002, |
|
"reward": 0.4915848672389984, |
|
"reward_std": 0.08231451734900475, |
|
"rewards/iou_timestamp_reward": 0.4915848672389984, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 87.07500076293945, |
|
"epoch": 0.07187708750835003, |
|
"grad_norm": 12.557903289794922, |
|
"kl": 0.057861328125, |
|
"learning_rate": 9.281133083912346e-07, |
|
"loss": 0.0023, |
|
"reward": 0.5351927578449249, |
|
"reward_std": 0.06344357319176197, |
|
"rewards/iou_timestamp_reward": 0.5351927578449249, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 84.0875015258789, |
|
"epoch": 0.07214428857715431, |
|
"grad_norm": 14.385464668273926, |
|
"kl": 0.0570068359375, |
|
"learning_rate": 9.278460716194549e-07, |
|
"loss": 0.0023, |
|
"reward": 0.4179234653711319, |
|
"reward_std": 0.10309569910168648, |
|
"rewards/iou_timestamp_reward": 0.4179234653711319, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 89.0374984741211, |
|
"epoch": 0.07241148964595859, |
|
"grad_norm": 9.917317390441895, |
|
"kl": 0.0496826171875, |
|
"learning_rate": 9.27578834847675e-07, |
|
"loss": 0.002, |
|
"reward": 0.3758576214313507, |
|
"reward_std": 0.06827409937977791, |
|
"rewards/iou_timestamp_reward": 0.3758576214313507, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 94.4625015258789, |
|
"epoch": 0.07267869071476286, |
|
"grad_norm": 24.282136917114258, |
|
"kl": 0.048828125, |
|
"learning_rate": 9.273115980758952e-07, |
|
"loss": 0.002, |
|
"reward": 0.472480833530426, |
|
"reward_std": 0.055225247517228127, |
|
"rewards/iou_timestamp_reward": 0.472480833530426, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 86.7125015258789, |
|
"epoch": 0.07294589178356714, |
|
"grad_norm": 11.784228324890137, |
|
"kl": 0.05126953125, |
|
"learning_rate": 9.270443613041155e-07, |
|
"loss": 0.0021, |
|
"reward": 0.505023866891861, |
|
"reward_std": 0.09757990017533302, |
|
"rewards/iou_timestamp_reward": 0.505023866891861, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 89.78750228881836, |
|
"epoch": 0.0732130928523714, |
|
"grad_norm": 45.109092712402344, |
|
"kl": 0.050048828125, |
|
"learning_rate": 9.267771245323356e-07, |
|
"loss": 0.002, |
|
"reward": 0.523750826716423, |
|
"reward_std": 0.17876286804676056, |
|
"rewards/iou_timestamp_reward": 0.523750826716423, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 93.31250381469727, |
|
"epoch": 0.07348029392117568, |
|
"grad_norm": 14.207196235656738, |
|
"kl": 0.048828125, |
|
"learning_rate": 9.265098877605559e-07, |
|
"loss": 0.002, |
|
"reward": 0.4920515716075897, |
|
"reward_std": 0.09994343668222427, |
|
"rewards/iou_timestamp_reward": 0.4920515716075897, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 92.17500305175781, |
|
"epoch": 0.07374749498997996, |
|
"grad_norm": 48.04758834838867, |
|
"kl": 0.0509033203125, |
|
"learning_rate": 9.26242650988776e-07, |
|
"loss": 0.002, |
|
"reward": 0.4091592878103256, |
|
"reward_std": 0.07305929809808731, |
|
"rewards/iou_timestamp_reward": 0.4091592878103256, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 88.2750015258789, |
|
"epoch": 0.07401469605878423, |
|
"grad_norm": 11.50412654876709, |
|
"kl": 0.059326171875, |
|
"learning_rate": 9.259754142169963e-07, |
|
"loss": 0.0024, |
|
"reward": 0.43483787775039673, |
|
"reward_std": 0.13717280700802803, |
|
"rewards/iou_timestamp_reward": 0.43483787775039673, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 92.61250305175781, |
|
"epoch": 0.07428189712758851, |
|
"grad_norm": 37.55918884277344, |
|
"kl": 0.051513671875, |
|
"learning_rate": 9.257081774452165e-07, |
|
"loss": 0.0021, |
|
"reward": 0.3264332413673401, |
|
"reward_std": 0.07515481114387512, |
|
"rewards/iou_timestamp_reward": 0.3264332413673401, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 87.92500305175781, |
|
"epoch": 0.07454909819639279, |
|
"grad_norm": 15.809645652770996, |
|
"kl": 0.0555419921875, |
|
"learning_rate": 9.254409406734365e-07, |
|
"loss": 0.0022, |
|
"reward": 0.4322213977575302, |
|
"reward_std": 0.0831381669268012, |
|
"rewards/iou_timestamp_reward": 0.4322213977575302, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 88.82500076293945, |
|
"epoch": 0.07481629926519706, |
|
"grad_norm": 29.246431350708008, |
|
"kl": 0.0552978515625, |
|
"learning_rate": 9.251737039016568e-07, |
|
"loss": 0.0022, |
|
"reward": 0.6085447072982788, |
|
"reward_std": 0.13383636623620987, |
|
"rewards/iou_timestamp_reward": 0.6085447072982788, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 90.7750015258789, |
|
"epoch": 0.07508350033400134, |
|
"grad_norm": 56.85616683959961, |
|
"kl": 0.05810546875, |
|
"learning_rate": 9.24906467129877e-07, |
|
"loss": 0.0023, |
|
"reward": 0.5065873116254807, |
|
"reward_std": 0.09172694198787212, |
|
"rewards/iou_timestamp_reward": 0.5065873116254807, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 94.125, |
|
"epoch": 0.07535070140280561, |
|
"grad_norm": 153.61770629882812, |
|
"kl": 0.0584716796875, |
|
"learning_rate": 9.246392303580972e-07, |
|
"loss": 0.0023, |
|
"reward": 0.619166225194931, |
|
"reward_std": 0.08932790905237198, |
|
"rewards/iou_timestamp_reward": 0.619166225194931, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 89.7750015258789, |
|
"epoch": 0.07561790247160989, |
|
"grad_norm": 92.22215270996094, |
|
"kl": 0.0556640625, |
|
"learning_rate": 9.243719935863174e-07, |
|
"loss": 0.0022, |
|
"reward": 0.5364521145820618, |
|
"reward_std": 0.0949009507894516, |
|
"rewards/iou_timestamp_reward": 0.5364521145820618, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 90.85000228881836, |
|
"epoch": 0.07588510354041417, |
|
"grad_norm": 11.400144577026367, |
|
"kl": 0.0491943359375, |
|
"learning_rate": 9.241047568145376e-07, |
|
"loss": 0.002, |
|
"reward": 0.3629377633333206, |
|
"reward_std": 0.051846086978912354, |
|
"rewards/iou_timestamp_reward": 0.3629377633333206, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 88.05000305175781, |
|
"epoch": 0.07615230460921844, |
|
"grad_norm": 10.160490036010742, |
|
"kl": 0.0489501953125, |
|
"learning_rate": 9.238375200427578e-07, |
|
"loss": 0.002, |
|
"reward": 0.4853304475545883, |
|
"reward_std": 0.0822471622377634, |
|
"rewards/iou_timestamp_reward": 0.4853304475545883, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 83.75, |
|
"epoch": 0.0764195056780227, |
|
"grad_norm": 29.11613655090332, |
|
"kl": 0.0517578125, |
|
"learning_rate": 9.23570283270978e-07, |
|
"loss": 0.0021, |
|
"reward": 0.5097087770700455, |
|
"reward_std": 0.0779729075729847, |
|
"rewards/iou_timestamp_reward": 0.5097087770700455, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 93.97500228881836, |
|
"epoch": 0.07668670674682698, |
|
"grad_norm": 13.363306045532227, |
|
"kl": 0.0643310546875, |
|
"learning_rate": 9.233030464991982e-07, |
|
"loss": 0.0026, |
|
"reward": 0.3735619783401489, |
|
"reward_std": 0.12772585824131966, |
|
"rewards/iou_timestamp_reward": 0.3735619783401489, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 88.7125015258789, |
|
"epoch": 0.07695390781563126, |
|
"grad_norm": 253.6396026611328, |
|
"kl": 0.0447998046875, |
|
"learning_rate": 9.230358097274184e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5121918767690659, |
|
"reward_std": 0.11110168322920799, |
|
"rewards/iou_timestamp_reward": 0.5121918767690659, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 87.95000076293945, |
|
"epoch": 0.07722110888443554, |
|
"grad_norm": 9.944025039672852, |
|
"kl": 0.0465087890625, |
|
"learning_rate": 9.227685729556387e-07, |
|
"loss": 0.0019, |
|
"reward": 0.5837224572896957, |
|
"reward_std": 0.0957045927643776, |
|
"rewards/iou_timestamp_reward": 0.5837224572896957, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 88.3375015258789, |
|
"epoch": 0.07748830995323981, |
|
"grad_norm": 19.37244415283203, |
|
"kl": 0.0513916015625, |
|
"learning_rate": 9.225013361838588e-07, |
|
"loss": 0.0021, |
|
"reward": 0.4252195358276367, |
|
"reward_std": 0.13499490171670914, |
|
"rewards/iou_timestamp_reward": 0.4252195358276367, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 90.23749923706055, |
|
"epoch": 0.07775551102204409, |
|
"grad_norm": 13.233763694763184, |
|
"kl": 0.050048828125, |
|
"learning_rate": 9.222340994120791e-07, |
|
"loss": 0.002, |
|
"reward": 0.4406256824731827, |
|
"reward_std": 0.10843142122030258, |
|
"rewards/iou_timestamp_reward": 0.4406256824731827, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 94.42499923706055, |
|
"epoch": 0.07802271209084836, |
|
"grad_norm": 13.899524688720703, |
|
"kl": 0.05615234375, |
|
"learning_rate": 9.219668626402992e-07, |
|
"loss": 0.0022, |
|
"reward": 0.4787358343601227, |
|
"reward_std": 0.13102044723927975, |
|
"rewards/iou_timestamp_reward": 0.4787358343601227, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 86.04999923706055, |
|
"epoch": 0.07828991315965264, |
|
"grad_norm": 12.498344421386719, |
|
"kl": 0.048583984375, |
|
"learning_rate": 9.216996258685194e-07, |
|
"loss": 0.0019, |
|
"reward": 0.4275331050157547, |
|
"reward_std": 0.1066926084458828, |
|
"rewards/iou_timestamp_reward": 0.4275331050157547, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 90.42499923706055, |
|
"epoch": 0.07855711422845692, |
|
"grad_norm": 22.13373565673828, |
|
"kl": 0.0589599609375, |
|
"learning_rate": 9.214323890967397e-07, |
|
"loss": 0.0024, |
|
"reward": 0.35555990040302277, |
|
"reward_std": 0.09240290522575378, |
|
"rewards/iou_timestamp_reward": 0.35555990040302277, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 91.51250457763672, |
|
"epoch": 0.0788243152972612, |
|
"grad_norm": 44.67702865600586, |
|
"kl": 0.052490234375, |
|
"learning_rate": 9.211651523249598e-07, |
|
"loss": 0.0021, |
|
"reward": 0.41058990359306335, |
|
"reward_std": 0.09301923587918282, |
|
"rewards/iou_timestamp_reward": 0.41058990359306335, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 88.375, |
|
"epoch": 0.07909151636606547, |
|
"grad_norm": 16.11538314819336, |
|
"kl": 0.065185546875, |
|
"learning_rate": 9.208979155531801e-07, |
|
"loss": 0.0026, |
|
"reward": 0.3699526935815811, |
|
"reward_std": 0.08414144068956375, |
|
"rewards/iou_timestamp_reward": 0.3699526935815811, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 87.55000305175781, |
|
"epoch": 0.07935871743486973, |
|
"grad_norm": 74.45164489746094, |
|
"kl": 0.04736328125, |
|
"learning_rate": 9.206306787814003e-07, |
|
"loss": 0.0019, |
|
"reward": 0.5554560720920563, |
|
"reward_std": 0.11732692644000053, |
|
"rewards/iou_timestamp_reward": 0.5554560720920563, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 84.18750381469727, |
|
"epoch": 0.07962591850367401, |
|
"grad_norm": 107.87571716308594, |
|
"kl": 0.04345703125, |
|
"learning_rate": 9.203634420096205e-07, |
|
"loss": 0.0017, |
|
"reward": 0.3576356768608093, |
|
"reward_std": 0.05249842070043087, |
|
"rewards/iou_timestamp_reward": 0.3576356768608093, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 80.6875, |
|
"epoch": 0.07989311957247829, |
|
"grad_norm": 16.578479766845703, |
|
"kl": 0.0604248046875, |
|
"learning_rate": 9.200962052378407e-07, |
|
"loss": 0.0024, |
|
"reward": 0.5808127671480179, |
|
"reward_std": 0.09874389320611954, |
|
"rewards/iou_timestamp_reward": 0.5808127671480179, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 82.1500015258789, |
|
"epoch": 0.08016032064128256, |
|
"grad_norm": 25.647584915161133, |
|
"kl": 0.071044921875, |
|
"learning_rate": 9.198289684660608e-07, |
|
"loss": 0.0028, |
|
"reward": 0.4414636939764023, |
|
"reward_std": 0.0884641632437706, |
|
"rewards/iou_timestamp_reward": 0.4414636939764023, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 3742, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|