{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3418803418803419, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017094017094017096, "grad_norm": 35.00202204303521, "learning_rate": 5e-07, "logits/chosen": -2.7455849647521973, "logits/rejected": -2.7442612648010254, "logps/chosen": -164.2725830078125, "logps/rejected": -170.57113647460938, "loss": 0.6934, "rewards/accuracies": 0.23749999701976776, "rewards/chosen": 0.0026612328365445137, "rewards/margins": -0.001539617427624762, "rewards/rejected": 0.004200850613415241, "step": 5 }, { "epoch": 0.03418803418803419, "grad_norm": 36.29266486314593, "learning_rate": 1e-06, "logits/chosen": -2.709902763366699, "logits/rejected": -2.7155404090881348, "logps/chosen": -171.80032348632812, "logps/rejected": -165.20169067382812, "loss": 0.6879, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.012009668163955212, "rewards/margins": 0.0021203968208283186, "rewards/rejected": 0.009889272041618824, "step": 10 }, { "epoch": 0.05128205128205128, "grad_norm": 33.83921269470837, "learning_rate": 9.999177507263144e-07, "logits/chosen": -2.6502068042755127, "logits/rejected": -2.628007411956787, "logps/chosen": -174.082275390625, "logps/rejected": -174.13429260253906, "loss": 0.6698, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.23495244979858398, "rewards/margins": 0.1125468835234642, "rewards/rejected": 0.12240554392337799, "step": 15 }, { "epoch": 0.06837606837606838, "grad_norm": 34.14427373918799, "learning_rate": 9.996710299650301e-07, "logits/chosen": -2.473665714263916, "logits/rejected": -2.4469008445739746, "logps/chosen": -158.2163848876953, "logps/rejected": -158.0710906982422, "loss": 0.661, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.4233472943305969, "rewards/margins": 0.1434161365032196, "rewards/rejected": 0.2799311578273773, "step": 20 }, { "epoch": 0.08547008547008547, "grad_norm": 33.2696083475879, "learning_rate": 9.992599188865604e-07, "logits/chosen": -2.314507007598877, "logits/rejected": -2.3168132305145264, "logps/chosen": -150.67019653320312, "logps/rejected": -156.8417510986328, "loss": 0.6501, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.4975205063819885, "rewards/margins": 0.15743504464626312, "rewards/rejected": 0.3400854766368866, "step": 25 }, { "epoch": 0.10256410256410256, "grad_norm": 34.42253361988952, "learning_rate": 9.98684552745256e-07, "logits/chosen": -2.243194103240967, "logits/rejected": -2.251340866088867, "logps/chosen": -161.2266845703125, "logps/rejected": -161.32298278808594, "loss": 0.6289, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.4243805408477783, "rewards/margins": 0.2635195851325989, "rewards/rejected": 0.16086098551750183, "step": 30 }, { "epoch": 0.11965811965811966, "grad_norm": 31.414296706456245, "learning_rate": 9.979451208349055e-07, "logits/chosen": -2.30315899848938, "logits/rejected": -2.289762496948242, "logps/chosen": -171.71713256835938, "logps/rejected": -174.50900268554688, "loss": 0.6296, "rewards/accuracies": 0.65625, "rewards/chosen": -0.019384615123271942, "rewards/margins": 0.318477988243103, "rewards/rejected": -0.3378625512123108, "step": 35 }, { "epoch": 0.13675213675213677, "grad_norm": 32.071830655862556, "learning_rate": 9.970418664264595e-07, "logits/chosen": -2.3935599327087402, "logits/rejected": -2.3812546730041504, "logps/chosen": -171.0698699951172, "logps/rejected": -176.58578491210938, "loss": 0.5991, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.26089853048324585, "rewards/margins": 0.5235068202018738, "rewards/rejected": -0.7844053506851196, "step": 40 }, { "epoch": 0.15384615384615385, "grad_norm": 36.19466541168301, "learning_rate": 9.95975086687994e-07, "logits/chosen": -2.4914021492004395, "logits/rejected": -2.4973323345184326, "logps/chosen": -163.68099975585938, "logps/rejected": -167.174072265625, "loss": 0.6141, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2962096929550171, "rewards/margins": 0.4588828682899475, "rewards/rejected": -0.7550925016403198, "step": 45 }, { "epoch": 0.17094017094017094, "grad_norm": 31.16276115760231, "learning_rate": 9.947451325869439e-07, "logits/chosen": -2.5575203895568848, "logits/rejected": -2.557717800140381, "logps/chosen": -172.04318237304688, "logps/rejected": -177.67672729492188, "loss": 0.5777, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2069791853427887, "rewards/margins": 0.6018465757369995, "rewards/rejected": -0.808825671672821, "step": 50 }, { "epoch": 0.17094017094017094, "eval_logits/chosen": -2.5221025943756104, "eval_logits/rejected": -2.5152711868286133, "eval_logps/chosen": -163.01820373535156, "eval_logps/rejected": -169.54832458496094, "eval_loss": 0.5812540650367737, "eval_rewards/accuracies": 0.6682692170143127, "eval_rewards/chosen": -0.45408713817596436, "eval_rewards/margins": 0.6127156615257263, "eval_rewards/rejected": -1.0668028593063354, "eval_runtime": 510.3361, "eval_samples_per_second": 16.291, "eval_steps_per_second": 0.255, "step": 50 }, { "epoch": 0.18803418803418803, "grad_norm": 31.575578721339145, "learning_rate": 9.933524087746347e-07, "logits/chosen": -2.490377426147461, "logits/rejected": -2.4825081825256348, "logps/chosen": -168.06161499023438, "logps/rejected": -175.0494384765625, "loss": 0.5706, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5060762763023376, "rewards/margins": 0.7589826583862305, "rewards/rejected": -1.2650587558746338, "step": 55 }, { "epoch": 0.20512820512820512, "grad_norm": 30.171745273288415, "learning_rate": 9.917973734531549e-07, "logits/chosen": -2.48228120803833, "logits/rejected": -2.4833157062530518, "logps/chosen": -159.47142028808594, "logps/rejected": -170.63671875, "loss": 0.5753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.35752761363983154, "rewards/margins": 0.5991309881210327, "rewards/rejected": -0.9566585421562195, "step": 60 }, { "epoch": 0.2222222222222222, "grad_norm": 32.13878319029882, "learning_rate": 9.90080538224607e-07, "logits/chosen": -2.585407018661499, "logits/rejected": -2.5767769813537598, "logps/chosen": -157.43936157226562, "logps/rejected": -166.13589477539062, "loss": 0.566, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02057185396552086, "rewards/margins": 0.47568243741989136, "rewards/rejected": -0.4962543547153473, "step": 65 }, { "epoch": 0.23931623931623933, "grad_norm": 29.494674721856043, "learning_rate": 9.882024679227938e-07, "logits/chosen": -2.6504979133605957, "logits/rejected": -2.6398470401763916, "logps/chosen": -178.0801239013672, "logps/rejected": -179.46328735351562, "loss": 0.5444, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.43436694145202637, "rewards/margins": 0.8427752256393433, "rewards/rejected": -1.27714204788208, "step": 70 }, { "epoch": 0.2564102564102564, "grad_norm": 28.856733948308104, "learning_rate": 9.861637804273881e-07, "logits/chosen": -2.660489082336426, "logits/rejected": -2.655539035797119, "logps/chosen": -162.1233673095703, "logps/rejected": -170.16131591796875, "loss": 0.5568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4032784402370453, "rewards/margins": 0.6959114074707031, "rewards/rejected": -1.0991899967193604, "step": 75 }, { "epoch": 0.27350427350427353, "grad_norm": 26.646061534818323, "learning_rate": 9.83965146460653e-07, "logits/chosen": -2.6391615867614746, "logits/rejected": -2.628577709197998, "logps/chosen": -168.58099365234375, "logps/rejected": -179.22805786132812, "loss": 0.5448, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6665827035903931, "rewards/margins": 0.8240470886230469, "rewards/rejected": -1.4906299114227295, "step": 80 }, { "epoch": 0.2905982905982906, "grad_norm": 36.04159750418885, "learning_rate": 9.816072893667758e-07, "logits/chosen": -2.6322970390319824, "logits/rejected": -2.6053385734558105, "logps/chosen": -174.82640075683594, "logps/rejected": -186.0735626220703, "loss": 0.5579, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0639268159866333, "rewards/margins": 1.0258175134658813, "rewards/rejected": -2.0897443294525146, "step": 85 }, { "epoch": 0.3076923076923077, "grad_norm": 26.922939193632168, "learning_rate": 9.790909848738904e-07, "logits/chosen": -2.60801362991333, "logits/rejected": -2.6101624965667725, "logps/chosen": -176.20538330078125, "logps/rejected": -184.7812957763672, "loss": 0.5215, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9929834604263306, "rewards/margins": 0.8646324276924133, "rewards/rejected": -1.8576160669326782, "step": 90 }, { "epoch": 0.3247863247863248, "grad_norm": 30.3564450245371, "learning_rate": 9.764170608388647e-07, "logits/chosen": -2.6054036617279053, "logits/rejected": -2.5733799934387207, "logps/chosen": -168.037109375, "logps/rejected": -174.51144409179688, "loss": 0.5197, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6652337312698364, "rewards/margins": 1.060430884361267, "rewards/rejected": -1.725664734840393, "step": 95 }, { "epoch": 0.3418803418803419, "grad_norm": 28.936164680674203, "learning_rate": 9.735863969749371e-07, "logits/chosen": -2.5255179405212402, "logits/rejected": -2.4874520301818848, "logps/chosen": -177.73861694335938, "logps/rejected": -189.82369995117188, "loss": 0.4982, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8511013984680176, "rewards/margins": 1.1354777812957764, "rewards/rejected": -1.986579179763794, "step": 100 }, { "epoch": 0.3418803418803419, "eval_logits/chosen": -2.484687328338623, "eval_logits/rejected": -2.460559368133545, "eval_logps/chosen": -168.28323364257812, "eval_logps/rejected": -180.8539276123047, "eval_loss": 0.5161151885986328, "eval_rewards/accuracies": 0.7211538553237915, "eval_rewards/chosen": -0.9805887937545776, "eval_rewards/margins": 1.2167747020721436, "eval_rewards/rejected": -2.1973636150360107, "eval_runtime": 510.3447, "eval_samples_per_second": 16.291, "eval_steps_per_second": 0.255, "step": 100 } ], "logging_steps": 5, "max_steps": 876, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1178822762299392.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }