{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9919137466307277, "eval_steps": 500, "global_step": 92, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01078167115902965, "grad_norm": 33.0, "learning_rate": 5e-08, "logits/chosen": 152.2467803955078, "logits/rejected": 176.4149169921875, "logps/chosen": -205.79974365234375, "logps/rejected": -305.11883544921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05390835579514825, "grad_norm": 24.75, "learning_rate": 2.5e-07, "logits/chosen": 157.8704376220703, "logits/rejected": 173.8945770263672, "logps/chosen": -220.14161682128906, "logps/rejected": -331.4261474609375, "loss": 0.6999, "rewards/accuracies": 0.40234375, "rewards/chosen": -0.003565180581063032, "rewards/margins": 0.02022523060441017, "rewards/rejected": -0.02379041165113449, "step": 5 }, { "epoch": 0.1078167115902965, "grad_norm": 24.875, "learning_rate": 5e-07, "logits/chosen": 155.61563110351562, "logits/rejected": 172.3813934326172, "logps/chosen": -243.0758819580078, "logps/rejected": -341.87957763671875, "loss": 0.6212, "rewards/accuracies": 0.703125, "rewards/chosen": -0.03515549376606941, "rewards/margins": 0.20575320720672607, "rewards/rejected": -0.2409086972475052, "step": 10 }, { "epoch": 0.16172506738544473, "grad_norm": 16.0, "learning_rate": 4.954270799992138e-07, "logits/chosen": 138.01356506347656, "logits/rejected": 159.1266326904297, "logps/chosen": -206.3141326904297, "logps/rejected": -328.7684326171875, "loss": 0.474, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.21403150260448456, "rewards/margins": 0.6888868808746338, "rewards/rejected": -0.9029184579849243, "step": 15 }, { "epoch": 0.215633423180593, "grad_norm": 13.3125, "learning_rate": 4.818756127755237e-07, "logits/chosen": 130.20480346679688, "logits/rejected": 150.1522979736328, "logps/chosen": -221.1512451171875, "logps/rejected": -350.3003845214844, "loss": 0.3224, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.48178911209106445, "rewards/margins": 1.5005438327789307, "rewards/rejected": -1.9823329448699951, "step": 20 }, { "epoch": 0.2695417789757412, "grad_norm": 11.6875, "learning_rate": 4.598413565329875e-07, "logits/chosen": 116.27034759521484, "logits/rejected": 136.27476501464844, "logps/chosen": -206.4403076171875, "logps/rejected": -367.7535400390625, "loss": 0.2646, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.6159170866012573, "rewards/margins": 2.116407871246338, "rewards/rejected": -2.7323248386383057, "step": 25 }, { "epoch": 0.32345013477088946, "grad_norm": 23.375, "learning_rate": 4.301303984001967e-07, "logits/chosen": 104.18186950683594, "logits/rejected": 124.33646392822266, "logps/chosen": -226.0315399169922, "logps/rejected": -403.89788818359375, "loss": 0.2118, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -0.7898440361022949, "rewards/margins": 2.661341428756714, "rewards/rejected": -3.451185703277588, "step": 30 }, { "epoch": 0.37735849056603776, "grad_norm": 17.5, "learning_rate": 3.9382966505465515e-07, "logits/chosen": 100.61357116699219, "logits/rejected": 123.2223129272461, "logps/chosen": -246.58029174804688, "logps/rejected": -422.08843994140625, "loss": 0.2308, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.8788894414901733, "rewards/margins": 2.963942527770996, "rewards/rejected": -3.84283185005188, "step": 35 }, { "epoch": 0.431266846361186, "grad_norm": 11.6875, "learning_rate": 3.52267159292835e-07, "logits/chosen": 92.08902740478516, "logits/rejected": 119.60585021972656, "logps/chosen": -206.1851806640625, "logps/rejected": -389.8322448730469, "loss": 0.2222, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -0.7538734078407288, "rewards/margins": 2.9801974296569824, "rewards/rejected": -3.7340705394744873, "step": 40 }, { "epoch": 0.48517520215633425, "grad_norm": 9.375, "learning_rate": 3.069633772257844e-07, "logits/chosen": 92.80540466308594, "logits/rejected": 120.6998062133789, "logps/chosen": -197.2289276123047, "logps/rejected": -413.96673583984375, "loss": 0.1888, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -0.8127392530441284, "rewards/margins": 3.4538745880126953, "rewards/rejected": -4.266613960266113, "step": 45 }, { "epoch": 0.5390835579514824, "grad_norm": 7.15625, "learning_rate": 2.5957568342250883e-07, "logits/chosen": 80.5639877319336, "logits/rejected": 109.0477066040039, "logps/chosen": -183.60572814941406, "logps/rejected": -414.2920837402344, "loss": 0.157, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6448982357978821, "rewards/margins": 3.7357356548309326, "rewards/rejected": -4.380634307861328, "step": 50 }, { "epoch": 0.5929919137466307, "grad_norm": 17.125, "learning_rate": 2.1183767894528135e-07, "logits/chosen": 80.83811950683594, "logits/rejected": 109.01493072509766, "logps/chosen": -218.6677703857422, "logps/rejected": -411.70428466796875, "loss": 0.177, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.001700758934021, "rewards/margins": 3.578040361404419, "rewards/rejected": -4.579740524291992, "step": 55 }, { "epoch": 0.6469002695417789, "grad_norm": 7.21875, "learning_rate": 1.6549578039787434e-07, "logits/chosen": 80.84095764160156, "logits/rejected": 104.05110168457031, "logps/chosen": -204.79974365234375, "logps/rejected": -417.6326599121094, "loss": 0.1528, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.9564255475997925, "rewards/margins": 3.9052727222442627, "rewards/rejected": -4.861698150634766, "step": 60 }, { "epoch": 0.7008086253369272, "grad_norm": 9.875, "learning_rate": 1.2224533013822236e-07, "logits/chosen": 82.07149505615234, "logits/rejected": 105.08570861816406, "logps/chosen": -217.42166137695312, "logps/rejected": -414.4361877441406, "loss": 0.1848, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.2266864776611328, "rewards/margins": 3.6652228832244873, "rewards/rejected": -4.891909599304199, "step": 65 }, { "epoch": 0.7547169811320755, "grad_norm": 15.25, "learning_rate": 8.366857495860869e-08, "logits/chosen": 77.88710021972656, "logits/rejected": 102.08468627929688, "logps/chosen": -211.1795654296875, "logps/rejected": -418.01007080078125, "loss": 0.1585, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -1.0707504749298096, "rewards/margins": 4.129985809326172, "rewards/rejected": -5.200736045837402, "step": 70 }, { "epoch": 0.8086253369272237, "grad_norm": 8.1875, "learning_rate": 5.117678218164337e-08, "logits/chosen": 77.63814544677734, "logits/rejected": 105.76668548583984, "logps/chosen": -207.71939086914062, "logps/rejected": -437.01324462890625, "loss": 0.1593, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1950900554656982, "rewards/margins": 3.9784095287323, "rewards/rejected": -5.173499584197998, "step": 75 }, { "epoch": 0.862533692722372, "grad_norm": 6.03125, "learning_rate": 2.5958610759736126e-08, "logits/chosen": 75.05287170410156, "logits/rejected": 102.56523132324219, "logps/chosen": -200.25509643554688, "logps/rejected": -432.19091796875, "loss": 0.1329, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.9279828071594238, "rewards/margins": 4.384153366088867, "rewards/rejected": -5.312136650085449, "step": 80 }, { "epoch": 0.9164420485175202, "grad_norm": 8.9375, "learning_rate": 8.936626136714753e-09, "logits/chosen": 77.63639068603516, "logits/rejected": 104.80506896972656, "logps/chosen": -214.80581665039062, "logps/rejected": -433.83575439453125, "loss": 0.1275, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -1.0722030401229858, "rewards/margins": 4.1656694412231445, "rewards/rejected": -5.237872123718262, "step": 85 }, { "epoch": 0.9703504043126685, "grad_norm": 8.1875, "learning_rate": 7.335497040648897e-10, "logits/chosen": 77.35391998291016, "logits/rejected": 102.33561706542969, "logps/chosen": -231.8318328857422, "logps/rejected": -451.8515625, "loss": 0.1304, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1285860538482666, "rewards/margins": 4.282302379608154, "rewards/rejected": -5.410888671875, "step": 90 }, { "epoch": 0.9919137466307277, "step": 92, "total_flos": 0.0, "train_loss": 0.25287168732155924, "train_runtime": 1304.8057, "train_samples_per_second": 4.549, "train_steps_per_second": 0.071 } ], "logging_steps": 5, "max_steps": 92, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }