{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9158878504672896, "eval_steps": 50, "global_step": 78, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18691588785046728, "grad_norm": 61.25268073995668, "learning_rate": 5e-07, "logits/chosen": -2.7241337299346924, "logits/rejected": -2.6918282508850098, "logps/chosen": -303.90643310546875, "logps/rejected": -234.9805450439453, "loss": 0.6903, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": 0.0160987488925457, "rewards/margins": 0.004555105231702328, "rewards/rejected": 0.011543644592165947, "step": 5 }, { "epoch": 0.37383177570093457, "grad_norm": 47.60898650679123, "learning_rate": 1e-06, "logits/chosen": -2.6647069454193115, "logits/rejected": -2.650399923324585, "logps/chosen": -269.51849365234375, "logps/rejected": -198.7647705078125, "loss": 0.6278, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.5727251768112183, "rewards/margins": 0.25209158658981323, "rewards/rejected": 0.3206337094306946, "step": 10 }, { "epoch": 0.5607476635514018, "grad_norm": 60.527379497145056, "learning_rate": 9.867190271803463e-07, "logits/chosen": -2.4945449829101562, "logits/rejected": -2.4840779304504395, "logps/chosen": -235.19595336914062, "logps/rejected": -213.60946655273438, "loss": 0.5799, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 1.5359665155410767, "rewards/margins": 0.8706293106079102, "rewards/rejected": 0.6653371453285217, "step": 15 }, { "epoch": 0.7476635514018691, "grad_norm": 53.828799786263204, "learning_rate": 9.475816456775312e-07, "logits/chosen": -2.4158647060394287, "logits/rejected": -2.3855679035186768, "logps/chosen": -268.0628662109375, "logps/rejected": -225.652587890625, "loss": 0.5684, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.9082000255584717, "rewards/margins": 1.4608235359191895, "rewards/rejected": 0.4473763406276703, "step": 20 }, { "epoch": 0.9345794392523364, "grad_norm": 34.458805166311905, "learning_rate": 8.846669854914395e-07, "logits/chosen": -2.2879459857940674, "logits/rejected": -2.2494328022003174, "logps/chosen": -231.61703491210938, "logps/rejected": -187.62875366210938, "loss": 0.5395, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.543787956237793, "rewards/margins": 1.2806603908538818, "rewards/rejected": 0.2631274163722992, "step": 25 }, { "epoch": 1.1214953271028036, "grad_norm": 27.103628504062815, "learning_rate": 8.013173181896282e-07, "logits/chosen": -2.277782917022705, "logits/rejected": -2.277600049972534, "logps/chosen": -259.5715637207031, "logps/rejected": -206.3133087158203, "loss": 0.3477, "rewards/accuracies": 0.84375, "rewards/chosen": 2.0770156383514404, "rewards/margins": 2.2314352989196777, "rewards/rejected": -0.15441982448101044, "step": 30 }, { "epoch": 1.308411214953271, "grad_norm": 21.639384944433875, "learning_rate": 7.019605024359474e-07, "logits/chosen": -2.276589870452881, "logits/rejected": -2.2846901416778564, "logps/chosen": -261.1808166503906, "logps/rejected": -219.38858032226562, "loss": 0.2558, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.4890947341918945, "rewards/margins": 3.4098620414733887, "rewards/rejected": -0.9207670092582703, "step": 35 }, { "epoch": 1.4953271028037383, "grad_norm": 22.680670396225757, "learning_rate": 5.918747589082852e-07, "logits/chosen": -2.3969621658325195, "logits/rejected": -2.3342068195343018, "logps/chosen": -248.27743530273438, "logps/rejected": -212.25845336914062, "loss": 0.2736, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.3888657093048096, "rewards/margins": 3.1784424781799316, "rewards/rejected": -0.789576530456543, "step": 40 }, { "epoch": 1.6822429906542056, "grad_norm": 24.61733178022545, "learning_rate": 4.769082706771303e-07, "logits/chosen": -2.397773265838623, "logits/rejected": -2.4283571243286133, "logps/chosen": -246.1250762939453, "logps/rejected": -229.49514770507812, "loss": 0.2653, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4959254264831543, "rewards/margins": 3.4453117847442627, "rewards/rejected": -0.9493860006332397, "step": 45 }, { "epoch": 1.8691588785046729, "grad_norm": 21.42871601886689, "learning_rate": 3.6316850496395855e-07, "logits/chosen": -2.4491772651672363, "logits/rejected": -2.4245429039001465, "logps/chosen": -275.09423828125, "logps/rejected": -236.69076538085938, "loss": 0.2912, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.785498857498169, "rewards/margins": 3.59558367729187, "rewards/rejected": -0.8100848197937012, "step": 50 }, { "epoch": 1.8691588785046729, "eval_logits/chosen": -2.4286158084869385, "eval_logits/rejected": -2.4138076305389404, "eval_logps/chosen": -230.1865997314453, "eval_logps/rejected": -191.80255126953125, "eval_loss": 0.5428566336631775, "eval_rewards/accuracies": 0.78125, "eval_rewards/chosen": 1.8078041076660156, "eval_rewards/margins": 1.948243260383606, "eval_rewards/rejected": -0.14043934643268585, "eval_runtime": 50.1691, "eval_samples_per_second": 15.149, "eval_steps_per_second": 0.239, "step": 50 }, { "epoch": 2.05607476635514, "grad_norm": 19.534577835943573, "learning_rate": 2.566977607165719e-07, "logits/chosen": -2.450115442276001, "logits/rejected": -2.419285297393799, "logps/chosen": -247.16586303710938, "logps/rejected": -211.9401092529297, "loss": 0.2287, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7968287467956543, "rewards/margins": 3.6083950996398926, "rewards/rejected": -0.8115667104721069, "step": 55 }, { "epoch": 2.2429906542056073, "grad_norm": 15.96174358986932, "learning_rate": 1.631521781767214e-07, "logits/chosen": -2.4297375679016113, "logits/rejected": -2.407179355621338, "logps/chosen": -232.9438018798828, "logps/rejected": -221.9869384765625, "loss": 0.1645, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.696375846862793, "rewards/margins": 3.9043846130371094, "rewards/rejected": -1.2080087661743164, "step": 60 }, { "epoch": 2.4299065420560746, "grad_norm": 16.25190830636716, "learning_rate": 8.75012627008489e-08, "logits/chosen": -2.460448741912842, "logits/rejected": -2.425128221511841, "logps/chosen": -258.5059814453125, "logps/rejected": -223.2895050048828, "loss": 0.1427, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 3.1485841274261475, "rewards/margins": 4.165754795074463, "rewards/rejected": -1.017170786857605, "step": 65 }, { "epoch": 2.616822429906542, "grad_norm": 15.19014195316505, "learning_rate": 3.376388529782215e-08, "logits/chosen": -2.4554247856140137, "logits/rejected": -2.421217679977417, "logps/chosen": -240.77627563476562, "logps/rejected": -223.79129028320312, "loss": 0.1737, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.8354218006134033, "rewards/margins": 4.4457106590271, "rewards/rejected": -1.6102889776229858, "step": 70 }, { "epoch": 2.803738317757009, "grad_norm": 20.62451875265718, "learning_rate": 4.794784562397458e-09, "logits/chosen": -2.437767505645752, "logits/rejected": -2.4236741065979004, "logps/chosen": -251.58251953125, "logps/rejected": -232.46157836914062, "loss": 0.159, "rewards/accuracies": 0.9375, "rewards/chosen": 3.0037763118743896, "rewards/margins": 4.042423248291016, "rewards/rejected": -1.038646936416626, "step": 75 } ], "logging_steps": 5, "max_steps": 78, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 919378820333568.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }