|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.0, |
|
"eval_steps": 500, |
|
"global_step": 37620, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01594896331738437, |
|
"grad_norm": 793786.1875, |
|
"learning_rate": 4.9867091972355135e-05, |
|
"loss": 4.3231, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03189792663476874, |
|
"grad_norm": 705751.0625, |
|
"learning_rate": 4.973418394471026e-05, |
|
"loss": 4.1867, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04784688995215311, |
|
"grad_norm": 761077.1875, |
|
"learning_rate": 4.9601275917065395e-05, |
|
"loss": 4.225, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06379585326953748, |
|
"grad_norm": 660073.9375, |
|
"learning_rate": 4.946836788942053e-05, |
|
"loss": 4.2529, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07974481658692185, |
|
"grad_norm": 701404.4375, |
|
"learning_rate": 4.9335459861775654e-05, |
|
"loss": 4.2985, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09569377990430622, |
|
"grad_norm": 776949.5625, |
|
"learning_rate": 4.920255183413078e-05, |
|
"loss": 4.2124, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.11164274322169059, |
|
"grad_norm": 703487.375, |
|
"learning_rate": 4.906964380648591e-05, |
|
"loss": 4.224, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.12759170653907495, |
|
"grad_norm": 727441.625, |
|
"learning_rate": 4.893673577884104e-05, |
|
"loss": 4.3259, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.14354066985645933, |
|
"grad_norm": 772143.4375, |
|
"learning_rate": 4.880382775119617e-05, |
|
"loss": 4.1641, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1594896331738437, |
|
"grad_norm": 834641.6875, |
|
"learning_rate": 4.8670919723551306e-05, |
|
"loss": 4.2651, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.17543859649122806, |
|
"grad_norm": 743223.0, |
|
"learning_rate": 4.853801169590643e-05, |
|
"loss": 4.1867, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.19138755980861244, |
|
"grad_norm": 643478.125, |
|
"learning_rate": 4.8405103668261565e-05, |
|
"loss": 4.1816, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.20733652312599682, |
|
"grad_norm": 656405.6875, |
|
"learning_rate": 4.82721956406167e-05, |
|
"loss": 4.1677, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.22328548644338117, |
|
"grad_norm": 714701.3125, |
|
"learning_rate": 4.813928761297183e-05, |
|
"loss": 4.1595, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.23923444976076555, |
|
"grad_norm": 739746.875, |
|
"learning_rate": 4.800637958532696e-05, |
|
"loss": 4.1319, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2551834130781499, |
|
"grad_norm": 736049.3125, |
|
"learning_rate": 4.787347155768209e-05, |
|
"loss": 4.1543, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2711323763955343, |
|
"grad_norm": 668736.5, |
|
"learning_rate": 4.7740563530037217e-05, |
|
"loss": 4.2052, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.28708133971291866, |
|
"grad_norm": 655993.0625, |
|
"learning_rate": 4.760765550239234e-05, |
|
"loss": 4.1642, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 732162.3125, |
|
"learning_rate": 4.7474747474747476e-05, |
|
"loss": 4.1464, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3189792663476874, |
|
"grad_norm": 687130.875, |
|
"learning_rate": 4.734183944710261e-05, |
|
"loss": 4.181, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3349282296650718, |
|
"grad_norm": 823373.1875, |
|
"learning_rate": 4.7208931419457735e-05, |
|
"loss": 4.1591, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.3508771929824561, |
|
"grad_norm": 716504.625, |
|
"learning_rate": 4.707602339181287e-05, |
|
"loss": 4.1391, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.3668261562998405, |
|
"grad_norm": 714200.6875, |
|
"learning_rate": 4.6943115364168e-05, |
|
"loss": 4.0962, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.3827751196172249, |
|
"grad_norm": 662553.0, |
|
"learning_rate": 4.681020733652313e-05, |
|
"loss": 4.0964, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.39872408293460926, |
|
"grad_norm": 767714.875, |
|
"learning_rate": 4.667729930887826e-05, |
|
"loss": 4.1174, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.41467304625199364, |
|
"grad_norm": 763852.75, |
|
"learning_rate": 4.6544391281233393e-05, |
|
"loss": 4.0442, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.430622009569378, |
|
"grad_norm": 733045.8125, |
|
"learning_rate": 4.641148325358852e-05, |
|
"loss": 4.112, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.44657097288676234, |
|
"grad_norm": 752065.875, |
|
"learning_rate": 4.6278575225943646e-05, |
|
"loss": 4.1223, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.4625199362041467, |
|
"grad_norm": 675739.8125, |
|
"learning_rate": 4.614566719829878e-05, |
|
"loss": 4.1608, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.4784688995215311, |
|
"grad_norm": 666147.375, |
|
"learning_rate": 4.6012759170653905e-05, |
|
"loss": 4.0803, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4944178628389155, |
|
"grad_norm": 594287.9375, |
|
"learning_rate": 4.587985114300904e-05, |
|
"loss": 4.045, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5103668261562998, |
|
"grad_norm": 778230.25, |
|
"learning_rate": 4.574694311536417e-05, |
|
"loss": 4.1268, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 683312.0625, |
|
"learning_rate": 4.56140350877193e-05, |
|
"loss": 4.0424, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5422647527910686, |
|
"grad_norm": 640858.25, |
|
"learning_rate": 4.548112706007443e-05, |
|
"loss": 4.0235, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.5582137161084529, |
|
"grad_norm": 680217.875, |
|
"learning_rate": 4.5348219032429564e-05, |
|
"loss": 4.1423, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5741626794258373, |
|
"grad_norm": 708408.4375, |
|
"learning_rate": 4.521531100478469e-05, |
|
"loss": 4.0291, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5901116427432217, |
|
"grad_norm": 663333.0, |
|
"learning_rate": 4.508240297713982e-05, |
|
"loss": 3.9998, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 652592.0, |
|
"learning_rate": 4.494949494949495e-05, |
|
"loss": 4.0424, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6220095693779905, |
|
"grad_norm": 667416.9375, |
|
"learning_rate": 4.481658692185008e-05, |
|
"loss": 4.053, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.6379585326953748, |
|
"grad_norm": 674108.4375, |
|
"learning_rate": 4.468367889420521e-05, |
|
"loss": 4.0748, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6539074960127592, |
|
"grad_norm": 714003.625, |
|
"learning_rate": 4.455077086656034e-05, |
|
"loss": 3.9807, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.6698564593301436, |
|
"grad_norm": 693166.4375, |
|
"learning_rate": 4.4417862838915475e-05, |
|
"loss": 3.993, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.6858054226475279, |
|
"grad_norm": 689997.125, |
|
"learning_rate": 4.42849548112706e-05, |
|
"loss": 4.0593, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 741118.5625, |
|
"learning_rate": 4.4152046783625734e-05, |
|
"loss": 4.0409, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7177033492822966, |
|
"grad_norm": 668626.9375, |
|
"learning_rate": 4.401913875598087e-05, |
|
"loss": 4.06, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.733652312599681, |
|
"grad_norm": 771064.125, |
|
"learning_rate": 4.388623072833599e-05, |
|
"loss": 4.0122, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.7496012759170654, |
|
"grad_norm": 756131.375, |
|
"learning_rate": 4.3753322700691126e-05, |
|
"loss": 4.043, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.7655502392344498, |
|
"grad_norm": 737817.5625, |
|
"learning_rate": 4.362041467304626e-05, |
|
"loss": 4.0213, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.7814992025518341, |
|
"grad_norm": 758023.5625, |
|
"learning_rate": 4.3487506645401385e-05, |
|
"loss": 3.9778, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.7974481658692185, |
|
"grad_norm": 672895.1875, |
|
"learning_rate": 4.335459861775651e-05, |
|
"loss": 4.0434, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8133971291866029, |
|
"grad_norm": 710937.375, |
|
"learning_rate": 4.3221690590111645e-05, |
|
"loss": 4.0117, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.8293460925039873, |
|
"grad_norm": 773563.875, |
|
"learning_rate": 4.308878256246677e-05, |
|
"loss": 4.0516, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.8452950558213717, |
|
"grad_norm": 750733.4375, |
|
"learning_rate": 4.2955874534821904e-05, |
|
"loss": 3.9798, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.861244019138756, |
|
"grad_norm": 706317.3125, |
|
"learning_rate": 4.282296650717704e-05, |
|
"loss": 3.9845, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.8771929824561403, |
|
"grad_norm": 710855.75, |
|
"learning_rate": 4.269005847953216e-05, |
|
"loss": 4.0295, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.8931419457735247, |
|
"grad_norm": 712988.9375, |
|
"learning_rate": 4.2557150451887296e-05, |
|
"loss": 3.9732, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 628492.75, |
|
"learning_rate": 4.242424242424243e-05, |
|
"loss": 4.0012, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.9250398724082934, |
|
"grad_norm": 821738.375, |
|
"learning_rate": 4.2291334396597556e-05, |
|
"loss": 3.9867, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.9409888357256778, |
|
"grad_norm": 720818.125, |
|
"learning_rate": 4.215842636895269e-05, |
|
"loss": 3.962, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.9569377990430622, |
|
"grad_norm": 698428.0625, |
|
"learning_rate": 4.2025518341307815e-05, |
|
"loss": 4.0095, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9728867623604466, |
|
"grad_norm": 769185.75, |
|
"learning_rate": 4.189261031366295e-05, |
|
"loss": 3.9469, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.988835725677831, |
|
"grad_norm": 706422.25, |
|
"learning_rate": 4.1759702286018074e-05, |
|
"loss": 3.9466, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 3.8981289863586426, |
|
"eval_runtime": 213.136, |
|
"eval_samples_per_second": 134.295, |
|
"eval_steps_per_second": 4.199, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 1.0047846889952152, |
|
"grad_norm": 664245.5, |
|
"learning_rate": 4.162679425837321e-05, |
|
"loss": 3.9397, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.0207336523125996, |
|
"grad_norm": 790251.5625, |
|
"learning_rate": 4.149388623072834e-05, |
|
"loss": 3.7918, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.036682615629984, |
|
"grad_norm": 813129.1875, |
|
"learning_rate": 4.1360978203083467e-05, |
|
"loss": 3.8563, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 695871.5, |
|
"learning_rate": 4.12280701754386e-05, |
|
"loss": 3.959, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.0685805422647527, |
|
"grad_norm": 640218.3125, |
|
"learning_rate": 4.109516214779373e-05, |
|
"loss": 3.9882, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.0845295055821371, |
|
"grad_norm": 709886.125, |
|
"learning_rate": 4.096225412014886e-05, |
|
"loss": 3.9328, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.1004784688995215, |
|
"grad_norm": 663732.1875, |
|
"learning_rate": 4.082934609250399e-05, |
|
"loss": 3.947, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.1164274322169059, |
|
"grad_norm": 649781.9375, |
|
"learning_rate": 4.069643806485912e-05, |
|
"loss": 3.8735, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.1323763955342903, |
|
"grad_norm": 788934.4375, |
|
"learning_rate": 4.056353003721425e-05, |
|
"loss": 3.8548, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.1483253588516746, |
|
"grad_norm": 688279.3125, |
|
"learning_rate": 4.043062200956938e-05, |
|
"loss": 3.8844, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.164274322169059, |
|
"grad_norm": 698958.0, |
|
"learning_rate": 4.029771398192451e-05, |
|
"loss": 3.878, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.1802232854864434, |
|
"grad_norm": 777544.8125, |
|
"learning_rate": 4.016480595427964e-05, |
|
"loss": 3.8299, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.1961722488038278, |
|
"grad_norm": 768888.3125, |
|
"learning_rate": 4.003189792663477e-05, |
|
"loss": 3.859, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 703599.8125, |
|
"learning_rate": 3.98989898989899e-05, |
|
"loss": 3.9008, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.2280701754385965, |
|
"grad_norm": 684616.75, |
|
"learning_rate": 3.976608187134503e-05, |
|
"loss": 3.8917, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.244019138755981, |
|
"grad_norm": 754650.9375, |
|
"learning_rate": 3.963317384370016e-05, |
|
"loss": 3.7987, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.2599681020733653, |
|
"grad_norm": 833289.8125, |
|
"learning_rate": 3.9500265816055295e-05, |
|
"loss": 3.8097, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.2759170653907497, |
|
"grad_norm": 712939.125, |
|
"learning_rate": 3.936735778841042e-05, |
|
"loss": 3.91, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.291866028708134, |
|
"grad_norm": 691034.4375, |
|
"learning_rate": 3.9234449760765554e-05, |
|
"loss": 3.8607, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.3078149920255182, |
|
"grad_norm": 777656.1875, |
|
"learning_rate": 3.910154173312068e-05, |
|
"loss": 3.8294, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.3237639553429026, |
|
"grad_norm": 779268.6875, |
|
"learning_rate": 3.896863370547581e-05, |
|
"loss": 3.839, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.339712918660287, |
|
"grad_norm": 737519.0, |
|
"learning_rate": 3.883572567783094e-05, |
|
"loss": 3.8773, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.3556618819776713, |
|
"grad_norm": 722279.3125, |
|
"learning_rate": 3.870281765018607e-05, |
|
"loss": 3.8351, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.3716108452950557, |
|
"grad_norm": 763525.9375, |
|
"learning_rate": 3.8569909622541206e-05, |
|
"loss": 3.8517, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.38755980861244, |
|
"grad_norm": 715289.3125, |
|
"learning_rate": 3.843700159489633e-05, |
|
"loss": 3.8272, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.4035087719298245, |
|
"grad_norm": 656186.625, |
|
"learning_rate": 3.8304093567251465e-05, |
|
"loss": 3.8894, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.4194577352472089, |
|
"grad_norm": 756528.875, |
|
"learning_rate": 3.81711855396066e-05, |
|
"loss": 3.8387, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.4354066985645932, |
|
"grad_norm": 716015.0, |
|
"learning_rate": 3.8038277511961725e-05, |
|
"loss": 3.7842, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.4513556618819776, |
|
"grad_norm": 721006.3125, |
|
"learning_rate": 3.790536948431686e-05, |
|
"loss": 3.7769, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.467304625199362, |
|
"grad_norm": 800538.1875, |
|
"learning_rate": 3.7772461456671984e-05, |
|
"loss": 3.8393, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.4832535885167464, |
|
"grad_norm": 699156.0, |
|
"learning_rate": 3.763955342902711e-05, |
|
"loss": 3.7527, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.4992025518341308, |
|
"grad_norm": 699306.375, |
|
"learning_rate": 3.750664540138224e-05, |
|
"loss": 3.8306, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 689892.5, |
|
"learning_rate": 3.7373737373737376e-05, |
|
"loss": 3.8582, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.5311004784688995, |
|
"grad_norm": 712134.3125, |
|
"learning_rate": 3.72408293460925e-05, |
|
"loss": 3.9048, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.547049441786284, |
|
"grad_norm": 611158.625, |
|
"learning_rate": 3.7107921318447635e-05, |
|
"loss": 3.8315, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.5629984051036683, |
|
"grad_norm": 814951.875, |
|
"learning_rate": 3.697501329080277e-05, |
|
"loss": 3.8433, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 638232.25, |
|
"learning_rate": 3.6842105263157895e-05, |
|
"loss": 3.8697, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.594896331738437, |
|
"grad_norm": 717277.8125, |
|
"learning_rate": 3.670919723551303e-05, |
|
"loss": 3.8447, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.6108452950558214, |
|
"grad_norm": 623000.1875, |
|
"learning_rate": 3.657628920786816e-05, |
|
"loss": 3.9104, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 1.6267942583732058, |
|
"grad_norm": 737050.875, |
|
"learning_rate": 3.644338118022329e-05, |
|
"loss": 3.8208, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 1.6427432216905902, |
|
"grad_norm": 766050.0, |
|
"learning_rate": 3.631047315257842e-05, |
|
"loss": 3.8454, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 1.6586921850079746, |
|
"grad_norm": 681873.25, |
|
"learning_rate": 3.6177565124933546e-05, |
|
"loss": 3.8284, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 1.674641148325359, |
|
"grad_norm": 636321.0625, |
|
"learning_rate": 3.604465709728867e-05, |
|
"loss": 3.8191, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.6905901116427433, |
|
"grad_norm": 751694.875, |
|
"learning_rate": 3.5911749069643806e-05, |
|
"loss": 3.8152, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 1.7065390749601277, |
|
"grad_norm": 779390.25, |
|
"learning_rate": 3.577884104199894e-05, |
|
"loss": 3.8235, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 1.722488038277512, |
|
"grad_norm": 780193.4375, |
|
"learning_rate": 3.5645933014354065e-05, |
|
"loss": 3.8109, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 1.7384370015948964, |
|
"grad_norm": 678532.75, |
|
"learning_rate": 3.55130249867092e-05, |
|
"loss": 3.8017, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 1.7543859649122808, |
|
"grad_norm": 733045.375, |
|
"learning_rate": 3.538011695906433e-05, |
|
"loss": 3.8518, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.7703349282296652, |
|
"grad_norm": 646417.375, |
|
"learning_rate": 3.5247208931419464e-05, |
|
"loss": 3.7623, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 1.7862838915470496, |
|
"grad_norm": 641868.25, |
|
"learning_rate": 3.511430090377459e-05, |
|
"loss": 3.7536, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 1.802232854864434, |
|
"grad_norm": 743804.375, |
|
"learning_rate": 3.498139287612972e-05, |
|
"loss": 3.8056, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 725540.5625, |
|
"learning_rate": 3.484848484848485e-05, |
|
"loss": 3.8078, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 1.8341307814992025, |
|
"grad_norm": 735264.125, |
|
"learning_rate": 3.4715576820839976e-05, |
|
"loss": 3.8247, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.8500797448165869, |
|
"grad_norm": 708785.5625, |
|
"learning_rate": 3.458266879319511e-05, |
|
"loss": 3.7898, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 1.8660287081339713, |
|
"grad_norm": 864416.3125, |
|
"learning_rate": 3.444976076555024e-05, |
|
"loss": 3.8642, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 1.8819776714513556, |
|
"grad_norm": 693382.5625, |
|
"learning_rate": 3.431685273790537e-05, |
|
"loss": 3.7433, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 1.89792663476874, |
|
"grad_norm": 841755.25, |
|
"learning_rate": 3.41839447102605e-05, |
|
"loss": 3.8562, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 1.9138755980861244, |
|
"grad_norm": 662765.125, |
|
"learning_rate": 3.4051036682615634e-05, |
|
"loss": 3.8703, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.9298245614035088, |
|
"grad_norm": 707709.75, |
|
"learning_rate": 3.391812865497076e-05, |
|
"loss": 3.8402, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 1.9457735247208932, |
|
"grad_norm": 616722.875, |
|
"learning_rate": 3.3785220627325893e-05, |
|
"loss": 3.8432, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 1.9617224880382775, |
|
"grad_norm": 795238.625, |
|
"learning_rate": 3.3652312599681026e-05, |
|
"loss": 3.7319, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 1.977671451355662, |
|
"grad_norm": 709130.0, |
|
"learning_rate": 3.351940457203615e-05, |
|
"loss": 3.7628, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.9936204146730463, |
|
"grad_norm": 789321.375, |
|
"learning_rate": 3.3386496544391286e-05, |
|
"loss": 3.7821, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 3.800079584121704, |
|
"eval_runtime": 212.9573, |
|
"eval_samples_per_second": 134.407, |
|
"eval_steps_per_second": 4.203, |
|
"step": 12540 |
|
}, |
|
{ |
|
"epoch": 2.0095693779904304, |
|
"grad_norm": 862786.0, |
|
"learning_rate": 3.325358851674641e-05, |
|
"loss": 3.7512, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 2.025518341307815, |
|
"grad_norm": 772642.5625, |
|
"learning_rate": 3.312068048910154e-05, |
|
"loss": 3.6611, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 2.041467304625199, |
|
"grad_norm": 706907.1875, |
|
"learning_rate": 3.298777246145667e-05, |
|
"loss": 3.7484, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 2.0574162679425836, |
|
"grad_norm": 940896.375, |
|
"learning_rate": 3.2854864433811804e-05, |
|
"loss": 3.7545, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 2.073365231259968, |
|
"grad_norm": 670013.875, |
|
"learning_rate": 3.272195640616693e-05, |
|
"loss": 3.6504, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.0893141945773523, |
|
"grad_norm": 702168.0, |
|
"learning_rate": 3.2589048378522064e-05, |
|
"loss": 3.6133, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 741310.75, |
|
"learning_rate": 3.24561403508772e-05, |
|
"loss": 3.7202, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"grad_norm": 630812.625, |
|
"learning_rate": 3.232323232323233e-05, |
|
"loss": 3.7131, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 2.1371610845295055, |
|
"grad_norm": 736768.125, |
|
"learning_rate": 3.2190324295587456e-05, |
|
"loss": 3.6982, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 2.15311004784689, |
|
"grad_norm": 743122.8125, |
|
"learning_rate": 3.205741626794259e-05, |
|
"loss": 3.7196, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.1690590111642742, |
|
"grad_norm": 685713.25, |
|
"learning_rate": 3.1924508240297715e-05, |
|
"loss": 3.7312, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 2.1850079744816586, |
|
"grad_norm": 829695.75, |
|
"learning_rate": 3.179160021265284e-05, |
|
"loss": 3.6914, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 2.200956937799043, |
|
"grad_norm": 640866.6875, |
|
"learning_rate": 3.1658692185007975e-05, |
|
"loss": 3.8376, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 2.2169059011164274, |
|
"grad_norm": 705478.25, |
|
"learning_rate": 3.152578415736311e-05, |
|
"loss": 3.7294, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 2.2328548644338118, |
|
"grad_norm": 664668.8125, |
|
"learning_rate": 3.1392876129718234e-05, |
|
"loss": 3.7046, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.248803827751196, |
|
"grad_norm": 653658.25, |
|
"learning_rate": 3.125996810207337e-05, |
|
"loss": 3.7142, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 2.2647527910685805, |
|
"grad_norm": 737460.0625, |
|
"learning_rate": 3.11270600744285e-05, |
|
"loss": 3.6449, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 2.280701754385965, |
|
"grad_norm": 657366.8125, |
|
"learning_rate": 3.0994152046783626e-05, |
|
"loss": 3.7051, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 2.2966507177033493, |
|
"grad_norm": 734922.0625, |
|
"learning_rate": 3.086124401913876e-05, |
|
"loss": 3.6793, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 2.3125996810207337, |
|
"grad_norm": 636888.125, |
|
"learning_rate": 3.072833599149389e-05, |
|
"loss": 3.764, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.328548644338118, |
|
"grad_norm": 776567.125, |
|
"learning_rate": 3.059542796384902e-05, |
|
"loss": 3.6952, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 2.3444976076555024, |
|
"grad_norm": 678031.5, |
|
"learning_rate": 3.0462519936204148e-05, |
|
"loss": 3.7086, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 2.360446570972887, |
|
"grad_norm": 702883.9375, |
|
"learning_rate": 3.032961190855928e-05, |
|
"loss": 3.6974, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 2.376395534290271, |
|
"grad_norm": 640335.375, |
|
"learning_rate": 3.0196703880914407e-05, |
|
"loss": 3.6555, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 2.3923444976076556, |
|
"grad_norm": 686070.0625, |
|
"learning_rate": 3.0063795853269537e-05, |
|
"loss": 3.7167, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.40829346092504, |
|
"grad_norm": 676044.25, |
|
"learning_rate": 2.993088782562467e-05, |
|
"loss": 3.6554, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 767818.6875, |
|
"learning_rate": 2.9797979797979796e-05, |
|
"loss": 3.693, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 2.4401913875598087, |
|
"grad_norm": 739911.0625, |
|
"learning_rate": 2.966507177033493e-05, |
|
"loss": 3.6431, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 2.456140350877193, |
|
"grad_norm": 771256.3125, |
|
"learning_rate": 2.9532163742690062e-05, |
|
"loss": 3.6216, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 2.4720893141945774, |
|
"grad_norm": 773173.375, |
|
"learning_rate": 2.939925571504519e-05, |
|
"loss": 3.7742, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.488038277511962, |
|
"grad_norm": 762888.3125, |
|
"learning_rate": 2.9266347687400318e-05, |
|
"loss": 3.7236, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 2.503987240829346, |
|
"grad_norm": 879857.0625, |
|
"learning_rate": 2.913343965975545e-05, |
|
"loss": 3.7253, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 2.5199362041467306, |
|
"grad_norm": 733285.0625, |
|
"learning_rate": 2.9000531632110584e-05, |
|
"loss": 3.7166, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 2.535885167464115, |
|
"grad_norm": 833014.3125, |
|
"learning_rate": 2.886762360446571e-05, |
|
"loss": 3.6705, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 2.5518341307814993, |
|
"grad_norm": 736943.4375, |
|
"learning_rate": 2.8734715576820844e-05, |
|
"loss": 3.7253, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.5677830940988837, |
|
"grad_norm": 708644.4375, |
|
"learning_rate": 2.8601807549175973e-05, |
|
"loss": 3.6672, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 2.583732057416268, |
|
"grad_norm": 795735.8125, |
|
"learning_rate": 2.84688995215311e-05, |
|
"loss": 3.728, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 2.5996810207336525, |
|
"grad_norm": 864533.875, |
|
"learning_rate": 2.8335991493886233e-05, |
|
"loss": 3.6478, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 2.6156299840510364, |
|
"grad_norm": 835745.0, |
|
"learning_rate": 2.8203083466241366e-05, |
|
"loss": 3.6204, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 726415.3125, |
|
"learning_rate": 2.8070175438596492e-05, |
|
"loss": 3.6899, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.647527910685805, |
|
"grad_norm": 619188.1875, |
|
"learning_rate": 2.793726741095162e-05, |
|
"loss": 3.6442, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 2.66347687400319, |
|
"grad_norm": 704278.1875, |
|
"learning_rate": 2.7804359383306755e-05, |
|
"loss": 3.7113, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 2.679425837320574, |
|
"grad_norm": 787753.0, |
|
"learning_rate": 2.767145135566188e-05, |
|
"loss": 3.7368, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 2.6953748006379588, |
|
"grad_norm": 738743.5625, |
|
"learning_rate": 2.7538543328017014e-05, |
|
"loss": 3.7575, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 2.7113237639553427, |
|
"grad_norm": 797812.75, |
|
"learning_rate": 2.7405635300372147e-05, |
|
"loss": 3.6586, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 662883.0, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 3.6812, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 2.7432216905901115, |
|
"grad_norm": 708693.8125, |
|
"learning_rate": 2.7139819245082403e-05, |
|
"loss": 3.6645, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 2.7591706539074963, |
|
"grad_norm": 750630.625, |
|
"learning_rate": 2.7006911217437536e-05, |
|
"loss": 3.659, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 2.77511961722488, |
|
"grad_norm": 743643.5625, |
|
"learning_rate": 2.6874003189792662e-05, |
|
"loss": 3.6794, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 2.7910685805422646, |
|
"grad_norm": 683095.9375, |
|
"learning_rate": 2.6741095162147795e-05, |
|
"loss": 3.6587, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.807017543859649, |
|
"grad_norm": 659289.8125, |
|
"learning_rate": 2.6608187134502928e-05, |
|
"loss": 3.7067, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 2.8229665071770333, |
|
"grad_norm": 722875.9375, |
|
"learning_rate": 2.6475279106858054e-05, |
|
"loss": 3.6918, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 2.8389154704944177, |
|
"grad_norm": 643060.4375, |
|
"learning_rate": 2.6342371079213184e-05, |
|
"loss": 3.694, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 2.854864433811802, |
|
"grad_norm": 811117.3125, |
|
"learning_rate": 2.6209463051568317e-05, |
|
"loss": 3.703, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 2.8708133971291865, |
|
"grad_norm": 835076.6875, |
|
"learning_rate": 2.6076555023923443e-05, |
|
"loss": 3.6346, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.886762360446571, |
|
"grad_norm": 800111.875, |
|
"learning_rate": 2.5943646996278576e-05, |
|
"loss": 3.6589, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 2.9027113237639552, |
|
"grad_norm": 726367.5, |
|
"learning_rate": 2.5810738968633706e-05, |
|
"loss": 3.6522, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 2.9186602870813396, |
|
"grad_norm": 717922.25, |
|
"learning_rate": 2.567783094098884e-05, |
|
"loss": 3.6592, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 2.934609250398724, |
|
"grad_norm": 841302.0625, |
|
"learning_rate": 2.5544922913343965e-05, |
|
"loss": 3.688, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 2.9505582137161084, |
|
"grad_norm": 697304.1875, |
|
"learning_rate": 2.5412014885699098e-05, |
|
"loss": 3.6017, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.9665071770334928, |
|
"grad_norm": 929050.25, |
|
"learning_rate": 2.527910685805423e-05, |
|
"loss": 3.6116, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 2.982456140350877, |
|
"grad_norm": 699048.9375, |
|
"learning_rate": 2.5146198830409358e-05, |
|
"loss": 3.6474, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 2.9984051036682615, |
|
"grad_norm": 783686.0625, |
|
"learning_rate": 2.5013290802764487e-05, |
|
"loss": 3.6783, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 3.706315755844116, |
|
"eval_runtime": 212.8533, |
|
"eval_samples_per_second": 134.473, |
|
"eval_steps_per_second": 4.205, |
|
"step": 18810 |
|
}, |
|
{ |
|
"epoch": 3.014354066985646, |
|
"grad_norm": 709059.3125, |
|
"learning_rate": 2.4880382775119617e-05, |
|
"loss": 3.5423, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"grad_norm": 655684.75, |
|
"learning_rate": 2.474747474747475e-05, |
|
"loss": 3.5606, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 3.0462519936204147, |
|
"grad_norm": 759915.1875, |
|
"learning_rate": 2.461456671982988e-05, |
|
"loss": 3.6979, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 3.062200956937799, |
|
"grad_norm": 835860.375, |
|
"learning_rate": 2.448165869218501e-05, |
|
"loss": 3.6324, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 3.0781499202551834, |
|
"grad_norm": 675779.5625, |
|
"learning_rate": 2.434875066454014e-05, |
|
"loss": 3.5773, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 3.094098883572568, |
|
"grad_norm": 700443.875, |
|
"learning_rate": 2.421584263689527e-05, |
|
"loss": 3.6309, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 3.110047846889952, |
|
"grad_norm": 768801.6875, |
|
"learning_rate": 2.4082934609250398e-05, |
|
"loss": 3.639, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 3.1259968102073366, |
|
"grad_norm": 705109.6875, |
|
"learning_rate": 2.395002658160553e-05, |
|
"loss": 3.6365, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 3.141945773524721, |
|
"grad_norm": 661887.5, |
|
"learning_rate": 2.381711855396066e-05, |
|
"loss": 3.5754, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"grad_norm": 691131.0, |
|
"learning_rate": 2.368421052631579e-05, |
|
"loss": 3.5888, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 3.1738437001594897, |
|
"grad_norm": 738841.625, |
|
"learning_rate": 2.355130249867092e-05, |
|
"loss": 3.6433, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 3.189792663476874, |
|
"grad_norm": 663372.1875, |
|
"learning_rate": 2.341839447102605e-05, |
|
"loss": 3.5642, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 3.2057416267942584, |
|
"grad_norm": 619548.75, |
|
"learning_rate": 2.3285486443381183e-05, |
|
"loss": 3.5864, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 3.221690590111643, |
|
"grad_norm": 773919.6875, |
|
"learning_rate": 2.3152578415736312e-05, |
|
"loss": 3.6072, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 3.237639553429027, |
|
"grad_norm": 728946.0625, |
|
"learning_rate": 2.3019670388091442e-05, |
|
"loss": 3.6212, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 3.2535885167464116, |
|
"grad_norm": 659326.9375, |
|
"learning_rate": 2.288676236044657e-05, |
|
"loss": 3.6068, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 3.269537480063796, |
|
"grad_norm": 759802.25, |
|
"learning_rate": 2.27538543328017e-05, |
|
"loss": 3.6783, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 3.2854864433811803, |
|
"grad_norm": 805141.9375, |
|
"learning_rate": 2.262094630515683e-05, |
|
"loss": 3.6441, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 3.3014354066985647, |
|
"grad_norm": 716521.875, |
|
"learning_rate": 2.2488038277511964e-05, |
|
"loss": 3.5748, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 3.317384370015949, |
|
"grad_norm": 920115.4375, |
|
"learning_rate": 2.2355130249867094e-05, |
|
"loss": 3.5985, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 838658.0625, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 3.564, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 3.349282296650718, |
|
"grad_norm": 838474.9375, |
|
"learning_rate": 2.2089314194577353e-05, |
|
"loss": 3.5875, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 3.3652312599681022, |
|
"grad_norm": 663992.25, |
|
"learning_rate": 2.1956406166932483e-05, |
|
"loss": 3.5681, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 3.3811802232854866, |
|
"grad_norm": 679489.9375, |
|
"learning_rate": 2.1823498139287616e-05, |
|
"loss": 3.5276, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 3.397129186602871, |
|
"grad_norm": 721495.6875, |
|
"learning_rate": 2.1690590111642745e-05, |
|
"loss": 3.5599, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 3.4130781499202554, |
|
"grad_norm": 667400.9375, |
|
"learning_rate": 2.1557682083997875e-05, |
|
"loss": 3.5554, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 3.4290271132376393, |
|
"grad_norm": 706122.0, |
|
"learning_rate": 2.1424774056353005e-05, |
|
"loss": 3.6543, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 3.444976076555024, |
|
"grad_norm": 709513.125, |
|
"learning_rate": 2.1291866028708134e-05, |
|
"loss": 3.6568, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 3.460925039872408, |
|
"grad_norm": 652669.375, |
|
"learning_rate": 2.1158958001063264e-05, |
|
"loss": 3.6423, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 3.476874003189793, |
|
"grad_norm": 713371.625, |
|
"learning_rate": 2.1026049973418397e-05, |
|
"loss": 3.6102, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 3.492822966507177, |
|
"grad_norm": 694536.875, |
|
"learning_rate": 2.0893141945773527e-05, |
|
"loss": 3.5488, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 3.5087719298245617, |
|
"grad_norm": 668059.9375, |
|
"learning_rate": 2.0760233918128656e-05, |
|
"loss": 3.6078, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 3.5247208931419456, |
|
"grad_norm": 731925.875, |
|
"learning_rate": 2.0627325890483786e-05, |
|
"loss": 3.5455, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 3.5406698564593304, |
|
"grad_norm": 766777.9375, |
|
"learning_rate": 2.0494417862838915e-05, |
|
"loss": 3.5582, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 3.5566188197767143, |
|
"grad_norm": 755381.1875, |
|
"learning_rate": 2.0361509835194045e-05, |
|
"loss": 3.5859, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 3.5725677830940987, |
|
"grad_norm": 749873.6875, |
|
"learning_rate": 2.0228601807549178e-05, |
|
"loss": 3.7161, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 3.588516746411483, |
|
"grad_norm": 677539.125, |
|
"learning_rate": 2.0095693779904308e-05, |
|
"loss": 3.6225, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 3.6044657097288675, |
|
"grad_norm": 744990.8125, |
|
"learning_rate": 1.9962785752259437e-05, |
|
"loss": 3.5935, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 3.620414673046252, |
|
"grad_norm": 629363.0, |
|
"learning_rate": 1.9829877724614567e-05, |
|
"loss": 3.5828, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 685327.4375, |
|
"learning_rate": 1.9696969696969697e-05, |
|
"loss": 3.5937, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 3.6523125996810206, |
|
"grad_norm": 760474.25, |
|
"learning_rate": 1.956406166932483e-05, |
|
"loss": 3.5799, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 3.668261562998405, |
|
"grad_norm": 725888.9375, |
|
"learning_rate": 1.943115364167996e-05, |
|
"loss": 3.5465, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"grad_norm": 752312.0625, |
|
"learning_rate": 1.929824561403509e-05, |
|
"loss": 3.6185, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 3.7001594896331738, |
|
"grad_norm": 855498.5625, |
|
"learning_rate": 1.916533758639022e-05, |
|
"loss": 3.5792, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 3.716108452950558, |
|
"grad_norm": 679264.5, |
|
"learning_rate": 1.9032429558745348e-05, |
|
"loss": 3.5531, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 3.7320574162679425, |
|
"grad_norm": 688482.875, |
|
"learning_rate": 1.8899521531100478e-05, |
|
"loss": 3.5495, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 3.748006379585327, |
|
"grad_norm": 756693.75, |
|
"learning_rate": 1.876661350345561e-05, |
|
"loss": 3.567, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 3.7639553429027113, |
|
"grad_norm": 854033.0625, |
|
"learning_rate": 1.863370547581074e-05, |
|
"loss": 3.5611, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 3.7799043062200957, |
|
"grad_norm": 753585.0625, |
|
"learning_rate": 1.850079744816587e-05, |
|
"loss": 3.5439, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 3.79585326953748, |
|
"grad_norm": 685358.25, |
|
"learning_rate": 1.8367889420521e-05, |
|
"loss": 3.535, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 3.8118022328548644, |
|
"grad_norm": 813028.4375, |
|
"learning_rate": 1.823498139287613e-05, |
|
"loss": 3.566, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 3.827751196172249, |
|
"grad_norm": 676295.9375, |
|
"learning_rate": 1.8102073365231263e-05, |
|
"loss": 3.5208, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 3.843700159489633, |
|
"grad_norm": 705614.1875, |
|
"learning_rate": 1.7969165337586392e-05, |
|
"loss": 3.5108, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 3.8596491228070176, |
|
"grad_norm": 643356.125, |
|
"learning_rate": 1.7836257309941522e-05, |
|
"loss": 3.5416, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 3.875598086124402, |
|
"grad_norm": 783635.625, |
|
"learning_rate": 1.770334928229665e-05, |
|
"loss": 3.6022, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 3.8915470494417863, |
|
"grad_norm": 829787.5, |
|
"learning_rate": 1.757044125465178e-05, |
|
"loss": 3.6056, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 3.9074960127591707, |
|
"grad_norm": 776633.0625, |
|
"learning_rate": 1.743753322700691e-05, |
|
"loss": 3.5857, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 3.923444976076555, |
|
"grad_norm": 708855.1875, |
|
"learning_rate": 1.7304625199362044e-05, |
|
"loss": 3.6086, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 3.9393939393939394, |
|
"grad_norm": 906587.0, |
|
"learning_rate": 1.7171717171717173e-05, |
|
"loss": 3.5361, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 3.955342902711324, |
|
"grad_norm": 835950.4375, |
|
"learning_rate": 1.7038809144072303e-05, |
|
"loss": 3.5976, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 3.971291866028708, |
|
"grad_norm": 816570.6875, |
|
"learning_rate": 1.6905901116427433e-05, |
|
"loss": 3.596, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 3.9872408293460926, |
|
"grad_norm": 751378.9375, |
|
"learning_rate": 1.6772993088782562e-05, |
|
"loss": 3.4856, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 3.646216869354248, |
|
"eval_runtime": 212.927, |
|
"eval_samples_per_second": 134.426, |
|
"eval_steps_per_second": 4.203, |
|
"step": 25080 |
|
}, |
|
{ |
|
"epoch": 4.003189792663477, |
|
"grad_norm": 735559.375, |
|
"learning_rate": 1.6640085061137695e-05, |
|
"loss": 3.5286, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 4.019138755980861, |
|
"grad_norm": 741852.3125, |
|
"learning_rate": 1.6507177033492825e-05, |
|
"loss": 3.5367, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 4.035087719298246, |
|
"grad_norm": 650216.625, |
|
"learning_rate": 1.6374269005847955e-05, |
|
"loss": 3.5056, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 4.05103668261563, |
|
"grad_norm": 717938.9375, |
|
"learning_rate": 1.6241360978203084e-05, |
|
"loss": 3.4996, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 4.0669856459330145, |
|
"grad_norm": 840426.9375, |
|
"learning_rate": 1.6108452950558214e-05, |
|
"loss": 3.5293, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 4.082934609250398, |
|
"grad_norm": 699668.5, |
|
"learning_rate": 1.5975544922913344e-05, |
|
"loss": 3.5122, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 4.098883572567783, |
|
"grad_norm": 649238.8125, |
|
"learning_rate": 1.5842636895268477e-05, |
|
"loss": 3.4949, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 4.114832535885167, |
|
"grad_norm": 669479.3125, |
|
"learning_rate": 1.5709728867623606e-05, |
|
"loss": 3.5515, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 4.130781499202552, |
|
"grad_norm": 645182.625, |
|
"learning_rate": 1.5576820839978733e-05, |
|
"loss": 3.5113, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 4.146730462519936, |
|
"grad_norm": 913136.6875, |
|
"learning_rate": 1.5443912812333866e-05, |
|
"loss": 3.5326, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 4.162679425837321, |
|
"grad_norm": 641233.3125, |
|
"learning_rate": 1.5311004784688995e-05, |
|
"loss": 3.4768, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 4.178628389154705, |
|
"grad_norm": 798713.5625, |
|
"learning_rate": 1.5178096757044127e-05, |
|
"loss": 3.4977, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 4.1945773524720895, |
|
"grad_norm": 719145.9375, |
|
"learning_rate": 1.5045188729399256e-05, |
|
"loss": 3.4549, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"grad_norm": 692601.9375, |
|
"learning_rate": 1.4912280701754386e-05, |
|
"loss": 3.5228, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 4.226475279106858, |
|
"grad_norm": 631408.4375, |
|
"learning_rate": 1.4779372674109517e-05, |
|
"loss": 3.4742, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 4.242424242424242, |
|
"grad_norm": 733137.0, |
|
"learning_rate": 1.4646464646464647e-05, |
|
"loss": 3.4242, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 4.258373205741627, |
|
"grad_norm": 741812.9375, |
|
"learning_rate": 1.4513556618819777e-05, |
|
"loss": 3.4598, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 4.274322169059011, |
|
"grad_norm": 682720.125, |
|
"learning_rate": 1.4380648591174908e-05, |
|
"loss": 3.4827, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 4.290271132376396, |
|
"grad_norm": 784646.125, |
|
"learning_rate": 1.4247740563530037e-05, |
|
"loss": 3.4712, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 4.30622009569378, |
|
"grad_norm": 713282.75, |
|
"learning_rate": 1.4114832535885167e-05, |
|
"loss": 3.5542, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 4.3221690590111645, |
|
"grad_norm": 699440.4375, |
|
"learning_rate": 1.3981924508240298e-05, |
|
"loss": 3.4572, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 4.3381180223285485, |
|
"grad_norm": 733063.875, |
|
"learning_rate": 1.3849016480595428e-05, |
|
"loss": 3.4106, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 4.354066985645933, |
|
"grad_norm": 799606.875, |
|
"learning_rate": 1.371610845295056e-05, |
|
"loss": 3.4189, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 4.370015948963317, |
|
"grad_norm": 722583.25, |
|
"learning_rate": 1.3583200425305689e-05, |
|
"loss": 3.448, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 4.385964912280702, |
|
"grad_norm": 761491.5625, |
|
"learning_rate": 1.3450292397660819e-05, |
|
"loss": 3.5248, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 4.401913875598086, |
|
"grad_norm": 633397.6875, |
|
"learning_rate": 1.331738437001595e-05, |
|
"loss": 3.5675, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 4.417862838915471, |
|
"grad_norm": 743160.4375, |
|
"learning_rate": 1.318447634237108e-05, |
|
"loss": 3.4983, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 4.433811802232855, |
|
"grad_norm": 689363.3125, |
|
"learning_rate": 1.305156831472621e-05, |
|
"loss": 3.5689, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 4.44976076555024, |
|
"grad_norm": 634674.0, |
|
"learning_rate": 1.291866028708134e-05, |
|
"loss": 3.4763, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 4.4657097288676235, |
|
"grad_norm": 682868.5, |
|
"learning_rate": 1.278575225943647e-05, |
|
"loss": 3.4725, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 4.481658692185008, |
|
"grad_norm": 839863.0, |
|
"learning_rate": 1.26528442317916e-05, |
|
"loss": 3.5255, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 4.497607655502392, |
|
"grad_norm": 840497.0625, |
|
"learning_rate": 1.2519936204146731e-05, |
|
"loss": 3.4914, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 4.513556618819777, |
|
"grad_norm": 798480.8125, |
|
"learning_rate": 1.2387028176501861e-05, |
|
"loss": 3.4787, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 4.529505582137161, |
|
"grad_norm": 776783.625, |
|
"learning_rate": 1.2254120148856992e-05, |
|
"loss": 3.4817, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 860058.625, |
|
"learning_rate": 1.2121212121212122e-05, |
|
"loss": 3.515, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 4.56140350877193, |
|
"grad_norm": 797709.0, |
|
"learning_rate": 1.1988304093567252e-05, |
|
"loss": 3.4715, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 4.577352472089315, |
|
"grad_norm": 686232.0, |
|
"learning_rate": 1.1855396065922381e-05, |
|
"loss": 3.493, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 4.5933014354066986, |
|
"grad_norm": 713671.875, |
|
"learning_rate": 1.1722488038277513e-05, |
|
"loss": 3.4767, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 4.6092503987240825, |
|
"grad_norm": 830858.875, |
|
"learning_rate": 1.1589580010632644e-05, |
|
"loss": 3.5506, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 4.625199362041467, |
|
"grad_norm": 681684.8125, |
|
"learning_rate": 1.1456671982987772e-05, |
|
"loss": 3.468, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 4.641148325358852, |
|
"grad_norm": 693863.25, |
|
"learning_rate": 1.1323763955342903e-05, |
|
"loss": 3.4847, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 4.657097288676236, |
|
"grad_norm": 612233.75, |
|
"learning_rate": 1.1190855927698035e-05, |
|
"loss": 3.3994, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 4.67304625199362, |
|
"grad_norm": 901251.25, |
|
"learning_rate": 1.1057947900053164e-05, |
|
"loss": 3.5144, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 4.688995215311005, |
|
"grad_norm": 742618.4375, |
|
"learning_rate": 1.0925039872408294e-05, |
|
"loss": 3.5388, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 4.70494417862839, |
|
"grad_norm": 797654.0, |
|
"learning_rate": 1.0792131844763423e-05, |
|
"loss": 3.4722, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 4.720893141945774, |
|
"grad_norm": 763737.9375, |
|
"learning_rate": 1.0659223817118555e-05, |
|
"loss": 3.5006, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 4.7368421052631575, |
|
"grad_norm": 702353.875, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 3.4772, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 4.752791068580542, |
|
"grad_norm": 790482.1875, |
|
"learning_rate": 1.0393407761828814e-05, |
|
"loss": 3.4507, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 4.768740031897926, |
|
"grad_norm": 706455.9375, |
|
"learning_rate": 1.0260499734183945e-05, |
|
"loss": 3.4769, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 4.784688995215311, |
|
"grad_norm": 720554.3125, |
|
"learning_rate": 1.0127591706539077e-05, |
|
"loss": 3.4551, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 4.800637958532695, |
|
"grad_norm": 754827.6875, |
|
"learning_rate": 9.994683678894205e-06, |
|
"loss": 3.4982, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 4.81658692185008, |
|
"grad_norm": 696089.375, |
|
"learning_rate": 9.861775651249336e-06, |
|
"loss": 3.4637, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 4.832535885167464, |
|
"grad_norm": 737095.3125, |
|
"learning_rate": 9.728867623604466e-06, |
|
"loss": 3.4621, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 4.848484848484849, |
|
"grad_norm": 768262.125, |
|
"learning_rate": 9.595959595959595e-06, |
|
"loss": 3.48, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 4.8644338118022326, |
|
"grad_norm": 767420.5625, |
|
"learning_rate": 9.463051568314727e-06, |
|
"loss": 3.4463, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 4.880382775119617, |
|
"grad_norm": 706310.0, |
|
"learning_rate": 9.330143540669856e-06, |
|
"loss": 3.4668, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 4.896331738437001, |
|
"grad_norm": 828940.625, |
|
"learning_rate": 9.197235513024988e-06, |
|
"loss": 3.4377, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 4.912280701754386, |
|
"grad_norm": 788075.375, |
|
"learning_rate": 9.064327485380117e-06, |
|
"loss": 3.4144, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 4.92822966507177, |
|
"grad_norm": 805364.0625, |
|
"learning_rate": 8.931419457735247e-06, |
|
"loss": 3.4726, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 4.944178628389155, |
|
"grad_norm": 604472.75, |
|
"learning_rate": 8.798511430090378e-06, |
|
"loss": 3.4936, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 4.960127591706539, |
|
"grad_norm": 684835.3125, |
|
"learning_rate": 8.66560340244551e-06, |
|
"loss": 3.4637, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 4.976076555023924, |
|
"grad_norm": 684998.5, |
|
"learning_rate": 8.532695374800638e-06, |
|
"loss": 3.5321, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 4.992025518341308, |
|
"grad_norm": 711871.5625, |
|
"learning_rate": 8.399787347155769e-06, |
|
"loss": 3.4809, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 3.6019129753112793, |
|
"eval_runtime": 212.6906, |
|
"eval_samples_per_second": 134.576, |
|
"eval_steps_per_second": 4.208, |
|
"step": 31350 |
|
}, |
|
{ |
|
"epoch": 5.007974481658692, |
|
"grad_norm": 1041463.125, |
|
"learning_rate": 8.266879319510899e-06, |
|
"loss": 3.4055, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 5.023923444976076, |
|
"grad_norm": 623676.3125, |
|
"learning_rate": 8.133971291866028e-06, |
|
"loss": 3.5315, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 5.039872408293461, |
|
"grad_norm": 619523.9375, |
|
"learning_rate": 8.00106326422116e-06, |
|
"loss": 3.4709, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 5.055821371610845, |
|
"grad_norm": 779379.875, |
|
"learning_rate": 7.86815523657629e-06, |
|
"loss": 3.3933, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 5.07177033492823, |
|
"grad_norm": 617667.5625, |
|
"learning_rate": 7.73524720893142e-06, |
|
"loss": 3.4738, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 5.087719298245614, |
|
"grad_norm": 691351.5625, |
|
"learning_rate": 7.602339181286549e-06, |
|
"loss": 3.4499, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 5.103668261562999, |
|
"grad_norm": 702913.25, |
|
"learning_rate": 7.469431153641681e-06, |
|
"loss": 3.4476, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 5.119617224880383, |
|
"grad_norm": 778967.4375, |
|
"learning_rate": 7.336523125996811e-06, |
|
"loss": 3.4689, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 5.1355661881977674, |
|
"grad_norm": 693687.3125, |
|
"learning_rate": 7.20361509835194e-06, |
|
"loss": 3.381, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 5.151515151515151, |
|
"grad_norm": 843375.5, |
|
"learning_rate": 7.0707070707070704e-06, |
|
"loss": 3.438, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 5.167464114832536, |
|
"grad_norm": 734730.5, |
|
"learning_rate": 6.937799043062202e-06, |
|
"loss": 3.4564, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 5.18341307814992, |
|
"grad_norm": 716158.375, |
|
"learning_rate": 6.804891015417332e-06, |
|
"loss": 3.4529, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 5.199362041467305, |
|
"grad_norm": 703040.625, |
|
"learning_rate": 6.671982987772461e-06, |
|
"loss": 3.391, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 5.215311004784689, |
|
"grad_norm": 636837.3125, |
|
"learning_rate": 6.5390749601275915e-06, |
|
"loss": 3.4174, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 5.231259968102074, |
|
"grad_norm": 661893.6875, |
|
"learning_rate": 6.406166932482723e-06, |
|
"loss": 3.4256, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 5.247208931419458, |
|
"grad_norm": 812641.0625, |
|
"learning_rate": 6.273258904837853e-06, |
|
"loss": 3.4981, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 5.2631578947368425, |
|
"grad_norm": 725753.125, |
|
"learning_rate": 6.140350877192982e-06, |
|
"loss": 3.5018, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 5.279106858054226, |
|
"grad_norm": 691768.3125, |
|
"learning_rate": 6.007442849548113e-06, |
|
"loss": 3.3969, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 5.295055821371611, |
|
"grad_norm": 776131.9375, |
|
"learning_rate": 5.874534821903243e-06, |
|
"loss": 3.4489, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 5.311004784688995, |
|
"grad_norm": 747268.25, |
|
"learning_rate": 5.741626794258374e-06, |
|
"loss": 3.4075, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 5.32695374800638, |
|
"grad_norm": 864468.1875, |
|
"learning_rate": 5.608718766613503e-06, |
|
"loss": 3.4443, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 5.342902711323764, |
|
"grad_norm": 729350.5, |
|
"learning_rate": 5.475810738968634e-06, |
|
"loss": 3.4189, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 5.358851674641148, |
|
"grad_norm": 687491.0625, |
|
"learning_rate": 5.342902711323764e-06, |
|
"loss": 3.4167, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 5.374800637958533, |
|
"grad_norm": 682089.125, |
|
"learning_rate": 5.209994683678895e-06, |
|
"loss": 3.3691, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 5.3907496012759175, |
|
"grad_norm": 698959.25, |
|
"learning_rate": 5.077086656034024e-06, |
|
"loss": 3.4449, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 5.4066985645933014, |
|
"grad_norm": 671919.375, |
|
"learning_rate": 4.944178628389155e-06, |
|
"loss": 3.4421, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 5.422647527910685, |
|
"grad_norm": 647332.125, |
|
"learning_rate": 4.811270600744285e-06, |
|
"loss": 3.4529, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 5.43859649122807, |
|
"grad_norm": 686242.25, |
|
"learning_rate": 4.678362573099415e-06, |
|
"loss": 3.3381, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 936387.5, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 3.4765, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 5.470494417862839, |
|
"grad_norm": 662910.3125, |
|
"learning_rate": 4.412546517809676e-06, |
|
"loss": 3.3729, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 5.486443381180223, |
|
"grad_norm": 671547.0, |
|
"learning_rate": 4.2796384901648065e-06, |
|
"loss": 3.4489, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 5.502392344497608, |
|
"grad_norm": 777179.3125, |
|
"learning_rate": 4.146730462519936e-06, |
|
"loss": 3.424, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 5.518341307814992, |
|
"grad_norm": 628742.25, |
|
"learning_rate": 4.013822434875067e-06, |
|
"loss": 3.4128, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 5.5342902711323765, |
|
"grad_norm": 794101.75, |
|
"learning_rate": 3.880914407230197e-06, |
|
"loss": 3.3938, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 5.55023923444976, |
|
"grad_norm": 720820.4375, |
|
"learning_rate": 3.7480063795853268e-06, |
|
"loss": 3.3622, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 5.566188197767145, |
|
"grad_norm": 686912.5, |
|
"learning_rate": 3.6150983519404573e-06, |
|
"loss": 3.3896, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 5.582137161084529, |
|
"grad_norm": 758477.0, |
|
"learning_rate": 3.4821903242955873e-06, |
|
"loss": 3.4494, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 5.598086124401914, |
|
"grad_norm": 828910.875, |
|
"learning_rate": 3.349282296650718e-06, |
|
"loss": 3.4204, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 5.614035087719298, |
|
"grad_norm": 650998.8125, |
|
"learning_rate": 3.216374269005848e-06, |
|
"loss": 3.4115, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 5.629984051036683, |
|
"grad_norm": 681491.375, |
|
"learning_rate": 3.0834662413609784e-06, |
|
"loss": 3.3861, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 5.645933014354067, |
|
"grad_norm": 749556.875, |
|
"learning_rate": 2.9505582137161084e-06, |
|
"loss": 3.4175, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 5.6618819776714515, |
|
"grad_norm": 677371.6875, |
|
"learning_rate": 2.817650186071239e-06, |
|
"loss": 3.4571, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 5.6778309409888355, |
|
"grad_norm": 730101.5625, |
|
"learning_rate": 2.684742158426369e-06, |
|
"loss": 3.3966, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 5.69377990430622, |
|
"grad_norm": 657353.375, |
|
"learning_rate": 2.5518341307814995e-06, |
|
"loss": 3.4639, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 5.709728867623604, |
|
"grad_norm": 734038.0, |
|
"learning_rate": 2.4189261031366296e-06, |
|
"loss": 3.3982, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 5.725677830940989, |
|
"grad_norm": 719495.625, |
|
"learning_rate": 2.28601807549176e-06, |
|
"loss": 3.4221, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 5.741626794258373, |
|
"grad_norm": 686361.0625, |
|
"learning_rate": 2.15311004784689e-06, |
|
"loss": 3.4886, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 5.757575757575758, |
|
"grad_norm": 804130.625, |
|
"learning_rate": 2.0202020202020206e-06, |
|
"loss": 3.4364, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 5.773524720893142, |
|
"grad_norm": 713737.5625, |
|
"learning_rate": 1.8872939925571505e-06, |
|
"loss": 3.391, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 5.7894736842105265, |
|
"grad_norm": 734503.125, |
|
"learning_rate": 1.7543859649122807e-06, |
|
"loss": 3.4291, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 5.8054226475279105, |
|
"grad_norm": 673341.3125, |
|
"learning_rate": 1.621477937267411e-06, |
|
"loss": 3.4816, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 5.821371610845295, |
|
"grad_norm": 643799.3125, |
|
"learning_rate": 1.4885699096225413e-06, |
|
"loss": 3.4221, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 5.837320574162679, |
|
"grad_norm": 641417.125, |
|
"learning_rate": 1.3556618819776716e-06, |
|
"loss": 3.4331, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 5.853269537480064, |
|
"grad_norm": 707790.25, |
|
"learning_rate": 1.2227538543328019e-06, |
|
"loss": 3.3936, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 5.869218500797448, |
|
"grad_norm": 734012.3125, |
|
"learning_rate": 1.089845826687932e-06, |
|
"loss": 3.4258, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 5.885167464114833, |
|
"grad_norm": 672716.0, |
|
"learning_rate": 9.569377990430622e-07, |
|
"loss": 3.4688, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 5.901116427432217, |
|
"grad_norm": 691052.75, |
|
"learning_rate": 8.240297713981925e-07, |
|
"loss": 3.3553, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 5.917065390749602, |
|
"grad_norm": 714107.875, |
|
"learning_rate": 6.911217437533228e-07, |
|
"loss": 3.4735, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 5.9330143540669855, |
|
"grad_norm": 783142.9375, |
|
"learning_rate": 5.582137161084529e-07, |
|
"loss": 3.4707, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 5.94896331738437, |
|
"grad_norm": 793443.25, |
|
"learning_rate": 4.2530568846358327e-07, |
|
"loss": 3.4279, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 5.964912280701754, |
|
"grad_norm": 828328.5625, |
|
"learning_rate": 2.9239766081871344e-07, |
|
"loss": 3.4171, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 5.980861244019139, |
|
"grad_norm": 826741.5625, |
|
"learning_rate": 1.5948963317384372e-07, |
|
"loss": 3.5064, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 5.996810207336523, |
|
"grad_norm": 698366.375, |
|
"learning_rate": 2.6581605528973954e-08, |
|
"loss": 3.42, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 3.615572929382324, |
|
"eval_runtime": 213.1542, |
|
"eval_samples_per_second": 134.283, |
|
"eval_steps_per_second": 4.199, |
|
"step": 37620 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 37620, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.65721373251031e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|