|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.975144705481783, |
|
"eval_steps": 1000, |
|
"global_step": 3670, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.027238678924072182, |
|
"grad_norm": 94.36308714522585, |
|
"learning_rate": 2.4523160762942784e-07, |
|
"loss": 3.2941, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.054477357848144364, |
|
"grad_norm": 72.09113030659383, |
|
"learning_rate": 5.177111716621253e-07, |
|
"loss": 2.9481, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08171603677221655, |
|
"grad_norm": 18.50935923305202, |
|
"learning_rate": 7.90190735694823e-07, |
|
"loss": 1.921, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10895471569628873, |
|
"grad_norm": 8.59280831709028, |
|
"learning_rate": 1.0626702997275206e-06, |
|
"loss": 1.5096, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13619339462036092, |
|
"grad_norm": 5.941916510585534, |
|
"learning_rate": 1.335149863760218e-06, |
|
"loss": 1.1101, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1634320735444331, |
|
"grad_norm": 4.39962525080334, |
|
"learning_rate": 1.6076294277929156e-06, |
|
"loss": 0.9091, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1906707524685053, |
|
"grad_norm": 4.489642566752767, |
|
"learning_rate": 1.8801089918256133e-06, |
|
"loss": 0.7886, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21790943139257746, |
|
"grad_norm": 3.8997170693605914, |
|
"learning_rate": 2.152588555858311e-06, |
|
"loss": 0.7292, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24514811031664965, |
|
"grad_norm": 4.2337949046127115, |
|
"learning_rate": 2.4250681198910083e-06, |
|
"loss": 0.6295, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.27238678924072185, |
|
"grad_norm": 3.887748571626415, |
|
"learning_rate": 2.697547683923706e-06, |
|
"loss": 0.5892, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.299625468164794, |
|
"grad_norm": 4.071563747798551, |
|
"learning_rate": 2.9700272479564033e-06, |
|
"loss": 0.4879, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3268641470888662, |
|
"grad_norm": 4.396940499759756, |
|
"learning_rate": 3.2425068119891012e-06, |
|
"loss": 0.4542, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.35410282601293835, |
|
"grad_norm": 3.784810689454143, |
|
"learning_rate": 3.5149863760217988e-06, |
|
"loss": 0.3211, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3813415049370106, |
|
"grad_norm": 3.7737741184600595, |
|
"learning_rate": 3.7874659400544963e-06, |
|
"loss": 0.2891, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.40858018386108275, |
|
"grad_norm": 3.361771664086218, |
|
"learning_rate": 4.059945504087194e-06, |
|
"loss": 0.1996, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4358188627851549, |
|
"grad_norm": 3.2907408057054237, |
|
"learning_rate": 4.332425068119892e-06, |
|
"loss": 0.133, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4630575417092271, |
|
"grad_norm": 3.040778638316543, |
|
"learning_rate": 4.604904632152589e-06, |
|
"loss": 0.1101, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4902962206332993, |
|
"grad_norm": 3.5750949211792156, |
|
"learning_rate": 4.877384196185287e-06, |
|
"loss": 0.0936, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5175348995573714, |
|
"grad_norm": 3.857830728644118, |
|
"learning_rate": 5.149863760217984e-06, |
|
"loss": 0.0757, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5447735784814437, |
|
"grad_norm": 4.5053152903816915, |
|
"learning_rate": 5.422343324250682e-06, |
|
"loss": 0.0677, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5720122574055159, |
|
"grad_norm": 2.0169258478044423, |
|
"learning_rate": 5.694822888283379e-06, |
|
"loss": 0.0607, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.599250936329588, |
|
"grad_norm": 2.7354337720763566, |
|
"learning_rate": 5.9673024523160776e-06, |
|
"loss": 0.057, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6264896152536602, |
|
"grad_norm": 2.41187775653569, |
|
"learning_rate": 6.239782016348774e-06, |
|
"loss": 0.056, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6537282941777324, |
|
"grad_norm": 2.7992598675350258, |
|
"learning_rate": 6.512261580381472e-06, |
|
"loss": 0.0515, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6809669731018045, |
|
"grad_norm": 3.885137671280064, |
|
"learning_rate": 6.78474114441417e-06, |
|
"loss": 0.0517, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7082056520258767, |
|
"grad_norm": 2.47654168655763, |
|
"learning_rate": 7.057220708446867e-06, |
|
"loss": 0.0462, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7354443309499489, |
|
"grad_norm": 2.5720242451467454, |
|
"learning_rate": 7.329700272479565e-06, |
|
"loss": 0.0454, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7626830098740212, |
|
"grad_norm": 4.695199938458052, |
|
"learning_rate": 7.602179836512263e-06, |
|
"loss": 0.0464, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7899216887980933, |
|
"grad_norm": 2.080104008186552, |
|
"learning_rate": 7.87465940054496e-06, |
|
"loss": 0.0416, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8171603677221655, |
|
"grad_norm": 2.1981255385062517, |
|
"learning_rate": 8.147138964577658e-06, |
|
"loss": 0.0446, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8443990466462377, |
|
"grad_norm": 1.7239470049599528, |
|
"learning_rate": 8.419618528610354e-06, |
|
"loss": 0.0452, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8716377255703098, |
|
"grad_norm": 1.8811480620279255, |
|
"learning_rate": 8.692098092643052e-06, |
|
"loss": 0.033, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 2.4779574516049556, |
|
"learning_rate": 8.964577656675751e-06, |
|
"loss": 0.0438, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9261150834184542, |
|
"grad_norm": 2.42289553821888, |
|
"learning_rate": 9.237057220708447e-06, |
|
"loss": 0.0392, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9533537623425263, |
|
"grad_norm": 2.3174600464823256, |
|
"learning_rate": 9.509536784741146e-06, |
|
"loss": 0.0394, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9805924412665986, |
|
"grad_norm": 2.076316899586314, |
|
"learning_rate": 9.782016348773843e-06, |
|
"loss": 0.0387, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0054477357848144, |
|
"grad_norm": 1.7952293400096322, |
|
"learning_rate": 9.99999095346085e-06, |
|
"loss": 0.0332, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0326864147088866, |
|
"grad_norm": 2.3966331090661925, |
|
"learning_rate": 9.999674328027824e-06, |
|
"loss": 0.0349, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0599250936329587, |
|
"grad_norm": 1.8578054931433101, |
|
"learning_rate": 9.998905408372662e-06, |
|
"loss": 0.0332, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.087163772557031, |
|
"grad_norm": 2.067301033604286, |
|
"learning_rate": 9.997684264055478e-06, |
|
"loss": 0.036, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1144024514811033, |
|
"grad_norm": 1.656723292655721, |
|
"learning_rate": 9.99601100554677e-06, |
|
"loss": 0.0315, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1416411304051755, |
|
"grad_norm": 1.4305496511087652, |
|
"learning_rate": 9.99388578421743e-06, |
|
"loss": 0.0307, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1688798093292476, |
|
"grad_norm": 1.4756680048744966, |
|
"learning_rate": 9.991308792325045e-06, |
|
"loss": 0.0279, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1961184882533198, |
|
"grad_norm": 2.488649223393654, |
|
"learning_rate": 9.988280262996507e-06, |
|
"loss": 0.0263, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.223357167177392, |
|
"grad_norm": 1.4666750141420732, |
|
"learning_rate": 9.98480047020693e-06, |
|
"loss": 0.0268, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2505958461014641, |
|
"grad_norm": 1.320119597330066, |
|
"learning_rate": 9.980869728754847e-06, |
|
"loss": 0.0298, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2778345250255363, |
|
"grad_norm": 1.390841137051498, |
|
"learning_rate": 9.976488394233752e-06, |
|
"loss": 0.021, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.3050732039496085, |
|
"grad_norm": 1.4228820783227014, |
|
"learning_rate": 9.971656862999917e-06, |
|
"loss": 0.0276, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3323118828736806, |
|
"grad_norm": 1.1183882912895564, |
|
"learning_rate": 9.966375572136546e-06, |
|
"loss": 0.0282, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3595505617977528, |
|
"grad_norm": 1.1616366909466755, |
|
"learning_rate": 9.960644999414226e-06, |
|
"loss": 0.0214, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.386789240721825, |
|
"grad_norm": 1.409120984041759, |
|
"learning_rate": 9.954465663247708e-06, |
|
"loss": 0.0178, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.4140279196458971, |
|
"grad_norm": 0.9803319101168316, |
|
"learning_rate": 9.947838122649014e-06, |
|
"loss": 0.0202, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4412665985699693, |
|
"grad_norm": 1.8131277665703773, |
|
"learning_rate": 9.94076297717686e-06, |
|
"loss": 0.0208, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4685052774940415, |
|
"grad_norm": 1.1662863071032854, |
|
"learning_rate": 9.933240866882418e-06, |
|
"loss": 0.0169, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4957439564181136, |
|
"grad_norm": 1.1465063039077388, |
|
"learning_rate": 9.925272472251415e-06, |
|
"loss": 0.0211, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5229826353421858, |
|
"grad_norm": 1.193468165618273, |
|
"learning_rate": 9.916858514142575e-06, |
|
"loss": 0.0198, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.550221314266258, |
|
"grad_norm": 1.109324425932322, |
|
"learning_rate": 9.907999753722407e-06, |
|
"loss": 0.021, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5774599931903301, |
|
"grad_norm": 1.290613695650256, |
|
"learning_rate": 9.898696992396333e-06, |
|
"loss": 0.0171, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.6046986721144023, |
|
"grad_norm": 1.6974083987042456, |
|
"learning_rate": 9.888951071736215e-06, |
|
"loss": 0.0192, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6319373510384745, |
|
"grad_norm": 1.2163159155887682, |
|
"learning_rate": 9.878762873404197e-06, |
|
"loss": 0.013, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6591760299625467, |
|
"grad_norm": 1.0193875076300951, |
|
"learning_rate": 9.86813331907296e-06, |
|
"loss": 0.0163, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.686414708886619, |
|
"grad_norm": 1.1034023564810655, |
|
"learning_rate": 9.857063370342338e-06, |
|
"loss": 0.0174, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7136533878106912, |
|
"grad_norm": 1.5309168123244021, |
|
"learning_rate": 9.845554028652331e-06, |
|
"loss": 0.0158, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7408920667347634, |
|
"grad_norm": 0.5532697619897224, |
|
"learning_rate": 9.833606335192506e-06, |
|
"loss": 0.0108, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7681307456588355, |
|
"grad_norm": 1.0576521653821709, |
|
"learning_rate": 9.821221370807805e-06, |
|
"loss": 0.0154, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7953694245829077, |
|
"grad_norm": 1.0783675700881836, |
|
"learning_rate": 9.808400255900772e-06, |
|
"loss": 0.0114, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8226081035069799, |
|
"grad_norm": 0.599102289097004, |
|
"learning_rate": 9.795144150330194e-06, |
|
"loss": 0.0126, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.849846782431052, |
|
"grad_norm": 1.1075249779167584, |
|
"learning_rate": 9.781454253306169e-06, |
|
"loss": 0.0141, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8770854613551244, |
|
"grad_norm": 0.69017494598675, |
|
"learning_rate": 9.76733180328163e-06, |
|
"loss": 0.0146, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.9043241402791966, |
|
"grad_norm": 0.9541591518502084, |
|
"learning_rate": 9.752778077840302e-06, |
|
"loss": 0.0097, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9315628192032688, |
|
"grad_norm": 1.515252524842471, |
|
"learning_rate": 9.737794393581125e-06, |
|
"loss": 0.0102, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.958801498127341, |
|
"grad_norm": 0.6520067186874438, |
|
"learning_rate": 9.722382105999156e-06, |
|
"loss": 0.0121, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9860401770514131, |
|
"grad_norm": 1.111673947257393, |
|
"learning_rate": 9.706542609362928e-06, |
|
"loss": 0.0101, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.010895471569629, |
|
"grad_norm": 1.2740352672206354, |
|
"learning_rate": 9.690277336588338e-06, |
|
"loss": 0.0093, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.038134150493701, |
|
"grad_norm": 1.2314399048766502, |
|
"learning_rate": 9.673587759109007e-06, |
|
"loss": 0.0082, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.065372829417773, |
|
"grad_norm": 0.6515664977184386, |
|
"learning_rate": 9.656475386743166e-06, |
|
"loss": 0.0078, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.0926115083418453, |
|
"grad_norm": 0.6076512914812847, |
|
"learning_rate": 9.638941767557085e-06, |
|
"loss": 0.0086, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.1198501872659175, |
|
"grad_norm": 0.49805773777686746, |
|
"learning_rate": 9.620988487724999e-06, |
|
"loss": 0.0094, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.1470888661899896, |
|
"grad_norm": 1.1149194861607004, |
|
"learning_rate": 9.602617171385646e-06, |
|
"loss": 0.0075, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.174327545114062, |
|
"grad_norm": 0.33357565062562405, |
|
"learning_rate": 9.583829480495325e-06, |
|
"loss": 0.0062, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.201566224038134, |
|
"grad_norm": 1.2725673141069516, |
|
"learning_rate": 9.564627114677546e-06, |
|
"loss": 0.0074, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.2288049029622066, |
|
"grad_norm": 0.8173124298927218, |
|
"learning_rate": 9.54501181106928e-06, |
|
"loss": 0.0102, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2560435818862787, |
|
"grad_norm": 0.6950238807140082, |
|
"learning_rate": 9.524985344163801e-06, |
|
"loss": 0.0065, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.283282260810351, |
|
"grad_norm": 0.9944262630317349, |
|
"learning_rate": 9.504549525650173e-06, |
|
"loss": 0.0067, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.310520939734423, |
|
"grad_norm": 0.7645573101281118, |
|
"learning_rate": 9.483706204249332e-06, |
|
"loss": 0.0072, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.3377596186584952, |
|
"grad_norm": 0.5284367702467161, |
|
"learning_rate": 9.462457265546867e-06, |
|
"loss": 0.007, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3649982975825674, |
|
"grad_norm": 0.5487759594771856, |
|
"learning_rate": 9.440804631822421e-06, |
|
"loss": 0.0073, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.3922369765066396, |
|
"grad_norm": 0.6876376014240485, |
|
"learning_rate": 9.418750261875811e-06, |
|
"loss": 0.006, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.4194756554307117, |
|
"grad_norm": 0.5930505890719068, |
|
"learning_rate": 9.396296150849804e-06, |
|
"loss": 0.0034, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.446714334354784, |
|
"grad_norm": 0.2924797737902157, |
|
"learning_rate": 9.373444330049645e-06, |
|
"loss": 0.0048, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.473953013278856, |
|
"grad_norm": 0.818504316665287, |
|
"learning_rate": 9.350196866759289e-06, |
|
"loss": 0.0065, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.5011916922029283, |
|
"grad_norm": 0.6173390676022147, |
|
"learning_rate": 9.326555864054383e-06, |
|
"loss": 0.0063, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.5284303711270004, |
|
"grad_norm": 0.42627927587804193, |
|
"learning_rate": 9.302523460612015e-06, |
|
"loss": 0.0055, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5556690500510726, |
|
"grad_norm": 0.5539596964886677, |
|
"learning_rate": 9.278101830517234e-06, |
|
"loss": 0.0047, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5829077289751448, |
|
"grad_norm": 0.3896212375163213, |
|
"learning_rate": 9.253293183066382e-06, |
|
"loss": 0.0056, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.610146407899217, |
|
"grad_norm": 0.658832029703366, |
|
"learning_rate": 9.228099762567221e-06, |
|
"loss": 0.0053, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.637385086823289, |
|
"grad_norm": 0.2940172656833637, |
|
"learning_rate": 9.202523848135903e-06, |
|
"loss": 0.005, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.6646237657473613, |
|
"grad_norm": 0.8264261097315277, |
|
"learning_rate": 9.176567753490795e-06, |
|
"loss": 0.0088, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.6918624446714334, |
|
"grad_norm": 0.47332662322314495, |
|
"learning_rate": 9.15023382674317e-06, |
|
"loss": 0.0049, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.7191011235955056, |
|
"grad_norm": 0.6043816603050725, |
|
"learning_rate": 9.12352445018478e-06, |
|
"loss": 0.0054, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.7191011235955056, |
|
"eval_loss": 0.009971115738153458, |
|
"eval_runtime": 149.6934, |
|
"eval_samples_per_second": 1.336, |
|
"eval_steps_per_second": 0.167, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.7463398025195778, |
|
"grad_norm": 0.18838425652727245, |
|
"learning_rate": 9.096442040072342e-06, |
|
"loss": 0.0036, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.77357848144365, |
|
"grad_norm": 0.33857822247434793, |
|
"learning_rate": 9.06898904640896e-06, |
|
"loss": 0.0038, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.800817160367722, |
|
"grad_norm": 0.56645910713706, |
|
"learning_rate": 9.04116795272248e-06, |
|
"loss": 0.0055, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.8280558392917943, |
|
"grad_norm": 0.28109810510949723, |
|
"learning_rate": 9.01298127584082e-06, |
|
"loss": 0.0036, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.8552945182158664, |
|
"grad_norm": 0.542296858675033, |
|
"learning_rate": 8.984431565664287e-06, |
|
"loss": 0.0048, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.8825331971399386, |
|
"grad_norm": 0.5373500607763515, |
|
"learning_rate": 8.955521404934895e-06, |
|
"loss": 0.0043, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.9097718760640108, |
|
"grad_norm": 0.5880158628582186, |
|
"learning_rate": 8.926253409002724e-06, |
|
"loss": 0.0045, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.937010554988083, |
|
"grad_norm": 1.0360372633008765, |
|
"learning_rate": 8.896630225589325e-06, |
|
"loss": 0.003, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.964249233912155, |
|
"grad_norm": 0.6664430992854442, |
|
"learning_rate": 8.866654534548188e-06, |
|
"loss": 0.0035, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.9914879128362273, |
|
"grad_norm": 0.3828974814299205, |
|
"learning_rate": 8.836329047622315e-06, |
|
"loss": 0.0051, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.0163432073544434, |
|
"grad_norm": 0.5237696754640224, |
|
"learning_rate": 8.805656508198893e-06, |
|
"loss": 0.0025, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.0435818862785156, |
|
"grad_norm": 0.31976718993311765, |
|
"learning_rate": 8.774639691061133e-06, |
|
"loss": 0.0027, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.0708205652025877, |
|
"grad_norm": 0.8311789315891146, |
|
"learning_rate": 8.743281402137234e-06, |
|
"loss": 0.0043, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.09805924412666, |
|
"grad_norm": 0.5395564063408896, |
|
"learning_rate": 8.711584478246545e-06, |
|
"loss": 0.0037, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.125297923050732, |
|
"grad_norm": 0.4868560419941669, |
|
"learning_rate": 8.679551786842947e-06, |
|
"loss": 0.003, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.1525366019748042, |
|
"grad_norm": 0.09982986237608656, |
|
"learning_rate": 8.647186225755435e-06, |
|
"loss": 0.004, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.1797752808988764, |
|
"grad_norm": 0.2992508205991956, |
|
"learning_rate": 8.614490722925976e-06, |
|
"loss": 0.0025, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.2070139598229486, |
|
"grad_norm": 0.7674760077175568, |
|
"learning_rate": 8.581468236144624e-06, |
|
"loss": 0.0029, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.2342526387470207, |
|
"grad_norm": 0.688415374822658, |
|
"learning_rate": 8.548121752781958e-06, |
|
"loss": 0.0024, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.261491317671093, |
|
"grad_norm": 0.9052988126796538, |
|
"learning_rate": 8.514454289518815e-06, |
|
"loss": 0.0034, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.288729996595165, |
|
"grad_norm": 0.40120185903679884, |
|
"learning_rate": 8.480468892073396e-06, |
|
"loss": 0.002, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.3159686755192372, |
|
"grad_norm": 0.5944020494018145, |
|
"learning_rate": 8.446168634925744e-06, |
|
"loss": 0.0027, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.3432073544433094, |
|
"grad_norm": 0.4472237168192407, |
|
"learning_rate": 8.411556621039587e-06, |
|
"loss": 0.0076, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.3704460333673816, |
|
"grad_norm": 0.28045654193358444, |
|
"learning_rate": 8.376635981581652e-06, |
|
"loss": 0.0035, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.3976847122914537, |
|
"grad_norm": 0.17536506030522428, |
|
"learning_rate": 8.341409875638396e-06, |
|
"loss": 0.0038, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.424923391215526, |
|
"grad_norm": 5.902141227200851, |
|
"learning_rate": 8.305881489930224e-06, |
|
"loss": 0.0075, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.452162070139598, |
|
"grad_norm": 1.1226970845212438, |
|
"learning_rate": 8.270054038523194e-06, |
|
"loss": 0.0039, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.4794007490636703, |
|
"grad_norm": 0.2305609441856671, |
|
"learning_rate": 8.233930762538271e-06, |
|
"loss": 0.0038, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.506639427987743, |
|
"grad_norm": 0.17972249159415393, |
|
"learning_rate": 8.197514929858108e-06, |
|
"loss": 0.0033, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.533878106911815, |
|
"grad_norm": 0.22572705207090887, |
|
"learning_rate": 8.160809834831422e-06, |
|
"loss": 0.0018, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.561116785835887, |
|
"grad_norm": 0.48294663787369996, |
|
"learning_rate": 8.123818797974973e-06, |
|
"loss": 0.001, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.5883554647599594, |
|
"grad_norm": 0.0324163008144768, |
|
"learning_rate": 8.08654516567318e-06, |
|
"loss": 0.0017, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.6155941436840315, |
|
"grad_norm": 0.2702928606649647, |
|
"learning_rate": 8.04899230987537e-06, |
|
"loss": 0.0007, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.6428328226081037, |
|
"grad_norm": 0.02733106163122347, |
|
"learning_rate": 8.011163627790765e-06, |
|
"loss": 0.0015, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.670071501532176, |
|
"grad_norm": 0.17653493830017525, |
|
"learning_rate": 7.97306254158113e-06, |
|
"loss": 0.0033, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.697310180456248, |
|
"grad_norm": 0.06779788652309977, |
|
"learning_rate": 7.934692498051202e-06, |
|
"loss": 0.0015, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.72454885938032, |
|
"grad_norm": 0.04283617369672373, |
|
"learning_rate": 7.896056968336868e-06, |
|
"loss": 0.0016, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.7517875383043924, |
|
"grad_norm": 0.4528535210316289, |
|
"learning_rate": 7.857159447591153e-06, |
|
"loss": 0.0015, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.7790262172284645, |
|
"grad_norm": 0.03703542904667366, |
|
"learning_rate": 7.81800345466804e-06, |
|
"loss": 0.0007, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.8062648961525367, |
|
"grad_norm": 0.22254648728704393, |
|
"learning_rate": 7.778592531804115e-06, |
|
"loss": 0.0018, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.833503575076609, |
|
"grad_norm": 0.1551875795468263, |
|
"learning_rate": 7.738930244298146e-06, |
|
"loss": 0.003, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.860742254000681, |
|
"grad_norm": 0.4498866311661179, |
|
"learning_rate": 7.699020180188533e-06, |
|
"loss": 0.0017, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.887980932924753, |
|
"grad_norm": 1.4505936210625598, |
|
"learning_rate": 7.658865949928717e-06, |
|
"loss": 0.0021, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.9152196118488254, |
|
"grad_norm": 0.2597247781008665, |
|
"learning_rate": 7.618471186060574e-06, |
|
"loss": 0.0009, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.9424582907728976, |
|
"grad_norm": 0.05461306630022223, |
|
"learning_rate": 7.577839542885783e-06, |
|
"loss": 0.0012, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.9696969696969697, |
|
"grad_norm": 0.2510446146562708, |
|
"learning_rate": 7.5369746961352505e-06, |
|
"loss": 0.0012, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.996935648621042, |
|
"grad_norm": 0.027247865962538042, |
|
"learning_rate": 7.495880342636581e-06, |
|
"loss": 0.0016, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.021790943139258, |
|
"grad_norm": 0.3181735974801834, |
|
"learning_rate": 7.454560199979647e-06, |
|
"loss": 0.0005, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.04902962206333, |
|
"grad_norm": 0.03550322906594092, |
|
"learning_rate": 7.413018006180278e-06, |
|
"loss": 0.0006, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.076268300987402, |
|
"grad_norm": 0.7839865968633526, |
|
"learning_rate": 7.371257519342103e-06, |
|
"loss": 0.0023, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.103506979911474, |
|
"grad_norm": 0.27056209178067114, |
|
"learning_rate": 7.329282517316574e-06, |
|
"loss": 0.0013, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.130745658835546, |
|
"grad_norm": 0.025007951464925165, |
|
"learning_rate": 7.287096797361197e-06, |
|
"loss": 0.0014, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.157984337759618, |
|
"grad_norm": 0.1639293410219479, |
|
"learning_rate": 7.244704175796028e-06, |
|
"loss": 0.002, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.185223016683691, |
|
"grad_norm": 0.09539274967398963, |
|
"learning_rate": 7.202108487658416e-06, |
|
"loss": 0.0017, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.212461695607763, |
|
"grad_norm": 0.18901065014774657, |
|
"learning_rate": 7.159313586356077e-06, |
|
"loss": 0.002, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.239700374531835, |
|
"grad_norm": 0.027388119425054664, |
|
"learning_rate": 7.116323343318495e-06, |
|
"loss": 0.0017, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.266939053455907, |
|
"grad_norm": 0.13933358678308516, |
|
"learning_rate": 7.073141647646691e-06, |
|
"loss": 0.0009, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.294177732379979, |
|
"grad_norm": 0.011964487372353175, |
|
"learning_rate": 7.029772405761397e-06, |
|
"loss": 0.0005, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.321416411304051, |
|
"grad_norm": 0.044579473352322464, |
|
"learning_rate": 6.9862195410496655e-06, |
|
"loss": 0.0007, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.348655090228124, |
|
"grad_norm": 0.9421209582985217, |
|
"learning_rate": 6.942486993509941e-06, |
|
"loss": 0.0011, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.375893769152196, |
|
"grad_norm": 0.165168485428461, |
|
"learning_rate": 6.898578719395622e-06, |
|
"loss": 0.0007, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.403132448076268, |
|
"grad_norm": 0.012552169908338339, |
|
"learning_rate": 6.854498690857173e-06, |
|
"loss": 0.0024, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.43037112700034, |
|
"grad_norm": 0.33669756307886667, |
|
"learning_rate": 6.810250895582773e-06, |
|
"loss": 0.0013, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.457609805924413, |
|
"grad_norm": 0.03430019485251983, |
|
"learning_rate": 6.765839336437574e-06, |
|
"loss": 0.001, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.484848484848484, |
|
"grad_norm": 0.7568315535654873, |
|
"learning_rate": 6.721268031101586e-06, |
|
"loss": 0.0018, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.5120871637725575, |
|
"grad_norm": 0.36926846488952053, |
|
"learning_rate": 6.676541011706212e-06, |
|
"loss": 0.0032, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.539325842696629, |
|
"grad_norm": 0.30700403135022064, |
|
"learning_rate": 6.631662324469492e-06, |
|
"loss": 0.0021, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 4.566564521620702, |
|
"grad_norm": 0.07283695365864443, |
|
"learning_rate": 6.586636029330054e-06, |
|
"loss": 0.0015, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.593803200544773, |
|
"grad_norm": 0.5098936742183463, |
|
"learning_rate": 6.5414661995798346e-06, |
|
"loss": 0.0026, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 4.621041879468846, |
|
"grad_norm": 0.15016221369549917, |
|
"learning_rate": 6.496156921495594e-06, |
|
"loss": 0.0023, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.648280558392918, |
|
"grad_norm": 0.7027337621531187, |
|
"learning_rate": 6.450712293969251e-06, |
|
"loss": 0.0036, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 4.6755192373169905, |
|
"grad_norm": 0.48920213901161386, |
|
"learning_rate": 6.405136428137072e-06, |
|
"loss": 0.0024, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.702757916241063, |
|
"grad_norm": 0.526663059492545, |
|
"learning_rate": 6.359433447007761e-06, |
|
"loss": 0.0037, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 4.729996595165135, |
|
"grad_norm": 0.43485515633750277, |
|
"learning_rate": 6.313607485089479e-06, |
|
"loss": 0.002, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 4.757235274089207, |
|
"grad_norm": 0.1608557350260687, |
|
"learning_rate": 6.267662688015811e-06, |
|
"loss": 0.0011, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 4.784473953013279, |
|
"grad_norm": 0.016233665978459856, |
|
"learning_rate": 6.221603212170727e-06, |
|
"loss": 0.0016, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.811712631937351, |
|
"grad_norm": 0.3060301856403388, |
|
"learning_rate": 6.175433224312588e-06, |
|
"loss": 0.0008, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 4.8389513108614235, |
|
"grad_norm": 0.0535023008279656, |
|
"learning_rate": 6.129156901197195e-06, |
|
"loss": 0.0007, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.866189989785496, |
|
"grad_norm": 0.22805192659166784, |
|
"learning_rate": 6.082778429199937e-06, |
|
"loss": 0.0011, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 4.893428668709568, |
|
"grad_norm": 0.06801175552041476, |
|
"learning_rate": 6.036302003937076e-06, |
|
"loss": 0.0004, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.92066734763364, |
|
"grad_norm": 0.3046742023784698, |
|
"learning_rate": 5.9897318298861885e-06, |
|
"loss": 0.0007, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.947906026557712, |
|
"grad_norm": 0.033986290353038136, |
|
"learning_rate": 5.943072120005816e-06, |
|
"loss": 0.0007, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.975144705481784, |
|
"grad_norm": 0.09040671159275827, |
|
"learning_rate": 5.89632709535433e-06, |
|
"loss": 0.0019, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.2530736784412786, |
|
"learning_rate": 5.849500984708082e-06, |
|
"loss": 0.0016, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.027238678924072, |
|
"grad_norm": 0.08687153636471827, |
|
"learning_rate": 5.802598024178848e-06, |
|
"loss": 0.0004, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.054477357848144, |
|
"grad_norm": 0.45420511928877233, |
|
"learning_rate": 5.755622456830605e-06, |
|
"loss": 0.0008, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.0817160367722165, |
|
"grad_norm": 0.1542430485091151, |
|
"learning_rate": 5.708578532295691e-06, |
|
"loss": 0.0016, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 5.108954715696289, |
|
"grad_norm": 1.3515082865895989, |
|
"learning_rate": 5.661470506390354e-06, |
|
"loss": 0.0011, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 5.136193394620361, |
|
"grad_norm": 0.1596903735504535, |
|
"learning_rate": 5.61430264072976e-06, |
|
"loss": 0.0014, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 5.163432073544433, |
|
"grad_norm": 0.19923117752022435, |
|
"learning_rate": 5.5670792023424615e-06, |
|
"loss": 0.0015, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.190670752468505, |
|
"grad_norm": 0.302828135997675, |
|
"learning_rate": 5.519804463284382e-06, |
|
"loss": 0.0009, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 5.217909431392577, |
|
"grad_norm": 0.04653233725601432, |
|
"learning_rate": 5.472482700252347e-06, |
|
"loss": 0.0012, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 5.2451481103166495, |
|
"grad_norm": 0.5190292480319276, |
|
"learning_rate": 5.425118194197196e-06, |
|
"loss": 0.0023, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 5.272386789240722, |
|
"grad_norm": 0.007225303260017864, |
|
"learning_rate": 5.3777152299365e-06, |
|
"loss": 0.0005, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 5.299625468164794, |
|
"grad_norm": 0.08842755160666287, |
|
"learning_rate": 5.3302780957669454e-06, |
|
"loss": 0.0006, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 5.326864147088866, |
|
"grad_norm": 0.024822557291839333, |
|
"learning_rate": 5.282811083076388e-06, |
|
"loss": 0.0004, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 5.354102826012938, |
|
"grad_norm": 1.2627706166913806, |
|
"learning_rate": 5.235318485955638e-06, |
|
"loss": 0.0007, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 5.38134150493701, |
|
"grad_norm": 0.03392392848083845, |
|
"learning_rate": 5.187804600809995e-06, |
|
"loss": 0.0014, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 5.4085801838610825, |
|
"grad_norm": 0.009487721281030682, |
|
"learning_rate": 5.140273725970569e-06, |
|
"loss": 0.0019, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 5.435818862785155, |
|
"grad_norm": 0.03169860186889457, |
|
"learning_rate": 5.092730161305444e-06, |
|
"loss": 0.0005, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.435818862785155, |
|
"eval_loss": 0.003610835410654545, |
|
"eval_runtime": 149.9472, |
|
"eval_samples_per_second": 1.334, |
|
"eval_steps_per_second": 0.167, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.463057541709227, |
|
"grad_norm": 0.26681987017023606, |
|
"learning_rate": 5.045178207830687e-06, |
|
"loss": 0.0005, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 5.490296220633299, |
|
"grad_norm": 0.020589913051757485, |
|
"learning_rate": 4.997622167321246e-06, |
|
"loss": 0.0004, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 5.517534899557371, |
|
"grad_norm": 0.005508716128027595, |
|
"learning_rate": 4.950066341921813e-06, |
|
"loss": 0.0002, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 5.544773578481443, |
|
"grad_norm": 0.12421491718000478, |
|
"learning_rate": 4.902515033757617e-06, |
|
"loss": 0.0002, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 5.5720122574055155, |
|
"grad_norm": 0.6904314051957775, |
|
"learning_rate": 4.854972544545231e-06, |
|
"loss": 0.0011, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 5.599250936329588, |
|
"grad_norm": 0.020582980767102473, |
|
"learning_rate": 4.807443175203432e-06, |
|
"loss": 0.0008, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 5.62648961525366, |
|
"grad_norm": 0.26044403454004866, |
|
"learning_rate": 4.759931225464107e-06, |
|
"loss": 0.0011, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 5.653728294177732, |
|
"grad_norm": 0.13198469815528743, |
|
"learning_rate": 4.712440993483281e-06, |
|
"loss": 0.0005, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 5.680966973101804, |
|
"grad_norm": 0.24787019272348249, |
|
"learning_rate": 4.664976775452293e-06, |
|
"loss": 0.0016, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 5.708205652025876, |
|
"grad_norm": 0.06673597677280482, |
|
"learning_rate": 4.617542865209133e-06, |
|
"loss": 0.0001, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.7354443309499485, |
|
"grad_norm": 0.01989819994546987, |
|
"learning_rate": 4.5701435538500065e-06, |
|
"loss": 0.0002, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 5.762683009874021, |
|
"grad_norm": 0.06711337766264915, |
|
"learning_rate": 4.522783129341141e-06, |
|
"loss": 0.0011, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 5.789921688798094, |
|
"grad_norm": 0.0258406372816757, |
|
"learning_rate": 4.475465876130872e-06, |
|
"loss": 0.0002, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 5.817160367722165, |
|
"grad_norm": 0.0069848118261474626, |
|
"learning_rate": 4.428196074762057e-06, |
|
"loss": 0.0001, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 5.844399046646238, |
|
"grad_norm": 0.00465294693967376, |
|
"learning_rate": 4.380978001484836e-06, |
|
"loss": 0.0008, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 5.871637725570309, |
|
"grad_norm": 0.062291913740785465, |
|
"learning_rate": 4.33381592786978e-06, |
|
"loss": 0.0001, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 5.898876404494382, |
|
"grad_norm": 0.00324106094253182, |
|
"learning_rate": 4.286714120421465e-06, |
|
"loss": 0.0001, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 5.926115083418454, |
|
"grad_norm": 0.010405479601857403, |
|
"learning_rate": 4.2396768401925044e-06, |
|
"loss": 0.0001, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 5.953353762342527, |
|
"grad_norm": 0.003621476287705226, |
|
"learning_rate": 4.1927083423980755e-06, |
|
"loss": 0.0001, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 5.980592441266599, |
|
"grad_norm": 0.15175481500791388, |
|
"learning_rate": 4.145812876030965e-06, |
|
"loss": 0.0001, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.005447735784815, |
|
"grad_norm": 0.009474114226140813, |
|
"learning_rate": 4.098994683477197e-06, |
|
"loss": 0.0, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 6.032686414708887, |
|
"grad_norm": 0.010773928421478833, |
|
"learning_rate": 4.0522580001322365e-06, |
|
"loss": 0.0, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 6.059925093632959, |
|
"grad_norm": 0.002749306413145174, |
|
"learning_rate": 4.0056070540178425e-06, |
|
"loss": 0.0, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 6.087163772557031, |
|
"grad_norm": 0.001948904462067843, |
|
"learning_rate": 3.959046065399575e-06, |
|
"loss": 0.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 6.114402451481103, |
|
"grad_norm": 0.001621993315411648, |
|
"learning_rate": 3.912579246405016e-06, |
|
"loss": 0.0, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 6.1416411304051755, |
|
"grad_norm": 0.001255150538557096, |
|
"learning_rate": 3.8662108006427165e-06, |
|
"loss": 0.0, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 6.168879809329248, |
|
"grad_norm": 0.004272455750195676, |
|
"learning_rate": 3.819944922821914e-06, |
|
"loss": 0.0, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 6.19611848825332, |
|
"grad_norm": 0.00187243023068361, |
|
"learning_rate": 3.773785798373069e-06, |
|
"loss": 0.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 6.223357167177392, |
|
"grad_norm": 0.0009374589516518995, |
|
"learning_rate": 3.7277376030692263e-06, |
|
"loss": 0.0, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 6.250595846101464, |
|
"grad_norm": 0.001253252761018019, |
|
"learning_rate": 3.681804502648254e-06, |
|
"loss": 0.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 6.277834525025536, |
|
"grad_norm": 0.002173317313832616, |
|
"learning_rate": 3.6359906524359932e-06, |
|
"loss": 0.0, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 6.3050732039496085, |
|
"grad_norm": 0.00122096897637743, |
|
"learning_rate": 3.590300196970341e-06, |
|
"loss": 0.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 6.332311882873681, |
|
"grad_norm": 0.0008158321760346172, |
|
"learning_rate": 3.544737269626328e-06, |
|
"loss": 0.0, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 6.359550561797753, |
|
"grad_norm": 0.0011021236484427205, |
|
"learning_rate": 3.4993059922421835e-06, |
|
"loss": 0.0, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 6.386789240721825, |
|
"grad_norm": 0.0007113597066715369, |
|
"learning_rate": 3.4540104747464575e-06, |
|
"loss": 0.0, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 6.414027919645897, |
|
"grad_norm": 0.0007186713752736677, |
|
"learning_rate": 3.408854814786219e-06, |
|
"loss": 0.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 6.441266598569969, |
|
"grad_norm": 0.0005956363546399899, |
|
"learning_rate": 3.3638430973563597e-06, |
|
"loss": 0.0, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 6.4685052774940415, |
|
"grad_norm": 0.0013985565483587347, |
|
"learning_rate": 3.318979394430051e-06, |
|
"loss": 0.0, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 6.495743956418114, |
|
"grad_norm": 0.0008815209213564751, |
|
"learning_rate": 3.27426776459037e-06, |
|
"loss": 0.0003, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 6.522982635342186, |
|
"grad_norm": 0.0024178562381092818, |
|
"learning_rate": 3.22971225266314e-06, |
|
"loss": 0.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.550221314266258, |
|
"grad_norm": 0.0010728069443376714, |
|
"learning_rate": 3.1853168893510223e-06, |
|
"loss": 0.0, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 6.57745999319033, |
|
"grad_norm": 0.0010548514445552371, |
|
"learning_rate": 3.141085690868871e-06, |
|
"loss": 0.0, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 6.604698672114402, |
|
"grad_norm": 0.0021998928664552233, |
|
"learning_rate": 3.0970226585804175e-06, |
|
"loss": 0.0001, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 6.6319373510384745, |
|
"grad_norm": 0.0011062246966726207, |
|
"learning_rate": 3.053131778636278e-06, |
|
"loss": 0.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 6.659176029962547, |
|
"grad_norm": 0.0011797120379395778, |
|
"learning_rate": 3.0094170216133545e-06, |
|
"loss": 0.0, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 6.686414708886619, |
|
"grad_norm": 0.007225871987921568, |
|
"learning_rate": 2.965882342155637e-06, |
|
"loss": 0.0, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 6.713653387810691, |
|
"grad_norm": 0.0010544539477212198, |
|
"learning_rate": 2.9225316786164417e-06, |
|
"loss": 0.0, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 6.740892066734763, |
|
"grad_norm": 0.0006976448579587069, |
|
"learning_rate": 2.8793689527021377e-06, |
|
"loss": 0.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 6.768130745658835, |
|
"grad_norm": 0.0027697202568828954, |
|
"learning_rate": 2.836398069117362e-06, |
|
"loss": 0.0, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 6.7953694245829075, |
|
"grad_norm": 0.0005279000452700497, |
|
"learning_rate": 2.7936229152117896e-06, |
|
"loss": 0.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.82260810350698, |
|
"grad_norm": 0.000826146277516193, |
|
"learning_rate": 2.751047360628458e-06, |
|
"loss": 0.0, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 6.849846782431052, |
|
"grad_norm": 0.0007622689637331964, |
|
"learning_rate": 2.708675256953708e-06, |
|
"loss": 0.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 6.877085461355124, |
|
"grad_norm": 0.0011470272220122446, |
|
"learning_rate": 2.6665104373687455e-06, |
|
"loss": 0.0006, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 6.904324140279196, |
|
"grad_norm": 0.0009952201118956113, |
|
"learning_rate": 2.624556716302876e-06, |
|
"loss": 0.0, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 6.931562819203268, |
|
"grad_norm": 0.0011435928941426047, |
|
"learning_rate": 2.582817889088435e-06, |
|
"loss": 0.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 6.9588014981273405, |
|
"grad_norm": 0.01166505316182784, |
|
"learning_rate": 2.541297731617437e-06, |
|
"loss": 0.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 6.986040177051413, |
|
"grad_norm": 0.0012091660156564501, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.0, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 7.010895471569629, |
|
"grad_norm": 0.0015980505913388763, |
|
"learning_rate": 2.458928430224548e-06, |
|
"loss": 0.0, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 7.038134150493701, |
|
"grad_norm": 0.0009393729565236982, |
|
"learning_rate": 2.4180867378198274e-06, |
|
"loss": 0.0, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 7.065372829417774, |
|
"grad_norm": 0.0008257129118958041, |
|
"learning_rate": 2.3774786175187932e-06, |
|
"loss": 0.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 7.092611508341846, |
|
"grad_norm": 0.0018052707906574057, |
|
"learning_rate": 2.337107742924359e-06, |
|
"loss": 0.0, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 7.119850187265918, |
|
"grad_norm": 0.1645764072539005, |
|
"learning_rate": 2.29697776617707e-06, |
|
"loss": 0.0, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 7.14708886618999, |
|
"grad_norm": 0.0007519843719597862, |
|
"learning_rate": 2.25709231762471e-06, |
|
"loss": 0.0, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 7.174327545114062, |
|
"grad_norm": 0.0008477596842611903, |
|
"learning_rate": 2.217455005493884e-06, |
|
"loss": 0.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 7.201566224038134, |
|
"grad_norm": 0.004509417330392989, |
|
"learning_rate": 2.1780694155636014e-06, |
|
"loss": 0.0, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 7.228804902962207, |
|
"grad_norm": 0.0005766378225871001, |
|
"learning_rate": 2.138939110840888e-06, |
|
"loss": 0.0, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 7.256043581886279, |
|
"grad_norm": 0.004100067804604992, |
|
"learning_rate": 2.100067631238464e-06, |
|
"loss": 0.0, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 7.283282260810351, |
|
"grad_norm": 0.003930226882351156, |
|
"learning_rate": 2.0614584932544955e-06, |
|
"loss": 0.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 7.310520939734423, |
|
"grad_norm": 0.0004427737628504764, |
|
"learning_rate": 2.023115189654491e-06, |
|
"loss": 0.0, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 7.337759618658495, |
|
"grad_norm": 0.0008888326172641632, |
|
"learning_rate": 1.9850411891553186e-06, |
|
"loss": 0.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 7.364998297582567, |
|
"grad_norm": 0.0006039128437105985, |
|
"learning_rate": 1.9472399361114126e-06, |
|
"loss": 0.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 7.39223697650664, |
|
"grad_norm": 0.0004893946381054215, |
|
"learning_rate": 1.909714850203177e-06, |
|
"loss": 0.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 7.419475655430712, |
|
"grad_norm": 0.0004317709133183237, |
|
"learning_rate": 1.8724693261276345e-06, |
|
"loss": 0.0, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 7.446714334354784, |
|
"grad_norm": 0.0006212197750363989, |
|
"learning_rate": 1.8355067332913156e-06, |
|
"loss": 0.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 7.473953013278856, |
|
"grad_norm": 0.0007456921468394779, |
|
"learning_rate": 1.7988304155054541e-06, |
|
"loss": 0.0, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 7.501191692202928, |
|
"grad_norm": 0.0008149478511071161, |
|
"learning_rate": 1.7624436906834842e-06, |
|
"loss": 0.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 7.528430371127, |
|
"grad_norm": 0.0007225352461383864, |
|
"learning_rate": 1.7263498505408893e-06, |
|
"loss": 0.0, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 7.555669050051073, |
|
"grad_norm": 0.0005959357394697677, |
|
"learning_rate": 1.6905521602974183e-06, |
|
"loss": 0.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 7.582907728975145, |
|
"grad_norm": 0.000635015561952419, |
|
"learning_rate": 1.6550538583816967e-06, |
|
"loss": 0.0, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 7.610146407899217, |
|
"grad_norm": 0.0005433753417863713, |
|
"learning_rate": 1.6198581561382643e-06, |
|
"loss": 0.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.637385086823289, |
|
"grad_norm": 0.00046276580358275586, |
|
"learning_rate": 1.5849682375370601e-06, |
|
"loss": 0.0, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 7.664623765747361, |
|
"grad_norm": 0.0008516896607091807, |
|
"learning_rate": 1.550387258885388e-06, |
|
"loss": 0.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 7.691862444671433, |
|
"grad_norm": 0.0005163692295676414, |
|
"learning_rate": 1.5161183485423785e-06, |
|
"loss": 0.0, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 7.719101123595506, |
|
"grad_norm": 0.0011709181420231178, |
|
"learning_rate": 1.482164606635989e-06, |
|
"loss": 0.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 7.746339802519578, |
|
"grad_norm": 0.0013056291673384528, |
|
"learning_rate": 1.44852910478254e-06, |
|
"loss": 0.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 7.77357848144365, |
|
"grad_norm": 0.0004465783868552864, |
|
"learning_rate": 1.4152148858088554e-06, |
|
"loss": 0.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 7.800817160367722, |
|
"grad_norm": 0.0005635784726540899, |
|
"learning_rate": 1.3822249634769864e-06, |
|
"loss": 0.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 7.828055839291794, |
|
"grad_norm": 0.0003537589230160337, |
|
"learning_rate": 1.3495623222115735e-06, |
|
"loss": 0.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 7.855294518215866, |
|
"grad_norm": 0.0004597321862407293, |
|
"learning_rate": 1.3172299168298614e-06, |
|
"loss": 0.0, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 7.882533197139939, |
|
"grad_norm": 0.00040170519106549137, |
|
"learning_rate": 1.2852306722743934e-06, |
|
"loss": 0.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.909771876064011, |
|
"grad_norm": 0.0016780869556999771, |
|
"learning_rate": 1.253567483348407e-06, |
|
"loss": 0.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 7.937010554988083, |
|
"grad_norm": 0.000593202963073429, |
|
"learning_rate": 1.222243214453951e-06, |
|
"loss": 0.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 7.964249233912155, |
|
"grad_norm": 0.0003643083864401578, |
|
"learning_rate": 1.1912606993327685e-06, |
|
"loss": 0.0, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 7.991487912836227, |
|
"grad_norm": 0.0007434796810202931, |
|
"learning_rate": 1.1606227408099347e-06, |
|
"loss": 0.0, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 8.016343207354444, |
|
"grad_norm": 0.00047702794678360533, |
|
"learning_rate": 1.1303321105403026e-06, |
|
"loss": 0.0, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 8.043581886278515, |
|
"grad_norm": 0.0005298663692543725, |
|
"learning_rate": 1.1003915487577683e-06, |
|
"loss": 0.0, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 8.070820565202588, |
|
"grad_norm": 0.004074165794804811, |
|
"learning_rate": 1.0708037640273715e-06, |
|
"loss": 0.0, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 8.09805924412666, |
|
"grad_norm": 0.00040511964399428683, |
|
"learning_rate": 1.0415714330002729e-06, |
|
"loss": 0.0, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 8.125297923050733, |
|
"grad_norm": 0.0012113673680468461, |
|
"learning_rate": 1.0126972001716007e-06, |
|
"loss": 0.0, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 8.152536601974804, |
|
"grad_norm": 0.00041889827396330973, |
|
"learning_rate": 9.841836776412294e-07, |
|
"loss": 0.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 8.152536601974804, |
|
"eval_loss": 1.2345339200692251e-05, |
|
"eval_runtime": 149.4558, |
|
"eval_samples_per_second": 1.338, |
|
"eval_steps_per_second": 0.167, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 8.179775280898877, |
|
"grad_norm": 0.00048789814951889245, |
|
"learning_rate": 9.560334448774705e-07, |
|
"loss": 0.0, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 8.207013959822948, |
|
"grad_norm": 0.00047638289528187925, |
|
"learning_rate": 9.282490484837215e-07, |
|
"loss": 0.0, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 8.234252638747021, |
|
"grad_norm": 0.000455674971676968, |
|
"learning_rate": 9.008330019680883e-07, |
|
"loss": 0.0, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 8.261491317671092, |
|
"grad_norm": 0.00041234844635284676, |
|
"learning_rate": 8.737877855160032e-07, |
|
"loss": 0.0, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 8.288729996595166, |
|
"grad_norm": 0.0004512145872248025, |
|
"learning_rate": 8.471158457658546e-07, |
|
"loss": 0.0, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 8.315968675519237, |
|
"grad_norm": 0.0003955500503942679, |
|
"learning_rate": 8.208195955876513e-07, |
|
"loss": 0.0, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 8.34320735444331, |
|
"grad_norm": 0.00037757417965770856, |
|
"learning_rate": 7.949014138647442e-07, |
|
"loss": 0.0, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 8.370446033367381, |
|
"grad_norm": 0.0005928754851042547, |
|
"learning_rate": 7.693636452786213e-07, |
|
"loss": 0.0, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 8.397684712291454, |
|
"grad_norm": 0.0011757489143008526, |
|
"learning_rate": 7.442086000967962e-07, |
|
"loss": 0.0, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 8.424923391215525, |
|
"grad_norm": 0.001091795641988052, |
|
"learning_rate": 7.194385539638099e-07, |
|
"loss": 0.0, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 8.452162070139599, |
|
"grad_norm": 0.0004490144524655851, |
|
"learning_rate": 6.950557476953674e-07, |
|
"loss": 0.0, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 8.47940074906367, |
|
"grad_norm": 0.0005170136229882478, |
|
"learning_rate": 6.710623870756178e-07, |
|
"loss": 0.0, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 8.506639427987743, |
|
"grad_norm": 0.0005606121390368469, |
|
"learning_rate": 6.474606426576157e-07, |
|
"loss": 0.0, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 8.533878106911814, |
|
"grad_norm": 0.0006494691276362171, |
|
"learning_rate": 6.242526495669587e-07, |
|
"loss": 0.0, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 8.561116785835887, |
|
"grad_norm": 0.000705761138730125, |
|
"learning_rate": 6.01440507308631e-07, |
|
"loss": 0.0, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 8.588355464759958, |
|
"grad_norm": 0.00037580712390274833, |
|
"learning_rate": 5.790262795770785e-07, |
|
"loss": 0.0, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 8.615594143684032, |
|
"grad_norm": 0.00037712299260292257, |
|
"learning_rate": 5.570119940695135e-07, |
|
"loss": 0.0, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 8.642832822608103, |
|
"grad_norm": 0.00038488025576517655, |
|
"learning_rate": 5.353996423024804e-07, |
|
"loss": 0.0, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 8.670071501532176, |
|
"grad_norm": 0.0004034621134836744, |
|
"learning_rate": 5.141911794316934e-07, |
|
"loss": 0.0, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 8.697310180456247, |
|
"grad_norm": 0.00038266845732824815, |
|
"learning_rate": 4.93388524075164e-07, |
|
"loss": 0.0, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.72454885938032, |
|
"grad_norm": 0.00038362011868780437, |
|
"learning_rate": 4.729935581396328e-07, |
|
"loss": 0.0, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 8.751787538304391, |
|
"grad_norm": 0.00035607543651581207, |
|
"learning_rate": 4.5300812665032557e-07, |
|
"loss": 0.0, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 8.779026217228465, |
|
"grad_norm": 0.0006068778689349405, |
|
"learning_rate": 4.334340375840418e-07, |
|
"loss": 0.0, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 8.806264896152536, |
|
"grad_norm": 0.00033490740677774155, |
|
"learning_rate": 4.1427306170559624e-07, |
|
"loss": 0.0, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 8.833503575076609, |
|
"grad_norm": 0.00046859576305887497, |
|
"learning_rate": 3.955269324076294e-07, |
|
"loss": 0.0, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 8.86074225400068, |
|
"grad_norm": 0.0008093853787619229, |
|
"learning_rate": 3.771973455537936e-07, |
|
"loss": 0.0, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 8.887980932924753, |
|
"grad_norm": 0.0010513228996015518, |
|
"learning_rate": 3.5928595932534005e-07, |
|
"loss": 0.0, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 8.915219611848826, |
|
"grad_norm": 0.0003623450487854711, |
|
"learning_rate": 3.4179439407110714e-07, |
|
"loss": 0.0, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 8.942458290772898, |
|
"grad_norm": 0.000339362832377934, |
|
"learning_rate": 3.247242321609434e-07, |
|
"loss": 0.0, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 8.969696969696969, |
|
"grad_norm": 0.0005334885527562191, |
|
"learning_rate": 3.0807701784255296e-07, |
|
"loss": 0.0, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 8.996935648621042, |
|
"grad_norm": 0.00044685136023717045, |
|
"learning_rate": 2.9185425710179737e-07, |
|
"loss": 0.0, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 9.021790943139258, |
|
"grad_norm": 0.0008018093014633273, |
|
"learning_rate": 2.7605741752645686e-07, |
|
"loss": 0.0, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 9.04902962206333, |
|
"grad_norm": 0.0003794485076320959, |
|
"learning_rate": 2.606879281734659e-07, |
|
"loss": 0.0, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 9.076268300987403, |
|
"grad_norm": 0.000514433360537291, |
|
"learning_rate": 2.457471794396338e-07, |
|
"loss": 0.0, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 9.103506979911474, |
|
"grad_norm": 0.0006110392205499266, |
|
"learning_rate": 2.3123652293586207e-07, |
|
"loss": 0.0, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 9.130745658835547, |
|
"grad_norm": 0.0004051883420581768, |
|
"learning_rate": 2.1715727136487174e-07, |
|
"loss": 0.0, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 9.157984337759618, |
|
"grad_norm": 0.0017022735873803575, |
|
"learning_rate": 2.0351069840244986e-07, |
|
"loss": 0.0, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 9.185223016683691, |
|
"grad_norm": 0.000833252754224706, |
|
"learning_rate": 1.9029803858222896e-07, |
|
"loss": 0.0, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 9.212461695607763, |
|
"grad_norm": 0.0038641354937789443, |
|
"learning_rate": 1.775204871840014e-07, |
|
"loss": 0.0, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 9.239700374531836, |
|
"grad_norm": 0.0003339132774971038, |
|
"learning_rate": 1.6517920012559086e-07, |
|
"loss": 0.0, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 9.266939053455907, |
|
"grad_norm": 0.0003633204131676538, |
|
"learning_rate": 1.5327529385828377e-07, |
|
"loss": 0.0, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 9.29417773237998, |
|
"grad_norm": 0.000322045602551698, |
|
"learning_rate": 1.4180984526582675e-07, |
|
"loss": 0.0, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 9.321416411304051, |
|
"grad_norm": 0.0010744986200642887, |
|
"learning_rate": 1.3078389156700842e-07, |
|
"loss": 0.0, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 9.348655090228124, |
|
"grad_norm": 0.0005973674276566756, |
|
"learning_rate": 1.2019843022182898e-07, |
|
"loss": 0.0, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 9.375893769152196, |
|
"grad_norm": 0.00035574829244011577, |
|
"learning_rate": 1.1005441884126278e-07, |
|
"loss": 0.0, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 9.403132448076269, |
|
"grad_norm": 0.0005334084677271788, |
|
"learning_rate": 1.0035277510062835e-07, |
|
"loss": 0.0, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 9.43037112700034, |
|
"grad_norm": 0.00032947386696114886, |
|
"learning_rate": 9.109437665657473e-08, |
|
"loss": 0.0, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 9.457609805924413, |
|
"grad_norm": 0.0003827344363811748, |
|
"learning_rate": 8.228006106767883e-08, |
|
"loss": 0.0, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 9.484848484848484, |
|
"grad_norm": 0.00030067997503277467, |
|
"learning_rate": 7.391062571868113e-08, |
|
"loss": 0.0, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 9.512087163772557, |
|
"grad_norm": 0.0003243561088926095, |
|
"learning_rate": 6.598682774834775e-08, |
|
"loss": 0.0, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 9.539325842696629, |
|
"grad_norm": 0.0003467443287273098, |
|
"learning_rate": 5.850938398097583e-08, |
|
"loss": 0.0, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 9.566564521620702, |
|
"grad_norm": 0.00035286356515664503, |
|
"learning_rate": 5.1478970861548185e-08, |
|
"loss": 0.0, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 9.593803200544773, |
|
"grad_norm": 0.0009776284074226168, |
|
"learning_rate": 4.4896224394537226e-08, |
|
"loss": 0.0, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 9.621041879468846, |
|
"grad_norm": 0.00039195791750442195, |
|
"learning_rate": 3.8761740086369345e-08, |
|
"loss": 0.0, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 9.648280558392917, |
|
"grad_norm": 0.0005306094646929393, |
|
"learning_rate": 3.307607289155301e-08, |
|
"loss": 0.0, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 9.67551923731699, |
|
"grad_norm": 0.0004368446632988839, |
|
"learning_rate": 2.78397371624739e-08, |
|
"loss": 0.0, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 9.702757916241062, |
|
"grad_norm": 0.0006477180396400463, |
|
"learning_rate": 2.305320660286603e-08, |
|
"loss": 0.0, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 9.729996595165135, |
|
"grad_norm": 0.0005051573397374285, |
|
"learning_rate": 1.8716914224957138e-08, |
|
"loss": 0.0, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 9.757235274089206, |
|
"grad_norm": 0.00035959712658455894, |
|
"learning_rate": 1.4831252310294474e-08, |
|
"loss": 0.0, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 9.78447395301328, |
|
"grad_norm": 0.00047611287827840345, |
|
"learning_rate": 1.1396572374261505e-08, |
|
"loss": 0.0, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 9.81171263193735, |
|
"grad_norm": 0.0009275444049437501, |
|
"learning_rate": 8.413185134273916e-09, |
|
"loss": 0.0, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 9.838951310861423, |
|
"grad_norm": 0.0010032462120567445, |
|
"learning_rate": 5.881360481673759e-09, |
|
"loss": 0.0, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 9.866189989785495, |
|
"grad_norm": 0.0008774671042869975, |
|
"learning_rate": 3.801327457311765e-09, |
|
"loss": 0.0, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 9.893428668709568, |
|
"grad_norm": 0.0007470443454518441, |
|
"learning_rate": 2.173274230827249e-09, |
|
"loss": 0.0, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 9.920667347633639, |
|
"grad_norm": 0.0006590459160652287, |
|
"learning_rate": 9.97348083627836e-10, |
|
"loss": 0.0, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 9.947906026557712, |
|
"grad_norm": 0.0008391834531967672, |
|
"learning_rate": 2.7365539556234444e-10, |
|
"loss": 0.0, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 9.975144705481783, |
|
"grad_norm": 0.00034623849620746836, |
|
"learning_rate": 2.261635299039e-12, |
|
"loss": 0.0, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 9.975144705481783, |
|
"step": 3670, |
|
"total_flos": 3659279013773312.0, |
|
"train_loss": 0.050615286758120444, |
|
"train_runtime": 386241.8476, |
|
"train_samples_per_second": 2.433, |
|
"train_steps_per_second": 0.01 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3670, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3659279013773312.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|