diaenra commited on
Commit
0266883
·
verified ·
1 Parent(s): 767028b

Training in progress, step 337, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2e8037fd1742aafeefe9394eb6eb3056e379cec6904dd227fdd37e6f723d139
3
  size 67126760
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ff032c2b84a94327628cf41dd74556da8b5604505cb11b3a7fbcd44644cbb23
3
  size 67126760
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c117807c6d7ec4eab79023545c24abd813201ccbb0d506a573fa5dc435b019a
3
  size 134325882
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ab626055373aac8d72bf80d61010fbfe74b3561c5f47454fa675333369173bd
3
  size 134325882
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b355518e25eb3f8f0a23ea4bbd5bf062b989e7cbba231f8b4b77cf35bb4106d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ddb0dc9aaf9cc0dc4a6b0020adae3a2c77f9db62a9fc24dadaa9a18d3a18470
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0bd0e7decbf7c91ab9fd757f5ce5ae4cb006710e4bd76818c20a5be991c1f90
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de435905b02e6e14253beeb6db9bdb52440d05130914ca4da5e77f7ebf47b817
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7076239822353811,
5
  "eval_steps": 500,
6
- "global_step": 239,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1680,6 +1680,692 @@
1680
  "learning_rate": 3.657954557919183e-05,
1681
  "loss": 1.6068,
1682
  "step": 239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1683
  }
1684
  ],
1685
  "logging_steps": 1,
@@ -1694,12 +2380,12 @@
1694
  "should_evaluate": false,
1695
  "should_log": false,
1696
  "should_save": true,
1697
- "should_training_stop": false
1698
  },
1699
  "attributes": {}
1700
  }
1701
  },
1702
- "total_flos": 2.142671224386355e+16,
1703
  "train_batch_size": 4,
1704
  "trial_name": null,
1705
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.997779422649889,
5
  "eval_steps": 500,
6
+ "global_step": 337,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1680
  "learning_rate": 3.657954557919183e-05,
1681
  "loss": 1.6068,
1682
  "step": 239
1683
+ },
1684
+ {
1685
+ "epoch": 0.7105847520355293,
1686
+ "grad_norm": 0.6972988843917847,
1687
+ "learning_rate": 3.5942280889623026e-05,
1688
+ "loss": 0.6496,
1689
+ "step": 240
1690
+ },
1691
+ {
1692
+ "epoch": 0.7135455218356773,
1693
+ "grad_norm": 0.620529055595398,
1694
+ "learning_rate": 3.5307486283103966e-05,
1695
+ "loss": 0.531,
1696
+ "step": 241
1697
+ },
1698
+ {
1699
+ "epoch": 0.7165062916358254,
1700
+ "grad_norm": 0.71197110414505,
1701
+ "learning_rate": 3.467527329945026e-05,
1702
+ "loss": 0.6825,
1703
+ "step": 242
1704
+ },
1705
+ {
1706
+ "epoch": 0.7194670614359734,
1707
+ "grad_norm": 0.6315649151802063,
1708
+ "learning_rate": 3.404575302486039e-05,
1709
+ "loss": 0.4259,
1710
+ "step": 243
1711
+ },
1712
+ {
1713
+ "epoch": 0.7224278312361214,
1714
+ "grad_norm": 0.6250630617141724,
1715
+ "learning_rate": 3.3419036072396616e-05,
1716
+ "loss": 0.2993,
1717
+ "step": 244
1718
+ },
1719
+ {
1720
+ "epoch": 0.7253886010362695,
1721
+ "grad_norm": 0.6370623111724854,
1722
+ "learning_rate": 3.27952325625493e-05,
1723
+ "loss": 0.3333,
1724
+ "step": 245
1725
+ },
1726
+ {
1727
+ "epoch": 0.7283493708364175,
1728
+ "grad_norm": 0.494165301322937,
1729
+ "learning_rate": 3.2174452103887456e-05,
1730
+ "loss": 0.1763,
1731
+ "step": 246
1732
+ },
1733
+ {
1734
+ "epoch": 0.7313101406365655,
1735
+ "grad_norm": 0.4754185676574707,
1736
+ "learning_rate": 3.1556803773799614e-05,
1737
+ "loss": 0.116,
1738
+ "step": 247
1739
+ },
1740
+ {
1741
+ "epoch": 0.7342709104367136,
1742
+ "grad_norm": 0.5947216749191284,
1743
+ "learning_rate": 3.094239609932764e-05,
1744
+ "loss": 0.1401,
1745
+ "step": 248
1746
+ },
1747
+ {
1748
+ "epoch": 0.7372316802368616,
1749
+ "grad_norm": 0.7304718494415283,
1750
+ "learning_rate": 3.0331337038097597e-05,
1751
+ "loss": 0.1343,
1752
+ "step": 249
1753
+ },
1754
+ {
1755
+ "epoch": 0.7401924500370096,
1756
+ "grad_norm": 1.4958913326263428,
1757
+ "learning_rate": 2.9723733959350307e-05,
1758
+ "loss": 0.2833,
1759
+ "step": 250
1760
+ },
1761
+ {
1762
+ "epoch": 0.7431532198371577,
1763
+ "grad_norm": 1.9902845621109009,
1764
+ "learning_rate": 2.911969362507574e-05,
1765
+ "loss": 2.3228,
1766
+ "step": 251
1767
+ },
1768
+ {
1769
+ "epoch": 0.7461139896373057,
1770
+ "grad_norm": 1.4703550338745117,
1771
+ "learning_rate": 2.8519322171253602e-05,
1772
+ "loss": 1.8494,
1773
+ "step": 252
1774
+ },
1775
+ {
1776
+ "epoch": 0.7490747594374537,
1777
+ "grad_norm": 1.096100926399231,
1778
+ "learning_rate": 2.7922725089204426e-05,
1779
+ "loss": 1.3966,
1780
+ "step": 253
1781
+ },
1782
+ {
1783
+ "epoch": 0.7520355292376018,
1784
+ "grad_norm": 1.069754719734192,
1785
+ "learning_rate": 2.733000720705341e-05,
1786
+ "loss": 1.512,
1787
+ "step": 254
1788
+ },
1789
+ {
1790
+ "epoch": 0.7549962990377498,
1791
+ "grad_norm": 1.1707820892333984,
1792
+ "learning_rate": 2.674127267131131e-05,
1793
+ "loss": 1.7061,
1794
+ "step": 255
1795
+ },
1796
+ {
1797
+ "epoch": 0.7579570688378978,
1798
+ "grad_norm": 1.0361454486846924,
1799
+ "learning_rate": 2.6156624928574707e-05,
1800
+ "loss": 1.2677,
1801
+ "step": 256
1802
+ },
1803
+ {
1804
+ "epoch": 0.7609178386380459,
1805
+ "grad_norm": 1.0314826965332031,
1806
+ "learning_rate": 2.5576166707349385e-05,
1807
+ "loss": 1.7485,
1808
+ "step": 257
1809
+ },
1810
+ {
1811
+ "epoch": 0.7638786084381939,
1812
+ "grad_norm": 1.0188485383987427,
1813
+ "learning_rate": 2.500000000000001e-05,
1814
+ "loss": 1.7111,
1815
+ "step": 258
1816
+ },
1817
+ {
1818
+ "epoch": 0.7668393782383419,
1819
+ "grad_norm": 1.0937459468841553,
1820
+ "learning_rate": 2.4428226044828896e-05,
1821
+ "loss": 1.9624,
1822
+ "step": 259
1823
+ },
1824
+ {
1825
+ "epoch": 0.7698001480384901,
1826
+ "grad_norm": 1.023545742034912,
1827
+ "learning_rate": 2.3860945308287552e-05,
1828
+ "loss": 1.3442,
1829
+ "step": 260
1830
+ },
1831
+ {
1832
+ "epoch": 0.7727609178386381,
1833
+ "grad_norm": 1.0937973260879517,
1834
+ "learning_rate": 2.3298257467323604e-05,
1835
+ "loss": 1.9485,
1836
+ "step": 261
1837
+ },
1838
+ {
1839
+ "epoch": 0.7757216876387861,
1840
+ "grad_norm": 1.153558373451233,
1841
+ "learning_rate": 2.2740261391866637e-05,
1842
+ "loss": 1.8108,
1843
+ "step": 262
1844
+ },
1845
+ {
1846
+ "epoch": 0.7786824574389342,
1847
+ "grad_norm": 1.0607295036315918,
1848
+ "learning_rate": 2.2187055127455653e-05,
1849
+ "loss": 1.7313,
1850
+ "step": 263
1851
+ },
1852
+ {
1853
+ "epoch": 0.7816432272390822,
1854
+ "grad_norm": 1.0273983478546143,
1855
+ "learning_rate": 2.16387358780116e-05,
1856
+ "loss": 1.4125,
1857
+ "step": 264
1858
+ },
1859
+ {
1860
+ "epoch": 0.7846039970392302,
1861
+ "grad_norm": 1.010677456855774,
1862
+ "learning_rate": 2.1095399988757574e-05,
1863
+ "loss": 1.6405,
1864
+ "step": 265
1865
+ },
1866
+ {
1867
+ "epoch": 0.7875647668393783,
1868
+ "grad_norm": 1.0559284687042236,
1869
+ "learning_rate": 2.0557142929290023e-05,
1870
+ "loss": 1.9992,
1871
+ "step": 266
1872
+ },
1873
+ {
1874
+ "epoch": 0.7905255366395263,
1875
+ "grad_norm": 1.1667604446411133,
1876
+ "learning_rate": 2.002405927680374e-05,
1877
+ "loss": 1.6065,
1878
+ "step": 267
1879
+ },
1880
+ {
1881
+ "epoch": 0.7934863064396743,
1882
+ "grad_norm": 0.968020498752594,
1883
+ "learning_rate": 1.9496242699473783e-05,
1884
+ "loss": 1.6283,
1885
+ "step": 268
1886
+ },
1887
+ {
1888
+ "epoch": 0.7964470762398224,
1889
+ "grad_norm": 1.1873533725738525,
1890
+ "learning_rate": 1.897378593999693e-05,
1891
+ "loss": 1.622,
1892
+ "step": 269
1893
+ },
1894
+ {
1895
+ "epoch": 0.7994078460399704,
1896
+ "grad_norm": 1.0366755723953247,
1897
+ "learning_rate": 1.8456780799295886e-05,
1898
+ "loss": 1.7705,
1899
+ "step": 270
1900
+ },
1901
+ {
1902
+ "epoch": 0.8023686158401184,
1903
+ "grad_norm": 0.9406031966209412,
1904
+ "learning_rate": 1.794531812038901e-05,
1905
+ "loss": 1.3649,
1906
+ "step": 271
1907
+ },
1908
+ {
1909
+ "epoch": 0.8053293856402665,
1910
+ "grad_norm": 1.1950721740722656,
1911
+ "learning_rate": 1.743948777242814e-05,
1912
+ "loss": 2.0907,
1913
+ "step": 272
1914
+ },
1915
+ {
1916
+ "epoch": 0.8082901554404145,
1917
+ "grad_norm": 0.8741153478622437,
1918
+ "learning_rate": 1.6939378634907815e-05,
1919
+ "loss": 1.3801,
1920
+ "step": 273
1921
+ },
1922
+ {
1923
+ "epoch": 0.8112509252405625,
1924
+ "grad_norm": 0.9927524924278259,
1925
+ "learning_rate": 1.6445078582048155e-05,
1926
+ "loss": 1.6127,
1927
+ "step": 274
1928
+ },
1929
+ {
1930
+ "epoch": 0.8142116950407106,
1931
+ "grad_norm": 1.0479118824005127,
1932
+ "learning_rate": 1.5956674467354537e-05,
1933
+ "loss": 2.0404,
1934
+ "step": 275
1935
+ },
1936
+ {
1937
+ "epoch": 0.8171724648408586,
1938
+ "grad_norm": 0.9169769883155823,
1939
+ "learning_rate": 1.5474252108356474e-05,
1940
+ "loss": 1.5797,
1941
+ "step": 276
1942
+ },
1943
+ {
1944
+ "epoch": 0.8201332346410066,
1945
+ "grad_norm": 0.9381272196769714,
1946
+ "learning_rate": 1.4997896271528739e-05,
1947
+ "loss": 1.0802,
1948
+ "step": 277
1949
+ },
1950
+ {
1951
+ "epoch": 0.8230940044411547,
1952
+ "grad_norm": 0.9167252779006958,
1953
+ "learning_rate": 1.452769065739688e-05,
1954
+ "loss": 1.3707,
1955
+ "step": 278
1956
+ },
1957
+ {
1958
+ "epoch": 0.8260547742413027,
1959
+ "grad_norm": 1.007392168045044,
1960
+ "learning_rate": 1.4063717885830374e-05,
1961
+ "loss": 1.79,
1962
+ "step": 279
1963
+ },
1964
+ {
1965
+ "epoch": 0.8290155440414507,
1966
+ "grad_norm": 1.0070596933364868,
1967
+ "learning_rate": 1.3606059481525296e-05,
1968
+ "loss": 1.7477,
1969
+ "step": 280
1970
+ },
1971
+ {
1972
+ "epoch": 0.8319763138415989,
1973
+ "grad_norm": 1.103658676147461,
1974
+ "learning_rate": 1.315479585967978e-05,
1975
+ "loss": 1.8807,
1976
+ "step": 281
1977
+ },
1978
+ {
1979
+ "epoch": 0.8349370836417469,
1980
+ "grad_norm": 0.742683470249176,
1981
+ "learning_rate": 1.2710006311864104e-05,
1982
+ "loss": 0.99,
1983
+ "step": 282
1984
+ },
1985
+ {
1986
+ "epoch": 0.8378978534418949,
1987
+ "grad_norm": 0.8142567276954651,
1988
+ "learning_rate": 1.2271768992088489e-05,
1989
+ "loss": 1.1157,
1990
+ "step": 283
1991
+ },
1992
+ {
1993
+ "epoch": 0.840858623242043,
1994
+ "grad_norm": 0.8256039023399353,
1995
+ "learning_rate": 1.184016090307059e-05,
1996
+ "loss": 1.2341,
1997
+ "step": 284
1998
+ },
1999
+ {
2000
+ "epoch": 0.843819393042191,
2001
+ "grad_norm": 1.0862884521484375,
2002
+ "learning_rate": 1.1415257882705311e-05,
2003
+ "loss": 1.7456,
2004
+ "step": 285
2005
+ },
2006
+ {
2007
+ "epoch": 0.846780162842339,
2008
+ "grad_norm": 1.1565598249435425,
2009
+ "learning_rate": 1.09971345907394e-05,
2010
+ "loss": 1.6508,
2011
+ "step": 286
2012
+ },
2013
+ {
2014
+ "epoch": 0.8497409326424871,
2015
+ "grad_norm": 1.1448659896850586,
2016
+ "learning_rate": 1.0585864495652897e-05,
2017
+ "loss": 1.9964,
2018
+ "step": 287
2019
+ },
2020
+ {
2021
+ "epoch": 0.8527017024426351,
2022
+ "grad_norm": 0.968397855758667,
2023
+ "learning_rate": 1.0181519861750078e-05,
2024
+ "loss": 1.6825,
2025
+ "step": 288
2026
+ },
2027
+ {
2028
+ "epoch": 0.8556624722427831,
2029
+ "grad_norm": 0.9276019930839539,
2030
+ "learning_rate": 9.784171736461762e-06,
2031
+ "loss": 1.2859,
2032
+ "step": 289
2033
+ },
2034
+ {
2035
+ "epoch": 0.8586232420429312,
2036
+ "grad_norm": 0.7998013496398926,
2037
+ "learning_rate": 9.393889937861694e-06,
2038
+ "loss": 0.6289,
2039
+ "step": 290
2040
+ },
2041
+ {
2042
+ "epoch": 0.8615840118430792,
2043
+ "grad_norm": 0.6690629124641418,
2044
+ "learning_rate": 9.010743042398684e-06,
2045
+ "loss": 0.581,
2046
+ "step": 291
2047
+ },
2048
+ {
2049
+ "epoch": 0.8645447816432272,
2050
+ "grad_norm": 0.6341821551322937,
2051
+ "learning_rate": 8.634798372847148e-06,
2052
+ "loss": 0.3543,
2053
+ "step": 292
2054
+ },
2055
+ {
2056
+ "epoch": 0.8675055514433753,
2057
+ "grad_norm": 0.5753948092460632,
2058
+ "learning_rate": 8.266121986477699e-06,
2059
+ "loss": 0.3731,
2060
+ "step": 293
2061
+ },
2062
+ {
2063
+ "epoch": 0.8704663212435233,
2064
+ "grad_norm": 0.7309054136276245,
2065
+ "learning_rate": 7.904778663450324e-06,
2066
+ "loss": 0.6859,
2067
+ "step": 294
2068
+ },
2069
+ {
2070
+ "epoch": 0.8734270910436713,
2071
+ "grad_norm": 0.5517750382423401,
2072
+ "learning_rate": 7.550831895431798e-06,
2073
+ "loss": 0.2878,
2074
+ "step": 295
2075
+ },
2076
+ {
2077
+ "epoch": 0.8763878608438194,
2078
+ "grad_norm": 0.5744172930717468,
2079
+ "learning_rate": 7.204343874439578e-06,
2080
+ "loss": 0.202,
2081
+ "step": 296
2082
+ },
2083
+ {
2084
+ "epoch": 0.8793486306439674,
2085
+ "grad_norm": 0.4893665909767151,
2086
+ "learning_rate": 6.865375481914016e-06,
2087
+ "loss": 0.1466,
2088
+ "step": 297
2089
+ },
2090
+ {
2091
+ "epoch": 0.8823094004441154,
2092
+ "grad_norm": 0.503653347492218,
2093
+ "learning_rate": 6.533986278020876e-06,
2094
+ "loss": 0.1117,
2095
+ "step": 298
2096
+ },
2097
+ {
2098
+ "epoch": 0.8852701702442635,
2099
+ "grad_norm": 0.754020631313324,
2100
+ "learning_rate": 6.210234491186079e-06,
2101
+ "loss": 0.1614,
2102
+ "step": 299
2103
+ },
2104
+ {
2105
+ "epoch": 0.8882309400444115,
2106
+ "grad_norm": 1.9576098918914795,
2107
+ "learning_rate": 5.894177007864271e-06,
2108
+ "loss": 0.5595,
2109
+ "step": 300
2110
+ },
2111
+ {
2112
+ "epoch": 0.8911917098445595,
2113
+ "grad_norm": 0.9034455418586731,
2114
+ "learning_rate": 5.585869362543416e-06,
2115
+ "loss": 1.3168,
2116
+ "step": 301
2117
+ },
2118
+ {
2119
+ "epoch": 0.8941524796447077,
2120
+ "grad_norm": 1.1294760704040527,
2121
+ "learning_rate": 5.285365727986707e-06,
2122
+ "loss": 1.9353,
2123
+ "step": 302
2124
+ },
2125
+ {
2126
+ "epoch": 0.8971132494448557,
2127
+ "grad_norm": 1.308358073234558,
2128
+ "learning_rate": 4.9927189057139665e-06,
2129
+ "loss": 2.1867,
2130
+ "step": 303
2131
+ },
2132
+ {
2133
+ "epoch": 0.9000740192450037,
2134
+ "grad_norm": 1.1667540073394775,
2135
+ "learning_rate": 4.707980316723837e-06,
2136
+ "loss": 2.2503,
2137
+ "step": 304
2138
+ },
2139
+ {
2140
+ "epoch": 0.9030347890451518,
2141
+ "grad_norm": 1.0320072174072266,
2142
+ "learning_rate": 4.4311999924586065e-06,
2143
+ "loss": 1.5944,
2144
+ "step": 305
2145
+ },
2146
+ {
2147
+ "epoch": 0.9059955588452998,
2148
+ "grad_norm": 1.141965627670288,
2149
+ "learning_rate": 4.16242656601315e-06,
2150
+ "loss": 1.8177,
2151
+ "step": 306
2152
+ },
2153
+ {
2154
+ "epoch": 0.9089563286454478,
2155
+ "grad_norm": 0.9115905165672302,
2156
+ "learning_rate": 3.901707263589671e-06,
2157
+ "loss": 1.436,
2158
+ "step": 307
2159
+ },
2160
+ {
2161
+ "epoch": 0.9119170984455959,
2162
+ "grad_norm": 0.9493201375007629,
2163
+ "learning_rate": 3.6490878961994878e-06,
2164
+ "loss": 1.9393,
2165
+ "step": 308
2166
+ },
2167
+ {
2168
+ "epoch": 0.9148778682457439,
2169
+ "grad_norm": 1.2201560735702515,
2170
+ "learning_rate": 3.4046128516136755e-06,
2171
+ "loss": 2.0329,
2172
+ "step": 309
2173
+ },
2174
+ {
2175
+ "epoch": 0.9178386380458919,
2176
+ "grad_norm": 1.1415530443191528,
2177
+ "learning_rate": 3.1683250865636114e-06,
2178
+ "loss": 1.7484,
2179
+ "step": 310
2180
+ },
2181
+ {
2182
+ "epoch": 0.92079940784604,
2183
+ "grad_norm": 1.055849313735962,
2184
+ "learning_rate": 2.9402661191930804e-06,
2185
+ "loss": 1.7875,
2186
+ "step": 311
2187
+ },
2188
+ {
2189
+ "epoch": 0.923760177646188,
2190
+ "grad_norm": 0.9362370371818542,
2191
+ "learning_rate": 2.7204760217631074e-06,
2192
+ "loss": 1.2803,
2193
+ "step": 312
2194
+ },
2195
+ {
2196
+ "epoch": 0.926720947446336,
2197
+ "grad_norm": 1.1764779090881348,
2198
+ "learning_rate": 2.5089934136108664e-06,
2199
+ "loss": 1.6859,
2200
+ "step": 313
2201
+ },
2202
+ {
2203
+ "epoch": 0.9296817172464841,
2204
+ "grad_norm": 1.2218987941741943,
2205
+ "learning_rate": 2.30585545436387e-06,
2206
+ "loss": 2.025,
2207
+ "step": 314
2208
+ },
2209
+ {
2210
+ "epoch": 0.9326424870466321,
2211
+ "grad_norm": 1.0182489156723022,
2212
+ "learning_rate": 2.1110978374106192e-06,
2213
+ "loss": 1.3178,
2214
+ "step": 315
2215
+ },
2216
+ {
2217
+ "epoch": 0.9356032568467801,
2218
+ "grad_norm": 1.1746158599853516,
2219
+ "learning_rate": 1.9247547836289793e-06,
2220
+ "loss": 2.0641,
2221
+ "step": 316
2222
+ },
2223
+ {
2224
+ "epoch": 0.9385640266469282,
2225
+ "grad_norm": 1.0268217325210571,
2226
+ "learning_rate": 1.7468590353731495e-06,
2227
+ "loss": 1.5677,
2228
+ "step": 317
2229
+ },
2230
+ {
2231
+ "epoch": 0.9415247964470762,
2232
+ "grad_norm": 1.1525732278823853,
2233
+ "learning_rate": 1.5774418507205679e-06,
2234
+ "loss": 1.8057,
2235
+ "step": 318
2236
+ },
2237
+ {
2238
+ "epoch": 0.9444855662472242,
2239
+ "grad_norm": 0.9357332587242126,
2240
+ "learning_rate": 1.4165329979794973e-06,
2241
+ "loss": 1.3016,
2242
+ "step": 319
2243
+ },
2244
+ {
2245
+ "epoch": 0.9474463360473723,
2246
+ "grad_norm": 0.9553401470184326,
2247
+ "learning_rate": 1.2641607504584928e-06,
2248
+ "loss": 1.5646,
2249
+ "step": 320
2250
+ },
2251
+ {
2252
+ "epoch": 0.9504071058475203,
2253
+ "grad_norm": 0.9898755550384521,
2254
+ "learning_rate": 1.1203518814984214e-06,
2255
+ "loss": 1.5453,
2256
+ "step": 321
2257
+ },
2258
+ {
2259
+ "epoch": 0.9533678756476683,
2260
+ "grad_norm": 1.019761085510254,
2261
+ "learning_rate": 9.851316597681958e-07,
2262
+ "loss": 1.5678,
2263
+ "step": 322
2264
+ },
2265
+ {
2266
+ "epoch": 0.9563286454478165,
2267
+ "grad_norm": 1.0154638290405273,
2268
+ "learning_rate": 8.585238448247435e-07,
2269
+ "loss": 1.6164,
2270
+ "step": 323
2271
+ },
2272
+ {
2273
+ "epoch": 0.9592894152479645,
2274
+ "grad_norm": 1.1107819080352783,
2275
+ "learning_rate": 7.405506829382735e-07,
2276
+ "loss": 1.8083,
2277
+ "step": 324
2278
+ },
2279
+ {
2280
+ "epoch": 0.9622501850481125,
2281
+ "grad_norm": 1.2360882759094238,
2282
+ "learning_rate": 6.312329031833319e-07,
2283
+ "loss": 2.0795,
2284
+ "step": 325
2285
+ },
2286
+ {
2287
+ "epoch": 0.9652109548482606,
2288
+ "grad_norm": 0.8985037207603455,
2289
+ "learning_rate": 5.305897137965199e-07,
2290
+ "loss": 1.1981,
2291
+ "step": 326
2292
+ },
2293
+ {
2294
+ "epoch": 0.9681717246484086,
2295
+ "grad_norm": 1.0836095809936523,
2296
+ "learning_rate": 4.386387988014273e-07,
2297
+ "loss": 2.1089,
2298
+ "step": 327
2299
+ },
2300
+ {
2301
+ "epoch": 0.9711324944485566,
2302
+ "grad_norm": 1.0523475408554077,
2303
+ "learning_rate": 3.553963149013295e-07,
2304
+ "loss": 1.8185,
2305
+ "step": 328
2306
+ },
2307
+ {
2308
+ "epoch": 0.9740932642487047,
2309
+ "grad_norm": 1.2510104179382324,
2310
+ "learning_rate": 2.808768886403301e-07,
2311
+ "loss": 2.1252,
2312
+ "step": 329
2313
+ },
2314
+ {
2315
+ "epoch": 0.9770540340488527,
2316
+ "grad_norm": 1.1012005805969238,
2317
+ "learning_rate": 2.1509361383330596e-07,
2318
+ "loss": 1.8081,
2319
+ "step": 330
2320
+ },
2321
+ {
2322
+ "epoch": 0.9800148038490007,
2323
+ "grad_norm": 0.6544106602668762,
2324
+ "learning_rate": 1.580580492652084e-07,
2325
+ "loss": 0.4715,
2326
+ "step": 331
2327
+ },
2328
+ {
2329
+ "epoch": 0.9829755736491488,
2330
+ "grad_norm": 0.6254527568817139,
2331
+ "learning_rate": 1.0978021666005478e-07,
2332
+ "loss": 0.4152,
2333
+ "step": 332
2334
+ },
2335
+ {
2336
+ "epoch": 0.9859363434492968,
2337
+ "grad_norm": 0.6432924866676331,
2338
+ "learning_rate": 7.02685989200258e-08,
2339
+ "loss": 0.3737,
2340
+ "step": 333
2341
+ },
2342
+ {
2343
+ "epoch": 0.9888971132494448,
2344
+ "grad_norm": 0.595024585723877,
2345
+ "learning_rate": 3.953013863490784e-08,
2346
+ "loss": 0.2546,
2347
+ "step": 334
2348
+ },
2349
+ {
2350
+ "epoch": 0.9918578830495929,
2351
+ "grad_norm": 0.4974139332771301,
2352
+ "learning_rate": 1.7570236862241017e-08,
2353
+ "loss": 0.2191,
2354
+ "step": 335
2355
+ },
2356
+ {
2357
+ "epoch": 0.9948186528497409,
2358
+ "grad_norm": 0.5824573636054993,
2359
+ "learning_rate": 4.392752178278281e-09,
2360
+ "loss": 0.1276,
2361
+ "step": 336
2362
+ },
2363
+ {
2364
+ "epoch": 0.997779422649889,
2365
+ "grad_norm": 0.7297351360321045,
2366
+ "learning_rate": 0.0,
2367
+ "loss": 0.1176,
2368
+ "step": 337
2369
  }
2370
  ],
2371
  "logging_steps": 1,
 
2380
  "should_evaluate": false,
2381
  "should_log": false,
2382
  "should_save": true,
2383
+ "should_training_stop": true
2384
  },
2385
  "attributes": {}
2386
  }
2387
  },
2388
+ "total_flos": 3.0172541179920384e+16,
2389
  "train_batch_size": 4,
2390
  "trial_name": null,
2391
  "trial_params": null