jssky commited on
Commit
cc9dc89
·
verified ·
1 Parent(s): d343f9e

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11706a87f4363fd35201e9a40e571ce07265530a124372f0051dff9e3be47838
3
  size 47724600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fb3a1c034c512f3602bb7eb9746a25ce70d3c1c2463811bd29bb2e039a6ac96
3
  size 47724600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d313f22afb42c637a37e19c640b88d40cdf181bd3eb3d56fac5b29d54cf53a09
3
  size 25331516
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a4ee21da39425c292a1b2d46b9c4ec0a440be08be6cebb00065c8e33bd4773e
3
  size 25331516
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46690724a684250634c18844184a4bf69054530dc15f342014b5e58c75744ff8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bbf00cc7d26b5ba1f1bfe59564ee6b340d81d2d6e92ca2595dc7bce3ba71015
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68af516a780edfd16c81c1cfe58920445340f6c2b487a237f423e8a752d36b94
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae4f1bd750c09fc9bb727cae976f56e1bbe0dff5c4d4e1a6eec209a810ae59b2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5050505050505051,
5
  "eval_steps": 50,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -723,6 +723,364 @@
723
  "eval_samples_per_second": 6.997,
724
  "eval_steps_per_second": 3.498,
725
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  }
727
  ],
728
  "logging_steps": 1,
@@ -742,7 +1100,7 @@
742
  "attributes": {}
743
  }
744
  },
745
- "total_flos": 1.42112968409088e+16,
746
  "train_batch_size": 2,
747
  "trial_name": null,
748
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7575757575757576,
5
  "eval_steps": 50,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
723
  "eval_samples_per_second": 6.997,
724
  "eval_steps_per_second": 3.498,
725
  "step": 100
726
+ },
727
+ {
728
+ "epoch": 0.51010101010101,
729
+ "grad_norm": 11.24027156829834,
730
+ "learning_rate": 0.00010501108017871192,
731
+ "loss": 0.4537,
732
+ "step": 101
733
+ },
734
+ {
735
+ "epoch": 0.5151515151515151,
736
+ "grad_norm": 14.357158660888672,
737
+ "learning_rate": 0.00010334149770076747,
738
+ "loss": 0.4567,
739
+ "step": 102
740
+ },
741
+ {
742
+ "epoch": 0.5202020202020202,
743
+ "grad_norm": 12.492400169372559,
744
+ "learning_rate": 0.00010167098215093009,
745
+ "loss": 0.4684,
746
+ "step": 103
747
+ },
748
+ {
749
+ "epoch": 0.5252525252525253,
750
+ "grad_norm": 11.547194480895996,
751
+ "learning_rate": 0.0001,
752
+ "loss": 0.4345,
753
+ "step": 104
754
+ },
755
+ {
756
+ "epoch": 0.5303030303030303,
757
+ "grad_norm": 13.756988525390625,
758
+ "learning_rate": 9.83290178490699e-05,
759
+ "loss": 0.4538,
760
+ "step": 105
761
+ },
762
+ {
763
+ "epoch": 0.5353535353535354,
764
+ "grad_norm": 13.91006088256836,
765
+ "learning_rate": 9.665850229923258e-05,
766
+ "loss": 0.3966,
767
+ "step": 106
768
+ },
769
+ {
770
+ "epoch": 0.5404040404040404,
771
+ "grad_norm": 13.23215103149414,
772
+ "learning_rate": 9.498891982128809e-05,
773
+ "loss": 0.3571,
774
+ "step": 107
775
+ },
776
+ {
777
+ "epoch": 0.5454545454545454,
778
+ "grad_norm": 13.87959098815918,
779
+ "learning_rate": 9.332073662548784e-05,
780
+ "loss": 0.4389,
781
+ "step": 108
782
+ },
783
+ {
784
+ "epoch": 0.5505050505050505,
785
+ "grad_norm": 12.538737297058105,
786
+ "learning_rate": 9.165441853135104e-05,
787
+ "loss": 0.4184,
788
+ "step": 109
789
+ },
790
+ {
791
+ "epoch": 0.5555555555555556,
792
+ "grad_norm": 23.718013763427734,
793
+ "learning_rate": 8.999043083759017e-05,
794
+ "loss": 0.3822,
795
+ "step": 110
796
+ },
797
+ {
798
+ "epoch": 0.5606060606060606,
799
+ "grad_norm": 70.29689025878906,
800
+ "learning_rate": 8.832923819218238e-05,
801
+ "loss": 0.3815,
802
+ "step": 111
803
+ },
804
+ {
805
+ "epoch": 0.5656565656565656,
806
+ "grad_norm": 11.808393478393555,
807
+ "learning_rate": 8.667130446262214e-05,
808
+ "loss": 0.362,
809
+ "step": 112
810
+ },
811
+ {
812
+ "epoch": 0.5707070707070707,
813
+ "grad_norm": 13.301114082336426,
814
+ "learning_rate": 8.501709260639186e-05,
815
+ "loss": 0.5234,
816
+ "step": 113
817
+ },
818
+ {
819
+ "epoch": 0.5757575757575758,
820
+ "grad_norm": 11.385530471801758,
821
+ "learning_rate": 8.336706454168701e-05,
822
+ "loss": 0.546,
823
+ "step": 114
824
+ },
825
+ {
826
+ "epoch": 0.5808080808080808,
827
+ "grad_norm": 17.040523529052734,
828
+ "learning_rate": 8.172168101843099e-05,
829
+ "loss": 0.4199,
830
+ "step": 115
831
+ },
832
+ {
833
+ "epoch": 0.5858585858585859,
834
+ "grad_norm": 18.70421028137207,
835
+ "learning_rate": 8.008140148961641e-05,
836
+ "loss": 0.5603,
837
+ "step": 116
838
+ },
839
+ {
840
+ "epoch": 0.5909090909090909,
841
+ "grad_norm": 13.10011100769043,
842
+ "learning_rate": 7.844668398300865e-05,
843
+ "loss": 0.6309,
844
+ "step": 117
845
+ },
846
+ {
847
+ "epoch": 0.5959595959595959,
848
+ "grad_norm": 13.03792953491211,
849
+ "learning_rate": 7.681798497324716e-05,
850
+ "loss": 0.4718,
851
+ "step": 118
852
+ },
853
+ {
854
+ "epoch": 0.601010101010101,
855
+ "grad_norm": 14.050755500793457,
856
+ "learning_rate": 7.519575925438067e-05,
857
+ "loss": 0.372,
858
+ "step": 119
859
+ },
860
+ {
861
+ "epoch": 0.6060606060606061,
862
+ "grad_norm": 13.505302429199219,
863
+ "learning_rate": 7.358045981287141e-05,
864
+ "loss": 0.5525,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 0.6111111111111112,
869
+ "grad_norm": 13.572732925415039,
870
+ "learning_rate": 7.197253770110438e-05,
871
+ "loss": 0.4607,
872
+ "step": 121
873
+ },
874
+ {
875
+ "epoch": 0.6161616161616161,
876
+ "grad_norm": 22.72136878967285,
877
+ "learning_rate": 7.037244191143661e-05,
878
+ "loss": 0.632,
879
+ "step": 122
880
+ },
881
+ {
882
+ "epoch": 0.6212121212121212,
883
+ "grad_norm": 12.22665786743164,
884
+ "learning_rate": 6.878061925082137e-05,
885
+ "loss": 0.429,
886
+ "step": 123
887
+ },
888
+ {
889
+ "epoch": 0.6262626262626263,
890
+ "grad_norm": 10.075366973876953,
891
+ "learning_rate": 6.719751421604309e-05,
892
+ "loss": 0.4592,
893
+ "step": 124
894
+ },
895
+ {
896
+ "epoch": 0.6313131313131313,
897
+ "grad_norm": 16.798250198364258,
898
+ "learning_rate": 6.562356886959704e-05,
899
+ "loss": 0.368,
900
+ "step": 125
901
+ },
902
+ {
903
+ "epoch": 0.6363636363636364,
904
+ "grad_norm": 10.359375953674316,
905
+ "learning_rate": 6.405922271624874e-05,
906
+ "loss": 0.2825,
907
+ "step": 126
908
+ },
909
+ {
910
+ "epoch": 0.6414141414141414,
911
+ "grad_norm": 12.54139232635498,
912
+ "learning_rate": 6.250491258030791e-05,
913
+ "loss": 0.479,
914
+ "step": 127
915
+ },
916
+ {
917
+ "epoch": 0.6464646464646465,
918
+ "grad_norm": 15.544010162353516,
919
+ "learning_rate": 6.0961072483650526e-05,
920
+ "loss": 0.4355,
921
+ "step": 128
922
+ },
923
+ {
924
+ "epoch": 0.6515151515151515,
925
+ "grad_norm": 13.150715827941895,
926
+ "learning_rate": 5.9428133524523646e-05,
927
+ "loss": 0.3998,
928
+ "step": 129
929
+ },
930
+ {
931
+ "epoch": 0.6565656565656566,
932
+ "grad_norm": 11.6818265914917,
933
+ "learning_rate": 5.790652375716652e-05,
934
+ "loss": 0.4009,
935
+ "step": 130
936
+ },
937
+ {
938
+ "epoch": 0.6616161616161617,
939
+ "grad_norm": 16.20279884338379,
940
+ "learning_rate": 5.639666807228175e-05,
941
+ "loss": 0.5703,
942
+ "step": 131
943
+ },
944
+ {
945
+ "epoch": 0.6666666666666666,
946
+ "grad_norm": 15.14217758178711,
947
+ "learning_rate": 5.48989880783898e-05,
948
+ "loss": 0.4918,
949
+ "step": 132
950
+ },
951
+ {
952
+ "epoch": 0.6717171717171717,
953
+ "grad_norm": 17.63346290588379,
954
+ "learning_rate": 5.341390198410019e-05,
955
+ "loss": 0.4146,
956
+ "step": 133
957
+ },
958
+ {
959
+ "epoch": 0.6767676767676768,
960
+ "grad_norm": 17.88300323486328,
961
+ "learning_rate": 5.1941824481331626e-05,
962
+ "loss": 0.4518,
963
+ "step": 134
964
+ },
965
+ {
966
+ "epoch": 0.6818181818181818,
967
+ "grad_norm": 19.94394302368164,
968
+ "learning_rate": 5.0483166629514654e-05,
969
+ "loss": 0.6016,
970
+ "step": 135
971
+ },
972
+ {
973
+ "epoch": 0.6868686868686869,
974
+ "grad_norm": 14.756821632385254,
975
+ "learning_rate": 4.903833574080825e-05,
976
+ "loss": 0.3996,
977
+ "step": 136
978
+ },
979
+ {
980
+ "epoch": 0.6919191919191919,
981
+ "grad_norm": 19.392858505249023,
982
+ "learning_rate": 4.760773526636315e-05,
983
+ "loss": 0.6053,
984
+ "step": 137
985
+ },
986
+ {
987
+ "epoch": 0.696969696969697,
988
+ "grad_norm": 7.531655788421631,
989
+ "learning_rate": 4.6191764683662744e-05,
990
+ "loss": 0.2299,
991
+ "step": 138
992
+ },
993
+ {
994
+ "epoch": 0.702020202020202,
995
+ "grad_norm": 23.472288131713867,
996
+ "learning_rate": 4.479081938497435e-05,
997
+ "loss": 0.5437,
998
+ "step": 139
999
+ },
1000
+ {
1001
+ "epoch": 0.7070707070707071,
1002
+ "grad_norm": 17.57956314086914,
1003
+ "learning_rate": 4.340529056694047e-05,
1004
+ "loss": 0.5943,
1005
+ "step": 140
1006
+ },
1007
+ {
1008
+ "epoch": 0.7121212121212122,
1009
+ "grad_norm": 11.99634075164795,
1010
+ "learning_rate": 4.2035565121342246e-05,
1011
+ "loss": 0.2477,
1012
+ "step": 141
1013
+ },
1014
+ {
1015
+ "epoch": 0.7171717171717171,
1016
+ "grad_norm": 34.67549133300781,
1017
+ "learning_rate": 4.0682025527064486e-05,
1018
+ "loss": 0.4986,
1019
+ "step": 142
1020
+ },
1021
+ {
1022
+ "epoch": 0.7222222222222222,
1023
+ "grad_norm": 12.304537773132324,
1024
+ "learning_rate": 3.934504974329326e-05,
1025
+ "loss": 0.5063,
1026
+ "step": 143
1027
+ },
1028
+ {
1029
+ "epoch": 0.7272727272727273,
1030
+ "grad_norm": 14.232462882995605,
1031
+ "learning_rate": 3.802501110397553e-05,
1032
+ "loss": 0.5271,
1033
+ "step": 144
1034
+ },
1035
+ {
1036
+ "epoch": 0.7323232323232324,
1037
+ "grad_norm": 23.888545989990234,
1038
+ "learning_rate": 3.672227821357014e-05,
1039
+ "loss": 0.6568,
1040
+ "step": 145
1041
+ },
1042
+ {
1043
+ "epoch": 0.7373737373737373,
1044
+ "grad_norm": 16.624494552612305,
1045
+ "learning_rate": 3.543721484411976e-05,
1046
+ "loss": 0.4932,
1047
+ "step": 146
1048
+ },
1049
+ {
1050
+ "epoch": 0.7424242424242424,
1051
+ "grad_norm": 15.917621612548828,
1052
+ "learning_rate": 3.4170179833671846e-05,
1053
+ "loss": 0.6934,
1054
+ "step": 147
1055
+ },
1056
+ {
1057
+ "epoch": 0.7474747474747475,
1058
+ "grad_norm": 9.06427001953125,
1059
+ "learning_rate": 3.292152698607768e-05,
1060
+ "loss": 0.3849,
1061
+ "step": 148
1062
+ },
1063
+ {
1064
+ "epoch": 0.7525252525252525,
1065
+ "grad_norm": 17.903718948364258,
1066
+ "learning_rate": 3.169160497219692e-05,
1067
+ "loss": 0.3646,
1068
+ "step": 149
1069
+ },
1070
+ {
1071
+ "epoch": 0.7575757575757576,
1072
+ "grad_norm": 11.98536205291748,
1073
+ "learning_rate": 3.0480757232535772e-05,
1074
+ "loss": 0.46,
1075
+ "step": 150
1076
+ },
1077
+ {
1078
+ "epoch": 0.7575757575757576,
1079
+ "eval_loss": 0.3067511022090912,
1080
+ "eval_runtime": 12.2523,
1081
+ "eval_samples_per_second": 6.856,
1082
+ "eval_steps_per_second": 3.428,
1083
+ "step": 150
1084
  }
1085
  ],
1086
  "logging_steps": 1,
 
1100
  "attributes": {}
1101
  }
1102
  },
1103
+ "total_flos": 2.13169452613632e+16,
1104
  "train_batch_size": 2,
1105
  "trial_name": null,
1106
  "trial_params": null