msharma95 commited on
Commit
74c8f48
·
verified ·
1 Parent(s): a79adde

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "phueb/BabyBERTa-2",
3
+ "architectures": [
4
+ "RobertaForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 3,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 4,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 256,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 1024,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 130,
18
+ "model_type": "roberta",
19
+ "num_attention_heads": 8,
20
+ "num_hidden_layers": 8,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.46.3",
25
+ "type_vocab_size": 2,
26
+ "use_cache": true,
27
+ "vocab_size": 8192
28
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ade9f0a6408d9d443f6f358408bab44404bd94b73172863d87c9c44d1cb2650a
3
+ size 34112288
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef1b9e66a36daf090a8132f3d47570f28809c663e135f1ee1112861b733fee83
3
+ size 68307834
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eef62f1726f7816d16b679bcd0fa5d5e094f95e050cda7edab9ab319cfc28693
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16810444873b484fc4539f7c992208849486053bc0e4c6d3fd5f116f0345171f
3
+ size 1064
tokenizer_reference.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Tokenizer available at: msharma95/babyberta-tokenizer
trainer_state.json ADDED
@@ -0,0 +1,1243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.7811457624923088,
5
+ "eval_steps": 1000,
6
+ "global_step": 55000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.016192234204475534,
13
+ "grad_norm": 2.899235963821411,
14
+ "learning_rate": 4.9919038828977625e-05,
15
+ "loss": 7.6546,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.03238446840895107,
20
+ "grad_norm": 3.1383473873138428,
21
+ "learning_rate": 4.983807765795525e-05,
22
+ "loss": 7.0771,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.03238446840895107,
27
+ "eval_loss": 10.114262580871582,
28
+ "eval_runtime": 20.3013,
29
+ "eval_samples_per_second": 2704.02,
30
+ "eval_steps_per_second": 169.004,
31
+ "step": 1000
32
+ },
33
+ {
34
+ "epoch": 0.0485767026134266,
35
+ "grad_norm": 3.265001058578491,
36
+ "learning_rate": 4.975711648693287e-05,
37
+ "loss": 6.9745,
38
+ "step": 1500
39
+ },
40
+ {
41
+ "epoch": 0.06476893681790213,
42
+ "grad_norm": 4.482234477996826,
43
+ "learning_rate": 4.967615531591049e-05,
44
+ "loss": 6.8578,
45
+ "step": 2000
46
+ },
47
+ {
48
+ "epoch": 0.06476893681790213,
49
+ "eval_loss": 10.50239086151123,
50
+ "eval_runtime": 20.2747,
51
+ "eval_samples_per_second": 2707.566,
52
+ "eval_steps_per_second": 169.226,
53
+ "step": 2000
54
+ },
55
+ {
56
+ "epoch": 0.08096117102237767,
57
+ "grad_norm": 3.455843925476074,
58
+ "learning_rate": 4.9595194144888116e-05,
59
+ "loss": 6.8529,
60
+ "step": 2500
61
+ },
62
+ {
63
+ "epoch": 0.0971534052268532,
64
+ "grad_norm": 4.888940811157227,
65
+ "learning_rate": 4.951423297386574e-05,
66
+ "loss": 6.7865,
67
+ "step": 3000
68
+ },
69
+ {
70
+ "epoch": 0.0971534052268532,
71
+ "eval_loss": 11.206120491027832,
72
+ "eval_runtime": 20.2787,
73
+ "eval_samples_per_second": 2707.023,
74
+ "eval_steps_per_second": 169.192,
75
+ "step": 3000
76
+ },
77
+ {
78
+ "epoch": 0.11334563943132873,
79
+ "grad_norm": 2.9272000789642334,
80
+ "learning_rate": 4.9433271802843354e-05,
81
+ "loss": 6.7582,
82
+ "step": 3500
83
+ },
84
+ {
85
+ "epoch": 0.12953787363580427,
86
+ "grad_norm": 3.68345046043396,
87
+ "learning_rate": 4.9352310631820984e-05,
88
+ "loss": 6.6886,
89
+ "step": 4000
90
+ },
91
+ {
92
+ "epoch": 0.12953787363580427,
93
+ "eval_loss": 11.919740676879883,
94
+ "eval_runtime": 20.2985,
95
+ "eval_samples_per_second": 2704.382,
96
+ "eval_steps_per_second": 169.027,
97
+ "step": 4000
98
+ },
99
+ {
100
+ "epoch": 0.14573010784027982,
101
+ "grad_norm": 3.9165866374969482,
102
+ "learning_rate": 4.927151138314065e-05,
103
+ "loss": 6.657,
104
+ "step": 4500
105
+ },
106
+ {
107
+ "epoch": 0.16192234204475534,
108
+ "grad_norm": 4.2097015380859375,
109
+ "learning_rate": 4.9190550212118266e-05,
110
+ "loss": 6.6567,
111
+ "step": 5000
112
+ },
113
+ {
114
+ "epoch": 0.16192234204475534,
115
+ "eval_loss": 11.381248474121094,
116
+ "eval_runtime": 20.3121,
117
+ "eval_samples_per_second": 2702.582,
118
+ "eval_steps_per_second": 168.914,
119
+ "step": 5000
120
+ },
121
+ {
122
+ "epoch": 0.17811457624923088,
123
+ "grad_norm": 3.525871753692627,
124
+ "learning_rate": 4.9109589041095895e-05,
125
+ "loss": 6.6297,
126
+ "step": 5500
127
+ },
128
+ {
129
+ "epoch": 0.1943068104537064,
130
+ "grad_norm": 5.512066841125488,
131
+ "learning_rate": 4.902862787007351e-05,
132
+ "loss": 6.6049,
133
+ "step": 6000
134
+ },
135
+ {
136
+ "epoch": 0.1943068104537064,
137
+ "eval_loss": 11.815380096435547,
138
+ "eval_runtime": 20.3269,
139
+ "eval_samples_per_second": 2700.605,
140
+ "eval_steps_per_second": 168.791,
141
+ "step": 6000
142
+ },
143
+ {
144
+ "epoch": 0.21049904465818195,
145
+ "grad_norm": 6.878340244293213,
146
+ "learning_rate": 4.8947828621393185e-05,
147
+ "loss": 6.5587,
148
+ "step": 6500
149
+ },
150
+ {
151
+ "epoch": 0.22669127886265747,
152
+ "grad_norm": 3.512578248977661,
153
+ "learning_rate": 4.886686745037081e-05,
154
+ "loss": 6.5451,
155
+ "step": 7000
156
+ },
157
+ {
158
+ "epoch": 0.22669127886265747,
159
+ "eval_loss": 11.993084907531738,
160
+ "eval_runtime": 20.3217,
161
+ "eval_samples_per_second": 2701.3,
162
+ "eval_steps_per_second": 168.834,
163
+ "step": 7000
164
+ },
165
+ {
166
+ "epoch": 0.24288351306713302,
167
+ "grad_norm": 4.394603729248047,
168
+ "learning_rate": 4.8786068201690474e-05,
169
+ "loss": 6.5283,
170
+ "step": 7500
171
+ },
172
+ {
173
+ "epoch": 0.25907574727160854,
174
+ "grad_norm": 3.8106327056884766,
175
+ "learning_rate": 4.8705107030668096e-05,
176
+ "loss": 6.5246,
177
+ "step": 8000
178
+ },
179
+ {
180
+ "epoch": 0.25907574727160854,
181
+ "eval_loss": 12.192233085632324,
182
+ "eval_runtime": 20.3069,
183
+ "eval_samples_per_second": 2703.263,
184
+ "eval_steps_per_second": 168.957,
185
+ "step": 8000
186
+ },
187
+ {
188
+ "epoch": 0.27526798147608406,
189
+ "grad_norm": 4.158311367034912,
190
+ "learning_rate": 4.862414585964571e-05,
191
+ "loss": 6.4727,
192
+ "step": 8500
193
+ },
194
+ {
195
+ "epoch": 0.29146021568055963,
196
+ "grad_norm": 3.797197103500366,
197
+ "learning_rate": 4.854318468862334e-05,
198
+ "loss": 6.4594,
199
+ "step": 9000
200
+ },
201
+ {
202
+ "epoch": 0.29146021568055963,
203
+ "eval_loss": 12.250760078430176,
204
+ "eval_runtime": 20.2943,
205
+ "eval_samples_per_second": 2704.945,
206
+ "eval_steps_per_second": 169.062,
207
+ "step": 9000
208
+ },
209
+ {
210
+ "epoch": 0.30765244988503515,
211
+ "grad_norm": 4.4851908683776855,
212
+ "learning_rate": 4.8462223517600964e-05,
213
+ "loss": 6.4441,
214
+ "step": 9500
215
+ },
216
+ {
217
+ "epoch": 0.32384468408951067,
218
+ "grad_norm": 5.158078670501709,
219
+ "learning_rate": 4.838142426892063e-05,
220
+ "loss": 6.4537,
221
+ "step": 10000
222
+ },
223
+ {
224
+ "epoch": 0.32384468408951067,
225
+ "eval_loss": 12.235699653625488,
226
+ "eval_runtime": 20.3426,
227
+ "eval_samples_per_second": 2698.522,
228
+ "eval_steps_per_second": 168.661,
229
+ "step": 10000
230
+ },
231
+ {
232
+ "epoch": 0.3400369182939862,
233
+ "grad_norm": 4.838212966918945,
234
+ "learning_rate": 4.8300463097898254e-05,
235
+ "loss": 6.4313,
236
+ "step": 10500
237
+ },
238
+ {
239
+ "epoch": 0.35622915249846177,
240
+ "grad_norm": 4.873743534088135,
241
+ "learning_rate": 4.821950192687587e-05,
242
+ "loss": 6.4375,
243
+ "step": 11000
244
+ },
245
+ {
246
+ "epoch": 0.35622915249846177,
247
+ "eval_loss": 13.167688369750977,
248
+ "eval_runtime": 20.3858,
249
+ "eval_samples_per_second": 2692.805,
250
+ "eval_steps_per_second": 168.303,
251
+ "step": 11000
252
+ },
253
+ {
254
+ "epoch": 0.3724213867029373,
255
+ "grad_norm": 5.022457122802734,
256
+ "learning_rate": 4.813854075585349e-05,
257
+ "loss": 6.4249,
258
+ "step": 11500
259
+ },
260
+ {
261
+ "epoch": 0.3886136209074128,
262
+ "grad_norm": 4.3410964012146,
263
+ "learning_rate": 4.805774150717316e-05,
264
+ "loss": 6.3944,
265
+ "step": 12000
266
+ },
267
+ {
268
+ "epoch": 0.3886136209074128,
269
+ "eval_loss": 12.753792762756348,
270
+ "eval_runtime": 20.3234,
271
+ "eval_samples_per_second": 2701.08,
272
+ "eval_steps_per_second": 168.821,
273
+ "step": 12000
274
+ },
275
+ {
276
+ "epoch": 0.4048058551118883,
277
+ "grad_norm": 4.282736301422119,
278
+ "learning_rate": 4.797678033615078e-05,
279
+ "loss": 6.3343,
280
+ "step": 12500
281
+ },
282
+ {
283
+ "epoch": 0.4209980893163639,
284
+ "grad_norm": 4.2941484451293945,
285
+ "learning_rate": 4.789581916512841e-05,
286
+ "loss": 6.3576,
287
+ "step": 13000
288
+ },
289
+ {
290
+ "epoch": 0.4209980893163639,
291
+ "eval_loss": 14.182137489318848,
292
+ "eval_runtime": 20.2915,
293
+ "eval_samples_per_second": 2705.322,
294
+ "eval_steps_per_second": 169.086,
295
+ "step": 13000
296
+ },
297
+ {
298
+ "epoch": 0.4371903235208394,
299
+ "grad_norm": 4.584780693054199,
300
+ "learning_rate": 4.7814857994106027e-05,
301
+ "loss": 6.3133,
302
+ "step": 13500
303
+ },
304
+ {
305
+ "epoch": 0.45338255772531494,
306
+ "grad_norm": 6.5496134757995605,
307
+ "learning_rate": 4.77340587454257e-05,
308
+ "loss": 6.3042,
309
+ "step": 14000
310
+ },
311
+ {
312
+ "epoch": 0.45338255772531494,
313
+ "eval_loss": 13.6593656539917,
314
+ "eval_runtime": 20.2751,
315
+ "eval_samples_per_second": 2707.503,
316
+ "eval_steps_per_second": 169.222,
317
+ "step": 14000
318
+ },
319
+ {
320
+ "epoch": 0.46957479192979046,
321
+ "grad_norm": 4.188933849334717,
322
+ "learning_rate": 4.7653097574403316e-05,
323
+ "loss": 6.2764,
324
+ "step": 14500
325
+ },
326
+ {
327
+ "epoch": 0.48576702613426603,
328
+ "grad_norm": 4.540693283081055,
329
+ "learning_rate": 4.757213640338094e-05,
330
+ "loss": 6.261,
331
+ "step": 15000
332
+ },
333
+ {
334
+ "epoch": 0.48576702613426603,
335
+ "eval_loss": 13.340327262878418,
336
+ "eval_runtime": 20.2957,
337
+ "eval_samples_per_second": 2704.755,
338
+ "eval_steps_per_second": 169.05,
339
+ "step": 15000
340
+ },
341
+ {
342
+ "epoch": 0.5019592603387415,
343
+ "grad_norm": 4.090181827545166,
344
+ "learning_rate": 4.749117523235857e-05,
345
+ "loss": 6.2382,
346
+ "step": 15500
347
+ },
348
+ {
349
+ "epoch": 0.5181514945432171,
350
+ "grad_norm": 5.604825973510742,
351
+ "learning_rate": 4.7410214061336184e-05,
352
+ "loss": 6.2225,
353
+ "step": 16000
354
+ },
355
+ {
356
+ "epoch": 0.5181514945432171,
357
+ "eval_loss": 14.243009567260742,
358
+ "eval_runtime": 20.3274,
359
+ "eval_samples_per_second": 2700.537,
360
+ "eval_steps_per_second": 168.787,
361
+ "step": 16000
362
+ },
363
+ {
364
+ "epoch": 0.5343437287476926,
365
+ "grad_norm": 5.334819316864014,
366
+ "learning_rate": 4.732941481265585e-05,
367
+ "loss": 6.2051,
368
+ "step": 16500
369
+ },
370
+ {
371
+ "epoch": 0.5505359629521681,
372
+ "grad_norm": 5.907509803771973,
373
+ "learning_rate": 4.724845364163347e-05,
374
+ "loss": 6.1475,
375
+ "step": 17000
376
+ },
377
+ {
378
+ "epoch": 0.5505359629521681,
379
+ "eval_loss": 14.208978652954102,
380
+ "eval_runtime": 20.3636,
381
+ "eval_samples_per_second": 2695.742,
382
+ "eval_steps_per_second": 168.487,
383
+ "step": 17000
384
+ },
385
+ {
386
+ "epoch": 0.5667281971566437,
387
+ "grad_norm": 5.113221645355225,
388
+ "learning_rate": 4.7167492470611096e-05,
389
+ "loss": 6.1765,
390
+ "step": 17500
391
+ },
392
+ {
393
+ "epoch": 0.5829204313611193,
394
+ "grad_norm": 5.8240461349487305,
395
+ "learning_rate": 4.708653129958872e-05,
396
+ "loss": 6.1567,
397
+ "step": 18000
398
+ },
399
+ {
400
+ "epoch": 0.5829204313611193,
401
+ "eval_loss": 14.480341911315918,
402
+ "eval_runtime": 20.3098,
403
+ "eval_samples_per_second": 2702.888,
404
+ "eval_steps_per_second": 168.934,
405
+ "step": 18000
406
+ },
407
+ {
408
+ "epoch": 0.5991126655655947,
409
+ "grad_norm": 4.948818683624268,
410
+ "learning_rate": 4.7005732050908385e-05,
411
+ "loss": 6.1198,
412
+ "step": 18500
413
+ },
414
+ {
415
+ "epoch": 0.6153048997700703,
416
+ "grad_norm": 5.95670223236084,
417
+ "learning_rate": 4.692477087988601e-05,
418
+ "loss": 6.1038,
419
+ "step": 19000
420
+ },
421
+ {
422
+ "epoch": 0.6153048997700703,
423
+ "eval_loss": 14.150490760803223,
424
+ "eval_runtime": 20.2957,
425
+ "eval_samples_per_second": 2704.759,
426
+ "eval_steps_per_second": 169.051,
427
+ "step": 19000
428
+ },
429
+ {
430
+ "epoch": 0.6314971339745458,
431
+ "grad_norm": 7.957947731018066,
432
+ "learning_rate": 4.684380970886363e-05,
433
+ "loss": 6.0658,
434
+ "step": 19500
435
+ },
436
+ {
437
+ "epoch": 0.6476893681790213,
438
+ "grad_norm": 6.398547649383545,
439
+ "learning_rate": 4.676284853784125e-05,
440
+ "loss": 6.007,
441
+ "step": 20000
442
+ },
443
+ {
444
+ "epoch": 0.6476893681790213,
445
+ "eval_loss": 13.682233810424805,
446
+ "eval_runtime": 20.2919,
447
+ "eval_samples_per_second": 2705.271,
448
+ "eval_steps_per_second": 169.083,
449
+ "step": 20000
450
+ },
451
+ {
452
+ "epoch": 0.6638816023834969,
453
+ "grad_norm": 7.564326763153076,
454
+ "learning_rate": 4.6681887366818875e-05,
455
+ "loss": 6.0063,
456
+ "step": 20500
457
+ },
458
+ {
459
+ "epoch": 0.6800738365879724,
460
+ "grad_norm": 6.847624778747559,
461
+ "learning_rate": 4.66009261957965e-05,
462
+ "loss": 5.9528,
463
+ "step": 21000
464
+ },
465
+ {
466
+ "epoch": 0.6800738365879724,
467
+ "eval_loss": 13.830253601074219,
468
+ "eval_runtime": 20.3183,
469
+ "eval_samples_per_second": 2701.748,
470
+ "eval_steps_per_second": 168.862,
471
+ "step": 21000
472
+ },
473
+ {
474
+ "epoch": 0.696266070792448,
475
+ "grad_norm": 6.5545125007629395,
476
+ "learning_rate": 4.651996502477412e-05,
477
+ "loss": 5.9257,
478
+ "step": 21500
479
+ },
480
+ {
481
+ "epoch": 0.7124583049969235,
482
+ "grad_norm": 8.006608009338379,
483
+ "learning_rate": 4.643900385375174e-05,
484
+ "loss": 5.9066,
485
+ "step": 22000
486
+ },
487
+ {
488
+ "epoch": 0.7124583049969235,
489
+ "eval_loss": 14.178547859191895,
490
+ "eval_runtime": 20.3334,
491
+ "eval_samples_per_second": 2699.748,
492
+ "eval_steps_per_second": 168.737,
493
+ "step": 22000
494
+ },
495
+ {
496
+ "epoch": 0.728650539201399,
497
+ "grad_norm": 6.9318766593933105,
498
+ "learning_rate": 4.635820460507141e-05,
499
+ "loss": 5.8562,
500
+ "step": 22500
501
+ },
502
+ {
503
+ "epoch": 0.7448427734058746,
504
+ "grad_norm": 6.815021991729736,
505
+ "learning_rate": 4.627724343404903e-05,
506
+ "loss": 5.7984,
507
+ "step": 23000
508
+ },
509
+ {
510
+ "epoch": 0.7448427734058746,
511
+ "eval_loss": 14.005717277526855,
512
+ "eval_runtime": 20.3208,
513
+ "eval_samples_per_second": 2701.413,
514
+ "eval_steps_per_second": 168.841,
515
+ "step": 23000
516
+ },
517
+ {
518
+ "epoch": 0.76103500761035,
519
+ "grad_norm": 7.4206976890563965,
520
+ "learning_rate": 4.6196282263026655e-05,
521
+ "loss": 5.7574,
522
+ "step": 23500
523
+ },
524
+ {
525
+ "epoch": 0.7772272418148256,
526
+ "grad_norm": 7.273144721984863,
527
+ "learning_rate": 4.611532109200428e-05,
528
+ "loss": 5.7247,
529
+ "step": 24000
530
+ },
531
+ {
532
+ "epoch": 0.7772272418148256,
533
+ "eval_loss": 13.36209774017334,
534
+ "eval_runtime": 20.3187,
535
+ "eval_samples_per_second": 2701.693,
536
+ "eval_steps_per_second": 168.859,
537
+ "step": 24000
538
+ },
539
+ {
540
+ "epoch": 0.7934194760193012,
541
+ "grad_norm": 8.667037963867188,
542
+ "learning_rate": 4.6034521843323944e-05,
543
+ "loss": 5.682,
544
+ "step": 24500
545
+ },
546
+ {
547
+ "epoch": 0.8096117102237766,
548
+ "grad_norm": 9.467761039733887,
549
+ "learning_rate": 4.595356067230157e-05,
550
+ "loss": 5.6262,
551
+ "step": 25000
552
+ },
553
+ {
554
+ "epoch": 0.8096117102237766,
555
+ "eval_loss": 14.30241584777832,
556
+ "eval_runtime": 20.3265,
557
+ "eval_samples_per_second": 2700.664,
558
+ "eval_steps_per_second": 168.795,
559
+ "step": 25000
560
+ },
561
+ {
562
+ "epoch": 0.8258039444282522,
563
+ "grad_norm": 7.378128528594971,
564
+ "learning_rate": 4.587259950127919e-05,
565
+ "loss": 5.5776,
566
+ "step": 25500
567
+ },
568
+ {
569
+ "epoch": 0.8419961786327278,
570
+ "grad_norm": 8.246053695678711,
571
+ "learning_rate": 4.5791638330256806e-05,
572
+ "loss": 5.5251,
573
+ "step": 26000
574
+ },
575
+ {
576
+ "epoch": 0.8419961786327278,
577
+ "eval_loss": 14.098834037780762,
578
+ "eval_runtime": 20.3189,
579
+ "eval_samples_per_second": 2701.67,
580
+ "eval_steps_per_second": 168.857,
581
+ "step": 26000
582
+ },
583
+ {
584
+ "epoch": 0.8581884128372033,
585
+ "grad_norm": 6.545131206512451,
586
+ "learning_rate": 4.571083908157648e-05,
587
+ "loss": 5.4848,
588
+ "step": 26500
589
+ },
590
+ {
591
+ "epoch": 0.8743806470416788,
592
+ "grad_norm": 7.735929489135742,
593
+ "learning_rate": 4.56298779105541e-05,
594
+ "loss": 5.4504,
595
+ "step": 27000
596
+ },
597
+ {
598
+ "epoch": 0.8743806470416788,
599
+ "eval_loss": 13.383395195007324,
600
+ "eval_runtime": 20.3058,
601
+ "eval_samples_per_second": 2703.417,
602
+ "eval_steps_per_second": 168.967,
603
+ "step": 27000
604
+ },
605
+ {
606
+ "epoch": 0.8905728812461543,
607
+ "grad_norm": 7.53142786026001,
608
+ "learning_rate": 4.5548916739531724e-05,
609
+ "loss": 5.4094,
610
+ "step": 27500
611
+ },
612
+ {
613
+ "epoch": 0.9067651154506299,
614
+ "grad_norm": 6.753902912139893,
615
+ "learning_rate": 4.546795556850935e-05,
616
+ "loss": 5.382,
617
+ "step": 28000
618
+ },
619
+ {
620
+ "epoch": 0.9067651154506299,
621
+ "eval_loss": 13.9662446975708,
622
+ "eval_runtime": 20.3511,
623
+ "eval_samples_per_second": 2697.397,
624
+ "eval_steps_per_second": 168.59,
625
+ "step": 28000
626
+ },
627
+ {
628
+ "epoch": 0.9229573496551055,
629
+ "grad_norm": 7.170286655426025,
630
+ "learning_rate": 4.538715631982901e-05,
631
+ "loss": 5.3786,
632
+ "step": 28500
633
+ },
634
+ {
635
+ "epoch": 0.9391495838595809,
636
+ "grad_norm": 7.388674736022949,
637
+ "learning_rate": 4.5306195148806636e-05,
638
+ "loss": 5.3468,
639
+ "step": 29000
640
+ },
641
+ {
642
+ "epoch": 0.9391495838595809,
643
+ "eval_loss": 14.00700855255127,
644
+ "eval_runtime": 20.2945,
645
+ "eval_samples_per_second": 2704.924,
646
+ "eval_steps_per_second": 169.061,
647
+ "step": 29000
648
+ },
649
+ {
650
+ "epoch": 0.9553418180640565,
651
+ "grad_norm": 8.680002212524414,
652
+ "learning_rate": 4.522523397778426e-05,
653
+ "loss": 5.3159,
654
+ "step": 29500
655
+ },
656
+ {
657
+ "epoch": 0.9715340522685321,
658
+ "grad_norm": 6.966347694396973,
659
+ "learning_rate": 4.514427280676188e-05,
660
+ "loss": 5.3321,
661
+ "step": 30000
662
+ },
663
+ {
664
+ "epoch": 0.9715340522685321,
665
+ "eval_loss": 13.2510986328125,
666
+ "eval_runtime": 20.2734,
667
+ "eval_samples_per_second": 2707.742,
668
+ "eval_steps_per_second": 169.237,
669
+ "step": 30000
670
+ },
671
+ {
672
+ "epoch": 0.9877262864730075,
673
+ "grad_norm": 7.675726413726807,
674
+ "learning_rate": 4.506347355808155e-05,
675
+ "loss": 5.2547,
676
+ "step": 30500
677
+ },
678
+ {
679
+ "epoch": 1.003918520677483,
680
+ "grad_norm": 9.16435432434082,
681
+ "learning_rate": 4.4982512387059164e-05,
682
+ "loss": 5.2597,
683
+ "step": 31000
684
+ },
685
+ {
686
+ "epoch": 1.003918520677483,
687
+ "eval_loss": 13.722345352172852,
688
+ "eval_runtime": 20.3123,
689
+ "eval_samples_per_second": 2702.549,
690
+ "eval_steps_per_second": 168.912,
691
+ "step": 31000
692
+ },
693
+ {
694
+ "epoch": 1.0201107548819586,
695
+ "grad_norm": 8.053180694580078,
696
+ "learning_rate": 4.490155121603679e-05,
697
+ "loss": 5.2168,
698
+ "step": 31500
699
+ },
700
+ {
701
+ "epoch": 1.0363029890864341,
702
+ "grad_norm": 7.8324432373046875,
703
+ "learning_rate": 4.4820590045014416e-05,
704
+ "loss": 5.2229,
705
+ "step": 32000
706
+ },
707
+ {
708
+ "epoch": 1.0363029890864341,
709
+ "eval_loss": 13.410867691040039,
710
+ "eval_runtime": 20.3235,
711
+ "eval_samples_per_second": 2701.057,
712
+ "eval_steps_per_second": 168.819,
713
+ "step": 32000
714
+ },
715
+ {
716
+ "epoch": 1.0524952232909097,
717
+ "grad_norm": 6.322490692138672,
718
+ "learning_rate": 4.473979079633408e-05,
719
+ "loss": 5.1849,
720
+ "step": 32500
721
+ },
722
+ {
723
+ "epoch": 1.0686874574953853,
724
+ "grad_norm": 7.511089324951172,
725
+ "learning_rate": 4.4658829625311705e-05,
726
+ "loss": 5.1838,
727
+ "step": 33000
728
+ },
729
+ {
730
+ "epoch": 1.0686874574953853,
731
+ "eval_loss": 13.21849250793457,
732
+ "eval_runtime": 20.3207,
733
+ "eval_samples_per_second": 2701.433,
734
+ "eval_steps_per_second": 168.843,
735
+ "step": 33000
736
+ },
737
+ {
738
+ "epoch": 1.0848796916998606,
739
+ "grad_norm": 7.725784778594971,
740
+ "learning_rate": 4.457786845428932e-05,
741
+ "loss": 5.1687,
742
+ "step": 33500
743
+ },
744
+ {
745
+ "epoch": 1.1010719259043362,
746
+ "grad_norm": 7.167893886566162,
747
+ "learning_rate": 4.449690728326695e-05,
748
+ "loss": 5.157,
749
+ "step": 34000
750
+ },
751
+ {
752
+ "epoch": 1.1010719259043362,
753
+ "eval_loss": 13.764806747436523,
754
+ "eval_runtime": 20.3095,
755
+ "eval_samples_per_second": 2702.924,
756
+ "eval_steps_per_second": 168.936,
757
+ "step": 34000
758
+ },
759
+ {
760
+ "epoch": 1.1172641601088118,
761
+ "grad_norm": 9.298649787902832,
762
+ "learning_rate": 4.441610803458661e-05,
763
+ "loss": 5.1152,
764
+ "step": 34500
765
+ },
766
+ {
767
+ "epoch": 1.1334563943132874,
768
+ "grad_norm": 7.694969654083252,
769
+ "learning_rate": 4.433514686356424e-05,
770
+ "loss": 5.0736,
771
+ "step": 35000
772
+ },
773
+ {
774
+ "epoch": 1.1334563943132874,
775
+ "eval_loss": 13.989611625671387,
776
+ "eval_runtime": 20.324,
777
+ "eval_samples_per_second": 2700.988,
778
+ "eval_steps_per_second": 168.815,
779
+ "step": 35000
780
+ },
781
+ {
782
+ "epoch": 1.149648628517763,
783
+ "grad_norm": 7.617096424102783,
784
+ "learning_rate": 4.425418569254186e-05,
785
+ "loss": 5.0596,
786
+ "step": 35500
787
+ },
788
+ {
789
+ "epoch": 1.1658408627222383,
790
+ "grad_norm": 6.822023391723633,
791
+ "learning_rate": 4.417322452151948e-05,
792
+ "loss": 5.0544,
793
+ "step": 36000
794
+ },
795
+ {
796
+ "epoch": 1.1658408627222383,
797
+ "eval_loss": 14.167996406555176,
798
+ "eval_runtime": 20.2953,
799
+ "eval_samples_per_second": 2704.815,
800
+ "eval_steps_per_second": 169.054,
801
+ "step": 36000
802
+ },
803
+ {
804
+ "epoch": 1.1820330969267139,
805
+ "grad_norm": 6.709229946136475,
806
+ "learning_rate": 4.40922633504971e-05,
807
+ "loss": 5.0675,
808
+ "step": 36500
809
+ },
810
+ {
811
+ "epoch": 1.1982253311311895,
812
+ "grad_norm": 8.224237442016602,
813
+ "learning_rate": 4.401162602415881e-05,
814
+ "loss": 4.9965,
815
+ "step": 37000
816
+ },
817
+ {
818
+ "epoch": 1.1982253311311895,
819
+ "eval_loss": 14.570647239685059,
820
+ "eval_runtime": 20.3304,
821
+ "eval_samples_per_second": 2700.144,
822
+ "eval_steps_per_second": 168.762,
823
+ "step": 37000
824
+ },
825
+ {
826
+ "epoch": 1.214417565335665,
827
+ "grad_norm": 8.49647045135498,
828
+ "learning_rate": 4.393066485313644e-05,
829
+ "loss": 4.9891,
830
+ "step": 37500
831
+ },
832
+ {
833
+ "epoch": 1.2306097995401406,
834
+ "grad_norm": 6.622528553009033,
835
+ "learning_rate": 4.384970368211406e-05,
836
+ "loss": 4.9896,
837
+ "step": 38000
838
+ },
839
+ {
840
+ "epoch": 1.2306097995401406,
841
+ "eval_loss": 14.330415725708008,
842
+ "eval_runtime": 20.3174,
843
+ "eval_samples_per_second": 2701.865,
844
+ "eval_steps_per_second": 168.87,
845
+ "step": 38000
846
+ },
847
+ {
848
+ "epoch": 1.2468020337446162,
849
+ "grad_norm": 7.867303371429443,
850
+ "learning_rate": 4.376874251109168e-05,
851
+ "loss": 4.9851,
852
+ "step": 38500
853
+ },
854
+ {
855
+ "epoch": 1.2629942679490918,
856
+ "grad_norm": 7.329415321350098,
857
+ "learning_rate": 4.368778134006931e-05,
858
+ "loss": 4.969,
859
+ "step": 39000
860
+ },
861
+ {
862
+ "epoch": 1.2629942679490918,
863
+ "eval_loss": 14.989124298095703,
864
+ "eval_runtime": 20.3265,
865
+ "eval_samples_per_second": 2700.663,
866
+ "eval_steps_per_second": 168.795,
867
+ "step": 39000
868
+ },
869
+ {
870
+ "epoch": 1.279186502153567,
871
+ "grad_norm": 8.647744178771973,
872
+ "learning_rate": 4.3606820169046924e-05,
873
+ "loss": 4.9547,
874
+ "step": 39500
875
+ },
876
+ {
877
+ "epoch": 1.2953787363580427,
878
+ "grad_norm": 8.578520774841309,
879
+ "learning_rate": 4.352585899802455e-05,
880
+ "loss": 4.9569,
881
+ "step": 40000
882
+ },
883
+ {
884
+ "epoch": 1.2953787363580427,
885
+ "eval_loss": 14.283174514770508,
886
+ "eval_runtime": 20.3168,
887
+ "eval_samples_per_second": 2701.954,
888
+ "eval_steps_per_second": 168.875,
889
+ "step": 40000
890
+ },
891
+ {
892
+ "epoch": 1.3115709705625183,
893
+ "grad_norm": 8.312915802001953,
894
+ "learning_rate": 4.3444897827002176e-05,
895
+ "loss": 4.9647,
896
+ "step": 40500
897
+ },
898
+ {
899
+ "epoch": 1.3277632047669938,
900
+ "grad_norm": 9.59231185913086,
901
+ "learning_rate": 4.3364098578321836e-05,
902
+ "loss": 4.9462,
903
+ "step": 41000
904
+ },
905
+ {
906
+ "epoch": 1.3277632047669938,
907
+ "eval_loss": 14.299793243408203,
908
+ "eval_runtime": 20.2951,
909
+ "eval_samples_per_second": 2704.839,
910
+ "eval_steps_per_second": 169.055,
911
+ "step": 41000
912
+ },
913
+ {
914
+ "epoch": 1.3439554389714692,
915
+ "grad_norm": 7.844625949859619,
916
+ "learning_rate": 4.328313740729946e-05,
917
+ "loss": 4.9147,
918
+ "step": 41500
919
+ },
920
+ {
921
+ "epoch": 1.3601476731759448,
922
+ "grad_norm": 6.845381736755371,
923
+ "learning_rate": 4.320217623627708e-05,
924
+ "loss": 4.914,
925
+ "step": 42000
926
+ },
927
+ {
928
+ "epoch": 1.3601476731759448,
929
+ "eval_loss": 14.049765586853027,
930
+ "eval_runtime": 20.2836,
931
+ "eval_samples_per_second": 2706.371,
932
+ "eval_steps_per_second": 169.151,
933
+ "step": 42000
934
+ },
935
+ {
936
+ "epoch": 1.3763399073804203,
937
+ "grad_norm": 7.644216537475586,
938
+ "learning_rate": 4.3121215065254704e-05,
939
+ "loss": 4.9066,
940
+ "step": 42500
941
+ },
942
+ {
943
+ "epoch": 1.392532141584896,
944
+ "grad_norm": 7.950870513916016,
945
+ "learning_rate": 4.304041581657438e-05,
946
+ "loss": 4.8716,
947
+ "step": 43000
948
+ },
949
+ {
950
+ "epoch": 1.392532141584896,
951
+ "eval_loss": 14.33547306060791,
952
+ "eval_runtime": 20.3142,
953
+ "eval_samples_per_second": 2702.295,
954
+ "eval_steps_per_second": 168.897,
955
+ "step": 43000
956
+ },
957
+ {
958
+ "epoch": 1.4087243757893715,
959
+ "grad_norm": 7.638858318328857,
960
+ "learning_rate": 4.295945464555199e-05,
961
+ "loss": 4.8557,
962
+ "step": 43500
963
+ },
964
+ {
965
+ "epoch": 1.424916609993847,
966
+ "grad_norm": 9.173840522766113,
967
+ "learning_rate": 4.2878493474529616e-05,
968
+ "loss": 4.8394,
969
+ "step": 44000
970
+ },
971
+ {
972
+ "epoch": 1.424916609993847,
973
+ "eval_loss": 13.498611450195312,
974
+ "eval_runtime": 20.3548,
975
+ "eval_samples_per_second": 2696.904,
976
+ "eval_steps_per_second": 168.56,
977
+ "step": 44000
978
+ },
979
+ {
980
+ "epoch": 1.4411088441983224,
981
+ "grad_norm": 8.224470138549805,
982
+ "learning_rate": 4.279753230350724e-05,
983
+ "loss": 4.8216,
984
+ "step": 44500
985
+ },
986
+ {
987
+ "epoch": 1.457301078402798,
988
+ "grad_norm": 7.497710704803467,
989
+ "learning_rate": 4.2716733054826905e-05,
990
+ "loss": 4.8237,
991
+ "step": 45000
992
+ },
993
+ {
994
+ "epoch": 1.457301078402798,
995
+ "eval_loss": 13.823565483093262,
996
+ "eval_runtime": 20.2695,
997
+ "eval_samples_per_second": 2708.257,
998
+ "eval_steps_per_second": 169.269,
999
+ "step": 45000
1000
+ },
1001
+ {
1002
+ "epoch": 1.4734933126072736,
1003
+ "grad_norm": 6.961141586303711,
1004
+ "learning_rate": 4.2635771883804535e-05,
1005
+ "loss": 4.8119,
1006
+ "step": 45500
1007
+ },
1008
+ {
1009
+ "epoch": 1.4896855468117491,
1010
+ "grad_norm": 6.934360504150391,
1011
+ "learning_rate": 4.255481071278215e-05,
1012
+ "loss": 4.7972,
1013
+ "step": 46000
1014
+ },
1015
+ {
1016
+ "epoch": 1.4896855468117491,
1017
+ "eval_loss": 14.0670804977417,
1018
+ "eval_runtime": 20.2732,
1019
+ "eval_samples_per_second": 2707.759,
1020
+ "eval_steps_per_second": 169.238,
1021
+ "step": 46000
1022
+ },
1023
+ {
1024
+ "epoch": 1.5058777810162245,
1025
+ "grad_norm": 7.600714206695557,
1026
+ "learning_rate": 4.247384954175977e-05,
1027
+ "loss": 4.8131,
1028
+ "step": 46500
1029
+ },
1030
+ {
1031
+ "epoch": 1.5220700152207,
1032
+ "grad_norm": 8.766816139221191,
1033
+ "learning_rate": 4.239305029307944e-05,
1034
+ "loss": 4.778,
1035
+ "step": 47000
1036
+ },
1037
+ {
1038
+ "epoch": 1.5220700152207,
1039
+ "eval_loss": 13.546905517578125,
1040
+ "eval_runtime": 20.3029,
1041
+ "eval_samples_per_second": 2703.795,
1042
+ "eval_steps_per_second": 168.99,
1043
+ "step": 47000
1044
+ },
1045
+ {
1046
+ "epoch": 1.5382622494251756,
1047
+ "grad_norm": 8.299079895019531,
1048
+ "learning_rate": 4.231208912205706e-05,
1049
+ "loss": 4.7336,
1050
+ "step": 47500
1051
+ },
1052
+ {
1053
+ "epoch": 1.5544544836296512,
1054
+ "grad_norm": 8.333426475524902,
1055
+ "learning_rate": 4.2231127951034685e-05,
1056
+ "loss": 4.7541,
1057
+ "step": 48000
1058
+ },
1059
+ {
1060
+ "epoch": 1.5544544836296512,
1061
+ "eval_loss": 13.711235046386719,
1062
+ "eval_runtime": 20.3064,
1063
+ "eval_samples_per_second": 2703.338,
1064
+ "eval_steps_per_second": 168.962,
1065
+ "step": 48000
1066
+ },
1067
+ {
1068
+ "epoch": 1.5706467178341268,
1069
+ "grad_norm": 7.705515384674072,
1070
+ "learning_rate": 4.215016678001231e-05,
1071
+ "loss": 4.7901,
1072
+ "step": 48500
1073
+ },
1074
+ {
1075
+ "epoch": 1.5868389520386024,
1076
+ "grad_norm": 6.669991493225098,
1077
+ "learning_rate": 4.206920560898993e-05,
1078
+ "loss": 4.7544,
1079
+ "step": 49000
1080
+ },
1081
+ {
1082
+ "epoch": 1.5868389520386024,
1083
+ "eval_loss": 13.069415092468262,
1084
+ "eval_runtime": 20.3123,
1085
+ "eval_samples_per_second": 2702.548,
1086
+ "eval_steps_per_second": 168.912,
1087
+ "step": 49000
1088
+ },
1089
+ {
1090
+ "epoch": 1.603031186243078,
1091
+ "grad_norm": 6.958492279052734,
1092
+ "learning_rate": 4.19884063603096e-05,
1093
+ "loss": 4.7175,
1094
+ "step": 49500
1095
+ },
1096
+ {
1097
+ "epoch": 1.6192234204475535,
1098
+ "grad_norm": 9.244942665100098,
1099
+ "learning_rate": 4.190744518928722e-05,
1100
+ "loss": 4.7092,
1101
+ "step": 50000
1102
+ },
1103
+ {
1104
+ "epoch": 1.6192234204475535,
1105
+ "eval_loss": 13.0497465133667,
1106
+ "eval_runtime": 20.3211,
1107
+ "eval_samples_per_second": 2701.382,
1108
+ "eval_steps_per_second": 168.839,
1109
+ "step": 50000
1110
+ },
1111
+ {
1112
+ "epoch": 1.6354156546520289,
1113
+ "grad_norm": 8.372320175170898,
1114
+ "learning_rate": 4.182648401826484e-05,
1115
+ "loss": 4.7401,
1116
+ "step": 50500
1117
+ },
1118
+ {
1119
+ "epoch": 1.6516078888565044,
1120
+ "grad_norm": 7.373213768005371,
1121
+ "learning_rate": 4.1745522847242465e-05,
1122
+ "loss": 4.7252,
1123
+ "step": 51000
1124
+ },
1125
+ {
1126
+ "epoch": 1.6516078888565044,
1127
+ "eval_loss": 13.589052200317383,
1128
+ "eval_runtime": 20.3142,
1129
+ "eval_samples_per_second": 2702.302,
1130
+ "eval_steps_per_second": 168.897,
1131
+ "step": 51000
1132
+ },
1133
+ {
1134
+ "epoch": 1.6678001230609798,
1135
+ "grad_norm": 9.22149658203125,
1136
+ "learning_rate": 4.166472359856213e-05,
1137
+ "loss": 4.6795,
1138
+ "step": 51500
1139
+ },
1140
+ {
1141
+ "epoch": 1.6839923572654554,
1142
+ "grad_norm": 5.994777202606201,
1143
+ "learning_rate": 4.1583762427539754e-05,
1144
+ "loss": 4.7021,
1145
+ "step": 52000
1146
+ },
1147
+ {
1148
+ "epoch": 1.6839923572654554,
1149
+ "eval_loss": 13.31000804901123,
1150
+ "eval_runtime": 20.2902,
1151
+ "eval_samples_per_second": 2705.498,
1152
+ "eval_steps_per_second": 169.097,
1153
+ "step": 52000
1154
+ },
1155
+ {
1156
+ "epoch": 1.700184591469931,
1157
+ "grad_norm": 8.134748458862305,
1158
+ "learning_rate": 4.150280125651738e-05,
1159
+ "loss": 4.7013,
1160
+ "step": 52500
1161
+ },
1162
+ {
1163
+ "epoch": 1.7163768256744065,
1164
+ "grad_norm": 8.395650863647461,
1165
+ "learning_rate": 4.1421840085495e-05,
1166
+ "loss": 4.6976,
1167
+ "step": 53000
1168
+ },
1169
+ {
1170
+ "epoch": 1.7163768256744065,
1171
+ "eval_loss": 13.707457542419434,
1172
+ "eval_runtime": 20.3052,
1173
+ "eval_samples_per_second": 2703.49,
1174
+ "eval_steps_per_second": 168.971,
1175
+ "step": 53000
1176
+ },
1177
+ {
1178
+ "epoch": 1.732569059878882,
1179
+ "grad_norm": 9.668176651000977,
1180
+ "learning_rate": 4.1341040836814666e-05,
1181
+ "loss": 4.6889,
1182
+ "step": 53500
1183
+ },
1184
+ {
1185
+ "epoch": 1.7487612940833577,
1186
+ "grad_norm": 7.348442554473877,
1187
+ "learning_rate": 4.126007966579229e-05,
1188
+ "loss": 4.6465,
1189
+ "step": 54000
1190
+ },
1191
+ {
1192
+ "epoch": 1.7487612940833577,
1193
+ "eval_loss": 13.285728454589844,
1194
+ "eval_runtime": 20.3053,
1195
+ "eval_samples_per_second": 2703.483,
1196
+ "eval_steps_per_second": 168.971,
1197
+ "step": 54000
1198
+ },
1199
+ {
1200
+ "epoch": 1.7649535282878333,
1201
+ "grad_norm": 7.7653889656066895,
1202
+ "learning_rate": 4.117911849476991e-05,
1203
+ "loss": 4.6537,
1204
+ "step": 54500
1205
+ },
1206
+ {
1207
+ "epoch": 1.7811457624923088,
1208
+ "grad_norm": 8.290285110473633,
1209
+ "learning_rate": 4.1098157323747534e-05,
1210
+ "loss": 4.6468,
1211
+ "step": 55000
1212
+ },
1213
+ {
1214
+ "epoch": 1.7811457624923088,
1215
+ "eval_loss": 13.134732246398926,
1216
+ "eval_runtime": 20.3013,
1217
+ "eval_samples_per_second": 2704.018,
1218
+ "eval_steps_per_second": 169.004,
1219
+ "step": 55000
1220
+ }
1221
+ ],
1222
+ "logging_steps": 500,
1223
+ "max_steps": 308790,
1224
+ "num_input_tokens_seen": 0,
1225
+ "num_train_epochs": 10,
1226
+ "save_steps": 5000,
1227
+ "stateful_callbacks": {
1228
+ "TrainerControl": {
1229
+ "args": {
1230
+ "should_epoch_stop": false,
1231
+ "should_evaluate": false,
1232
+ "should_log": false,
1233
+ "should_save": true,
1234
+ "should_training_stop": false
1235
+ },
1236
+ "attributes": {}
1237
+ }
1238
+ },
1239
+ "total_flos": 4320645675220992.0,
1240
+ "train_batch_size": 16,
1241
+ "trial_name": null,
1242
+ "trial_params": null
1243
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:944a39203183756575dda434a2ab60e2e05d73559e20ed8b7c45f952dee1c16d
3
+ size 5304