Nike-Hanmatheekuna commited on
Commit
f92cdae
·
verified ·
1 Parent(s): b81fe19

Model save

Browse files
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.9978431433840766,
3
  "total_flos": 1.9278929080237425e+18,
4
- "train_loss": 0.3391870107895778,
5
- "train_runtime": 31542.9957,
6
  "train_samples": 63982,
7
- "train_samples_per_second": 6.085,
8
  "train_steps_per_second": 0.095
9
  }
 
1
  {
2
  "epoch": 2.9978431433840766,
3
  "total_flos": 1.9278929080237425e+18,
4
+ "train_loss": 0.3418351112304626,
5
+ "train_runtime": 31635.6273,
6
  "train_samples": 63982,
7
+ "train_samples_per_second": 6.067,
8
  "train_steps_per_second": 0.095
9
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a48d694a29804eb9d12b8ced279b542d707186f3daf6cff8c8c5d9200f14e2b4
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e94664ad14df07d014ff7c61583ac9ecf893ed67ff31b8657ed54fcdcfd667d9
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c34e245f8cd6490cc3b360786a826fa9fc1b394bcf65b2261dc6bf1c0ae754bb
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbd7af2e261a5615659698f29ea28cc68f7d338f40cbec3bfe3daa558a1ed97c
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1bfca41224c5e7da9448e00e5bb4cb8cb21aee969e78f1af1e873f2a17dc499
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:390b29ca3b5036d1e837b0029938c275b1c4d41bff8f3b200fb239d2f03db503
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32a2539719eb5119644cea1e70c1291b614162016924ff31584f03183f2a6cb2
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3eb9a2caf299496568bba1836c833f5455ac8f8079bf142a3912ac77eb9c2f75
3
  size 1168138808
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.9978431433840766,
3
  "total_flos": 1.9278929080237425e+18,
4
- "train_loss": 0.3391870107895778,
5
- "train_runtime": 31542.9957,
6
  "train_samples": 63982,
7
- "train_samples_per_second": 6.085,
8
  "train_steps_per_second": 0.095
9
  }
 
1
  {
2
  "epoch": 2.9978431433840766,
3
  "total_flos": 1.9278929080237425e+18,
4
+ "train_loss": 0.3418351112304626,
5
+ "train_runtime": 31635.6273,
6
  "train_samples": 63982,
7
+ "train_samples_per_second": 6.067,
8
  "train_steps_per_second": 0.095
9
  }
trainer_state.json CHANGED
@@ -17,424 +17,424 @@
17
  },
18
  {
19
  "epoch": 0.0500140664561908,
20
- "grad_norm": 6.46875,
21
  "learning_rate": 3.3333333333333333e-06,
22
- "loss": 1.2485,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.1000281329123816,
27
- "grad_norm": 3.296875,
28
  "learning_rate": 6.666666666666667e-06,
29
- "loss": 0.6295,
30
  "step": 100
31
  },
32
  {
33
  "epoch": 0.1500421993685724,
34
- "grad_norm": 3.40625,
35
  "learning_rate": 1e-05,
36
- "loss": 0.567,
37
  "step": 150
38
  },
39
  {
40
  "epoch": 0.2000562658247632,
41
- "grad_norm": 3.890625,
42
  "learning_rate": 1.3333333333333333e-05,
43
- "loss": 0.6028,
44
  "step": 200
45
  },
46
  {
47
  "epoch": 0.25007033228095404,
48
- "grad_norm": 2.84375,
49
  "learning_rate": 1.6666666666666667e-05,
50
- "loss": 0.6082,
51
  "step": 250
52
  },
53
  {
54
  "epoch": 0.3000843987371448,
55
- "grad_norm": 2.96875,
56
  "learning_rate": 2e-05,
57
- "loss": 0.5956,
58
  "step": 300
59
  },
60
  {
61
  "epoch": 0.35009846519333565,
62
- "grad_norm": 2.203125,
63
  "learning_rate": 1.9983043934122208e-05,
64
- "loss": 0.5838,
65
  "step": 350
66
  },
67
  {
68
  "epoch": 0.4001125316495264,
69
- "grad_norm": 2.5625,
70
  "learning_rate": 1.9932233238122834e-05,
71
- "loss": 0.5634,
72
  "step": 400
73
  },
74
  {
75
  "epoch": 0.45012659810571726,
76
  "grad_norm": 2.0,
77
  "learning_rate": 1.984774022190361e-05,
78
- "loss": 0.5472,
79
  "step": 450
80
  },
81
  {
82
  "epoch": 0.5001406645619081,
83
- "grad_norm": 1.84375,
84
  "learning_rate": 1.972985141929439e-05,
85
- "loss": 0.5359,
86
  "step": 500
87
  },
88
  {
89
  "epoch": 0.5501547310180989,
90
- "grad_norm": 1.9375,
91
  "learning_rate": 1.9578966616355823e-05,
92
- "loss": 0.5243,
93
  "step": 550
94
  },
95
  {
96
  "epoch": 0.6001687974742896,
97
- "grad_norm": 1.9453125,
98
  "learning_rate": 1.9395597495619634e-05,
99
- "loss": 0.5194,
100
  "step": 600
101
  },
102
  {
103
  "epoch": 0.6501828639304804,
104
- "grad_norm": 1.578125,
105
  "learning_rate": 1.918036590086405e-05,
106
- "loss": 0.5046,
107
  "step": 650
108
  },
109
  {
110
  "epoch": 0.7001969303866713,
111
- "grad_norm": 1.7265625,
112
  "learning_rate": 1.8934001728309003e-05,
113
- "loss": 0.5029,
114
  "step": 700
115
  },
116
  {
117
  "epoch": 0.7502109968428621,
118
- "grad_norm": 1.703125,
119
  "learning_rate": 1.865734045138245e-05,
120
- "loss": 0.4931,
121
  "step": 750
122
  },
123
  {
124
  "epoch": 0.8002250632990529,
125
- "grad_norm": 1.5625,
126
  "learning_rate": 1.8351320287451865e-05,
127
- "loss": 0.4897,
128
  "step": 800
129
  },
130
  {
131
  "epoch": 0.8502391297552436,
132
- "grad_norm": 1.390625,
133
  "learning_rate": 1.8016979016129164e-05,
134
- "loss": 0.4807,
135
  "step": 850
136
  },
137
  {
138
  "epoch": 0.9002531962114345,
139
- "grad_norm": 1.515625,
140
  "learning_rate": 1.7655450459938786e-05,
141
- "loss": 0.4724,
142
  "step": 900
143
  },
144
  {
145
  "epoch": 0.9502672626676253,
146
- "grad_norm": 1.5,
147
  "learning_rate": 1.726796063928382e-05,
148
- "loss": 0.4654,
149
  "step": 950
150
  },
151
  {
152
  "epoch": 1.0002813291238162,
153
- "grad_norm": 1.4296875,
154
  "learning_rate": 1.6855823614749474e-05,
155
- "loss": 0.4645,
156
  "step": 1000
157
  },
158
  {
159
  "epoch": 1.050295395580007,
160
- "grad_norm": 1.3359375,
161
  "learning_rate": 1.6420437030843482e-05,
162
- "loss": 0.3193,
163
  "step": 1050
164
  },
165
  {
166
  "epoch": 1.1003094620361977,
167
- "grad_norm": 1.3125,
168
  "learning_rate": 1.5963277376285646e-05,
169
- "loss": 0.3168,
170
  "step": 1100
171
  },
172
  {
173
  "epoch": 1.1503235284923885,
174
- "grad_norm": 1.3046875,
175
  "learning_rate": 1.5485894976919836e-05,
176
- "loss": 0.3189,
177
  "step": 1150
178
  },
179
  {
180
  "epoch": 1.2003375949485793,
181
- "grad_norm": 1.3984375,
182
  "learning_rate": 1.4989908738228567e-05,
183
- "loss": 0.3143,
184
  "step": 1200
185
  },
186
  {
187
  "epoch": 1.25035166140477,
188
- "grad_norm": 1.3515625,
189
  "learning_rate": 1.4477000655279376e-05,
190
- "loss": 0.3163,
191
  "step": 1250
192
  },
193
  {
194
  "epoch": 1.3003657278609608,
195
- "grad_norm": 1.34375,
196
  "learning_rate": 1.394891010872102e-05,
197
- "loss": 0.3129,
198
  "step": 1300
199
  },
200
  {
201
  "epoch": 1.3503797943171518,
202
- "grad_norm": 1.40625,
203
  "learning_rate": 1.3407427966172866e-05,
204
- "loss": 0.3133,
205
  "step": 1350
206
  },
207
  {
208
  "epoch": 1.4003938607733426,
209
- "grad_norm": 1.3515625,
210
  "learning_rate": 1.2854390509011061e-05,
211
- "loss": 0.3103,
212
  "step": 1400
213
  },
214
  {
215
  "epoch": 1.4504079272295334,
216
- "grad_norm": 1.3203125,
217
  "learning_rate": 1.2291673205146908e-05,
218
- "loss": 0.3051,
219
  "step": 1450
220
  },
221
  {
222
  "epoch": 1.5004219936857242,
223
- "grad_norm": 1.3046875,
224
  "learning_rate": 1.1721184348915384e-05,
225
- "loss": 0.3043,
226
  "step": 1500
227
  },
228
  {
229
  "epoch": 1.550436060141915,
230
- "grad_norm": 1.2734375,
231
  "learning_rate": 1.1144858589642251e-05,
232
- "loss": 0.2998,
233
  "step": 1550
234
  },
235
  {
236
  "epoch": 1.6004501265981057,
237
- "grad_norm": 1.3046875,
238
  "learning_rate": 1.0564650370835772e-05,
239
- "loss": 0.2983,
240
  "step": 1600
241
  },
242
  {
243
  "epoch": 1.6504641930542965,
244
- "grad_norm": 1.34375,
245
  "learning_rate": 9.982527302252135e-06,
246
- "loss": 0.297,
247
  "step": 1650
248
  },
249
  {
250
  "epoch": 1.7004782595104873,
251
  "grad_norm": 1.359375,
252
  "learning_rate": 9.40046348731131e-06,
253
- "loss": 0.2918,
254
  "step": 1700
255
  },
256
  {
257
  "epoch": 1.750492325966678,
258
- "grad_norm": 1.40625,
259
  "learning_rate": 8.820432828491542e-06,
260
- "loss": 0.2919,
261
  "step": 1750
262
  },
263
  {
264
  "epoch": 1.8005063924228688,
265
- "grad_norm": 1.34375,
266
  "learning_rate": 8.244402333405252e-06,
267
- "loss": 0.2872,
268
  "step": 1800
269
  },
270
  {
271
  "epoch": 1.8505204588790596,
272
- "grad_norm": 1.328125,
273
  "learning_rate": 7.674325444256899e-06,
274
- "loss": 0.2857,
275
  "step": 1850
276
  },
277
  {
278
  "epoch": 1.9005345253352506,
279
- "grad_norm": 1.390625,
280
  "learning_rate": 7.112135413304042e-06,
281
- "loss": 0.2821,
282
  "step": 1900
283
  },
284
  {
285
  "epoch": 1.9505485917914414,
286
- "grad_norm": 1.2421875,
287
  "learning_rate": 6.55973874678682e-06,
288
- "loss": 0.2812,
289
  "step": 1950
290
  },
291
  {
292
  "epoch": 2.0005626582476324,
293
- "grad_norm": 1.09375,
294
  "learning_rate": 6.0190087395588596e-06,
295
- "loss": 0.2739,
296
  "step": 2000
297
  },
298
  {
299
  "epoch": 2.050576724703823,
300
- "grad_norm": 1.2265625,
301
  "learning_rate": 5.491779122345093e-06,
302
- "loss": 0.1498,
303
  "step": 2050
304
  },
305
  {
306
  "epoch": 2.100590791160014,
307
- "grad_norm": 1.1796875,
308
  "learning_rate": 4.979837843169959e-06,
309
- "loss": 0.1477,
310
  "step": 2100
311
  },
312
  {
313
  "epoch": 2.1506048576162047,
314
- "grad_norm": 1.1953125,
315
  "learning_rate": 4.484921004044509e-06,
316
- "loss": 0.1482,
317
  "step": 2150
318
  },
319
  {
320
  "epoch": 2.2006189240723955,
321
- "grad_norm": 1.140625,
322
  "learning_rate": 4.008706973474391e-06,
323
- "loss": 0.1483,
324
  "step": 2200
325
  },
326
  {
327
  "epoch": 2.2506329905285862,
328
  "grad_norm": 1.1328125,
329
  "learning_rate": 3.5528106947544626e-06,
330
- "loss": 0.1464,
331
  "step": 2250
332
  },
333
  {
334
  "epoch": 2.300647056984777,
335
- "grad_norm": 1.1953125,
336
  "learning_rate": 3.118778209351808e-06,
337
- "loss": 0.1463,
338
  "step": 2300
339
  },
340
  {
341
  "epoch": 2.350661123440968,
342
- "grad_norm": 1.3125,
343
  "learning_rate": 2.7080814139495402e-06,
344
- "loss": 0.1456,
345
  "step": 2350
346
  },
347
  {
348
  "epoch": 2.4006751898971586,
349
- "grad_norm": 1.3046875,
350
  "learning_rate": 2.322113068931391e-06,
351
- "loss": 0.1453,
352
  "step": 2400
353
  },
354
  {
355
  "epoch": 2.4506892563533493,
356
  "grad_norm": 1.1875,
357
  "learning_rate": 1.9621820752343324e-06,
358
- "loss": 0.145,
359
  "step": 2450
360
  },
361
  {
362
  "epoch": 2.50070332280954,
363
- "grad_norm": 1.203125,
364
  "learning_rate": 1.629509035586484e-06,
365
- "loss": 0.1436,
366
  "step": 2500
367
  },
368
  {
369
  "epoch": 2.550717389265731,
370
- "grad_norm": 1.296875,
371
  "learning_rate": 1.3252221151830513e-06,
372
- "loss": 0.1439,
373
  "step": 2550
374
  },
375
  {
376
  "epoch": 2.6007314557219217,
377
- "grad_norm": 1.203125,
378
  "learning_rate": 1.0503532158376584e-06,
379
- "loss": 0.1432,
380
  "step": 2600
381
  },
382
  {
383
  "epoch": 2.6507455221781124,
384
  "grad_norm": 1.203125,
385
  "learning_rate": 8.058344765833171e-07,
386
- "loss": 0.1442,
387
  "step": 2650
388
  },
389
  {
390
  "epoch": 2.7007595886343037,
391
- "grad_norm": 1.2109375,
392
  "learning_rate": 5.924951125902545e-07,
393
- "loss": 0.1443,
394
  "step": 2700
395
  },
396
  {
397
  "epoch": 2.7507736550904944,
398
- "grad_norm": 1.171875,
399
  "learning_rate": 4.11058603120511e-07,
400
- "loss": 0.1428,
401
  "step": 2750
402
  },
403
  {
404
  "epoch": 2.800787721546685,
405
- "grad_norm": 1.1640625,
406
  "learning_rate": 2.6214023805552826e-07,
407
- "loss": 0.1432,
408
  "step": 2800
409
  },
410
  {
411
  "epoch": 2.850801788002876,
412
- "grad_norm": 1.2578125,
413
  "learning_rate": 1.462450313169983e-07,
414
- "loss": 0.1434,
415
  "step": 2850
416
  },
417
  {
418
  "epoch": 2.9008158544590668,
419
- "grad_norm": 1.1484375,
420
  "learning_rate": 6.376600825699463e-08,
421
- "loss": 0.1427,
422
  "step": 2900
423
  },
424
  {
425
  "epoch": 2.9508299209152575,
426
- "grad_norm": 1.1796875,
427
  "learning_rate": 1.49828728252277e-08,
428
- "loss": 0.1438,
429
  "step": 2950
430
  },
431
  {
432
  "epoch": 2.9978431433840766,
433
  "step": 2997,
434
  "total_flos": 1.9278929080237425e+18,
435
- "train_loss": 0.3391870107895778,
436
- "train_runtime": 31542.9957,
437
- "train_samples_per_second": 6.085,
438
  "train_steps_per_second": 0.095
439
  }
440
  ],
 
17
  },
18
  {
19
  "epoch": 0.0500140664561908,
20
+ "grad_norm": 9.0625,
21
  "learning_rate": 3.3333333333333333e-06,
22
+ "loss": 1.2593,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.1000281329123816,
27
+ "grad_norm": 4.1875,
28
  "learning_rate": 6.666666666666667e-06,
29
+ "loss": 0.6353,
30
  "step": 100
31
  },
32
  {
33
  "epoch": 0.1500421993685724,
34
+ "grad_norm": 3.4375,
35
  "learning_rate": 1e-05,
36
+ "loss": 0.5783,
37
  "step": 150
38
  },
39
  {
40
  "epoch": 0.2000562658247632,
41
+ "grad_norm": 3.09375,
42
  "learning_rate": 1.3333333333333333e-05,
43
+ "loss": 0.5614,
44
  "step": 200
45
  },
46
  {
47
  "epoch": 0.25007033228095404,
48
+ "grad_norm": 3.421875,
49
  "learning_rate": 1.6666666666666667e-05,
50
+ "loss": 0.6185,
51
  "step": 250
52
  },
53
  {
54
  "epoch": 0.3000843987371448,
55
+ "grad_norm": 7.5,
56
  "learning_rate": 2e-05,
57
+ "loss": 0.6262,
58
  "step": 300
59
  },
60
  {
61
  "epoch": 0.35009846519333565,
62
+ "grad_norm": 2.421875,
63
  "learning_rate": 1.9983043934122208e-05,
64
+ "loss": 0.5878,
65
  "step": 350
66
  },
67
  {
68
  "epoch": 0.4001125316495264,
69
+ "grad_norm": 3.171875,
70
  "learning_rate": 1.9932233238122834e-05,
71
+ "loss": 0.575,
72
  "step": 400
73
  },
74
  {
75
  "epoch": 0.45012659810571726,
76
  "grad_norm": 2.0,
77
  "learning_rate": 1.984774022190361e-05,
78
+ "loss": 0.5526,
79
  "step": 450
80
  },
81
  {
82
  "epoch": 0.5001406645619081,
83
+ "grad_norm": 1.8203125,
84
  "learning_rate": 1.972985141929439e-05,
85
+ "loss": 0.54,
86
  "step": 500
87
  },
88
  {
89
  "epoch": 0.5501547310180989,
90
+ "grad_norm": 2.03125,
91
  "learning_rate": 1.9578966616355823e-05,
92
+ "loss": 0.527,
93
  "step": 550
94
  },
95
  {
96
  "epoch": 0.6001687974742896,
97
+ "grad_norm": 2.046875,
98
  "learning_rate": 1.9395597495619634e-05,
99
+ "loss": 0.5229,
100
  "step": 600
101
  },
102
  {
103
  "epoch": 0.6501828639304804,
104
+ "grad_norm": 1.734375,
105
  "learning_rate": 1.918036590086405e-05,
106
+ "loss": 0.5062,
107
  "step": 650
108
  },
109
  {
110
  "epoch": 0.7001969303866713,
111
+ "grad_norm": 1.578125,
112
  "learning_rate": 1.8934001728309003e-05,
113
+ "loss": 0.5055,
114
  "step": 700
115
  },
116
  {
117
  "epoch": 0.7502109968428621,
118
+ "grad_norm": 1.7265625,
119
  "learning_rate": 1.865734045138245e-05,
120
+ "loss": 0.4947,
121
  "step": 750
122
  },
123
  {
124
  "epoch": 0.8002250632990529,
125
+ "grad_norm": 1.5390625,
126
  "learning_rate": 1.8351320287451865e-05,
127
+ "loss": 0.491,
128
  "step": 800
129
  },
130
  {
131
  "epoch": 0.8502391297552436,
132
+ "grad_norm": 1.4765625,
133
  "learning_rate": 1.8016979016129164e-05,
134
+ "loss": 0.4824,
135
  "step": 850
136
  },
137
  {
138
  "epoch": 0.9002531962114345,
139
+ "grad_norm": 1.53125,
140
  "learning_rate": 1.7655450459938786e-05,
141
+ "loss": 0.4738,
142
  "step": 900
143
  },
144
  {
145
  "epoch": 0.9502672626676253,
146
+ "grad_norm": 1.65625,
147
  "learning_rate": 1.726796063928382e-05,
148
+ "loss": 0.4676,
149
  "step": 950
150
  },
151
  {
152
  "epoch": 1.0002813291238162,
153
+ "grad_norm": 1.40625,
154
  "learning_rate": 1.6855823614749474e-05,
155
+ "loss": 0.4657,
156
  "step": 1000
157
  },
158
  {
159
  "epoch": 1.050295395580007,
160
+ "grad_norm": 1.40625,
161
  "learning_rate": 1.6420437030843482e-05,
162
+ "loss": 0.3223,
163
  "step": 1050
164
  },
165
  {
166
  "epoch": 1.1003094620361977,
167
+ "grad_norm": 1.3359375,
168
  "learning_rate": 1.5963277376285646e-05,
169
+ "loss": 0.3197,
170
  "step": 1100
171
  },
172
  {
173
  "epoch": 1.1503235284923885,
174
+ "grad_norm": 1.2890625,
175
  "learning_rate": 1.5485894976919836e-05,
176
+ "loss": 0.3246,
177
  "step": 1150
178
  },
179
  {
180
  "epoch": 1.2003375949485793,
181
+ "grad_norm": 1.421875,
182
  "learning_rate": 1.4989908738228567e-05,
183
+ "loss": 0.3167,
184
  "step": 1200
185
  },
186
  {
187
  "epoch": 1.25035166140477,
188
+ "grad_norm": 1.3828125,
189
  "learning_rate": 1.4477000655279376e-05,
190
+ "loss": 0.3186,
191
  "step": 1250
192
  },
193
  {
194
  "epoch": 1.3003657278609608,
195
+ "grad_norm": 1.3515625,
196
  "learning_rate": 1.394891010872102e-05,
197
+ "loss": 0.3149,
198
  "step": 1300
199
  },
200
  {
201
  "epoch": 1.3503797943171518,
202
+ "grad_norm": 1.3828125,
203
  "learning_rate": 1.3407427966172866e-05,
204
+ "loss": 0.3164,
205
  "step": 1350
206
  },
207
  {
208
  "epoch": 1.4003938607733426,
209
+ "grad_norm": 1.34375,
210
  "learning_rate": 1.2854390509011061e-05,
211
+ "loss": 0.313,
212
  "step": 1400
213
  },
214
  {
215
  "epoch": 1.4504079272295334,
216
+ "grad_norm": 1.390625,
217
  "learning_rate": 1.2291673205146908e-05,
218
+ "loss": 0.3071,
219
  "step": 1450
220
  },
221
  {
222
  "epoch": 1.5004219936857242,
223
+ "grad_norm": 1.328125,
224
  "learning_rate": 1.1721184348915384e-05,
225
+ "loss": 0.3063,
226
  "step": 1500
227
  },
228
  {
229
  "epoch": 1.550436060141915,
230
+ "grad_norm": 1.2890625,
231
  "learning_rate": 1.1144858589642251e-05,
232
+ "loss": 0.3022,
233
  "step": 1550
234
  },
235
  {
236
  "epoch": 1.6004501265981057,
237
+ "grad_norm": 1.3359375,
238
  "learning_rate": 1.0564650370835772e-05,
239
+ "loss": 0.3006,
240
  "step": 1600
241
  },
242
  {
243
  "epoch": 1.6504641930542965,
244
+ "grad_norm": 1.3359375,
245
  "learning_rate": 9.982527302252135e-06,
246
+ "loss": 0.2998,
247
  "step": 1650
248
  },
249
  {
250
  "epoch": 1.7004782595104873,
251
  "grad_norm": 1.359375,
252
  "learning_rate": 9.40046348731131e-06,
253
+ "loss": 0.2947,
254
  "step": 1700
255
  },
256
  {
257
  "epoch": 1.750492325966678,
258
+ "grad_norm": 1.34375,
259
  "learning_rate": 8.820432828491542e-06,
260
+ "loss": 0.294,
261
  "step": 1750
262
  },
263
  {
264
  "epoch": 1.8005063924228688,
265
+ "grad_norm": 1.375,
266
  "learning_rate": 8.244402333405252e-06,
267
+ "loss": 0.2894,
268
  "step": 1800
269
  },
270
  {
271
  "epoch": 1.8505204588790596,
272
+ "grad_norm": 1.375,
273
  "learning_rate": 7.674325444256899e-06,
274
+ "loss": 0.2879,
275
  "step": 1850
276
  },
277
  {
278
  "epoch": 1.9005345253352506,
279
+ "grad_norm": 1.3671875,
280
  "learning_rate": 7.112135413304042e-06,
281
+ "loss": 0.2842,
282
  "step": 1900
283
  },
284
  {
285
  "epoch": 1.9505485917914414,
286
+ "grad_norm": 1.3125,
287
  "learning_rate": 6.55973874678682e-06,
288
+ "loss": 0.2839,
289
  "step": 1950
290
  },
291
  {
292
  "epoch": 2.0005626582476324,
293
+ "grad_norm": 1.1171875,
294
  "learning_rate": 6.0190087395588596e-06,
295
+ "loss": 0.2766,
296
  "step": 2000
297
  },
298
  {
299
  "epoch": 2.050576724703823,
300
+ "grad_norm": 1.3046875,
301
  "learning_rate": 5.491779122345093e-06,
302
+ "loss": 0.1517,
303
  "step": 2050
304
  },
305
  {
306
  "epoch": 2.100590791160014,
307
+ "grad_norm": 1.1953125,
308
  "learning_rate": 4.979837843169959e-06,
309
+ "loss": 0.1492,
310
  "step": 2100
311
  },
312
  {
313
  "epoch": 2.1506048576162047,
314
+ "grad_norm": 1.203125,
315
  "learning_rate": 4.484921004044509e-06,
316
+ "loss": 0.1494,
317
  "step": 2150
318
  },
319
  {
320
  "epoch": 2.2006189240723955,
321
+ "grad_norm": 1.1484375,
322
  "learning_rate": 4.008706973474391e-06,
323
+ "loss": 0.1498,
324
  "step": 2200
325
  },
326
  {
327
  "epoch": 2.2506329905285862,
328
  "grad_norm": 1.1328125,
329
  "learning_rate": 3.5528106947544626e-06,
330
+ "loss": 0.1482,
331
  "step": 2250
332
  },
333
  {
334
  "epoch": 2.300647056984777,
335
+ "grad_norm": 1.1796875,
336
  "learning_rate": 3.118778209351808e-06,
337
+ "loss": 0.1477,
338
  "step": 2300
339
  },
340
  {
341
  "epoch": 2.350661123440968,
342
+ "grad_norm": 1.234375,
343
  "learning_rate": 2.7080814139495402e-06,
344
+ "loss": 0.1473,
345
  "step": 2350
346
  },
347
  {
348
  "epoch": 2.4006751898971586,
349
+ "grad_norm": 1.2421875,
350
  "learning_rate": 2.322113068931391e-06,
351
+ "loss": 0.147,
352
  "step": 2400
353
  },
354
  {
355
  "epoch": 2.4506892563533493,
356
  "grad_norm": 1.1875,
357
  "learning_rate": 1.9621820752343324e-06,
358
+ "loss": 0.1466,
359
  "step": 2450
360
  },
361
  {
362
  "epoch": 2.50070332280954,
363
+ "grad_norm": 1.21875,
364
  "learning_rate": 1.629509035586484e-06,
365
+ "loss": 0.145,
366
  "step": 2500
367
  },
368
  {
369
  "epoch": 2.550717389265731,
370
+ "grad_norm": 1.4140625,
371
  "learning_rate": 1.3252221151830513e-06,
372
+ "loss": 0.146,
373
  "step": 2550
374
  },
375
  {
376
  "epoch": 2.6007314557219217,
377
+ "grad_norm": 1.15625,
378
  "learning_rate": 1.0503532158376584e-06,
379
+ "loss": 0.1453,
380
  "step": 2600
381
  },
382
  {
383
  "epoch": 2.6507455221781124,
384
  "grad_norm": 1.203125,
385
  "learning_rate": 8.058344765833171e-07,
386
+ "loss": 0.1466,
387
  "step": 2650
388
  },
389
  {
390
  "epoch": 2.7007595886343037,
391
+ "grad_norm": 1.2734375,
392
  "learning_rate": 5.924951125902545e-07,
393
+ "loss": 0.1458,
394
  "step": 2700
395
  },
396
  {
397
  "epoch": 2.7507736550904944,
398
+ "grad_norm": 1.1640625,
399
  "learning_rate": 4.11058603120511e-07,
400
+ "loss": 0.1445,
401
  "step": 2750
402
  },
403
  {
404
  "epoch": 2.800787721546685,
405
+ "grad_norm": 1.1875,
406
  "learning_rate": 2.6214023805552826e-07,
407
+ "loss": 0.1449,
408
  "step": 2800
409
  },
410
  {
411
  "epoch": 2.850801788002876,
412
+ "grad_norm": 1.390625,
413
  "learning_rate": 1.462450313169983e-07,
414
+ "loss": 0.1457,
415
  "step": 2850
416
  },
417
  {
418
  "epoch": 2.9008158544590668,
419
+ "grad_norm": 1.2265625,
420
  "learning_rate": 6.376600825699463e-08,
421
+ "loss": 0.1444,
422
  "step": 2900
423
  },
424
  {
425
  "epoch": 2.9508299209152575,
426
+ "grad_norm": 1.171875,
427
  "learning_rate": 1.49828728252277e-08,
428
+ "loss": 0.1451,
429
  "step": 2950
430
  },
431
  {
432
  "epoch": 2.9978431433840766,
433
  "step": 2997,
434
  "total_flos": 1.9278929080237425e+18,
435
+ "train_loss": 0.3418351112304626,
436
+ "train_runtime": 31635.6273,
437
+ "train_samples_per_second": 6.067,
438
  "train_steps_per_second": 0.095
439
  }
440
  ],