jssky commited on
Commit
f6f154f
·
verified ·
1 Parent(s): 367e19b

Training in progress, step 198, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fb3a1c034c512f3602bb7eb9746a25ce70d3c1c2463811bd29bb2e039a6ac96
3
  size 47724600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78caa2ecb707469c3b9436373b3cfd3d25d6860d81dfdd1fcff1dd3975613d82
3
  size 47724600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a4ee21da39425c292a1b2d46b9c4ec0a440be08be6cebb00065c8e33bd4773e
3
  size 25331516
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1cd66b51dd9ce1e23fb67cd242e2eb29e6b5e6b2e592166f989f3c2e9e97255
3
  size 25331516
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3bbf00cc7d26b5ba1f1bfe59564ee6b340d81d2d6e92ca2595dc7bce3ba71015
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63afe6b53664eea8898016edb2f2324259424202f671640bd9eb38d5de37a2a7
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae4f1bd750c09fc9bb727cae976f56e1bbe0dff5c4d4e1a6eec209a810ae59b2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:588b963689e2bc6a644ef6e066b36a07667462b36247fb966e7188944b9c91f2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7575757575757576,
5
  "eval_steps": 50,
6
- "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1081,6 +1081,342 @@
1081
  "eval_samples_per_second": 6.856,
1082
  "eval_steps_per_second": 3.428,
1083
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1084
  }
1085
  ],
1086
  "logging_steps": 1,
@@ -1095,12 +1431,12 @@
1095
  "should_evaluate": false,
1096
  "should_log": false,
1097
  "should_save": true,
1098
- "should_training_stop": false
1099
  },
1100
  "attributes": {}
1101
  }
1102
  },
1103
- "total_flos": 2.13169452613632e+16,
1104
  "train_batch_size": 2,
1105
  "trial_name": null,
1106
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 50,
6
+ "global_step": 198,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1081
  "eval_samples_per_second": 6.856,
1082
  "eval_steps_per_second": 3.428,
1083
  "step": 150
1084
+ },
1085
+ {
1086
+ "epoch": 0.7626262626262627,
1087
+ "grad_norm": 9.82388687133789,
1088
+ "learning_rate": 2.9289321881345254e-05,
1089
+ "loss": 0.302,
1090
+ "step": 151
1091
+ },
1092
+ {
1093
+ "epoch": 0.7676767676767676,
1094
+ "grad_norm": 12.946221351623535,
1095
+ "learning_rate": 2.8117631612207084e-05,
1096
+ "loss": 0.4361,
1097
+ "step": 152
1098
+ },
1099
+ {
1100
+ "epoch": 0.7727272727272727,
1101
+ "grad_norm": 9.764445304870605,
1102
+ "learning_rate": 2.6966013605133088e-05,
1103
+ "loss": 0.3317,
1104
+ "step": 153
1105
+ },
1106
+ {
1107
+ "epoch": 0.7777777777777778,
1108
+ "grad_norm": 14.5607271194458,
1109
+ "learning_rate": 2.5834789435204243e-05,
1110
+ "loss": 0.3403,
1111
+ "step": 154
1112
+ },
1113
+ {
1114
+ "epoch": 0.7828282828282829,
1115
+ "grad_norm": 11.619050979614258,
1116
+ "learning_rate": 2.4724274982774865e-05,
1117
+ "loss": 0.4053,
1118
+ "step": 155
1119
+ },
1120
+ {
1121
+ "epoch": 0.7878787878787878,
1122
+ "grad_norm": 14.43159008026123,
1123
+ "learning_rate": 2.3634780345266806e-05,
1124
+ "loss": 0.3843,
1125
+ "step": 156
1126
+ },
1127
+ {
1128
+ "epoch": 0.7929292929292929,
1129
+ "grad_norm": 15.145977020263672,
1130
+ "learning_rate": 2.2566609750578673e-05,
1131
+ "loss": 0.4364,
1132
+ "step": 157
1133
+ },
1134
+ {
1135
+ "epoch": 0.797979797979798,
1136
+ "grad_norm": 11.833556175231934,
1137
+ "learning_rate": 2.1520061472133902e-05,
1138
+ "loss": 0.3154,
1139
+ "step": 158
1140
+ },
1141
+ {
1142
+ "epoch": 0.803030303030303,
1143
+ "grad_norm": 12.835911750793457,
1144
+ "learning_rate": 2.04954277455917e-05,
1145
+ "loss": 0.3429,
1146
+ "step": 159
1147
+ },
1148
+ {
1149
+ "epoch": 0.8080808080808081,
1150
+ "grad_norm": 11.492507934570312,
1151
+ "learning_rate": 1.9492994687243714e-05,
1152
+ "loss": 0.3757,
1153
+ "step": 160
1154
+ },
1155
+ {
1156
+ "epoch": 0.8131313131313131,
1157
+ "grad_norm": 13.120503425598145,
1158
+ "learning_rate": 1.851304221411967e-05,
1159
+ "loss": 0.4127,
1160
+ "step": 161
1161
+ },
1162
+ {
1163
+ "epoch": 0.8181818181818182,
1164
+ "grad_norm": 20.865995407104492,
1165
+ "learning_rate": 1.7555843965823992e-05,
1166
+ "loss": 0.4,
1167
+ "step": 162
1168
+ },
1169
+ {
1170
+ "epoch": 0.8232323232323232,
1171
+ "grad_norm": 11.817997932434082,
1172
+ "learning_rate": 1.6621667228125302e-05,
1173
+ "loss": 0.4647,
1174
+ "step": 163
1175
+ },
1176
+ {
1177
+ "epoch": 0.8282828282828283,
1178
+ "grad_norm": 10.320054054260254,
1179
+ "learning_rate": 1.57107728583203e-05,
1180
+ "loss": 0.3413,
1181
+ "step": 164
1182
+ },
1183
+ {
1184
+ "epoch": 0.8333333333333334,
1185
+ "grad_norm": 11.233556747436523,
1186
+ "learning_rate": 1.4823415212392377e-05,
1187
+ "loss": 0.2859,
1188
+ "step": 165
1189
+ },
1190
+ {
1191
+ "epoch": 0.8383838383838383,
1192
+ "grad_norm": 10.825108528137207,
1193
+ "learning_rate": 1.3959842073986085e-05,
1194
+ "loss": 0.2771,
1195
+ "step": 166
1196
+ },
1197
+ {
1198
+ "epoch": 0.8434343434343434,
1199
+ "grad_norm": 11.109395027160645,
1200
+ "learning_rate": 1.3120294585216353e-05,
1201
+ "loss": 0.3611,
1202
+ "step": 167
1203
+ },
1204
+ {
1205
+ "epoch": 0.8484848484848485,
1206
+ "grad_norm": 10.571539878845215,
1207
+ "learning_rate": 1.230500717933285e-05,
1208
+ "loss": 0.3888,
1209
+ "step": 168
1210
+ },
1211
+ {
1212
+ "epoch": 0.8535353535353535,
1213
+ "grad_norm": 11.521129608154297,
1214
+ "learning_rate": 1.1514207515257147e-05,
1215
+ "loss": 0.3247,
1216
+ "step": 169
1217
+ },
1218
+ {
1219
+ "epoch": 0.8585858585858586,
1220
+ "grad_norm": 11.530449867248535,
1221
+ "learning_rate": 1.0748116414011888e-05,
1222
+ "loss": 0.2623,
1223
+ "step": 170
1224
+ },
1225
+ {
1226
+ "epoch": 0.8636363636363636,
1227
+ "grad_norm": 10.529777526855469,
1228
+ "learning_rate": 1.0006947797059219e-05,
1229
+ "loss": 0.3313,
1230
+ "step": 171
1231
+ },
1232
+ {
1233
+ "epoch": 0.8686868686868687,
1234
+ "grad_norm": 9.755709648132324,
1235
+ "learning_rate": 9.29090862656593e-06,
1236
+ "loss": 0.3945,
1237
+ "step": 172
1238
+ },
1239
+ {
1240
+ "epoch": 0.8737373737373737,
1241
+ "grad_norm": 10.26276969909668,
1242
+ "learning_rate": 8.600198847611729e-06,
1243
+ "loss": 0.2629,
1244
+ "step": 173
1245
+ },
1246
+ {
1247
+ "epoch": 0.8787878787878788,
1248
+ "grad_norm": 18.38811492919922,
1249
+ "learning_rate": 7.935011332357112e-06,
1250
+ "loss": 0.408,
1251
+ "step": 174
1252
+ },
1253
+ {
1254
+ "epoch": 0.8838383838383839,
1255
+ "grad_norm": 11.992220878601074,
1256
+ "learning_rate": 7.295531826186264e-06,
1257
+ "loss": 0.4615,
1258
+ "step": 175
1259
+ },
1260
+ {
1261
+ "epoch": 0.8888888888888888,
1262
+ "grad_norm": 14.000274658203125,
1263
+ "learning_rate": 6.681938895839746e-06,
1264
+ "loss": 0.3583,
1265
+ "step": 176
1266
+ },
1267
+ {
1268
+ "epoch": 0.8939393939393939,
1269
+ "grad_norm": 11.123028755187988,
1270
+ "learning_rate": 6.094403879552213e-06,
1271
+ "loss": 0.2686,
1272
+ "step": 177
1273
+ },
1274
+ {
1275
+ "epoch": 0.898989898989899,
1276
+ "grad_norm": 11.463796615600586,
1277
+ "learning_rate": 5.533090839208133e-06,
1278
+ "loss": 0.3694,
1279
+ "step": 178
1280
+ },
1281
+ {
1282
+ "epoch": 0.9040404040404041,
1283
+ "grad_norm": 14.426941871643066,
1284
+ "learning_rate": 4.998156514529595e-06,
1285
+ "loss": 0.4453,
1286
+ "step": 179
1287
+ },
1288
+ {
1289
+ "epoch": 0.9090909090909091,
1290
+ "grad_norm": 11.056812286376953,
1291
+ "learning_rate": 4.489750279308757e-06,
1292
+ "loss": 0.3627,
1293
+ "step": 180
1294
+ },
1295
+ {
1296
+ "epoch": 0.9141414141414141,
1297
+ "grad_norm": 15.307928085327148,
1298
+ "learning_rate": 4.008014099696922e-06,
1299
+ "loss": 0.5187,
1300
+ "step": 181
1301
+ },
1302
+ {
1303
+ "epoch": 0.9191919191919192,
1304
+ "grad_norm": 16.49620246887207,
1305
+ "learning_rate": 3.5530824945623542e-06,
1306
+ "loss": 0.4363,
1307
+ "step": 182
1308
+ },
1309
+ {
1310
+ "epoch": 0.9242424242424242,
1311
+ "grad_norm": 14.4350004196167,
1312
+ "learning_rate": 3.1250824979274675e-06,
1313
+ "loss": 0.422,
1314
+ "step": 183
1315
+ },
1316
+ {
1317
+ "epoch": 0.9292929292929293,
1318
+ "grad_norm": 8.324783325195312,
1319
+ "learning_rate": 2.7241336234962944e-06,
1320
+ "loss": 0.3985,
1321
+ "step": 184
1322
+ },
1323
+ {
1324
+ "epoch": 0.9343434343434344,
1325
+ "grad_norm": 11.766496658325195,
1326
+ "learning_rate": 2.3503478312815298e-06,
1327
+ "loss": 0.3377,
1328
+ "step": 185
1329
+ },
1330
+ {
1331
+ "epoch": 0.9393939393939394,
1332
+ "grad_norm": 15.193984031677246,
1333
+ "learning_rate": 2.003829496341325e-06,
1334
+ "loss": 0.3802,
1335
+ "step": 186
1336
+ },
1337
+ {
1338
+ "epoch": 0.9444444444444444,
1339
+ "grad_norm": 20.59272003173828,
1340
+ "learning_rate": 1.684675379633649e-06,
1341
+ "loss": 0.4727,
1342
+ "step": 187
1343
+ },
1344
+ {
1345
+ "epoch": 0.9494949494949495,
1346
+ "grad_norm": 10.777263641357422,
1347
+ "learning_rate": 1.3929746009971433e-06,
1348
+ "loss": 0.3044,
1349
+ "step": 188
1350
+ },
1351
+ {
1352
+ "epoch": 0.9545454545454546,
1353
+ "grad_norm": 20.804996490478516,
1354
+ "learning_rate": 1.1288086142653864e-06,
1355
+ "loss": 0.3692,
1356
+ "step": 189
1357
+ },
1358
+ {
1359
+ "epoch": 0.9595959595959596,
1360
+ "grad_norm": 19.473365783691406,
1361
+ "learning_rate": 8.922511845219971e-07,
1362
+ "loss": 0.321,
1363
+ "step": 190
1364
+ },
1365
+ {
1366
+ "epoch": 0.9646464646464646,
1367
+ "grad_norm": 10.078756332397461,
1368
+ "learning_rate": 6.833683675025904e-07,
1369
+ "loss": 0.3689,
1370
+ "step": 191
1371
+ },
1372
+ {
1373
+ "epoch": 0.9696969696969697,
1374
+ "grad_norm": 10.66713809967041,
1375
+ "learning_rate": 5.022184911495864e-07,
1376
+ "loss": 0.2831,
1377
+ "step": 192
1378
+ },
1379
+ {
1380
+ "epoch": 0.9747474747474747,
1381
+ "grad_norm": 13.519158363342285,
1382
+ "learning_rate": 3.488521393248401e-07,
1383
+ "loss": 0.3693,
1384
+ "step": 193
1385
+ },
1386
+ {
1387
+ "epoch": 0.9797979797979798,
1388
+ "grad_norm": 18.351518630981445,
1389
+ "learning_rate": 2.2331213768468363e-07,
1390
+ "loss": 0.5692,
1391
+ "step": 194
1392
+ },
1393
+ {
1394
+ "epoch": 0.9848484848484849,
1395
+ "grad_norm": 20.144264221191406,
1396
+ "learning_rate": 1.2563354172142606e-07,
1397
+ "loss": 0.4896,
1398
+ "step": 195
1399
+ },
1400
+ {
1401
+ "epoch": 0.98989898989899,
1402
+ "grad_norm": 21.47068977355957,
1403
+ "learning_rate": 5.584362697453882e-08,
1404
+ "loss": 0.4884,
1405
+ "step": 196
1406
+ },
1407
+ {
1408
+ "epoch": 0.9949494949494949,
1409
+ "grad_norm": 14.894364356994629,
1410
+ "learning_rate": 1.3961881414292778e-08,
1411
+ "loss": 0.4101,
1412
+ "step": 197
1413
+ },
1414
+ {
1415
+ "epoch": 1.0,
1416
+ "grad_norm": 20.294527053833008,
1417
+ "learning_rate": 0.0,
1418
+ "loss": 0.5037,
1419
+ "step": 198
1420
  }
1421
  ],
1422
  "logging_steps": 1,
 
1431
  "should_evaluate": false,
1432
  "should_log": false,
1433
  "should_save": true,
1434
+ "should_training_stop": true
1435
  },
1436
  "attributes": {}
1437
  }
1438
  },
1439
+ "total_flos": 2.812060362394829e+16,
1440
  "train_batch_size": 2,
1441
  "trial_name": null,
1442
  "trial_params": null