delarosajav95 commited on
Commit
7ea357f
·
verified ·
1 Parent(s): 5acea40

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-10412/config.json +39 -0
  2. checkpoint-10412/merges.txt +0 -0
  3. checkpoint-10412/model.safetensors +3 -0
  4. checkpoint-10412/rng_state.pth +3 -0
  5. checkpoint-10412/scheduler.pt +3 -0
  6. checkpoint-10412/special_tokens_map.json +51 -0
  7. checkpoint-10412/tokenizer.json +0 -0
  8. checkpoint-10412/tokenizer_config.json +58 -0
  9. checkpoint-10412/trainer_state.json +0 -0
  10. checkpoint-10412/training_args.bin +3 -0
  11. checkpoint-10412/vocab.json +0 -0
  12. checkpoint-2603/config.json +39 -0
  13. checkpoint-2603/merges.txt +0 -0
  14. checkpoint-2603/model.safetensors +3 -0
  15. checkpoint-2603/optimizer.pt +3 -0
  16. checkpoint-2603/rng_state.pth +3 -0
  17. checkpoint-2603/scheduler.pt +3 -0
  18. checkpoint-2603/special_tokens_map.json +51 -0
  19. checkpoint-2603/tokenizer.json +0 -0
  20. checkpoint-2603/tokenizer_config.json +58 -0
  21. checkpoint-2603/trainer_state.json +1889 -0
  22. checkpoint-2603/training_args.bin +3 -0
  23. checkpoint-2603/vocab.json +0 -0
  24. checkpoint-5206/config.json +39 -0
  25. checkpoint-5206/merges.txt +0 -0
  26. checkpoint-5206/model.safetensors +3 -0
  27. checkpoint-5206/optimizer.pt +3 -0
  28. checkpoint-5206/rng_state.pth +3 -0
  29. checkpoint-5206/scheduler.pt +3 -0
  30. checkpoint-5206/special_tokens_map.json +51 -0
  31. checkpoint-5206/tokenizer.json +0 -0
  32. checkpoint-5206/tokenizer_config.json +58 -0
  33. checkpoint-5206/trainer_state.json +3736 -0
  34. checkpoint-5206/training_args.bin +3 -0
  35. checkpoint-5206/vocab.json +0 -0
  36. checkpoint-7809/config.json +39 -0
  37. checkpoint-7809/merges.txt +0 -0
  38. checkpoint-7809/model.safetensors +3 -0
  39. checkpoint-7809/optimizer.pt +3 -0
  40. checkpoint-7809/rng_state.pth +3 -0
  41. checkpoint-7809/scheduler.pt +3 -0
  42. checkpoint-7809/special_tokens_map.json +51 -0
  43. checkpoint-7809/tokenizer.json +0 -0
  44. checkpoint-7809/tokenizer_config.json +58 -0
  45. checkpoint-7809/trainer_state.json +0 -0
  46. checkpoint-7809/training_args.bin +3 -0
  47. checkpoint-7809/vocab.json +0 -0
  48. config.json +39 -0
  49. events.out.tfevents.1735229724.0be04a97d1bd.376.0 +3 -0
  50. merges.txt +0 -0
checkpoint-10412/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "LABEL_0",
16
+ "1": "LABEL_1",
17
+ "2": "LABEL_2"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "LABEL_0": 0,
23
+ "LABEL_1": 1,
24
+ "LABEL_2": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.47.1",
36
+ "type_vocab_size": 1,
37
+ "use_cache": true,
38
+ "vocab_size": 50265
39
+ }
checkpoint-10412/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10412/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da3225b5ef5be3282380193f7ee4c8e74fe130e395e3c11ee904a71f180bf745
3
+ size 498615900
checkpoint-10412/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22b21e3f3b37817700fe1c22335b3435eea4b7c75312bd789258d4c14eef9d4d
3
+ size 14244
checkpoint-10412/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4481cb4e940681df89103ca351322e97f1c9ab46880b69c383d8b915ea323cc
3
+ size 1064
checkpoint-10412/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-10412/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10412/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
checkpoint-10412/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10412/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8dba586531be0fd60261c0b3a7e765f40602a7ad1431e5af47acc67f5d677d1
3
+ size 5368
checkpoint-10412/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2603/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "LABEL_0",
16
+ "1": "LABEL_1",
17
+ "2": "LABEL_2"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "LABEL_0": 0,
23
+ "LABEL_1": 1,
24
+ "LABEL_2": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.47.1",
36
+ "type_vocab_size": 1,
37
+ "use_cache": true,
38
+ "vocab_size": 50265
39
+ }
checkpoint-2603/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2603/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d664c2d6d49d4469b928e966b67bb74723a29cdf527c431d426ac3e201d88343
3
+ size 498615900
checkpoint-2603/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34b78cc6f777ab84c54bd6d47c9161e2e8447c10fa7abc9125736248ea8bd7b8
3
+ size 997351674
checkpoint-2603/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90e799908dfe0d62c119bf5adce0198c751cd4718952c3207619d14a91e5cdbb
3
+ size 14244
checkpoint-2603/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eea724afd1e0e16412f6c05ae3c6f86bda721660c29d67e44b8869eea748c699
3
+ size 1064
checkpoint-2603/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-2603/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2603/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
checkpoint-2603/trainer_state.json ADDED
@@ -0,0 +1,1889 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5626052618026733,
3
+ "best_model_checkpoint": "/content/drive/MyDrive/tw-roberta-base-sentiment-FT-v2/checkpoint-2603",
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2603,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.00384172109104879,
13
+ "grad_norm": 19.648235321044922,
14
+ "learning_rate": 1.2195121951219514e-07,
15
+ "loss": 0.7091,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.00768344218209758,
20
+ "grad_norm": 15.138011932373047,
21
+ "learning_rate": 2.439024390243903e-07,
22
+ "loss": 0.5428,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.01152516327314637,
27
+ "grad_norm": 15.91871452331543,
28
+ "learning_rate": 3.6585365853658536e-07,
29
+ "loss": 0.709,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.01536688436419516,
34
+ "grad_norm": 11.673059463500977,
35
+ "learning_rate": 4.878048780487805e-07,
36
+ "loss": 0.6269,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.01920860545524395,
41
+ "grad_norm": 10.827598571777344,
42
+ "learning_rate": 6.097560975609757e-07,
43
+ "loss": 0.6899,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.02305032654629274,
48
+ "grad_norm": 29.76123809814453,
49
+ "learning_rate": 7.317073170731707e-07,
50
+ "loss": 0.7235,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.02689204763734153,
55
+ "grad_norm": 16.770448684692383,
56
+ "learning_rate": 8.53658536585366e-07,
57
+ "loss": 0.731,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.03073376872839032,
62
+ "grad_norm": 17.501832962036133,
63
+ "learning_rate": 9.75609756097561e-07,
64
+ "loss": 0.7907,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.03457548981943911,
69
+ "grad_norm": 5.903749465942383,
70
+ "learning_rate": 1.0975609756097562e-06,
71
+ "loss": 0.6549,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.0384172109104879,
76
+ "grad_norm": 12.823253631591797,
77
+ "learning_rate": 1.2195121951219514e-06,
78
+ "loss": 0.6833,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.042258932001536686,
83
+ "grad_norm": 14.516304969787598,
84
+ "learning_rate": 1.3414634146341465e-06,
85
+ "loss": 0.7764,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.04610065309258548,
90
+ "grad_norm": 17.775850296020508,
91
+ "learning_rate": 1.4634146341463414e-06,
92
+ "loss": 0.6677,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.049942374183634265,
97
+ "grad_norm": 16.348901748657227,
98
+ "learning_rate": 1.5853658536585368e-06,
99
+ "loss": 0.6708,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.05378409527468306,
104
+ "grad_norm": 20.729867935180664,
105
+ "learning_rate": 1.707317073170732e-06,
106
+ "loss": 0.685,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.05762581636573185,
111
+ "grad_norm": 23.60687255859375,
112
+ "learning_rate": 1.8292682926829268e-06,
113
+ "loss": 0.7048,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.06146753745678064,
118
+ "grad_norm": 24.990461349487305,
119
+ "learning_rate": 1.951219512195122e-06,
120
+ "loss": 0.6849,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.06530925854782943,
125
+ "grad_norm": 21.754741668701172,
126
+ "learning_rate": 2.073170731707317e-06,
127
+ "loss": 0.6801,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.06915097963887822,
132
+ "grad_norm": 19.427799224853516,
133
+ "learning_rate": 2.1951219512195125e-06,
134
+ "loss": 0.6006,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.072992700729927,
139
+ "grad_norm": 11.91465950012207,
140
+ "learning_rate": 2.317073170731708e-06,
141
+ "loss": 0.6498,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.0768344218209758,
146
+ "grad_norm": 24.343521118164062,
147
+ "learning_rate": 2.4390243902439027e-06,
148
+ "loss": 0.8272,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.08067614291202459,
153
+ "grad_norm": 11.623435020446777,
154
+ "learning_rate": 2.5609756097560977e-06,
155
+ "loss": 0.6703,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.08451786400307337,
160
+ "grad_norm": 8.984451293945312,
161
+ "learning_rate": 2.682926829268293e-06,
162
+ "loss": 0.6965,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.08835958509412217,
167
+ "grad_norm": 26.78940773010254,
168
+ "learning_rate": 2.8048780487804884e-06,
169
+ "loss": 0.7135,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.09220130618517096,
174
+ "grad_norm": 17.50589370727539,
175
+ "learning_rate": 2.926829268292683e-06,
176
+ "loss": 0.6637,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.09604302727621974,
181
+ "grad_norm": 13.149524688720703,
182
+ "learning_rate": 3.0487804878048782e-06,
183
+ "loss": 0.5756,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.09988474836726853,
188
+ "grad_norm": 10.727895736694336,
189
+ "learning_rate": 3.1707317073170736e-06,
190
+ "loss": 0.594,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.10372646945831733,
195
+ "grad_norm": 15.031158447265625,
196
+ "learning_rate": 3.292682926829269e-06,
197
+ "loss": 0.7186,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.10756819054936612,
202
+ "grad_norm": 10.520498275756836,
203
+ "learning_rate": 3.414634146341464e-06,
204
+ "loss": 0.5746,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.1114099116404149,
209
+ "grad_norm": 19.504133224487305,
210
+ "learning_rate": 3.5365853658536588e-06,
211
+ "loss": 0.7709,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.1152516327314637,
216
+ "grad_norm": 12.495412826538086,
217
+ "learning_rate": 3.6585365853658537e-06,
218
+ "loss": 0.6508,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.11909335382251249,
223
+ "grad_norm": 9.115503311157227,
224
+ "learning_rate": 3.780487804878049e-06,
225
+ "loss": 0.5847,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.12293507491356127,
230
+ "grad_norm": 12.583561897277832,
231
+ "learning_rate": 3.902439024390244e-06,
232
+ "loss": 0.6114,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.12677679600461006,
237
+ "grad_norm": 16.474716186523438,
238
+ "learning_rate": 4.024390243902439e-06,
239
+ "loss": 0.5819,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.13061851709565886,
244
+ "grad_norm": 12.282912254333496,
245
+ "learning_rate": 4.146341463414634e-06,
246
+ "loss": 0.5974,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.13446023818670763,
251
+ "grad_norm": 14.967294692993164,
252
+ "learning_rate": 4.268292682926829e-06,
253
+ "loss": 0.515,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.13830195927775643,
258
+ "grad_norm": 22.999889373779297,
259
+ "learning_rate": 4.390243902439025e-06,
260
+ "loss": 0.8314,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.14214368036880523,
265
+ "grad_norm": 13.173623085021973,
266
+ "learning_rate": 4.51219512195122e-06,
267
+ "loss": 0.7274,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.145985401459854,
272
+ "grad_norm": 10.259631156921387,
273
+ "learning_rate": 4.634146341463416e-06,
274
+ "loss": 0.5758,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.1498271225509028,
279
+ "grad_norm": 20.109394073486328,
280
+ "learning_rate": 4.75609756097561e-06,
281
+ "loss": 0.6767,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.1536688436419516,
286
+ "grad_norm": 12.53744125366211,
287
+ "learning_rate": 4.8780487804878055e-06,
288
+ "loss": 0.6554,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.15751056473300037,
293
+ "grad_norm": 32.90926742553711,
294
+ "learning_rate": 5e-06,
295
+ "loss": 0.751,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.16135228582404917,
300
+ "grad_norm": 13.864742279052734,
301
+ "learning_rate": 5.121951219512195e-06,
302
+ "loss": 0.5644,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.16519400691509797,
307
+ "grad_norm": 16.864086151123047,
308
+ "learning_rate": 5.243902439024391e-06,
309
+ "loss": 0.7212,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.16903572800614675,
314
+ "grad_norm": 8.035212516784668,
315
+ "learning_rate": 5.365853658536586e-06,
316
+ "loss": 0.6411,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.17287744909719555,
321
+ "grad_norm": 15.027800559997559,
322
+ "learning_rate": 5.487804878048781e-06,
323
+ "loss": 0.7331,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.17671917018824435,
328
+ "grad_norm": 9.771449089050293,
329
+ "learning_rate": 5.609756097560977e-06,
330
+ "loss": 0.6245,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.18056089127929312,
335
+ "grad_norm": 7.735960960388184,
336
+ "learning_rate": 5.731707317073171e-06,
337
+ "loss": 0.5251,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.18440261237034192,
342
+ "grad_norm": 7.348488807678223,
343
+ "learning_rate": 5.853658536585366e-06,
344
+ "loss": 0.5108,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.1882443334613907,
349
+ "grad_norm": 30.567018508911133,
350
+ "learning_rate": 5.9756097560975615e-06,
351
+ "loss": 0.5508,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 0.1920860545524395,
356
+ "grad_norm": 19.212230682373047,
357
+ "learning_rate": 6.0975609756097564e-06,
358
+ "loss": 0.5997,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 0.1959277756434883,
363
+ "grad_norm": 17.70198631286621,
364
+ "learning_rate": 6.219512195121951e-06,
365
+ "loss": 0.8108,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 0.19976949673453706,
370
+ "grad_norm": 13.471924781799316,
371
+ "learning_rate": 6.341463414634147e-06,
372
+ "loss": 0.6738,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 0.20361121782558586,
377
+ "grad_norm": 15.494725227355957,
378
+ "learning_rate": 6.463414634146342e-06,
379
+ "loss": 0.8114,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 0.20745293891663466,
384
+ "grad_norm": 9.314640045166016,
385
+ "learning_rate": 6.585365853658538e-06,
386
+ "loss": 0.5791,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 0.21129466000768343,
391
+ "grad_norm": 11.314807891845703,
392
+ "learning_rate": 6.707317073170733e-06,
393
+ "loss": 0.6536,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 0.21513638109873223,
398
+ "grad_norm": 16.462522506713867,
399
+ "learning_rate": 6.829268292682928e-06,
400
+ "loss": 0.5598,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 0.21897810218978103,
405
+ "grad_norm": 36.564476013183594,
406
+ "learning_rate": 6.951219512195122e-06,
407
+ "loss": 0.5573,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 0.2228198232808298,
412
+ "grad_norm": 16.07019805908203,
413
+ "learning_rate": 7.0731707317073175e-06,
414
+ "loss": 0.5328,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 0.2266615443718786,
419
+ "grad_norm": 6.132660388946533,
420
+ "learning_rate": 7.1951219512195125e-06,
421
+ "loss": 0.4499,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 0.2305032654629274,
426
+ "grad_norm": 21.4416561126709,
427
+ "learning_rate": 7.317073170731707e-06,
428
+ "loss": 0.6584,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 0.23434498655397618,
433
+ "grad_norm": 18.0037841796875,
434
+ "learning_rate": 7.439024390243903e-06,
435
+ "loss": 0.4844,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 0.23818670764502498,
440
+ "grad_norm": 17.625553131103516,
441
+ "learning_rate": 7.560975609756098e-06,
442
+ "loss": 0.5122,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 0.24202842873607375,
447
+ "grad_norm": 10.607305526733398,
448
+ "learning_rate": 7.682926829268293e-06,
449
+ "loss": 0.5188,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 0.24587014982712255,
454
+ "grad_norm": 26.88294792175293,
455
+ "learning_rate": 7.804878048780489e-06,
456
+ "loss": 0.7002,
457
+ "step": 640
458
+ },
459
+ {
460
+ "epoch": 0.24971187091817135,
461
+ "grad_norm": 38.178287506103516,
462
+ "learning_rate": 7.926829268292685e-06,
463
+ "loss": 0.9407,
464
+ "step": 650
465
+ },
466
+ {
467
+ "epoch": 0.2535535920092201,
468
+ "grad_norm": 8.800226211547852,
469
+ "learning_rate": 8.048780487804879e-06,
470
+ "loss": 0.4968,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 0.2573953131002689,
475
+ "grad_norm": 30.46478843688965,
476
+ "learning_rate": 8.170731707317073e-06,
477
+ "loss": 0.5293,
478
+ "step": 670
479
+ },
480
+ {
481
+ "epoch": 0.2612370341913177,
482
+ "grad_norm": 20.630985260009766,
483
+ "learning_rate": 8.292682926829268e-06,
484
+ "loss": 0.5382,
485
+ "step": 680
486
+ },
487
+ {
488
+ "epoch": 0.2650787552823665,
489
+ "grad_norm": 19.19484519958496,
490
+ "learning_rate": 8.414634146341464e-06,
491
+ "loss": 0.6311,
492
+ "step": 690
493
+ },
494
+ {
495
+ "epoch": 0.26892047637341526,
496
+ "grad_norm": 26.084064483642578,
497
+ "learning_rate": 8.536585365853658e-06,
498
+ "loss": 0.6893,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 0.27276219746446406,
503
+ "grad_norm": 11.942285537719727,
504
+ "learning_rate": 8.658536585365854e-06,
505
+ "loss": 0.5938,
506
+ "step": 710
507
+ },
508
+ {
509
+ "epoch": 0.27660391855551286,
510
+ "grad_norm": 25.995960235595703,
511
+ "learning_rate": 8.78048780487805e-06,
512
+ "loss": 0.5693,
513
+ "step": 720
514
+ },
515
+ {
516
+ "epoch": 0.28044563964656166,
517
+ "grad_norm": 10.440145492553711,
518
+ "learning_rate": 8.902439024390244e-06,
519
+ "loss": 0.5937,
520
+ "step": 730
521
+ },
522
+ {
523
+ "epoch": 0.28428736073761046,
524
+ "grad_norm": 21.39019012451172,
525
+ "learning_rate": 9.02439024390244e-06,
526
+ "loss": 0.664,
527
+ "step": 740
528
+ },
529
+ {
530
+ "epoch": 0.28812908182865926,
531
+ "grad_norm": 11.281585693359375,
532
+ "learning_rate": 9.146341463414635e-06,
533
+ "loss": 0.5178,
534
+ "step": 750
535
+ },
536
+ {
537
+ "epoch": 0.291970802919708,
538
+ "grad_norm": 13.577981948852539,
539
+ "learning_rate": 9.268292682926831e-06,
540
+ "loss": 0.5405,
541
+ "step": 760
542
+ },
543
+ {
544
+ "epoch": 0.2958125240107568,
545
+ "grad_norm": 55.213340759277344,
546
+ "learning_rate": 9.390243902439025e-06,
547
+ "loss": 0.5633,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 0.2996542451018056,
552
+ "grad_norm": 19.74529266357422,
553
+ "learning_rate": 9.51219512195122e-06,
554
+ "loss": 0.7541,
555
+ "step": 780
556
+ },
557
+ {
558
+ "epoch": 0.3034959661928544,
559
+ "grad_norm": 45.70171356201172,
560
+ "learning_rate": 9.634146341463415e-06,
561
+ "loss": 0.6196,
562
+ "step": 790
563
+ },
564
+ {
565
+ "epoch": 0.3073376872839032,
566
+ "grad_norm": 23.01153564453125,
567
+ "learning_rate": 9.756097560975611e-06,
568
+ "loss": 0.7121,
569
+ "step": 800
570
+ },
571
+ {
572
+ "epoch": 0.311179408374952,
573
+ "grad_norm": 17.099529266357422,
574
+ "learning_rate": 9.878048780487805e-06,
575
+ "loss": 0.6021,
576
+ "step": 810
577
+ },
578
+ {
579
+ "epoch": 0.31502112946600075,
580
+ "grad_norm": 11.891583442687988,
581
+ "learning_rate": 1e-05,
582
+ "loss": 0.6071,
583
+ "step": 820
584
+ },
585
+ {
586
+ "epoch": 0.31886285055704955,
587
+ "grad_norm": 16.423572540283203,
588
+ "learning_rate": 9.98957464553795e-06,
589
+ "loss": 0.539,
590
+ "step": 830
591
+ },
592
+ {
593
+ "epoch": 0.32270457164809835,
594
+ "grad_norm": 11.026520729064941,
595
+ "learning_rate": 9.979149291075898e-06,
596
+ "loss": 0.615,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 0.32654629273914715,
601
+ "grad_norm": 25.40671730041504,
602
+ "learning_rate": 9.968723936613845e-06,
603
+ "loss": 0.4819,
604
+ "step": 850
605
+ },
606
+ {
607
+ "epoch": 0.33038801383019595,
608
+ "grad_norm": 30.425880432128906,
609
+ "learning_rate": 9.958298582151794e-06,
610
+ "loss": 0.7362,
611
+ "step": 860
612
+ },
613
+ {
614
+ "epoch": 0.3342297349212447,
615
+ "grad_norm": 21.710176467895508,
616
+ "learning_rate": 9.947873227689742e-06,
617
+ "loss": 0.5052,
618
+ "step": 870
619
+ },
620
+ {
621
+ "epoch": 0.3380714560122935,
622
+ "grad_norm": 13.537766456604004,
623
+ "learning_rate": 9.93744787322769e-06,
624
+ "loss": 0.6032,
625
+ "step": 880
626
+ },
627
+ {
628
+ "epoch": 0.3419131771033423,
629
+ "grad_norm": 19.592073440551758,
630
+ "learning_rate": 9.927022518765639e-06,
631
+ "loss": 0.5222,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 0.3457548981943911,
636
+ "grad_norm": 10.528463363647461,
637
+ "learning_rate": 9.916597164303588e-06,
638
+ "loss": 0.596,
639
+ "step": 900
640
+ },
641
+ {
642
+ "epoch": 0.3495966192854399,
643
+ "grad_norm": 15.643308639526367,
644
+ "learning_rate": 9.906171809841536e-06,
645
+ "loss": 0.7414,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 0.3534383403764887,
650
+ "grad_norm": 22.77689552307129,
651
+ "learning_rate": 9.895746455379483e-06,
652
+ "loss": 0.5151,
653
+ "step": 920
654
+ },
655
+ {
656
+ "epoch": 0.35728006146753744,
657
+ "grad_norm": 23.217538833618164,
658
+ "learning_rate": 9.885321100917432e-06,
659
+ "loss": 0.7501,
660
+ "step": 930
661
+ },
662
+ {
663
+ "epoch": 0.36112178255858624,
664
+ "grad_norm": 21.08719825744629,
665
+ "learning_rate": 9.874895746455382e-06,
666
+ "loss": 0.4944,
667
+ "step": 940
668
+ },
669
+ {
670
+ "epoch": 0.36496350364963503,
671
+ "grad_norm": 19.93057632446289,
672
+ "learning_rate": 9.864470391993327e-06,
673
+ "loss": 0.7289,
674
+ "step": 950
675
+ },
676
+ {
677
+ "epoch": 0.36880522474068383,
678
+ "grad_norm": 14.810643196105957,
679
+ "learning_rate": 9.854045037531277e-06,
680
+ "loss": 0.4956,
681
+ "step": 960
682
+ },
683
+ {
684
+ "epoch": 0.37264694583173263,
685
+ "grad_norm": 19.596710205078125,
686
+ "learning_rate": 9.843619683069226e-06,
687
+ "loss": 0.5528,
688
+ "step": 970
689
+ },
690
+ {
691
+ "epoch": 0.3764886669227814,
692
+ "grad_norm": 23.963172912597656,
693
+ "learning_rate": 9.833194328607173e-06,
694
+ "loss": 0.5904,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 0.3803303880138302,
699
+ "grad_norm": 14.250468254089355,
700
+ "learning_rate": 9.822768974145121e-06,
701
+ "loss": 0.6762,
702
+ "step": 990
703
+ },
704
+ {
705
+ "epoch": 0.384172109104879,
706
+ "grad_norm": 30.411888122558594,
707
+ "learning_rate": 9.81234361968307e-06,
708
+ "loss": 0.6031,
709
+ "step": 1000
710
+ },
711
+ {
712
+ "epoch": 0.3880138301959278,
713
+ "grad_norm": 4.892022132873535,
714
+ "learning_rate": 9.80191826522102e-06,
715
+ "loss": 0.5832,
716
+ "step": 1010
717
+ },
718
+ {
719
+ "epoch": 0.3918555512869766,
720
+ "grad_norm": 14.026780128479004,
721
+ "learning_rate": 9.791492910758967e-06,
722
+ "loss": 0.4432,
723
+ "step": 1020
724
+ },
725
+ {
726
+ "epoch": 0.3956972723780254,
727
+ "grad_norm": 15.22079086303711,
728
+ "learning_rate": 9.781067556296915e-06,
729
+ "loss": 0.5003,
730
+ "step": 1030
731
+ },
732
+ {
733
+ "epoch": 0.3995389934690741,
734
+ "grad_norm": 38.46358871459961,
735
+ "learning_rate": 9.770642201834864e-06,
736
+ "loss": 0.6863,
737
+ "step": 1040
738
+ },
739
+ {
740
+ "epoch": 0.4033807145601229,
741
+ "grad_norm": 28.509458541870117,
742
+ "learning_rate": 9.760216847372811e-06,
743
+ "loss": 0.6557,
744
+ "step": 1050
745
+ },
746
+ {
747
+ "epoch": 0.4072224356511717,
748
+ "grad_norm": 10.18283748626709,
749
+ "learning_rate": 9.749791492910759e-06,
750
+ "loss": 0.475,
751
+ "step": 1060
752
+ },
753
+ {
754
+ "epoch": 0.4110641567422205,
755
+ "grad_norm": 16.280475616455078,
756
+ "learning_rate": 9.739366138448708e-06,
757
+ "loss": 0.5423,
758
+ "step": 1070
759
+ },
760
+ {
761
+ "epoch": 0.4149058778332693,
762
+ "grad_norm": 27.256633758544922,
763
+ "learning_rate": 9.728940783986657e-06,
764
+ "loss": 0.4638,
765
+ "step": 1080
766
+ },
767
+ {
768
+ "epoch": 0.4187475989243181,
769
+ "grad_norm": 26.906049728393555,
770
+ "learning_rate": 9.718515429524605e-06,
771
+ "loss": 0.6528,
772
+ "step": 1090
773
+ },
774
+ {
775
+ "epoch": 0.42258932001536686,
776
+ "grad_norm": 10.475133895874023,
777
+ "learning_rate": 9.708090075062552e-06,
778
+ "loss": 0.8387,
779
+ "step": 1100
780
+ },
781
+ {
782
+ "epoch": 0.42643104110641566,
783
+ "grad_norm": 14.576977729797363,
784
+ "learning_rate": 9.697664720600502e-06,
785
+ "loss": 0.6325,
786
+ "step": 1110
787
+ },
788
+ {
789
+ "epoch": 0.43027276219746446,
790
+ "grad_norm": 17.823413848876953,
791
+ "learning_rate": 9.68723936613845e-06,
792
+ "loss": 0.613,
793
+ "step": 1120
794
+ },
795
+ {
796
+ "epoch": 0.43411448328851326,
797
+ "grad_norm": 12.499800682067871,
798
+ "learning_rate": 9.676814011676397e-06,
799
+ "loss": 0.5685,
800
+ "step": 1130
801
+ },
802
+ {
803
+ "epoch": 0.43795620437956206,
804
+ "grad_norm": 19.12653923034668,
805
+ "learning_rate": 9.666388657214346e-06,
806
+ "loss": 0.8678,
807
+ "step": 1140
808
+ },
809
+ {
810
+ "epoch": 0.4417979254706108,
811
+ "grad_norm": 5.942495822906494,
812
+ "learning_rate": 9.655963302752295e-06,
813
+ "loss": 0.4594,
814
+ "step": 1150
815
+ },
816
+ {
817
+ "epoch": 0.4456396465616596,
818
+ "grad_norm": 19.233552932739258,
819
+ "learning_rate": 9.645537948290243e-06,
820
+ "loss": 0.5747,
821
+ "step": 1160
822
+ },
823
+ {
824
+ "epoch": 0.4494813676527084,
825
+ "grad_norm": 17.434133529663086,
826
+ "learning_rate": 9.63511259382819e-06,
827
+ "loss": 0.5121,
828
+ "step": 1170
829
+ },
830
+ {
831
+ "epoch": 0.4533230887437572,
832
+ "grad_norm": 14.78231143951416,
833
+ "learning_rate": 9.62468723936614e-06,
834
+ "loss": 0.7225,
835
+ "step": 1180
836
+ },
837
+ {
838
+ "epoch": 0.457164809834806,
839
+ "grad_norm": 23.81663703918457,
840
+ "learning_rate": 9.614261884904089e-06,
841
+ "loss": 0.8527,
842
+ "step": 1190
843
+ },
844
+ {
845
+ "epoch": 0.4610065309258548,
846
+ "grad_norm": 18.266740798950195,
847
+ "learning_rate": 9.603836530442035e-06,
848
+ "loss": 0.4101,
849
+ "step": 1200
850
+ },
851
+ {
852
+ "epoch": 0.46484825201690355,
853
+ "grad_norm": 23.31222152709961,
854
+ "learning_rate": 9.593411175979984e-06,
855
+ "loss": 0.5847,
856
+ "step": 1210
857
+ },
858
+ {
859
+ "epoch": 0.46868997310795235,
860
+ "grad_norm": 11.039971351623535,
861
+ "learning_rate": 9.582985821517933e-06,
862
+ "loss": 0.6515,
863
+ "step": 1220
864
+ },
865
+ {
866
+ "epoch": 0.47253169419900115,
867
+ "grad_norm": 27.05122184753418,
868
+ "learning_rate": 9.57256046705588e-06,
869
+ "loss": 0.5968,
870
+ "step": 1230
871
+ },
872
+ {
873
+ "epoch": 0.47637341529004995,
874
+ "grad_norm": 18.416839599609375,
875
+ "learning_rate": 9.562135112593828e-06,
876
+ "loss": 0.6339,
877
+ "step": 1240
878
+ },
879
+ {
880
+ "epoch": 0.48021513638109875,
881
+ "grad_norm": 18.275182723999023,
882
+ "learning_rate": 9.551709758131778e-06,
883
+ "loss": 0.3595,
884
+ "step": 1250
885
+ },
886
+ {
887
+ "epoch": 0.4840568574721475,
888
+ "grad_norm": 32.038143157958984,
889
+ "learning_rate": 9.541284403669727e-06,
890
+ "loss": 0.8281,
891
+ "step": 1260
892
+ },
893
+ {
894
+ "epoch": 0.4878985785631963,
895
+ "grad_norm": 19.010108947753906,
896
+ "learning_rate": 9.530859049207674e-06,
897
+ "loss": 0.7158,
898
+ "step": 1270
899
+ },
900
+ {
901
+ "epoch": 0.4917402996542451,
902
+ "grad_norm": 9.5922269821167,
903
+ "learning_rate": 9.520433694745622e-06,
904
+ "loss": 0.5315,
905
+ "step": 1280
906
+ },
907
+ {
908
+ "epoch": 0.4955820207452939,
909
+ "grad_norm": 11.005895614624023,
910
+ "learning_rate": 9.510008340283571e-06,
911
+ "loss": 0.3599,
912
+ "step": 1290
913
+ },
914
+ {
915
+ "epoch": 0.4994237418363427,
916
+ "grad_norm": 51.6233024597168,
917
+ "learning_rate": 9.499582985821519e-06,
918
+ "loss": 0.537,
919
+ "step": 1300
920
+ },
921
+ {
922
+ "epoch": 0.5032654629273915,
923
+ "grad_norm": 19.033329010009766,
924
+ "learning_rate": 9.489157631359466e-06,
925
+ "loss": 0.6083,
926
+ "step": 1310
927
+ },
928
+ {
929
+ "epoch": 0.5071071840184402,
930
+ "grad_norm": 16.91973114013672,
931
+ "learning_rate": 9.478732276897415e-06,
932
+ "loss": 0.7693,
933
+ "step": 1320
934
+ },
935
+ {
936
+ "epoch": 0.5109489051094891,
937
+ "grad_norm": 23.38747215270996,
938
+ "learning_rate": 9.468306922435365e-06,
939
+ "loss": 0.6646,
940
+ "step": 1330
941
+ },
942
+ {
943
+ "epoch": 0.5147906262005378,
944
+ "grad_norm": 13.772806167602539,
945
+ "learning_rate": 9.457881567973312e-06,
946
+ "loss": 0.4763,
947
+ "step": 1340
948
+ },
949
+ {
950
+ "epoch": 0.5186323472915866,
951
+ "grad_norm": 8.950833320617676,
952
+ "learning_rate": 9.44745621351126e-06,
953
+ "loss": 0.5793,
954
+ "step": 1350
955
+ },
956
+ {
957
+ "epoch": 0.5224740683826354,
958
+ "grad_norm": 5.6142964363098145,
959
+ "learning_rate": 9.437030859049209e-06,
960
+ "loss": 0.5797,
961
+ "step": 1360
962
+ },
963
+ {
964
+ "epoch": 0.5263157894736842,
965
+ "grad_norm": 18.615188598632812,
966
+ "learning_rate": 9.426605504587157e-06,
967
+ "loss": 0.6041,
968
+ "step": 1370
969
+ },
970
+ {
971
+ "epoch": 0.530157510564733,
972
+ "grad_norm": 10.953849792480469,
973
+ "learning_rate": 9.416180150125104e-06,
974
+ "loss": 0.5933,
975
+ "step": 1380
976
+ },
977
+ {
978
+ "epoch": 0.5339992316557818,
979
+ "grad_norm": 11.613428115844727,
980
+ "learning_rate": 9.405754795663053e-06,
981
+ "loss": 0.5275,
982
+ "step": 1390
983
+ },
984
+ {
985
+ "epoch": 0.5378409527468305,
986
+ "grad_norm": 12.725924491882324,
987
+ "learning_rate": 9.395329441201003e-06,
988
+ "loss": 0.5673,
989
+ "step": 1400
990
+ },
991
+ {
992
+ "epoch": 0.5416826738378794,
993
+ "grad_norm": 34.54634094238281,
994
+ "learning_rate": 9.38490408673895e-06,
995
+ "loss": 0.6717,
996
+ "step": 1410
997
+ },
998
+ {
999
+ "epoch": 0.5455243949289281,
1000
+ "grad_norm": 21.028316497802734,
1001
+ "learning_rate": 9.374478732276898e-06,
1002
+ "loss": 0.5483,
1003
+ "step": 1420
1004
+ },
1005
+ {
1006
+ "epoch": 0.549366116019977,
1007
+ "grad_norm": 30.281667709350586,
1008
+ "learning_rate": 9.364053377814847e-06,
1009
+ "loss": 0.7806,
1010
+ "step": 1430
1011
+ },
1012
+ {
1013
+ "epoch": 0.5532078371110257,
1014
+ "grad_norm": 11.983960151672363,
1015
+ "learning_rate": 9.353628023352795e-06,
1016
+ "loss": 0.5061,
1017
+ "step": 1440
1018
+ },
1019
+ {
1020
+ "epoch": 0.5570495582020746,
1021
+ "grad_norm": 6.99747896194458,
1022
+ "learning_rate": 9.343202668890742e-06,
1023
+ "loss": 0.5623,
1024
+ "step": 1450
1025
+ },
1026
+ {
1027
+ "epoch": 0.5608912792931233,
1028
+ "grad_norm": 11.219843864440918,
1029
+ "learning_rate": 9.332777314428691e-06,
1030
+ "loss": 0.6227,
1031
+ "step": 1460
1032
+ },
1033
+ {
1034
+ "epoch": 0.5647330003841721,
1035
+ "grad_norm": 8.85550308227539,
1036
+ "learning_rate": 9.32235195996664e-06,
1037
+ "loss": 0.5908,
1038
+ "step": 1470
1039
+ },
1040
+ {
1041
+ "epoch": 0.5685747214752209,
1042
+ "grad_norm": 15.55632209777832,
1043
+ "learning_rate": 9.311926605504588e-06,
1044
+ "loss": 0.5888,
1045
+ "step": 1480
1046
+ },
1047
+ {
1048
+ "epoch": 0.5724164425662697,
1049
+ "grad_norm": 5.281271457672119,
1050
+ "learning_rate": 9.301501251042536e-06,
1051
+ "loss": 0.4795,
1052
+ "step": 1490
1053
+ },
1054
+ {
1055
+ "epoch": 0.5762581636573185,
1056
+ "grad_norm": 10.58825397491455,
1057
+ "learning_rate": 9.291075896580485e-06,
1058
+ "loss": 0.4825,
1059
+ "step": 1500
1060
+ },
1061
+ {
1062
+ "epoch": 0.5800998847483673,
1063
+ "grad_norm": 13.970091819763184,
1064
+ "learning_rate": 9.280650542118432e-06,
1065
+ "loss": 0.6107,
1066
+ "step": 1510
1067
+ },
1068
+ {
1069
+ "epoch": 0.583941605839416,
1070
+ "grad_norm": 15.610709190368652,
1071
+ "learning_rate": 9.270225187656382e-06,
1072
+ "loss": 0.454,
1073
+ "step": 1520
1074
+ },
1075
+ {
1076
+ "epoch": 0.5877833269304649,
1077
+ "grad_norm": 9.203128814697266,
1078
+ "learning_rate": 9.25979983319433e-06,
1079
+ "loss": 0.596,
1080
+ "step": 1530
1081
+ },
1082
+ {
1083
+ "epoch": 0.5916250480215136,
1084
+ "grad_norm": 12.340123176574707,
1085
+ "learning_rate": 9.249374478732278e-06,
1086
+ "loss": 0.6622,
1087
+ "step": 1540
1088
+ },
1089
+ {
1090
+ "epoch": 0.5954667691125625,
1091
+ "grad_norm": 6.894665718078613,
1092
+ "learning_rate": 9.238949124270226e-06,
1093
+ "loss": 0.4944,
1094
+ "step": 1550
1095
+ },
1096
+ {
1097
+ "epoch": 0.5993084902036112,
1098
+ "grad_norm": 22.704559326171875,
1099
+ "learning_rate": 9.228523769808174e-06,
1100
+ "loss": 0.6151,
1101
+ "step": 1560
1102
+ },
1103
+ {
1104
+ "epoch": 0.60315021129466,
1105
+ "grad_norm": 6.272796630859375,
1106
+ "learning_rate": 9.218098415346123e-06,
1107
+ "loss": 0.4866,
1108
+ "step": 1570
1109
+ },
1110
+ {
1111
+ "epoch": 0.6069919323857088,
1112
+ "grad_norm": 40.39881134033203,
1113
+ "learning_rate": 9.20767306088407e-06,
1114
+ "loss": 0.5471,
1115
+ "step": 1580
1116
+ },
1117
+ {
1118
+ "epoch": 0.6108336534767576,
1119
+ "grad_norm": 9.417654037475586,
1120
+ "learning_rate": 9.19724770642202e-06,
1121
+ "loss": 0.5702,
1122
+ "step": 1590
1123
+ },
1124
+ {
1125
+ "epoch": 0.6146753745678064,
1126
+ "grad_norm": 8.880293846130371,
1127
+ "learning_rate": 9.186822351959967e-06,
1128
+ "loss": 0.7593,
1129
+ "step": 1600
1130
+ },
1131
+ {
1132
+ "epoch": 0.6185170956588552,
1133
+ "grad_norm": 16.337783813476562,
1134
+ "learning_rate": 9.176396997497916e-06,
1135
+ "loss": 0.3708,
1136
+ "step": 1610
1137
+ },
1138
+ {
1139
+ "epoch": 0.622358816749904,
1140
+ "grad_norm": 15.34815502166748,
1141
+ "learning_rate": 9.165971643035864e-06,
1142
+ "loss": 0.6829,
1143
+ "step": 1620
1144
+ },
1145
+ {
1146
+ "epoch": 0.6262005378409528,
1147
+ "grad_norm": 12.125506401062012,
1148
+ "learning_rate": 9.155546288573811e-06,
1149
+ "loss": 0.5839,
1150
+ "step": 1630
1151
+ },
1152
+ {
1153
+ "epoch": 0.6300422589320015,
1154
+ "grad_norm": 12.340716361999512,
1155
+ "learning_rate": 9.14512093411176e-06,
1156
+ "loss": 0.5855,
1157
+ "step": 1640
1158
+ },
1159
+ {
1160
+ "epoch": 0.6338839800230504,
1161
+ "grad_norm": 17.276071548461914,
1162
+ "learning_rate": 9.134695579649708e-06,
1163
+ "loss": 0.4579,
1164
+ "step": 1650
1165
+ },
1166
+ {
1167
+ "epoch": 0.6377257011140991,
1168
+ "grad_norm": 4.054512977600098,
1169
+ "learning_rate": 9.124270225187658e-06,
1170
+ "loss": 0.3717,
1171
+ "step": 1660
1172
+ },
1173
+ {
1174
+ "epoch": 0.641567422205148,
1175
+ "grad_norm": 26.277875900268555,
1176
+ "learning_rate": 9.113844870725605e-06,
1177
+ "loss": 0.6934,
1178
+ "step": 1670
1179
+ },
1180
+ {
1181
+ "epoch": 0.6454091432961967,
1182
+ "grad_norm": 23.17993927001953,
1183
+ "learning_rate": 9.103419516263554e-06,
1184
+ "loss": 0.8507,
1185
+ "step": 1680
1186
+ },
1187
+ {
1188
+ "epoch": 0.6492508643872454,
1189
+ "grad_norm": 30.25948715209961,
1190
+ "learning_rate": 9.092994161801502e-06,
1191
+ "loss": 0.5851,
1192
+ "step": 1690
1193
+ },
1194
+ {
1195
+ "epoch": 0.6530925854782943,
1196
+ "grad_norm": 12.083464622497559,
1197
+ "learning_rate": 9.08256880733945e-06,
1198
+ "loss": 0.5214,
1199
+ "step": 1700
1200
+ },
1201
+ {
1202
+ "epoch": 0.656934306569343,
1203
+ "grad_norm": 26.132946014404297,
1204
+ "learning_rate": 9.072143452877399e-06,
1205
+ "loss": 0.5715,
1206
+ "step": 1710
1207
+ },
1208
+ {
1209
+ "epoch": 0.6607760276603919,
1210
+ "grad_norm": 13.83061408996582,
1211
+ "learning_rate": 9.061718098415346e-06,
1212
+ "loss": 0.6076,
1213
+ "step": 1720
1214
+ },
1215
+ {
1216
+ "epoch": 0.6646177487514406,
1217
+ "grad_norm": 36.992679595947266,
1218
+ "learning_rate": 9.051292743953295e-06,
1219
+ "loss": 0.5795,
1220
+ "step": 1730
1221
+ },
1222
+ {
1223
+ "epoch": 0.6684594698424894,
1224
+ "grad_norm": 24.426977157592773,
1225
+ "learning_rate": 9.040867389491243e-06,
1226
+ "loss": 0.6913,
1227
+ "step": 1740
1228
+ },
1229
+ {
1230
+ "epoch": 0.6723011909335382,
1231
+ "grad_norm": 15.399202346801758,
1232
+ "learning_rate": 9.030442035029192e-06,
1233
+ "loss": 0.647,
1234
+ "step": 1750
1235
+ },
1236
+ {
1237
+ "epoch": 0.676142912024587,
1238
+ "grad_norm": 36.72813034057617,
1239
+ "learning_rate": 9.02001668056714e-06,
1240
+ "loss": 0.7641,
1241
+ "step": 1760
1242
+ },
1243
+ {
1244
+ "epoch": 0.6799846331156358,
1245
+ "grad_norm": 19.219661712646484,
1246
+ "learning_rate": 9.009591326105089e-06,
1247
+ "loss": 0.7111,
1248
+ "step": 1770
1249
+ },
1250
+ {
1251
+ "epoch": 0.6838263542066846,
1252
+ "grad_norm": 10.353839874267578,
1253
+ "learning_rate": 8.999165971643037e-06,
1254
+ "loss": 0.437,
1255
+ "step": 1780
1256
+ },
1257
+ {
1258
+ "epoch": 0.6876680752977334,
1259
+ "grad_norm": 12.179790496826172,
1260
+ "learning_rate": 8.988740617180984e-06,
1261
+ "loss": 0.6514,
1262
+ "step": 1790
1263
+ },
1264
+ {
1265
+ "epoch": 0.6915097963887822,
1266
+ "grad_norm": 15.036273956298828,
1267
+ "learning_rate": 8.978315262718933e-06,
1268
+ "loss": 0.4611,
1269
+ "step": 1800
1270
+ },
1271
+ {
1272
+ "epoch": 0.6953515174798309,
1273
+ "grad_norm": 12.146955490112305,
1274
+ "learning_rate": 8.967889908256881e-06,
1275
+ "loss": 0.5176,
1276
+ "step": 1810
1277
+ },
1278
+ {
1279
+ "epoch": 0.6991932385708798,
1280
+ "grad_norm": 16.004959106445312,
1281
+ "learning_rate": 8.95746455379483e-06,
1282
+ "loss": 0.3749,
1283
+ "step": 1820
1284
+ },
1285
+ {
1286
+ "epoch": 0.7030349596619285,
1287
+ "grad_norm": 23.500526428222656,
1288
+ "learning_rate": 8.947039199332778e-06,
1289
+ "loss": 0.6124,
1290
+ "step": 1830
1291
+ },
1292
+ {
1293
+ "epoch": 0.7068766807529774,
1294
+ "grad_norm": 11.367331504821777,
1295
+ "learning_rate": 8.936613844870727e-06,
1296
+ "loss": 0.3982,
1297
+ "step": 1840
1298
+ },
1299
+ {
1300
+ "epoch": 0.7107184018440261,
1301
+ "grad_norm": 13.60319709777832,
1302
+ "learning_rate": 8.926188490408674e-06,
1303
+ "loss": 0.4618,
1304
+ "step": 1850
1305
+ },
1306
+ {
1307
+ "epoch": 0.7145601229350749,
1308
+ "grad_norm": 9.807296752929688,
1309
+ "learning_rate": 8.915763135946624e-06,
1310
+ "loss": 0.552,
1311
+ "step": 1860
1312
+ },
1313
+ {
1314
+ "epoch": 0.7184018440261237,
1315
+ "grad_norm": 41.238895416259766,
1316
+ "learning_rate": 8.905337781484571e-06,
1317
+ "loss": 0.738,
1318
+ "step": 1870
1319
+ },
1320
+ {
1321
+ "epoch": 0.7222435651171725,
1322
+ "grad_norm": 8.117176055908203,
1323
+ "learning_rate": 8.894912427022519e-06,
1324
+ "loss": 0.546,
1325
+ "step": 1880
1326
+ },
1327
+ {
1328
+ "epoch": 0.7260852862082213,
1329
+ "grad_norm": 8.292084693908691,
1330
+ "learning_rate": 8.884487072560468e-06,
1331
+ "loss": 0.5098,
1332
+ "step": 1890
1333
+ },
1334
+ {
1335
+ "epoch": 0.7299270072992701,
1336
+ "grad_norm": 16.20579719543457,
1337
+ "learning_rate": 8.874061718098416e-06,
1338
+ "loss": 0.5693,
1339
+ "step": 1900
1340
+ },
1341
+ {
1342
+ "epoch": 0.7337687283903188,
1343
+ "grad_norm": 10.686980247497559,
1344
+ "learning_rate": 8.863636363636365e-06,
1345
+ "loss": 0.6848,
1346
+ "step": 1910
1347
+ },
1348
+ {
1349
+ "epoch": 0.7376104494813677,
1350
+ "grad_norm": 12.386652946472168,
1351
+ "learning_rate": 8.853211009174312e-06,
1352
+ "loss": 0.5282,
1353
+ "step": 1920
1354
+ },
1355
+ {
1356
+ "epoch": 0.7414521705724164,
1357
+ "grad_norm": 11.129962921142578,
1358
+ "learning_rate": 8.842785654712262e-06,
1359
+ "loss": 0.5789,
1360
+ "step": 1930
1361
+ },
1362
+ {
1363
+ "epoch": 0.7452938916634653,
1364
+ "grad_norm": 8.727615356445312,
1365
+ "learning_rate": 8.83236030025021e-06,
1366
+ "loss": 0.5936,
1367
+ "step": 1940
1368
+ },
1369
+ {
1370
+ "epoch": 0.749135612754514,
1371
+ "grad_norm": 11.261787414550781,
1372
+ "learning_rate": 8.821934945788157e-06,
1373
+ "loss": 0.5308,
1374
+ "step": 1950
1375
+ },
1376
+ {
1377
+ "epoch": 0.7529773338455628,
1378
+ "grad_norm": 23.387935638427734,
1379
+ "learning_rate": 8.811509591326106e-06,
1380
+ "loss": 0.5074,
1381
+ "step": 1960
1382
+ },
1383
+ {
1384
+ "epoch": 0.7568190549366116,
1385
+ "grad_norm": 20.772794723510742,
1386
+ "learning_rate": 8.801084236864054e-06,
1387
+ "loss": 0.6157,
1388
+ "step": 1970
1389
+ },
1390
+ {
1391
+ "epoch": 0.7606607760276604,
1392
+ "grad_norm": 23.0604305267334,
1393
+ "learning_rate": 8.790658882402003e-06,
1394
+ "loss": 0.5272,
1395
+ "step": 1980
1396
+ },
1397
+ {
1398
+ "epoch": 0.7645024971187092,
1399
+ "grad_norm": 44.302425384521484,
1400
+ "learning_rate": 8.78023352793995e-06,
1401
+ "loss": 0.6709,
1402
+ "step": 1990
1403
+ },
1404
+ {
1405
+ "epoch": 0.768344218209758,
1406
+ "grad_norm": 16.66979217529297,
1407
+ "learning_rate": 8.7698081734779e-06,
1408
+ "loss": 0.4651,
1409
+ "step": 2000
1410
+ },
1411
+ {
1412
+ "epoch": 0.7721859393008068,
1413
+ "grad_norm": 18.14614486694336,
1414
+ "learning_rate": 8.759382819015847e-06,
1415
+ "loss": 0.5747,
1416
+ "step": 2010
1417
+ },
1418
+ {
1419
+ "epoch": 0.7760276603918556,
1420
+ "grad_norm": 10.635650634765625,
1421
+ "learning_rate": 8.748957464553796e-06,
1422
+ "loss": 0.5169,
1423
+ "step": 2020
1424
+ },
1425
+ {
1426
+ "epoch": 0.7798693814829043,
1427
+ "grad_norm": 13.54704475402832,
1428
+ "learning_rate": 8.738532110091744e-06,
1429
+ "loss": 0.4784,
1430
+ "step": 2030
1431
+ },
1432
+ {
1433
+ "epoch": 0.7837111025739532,
1434
+ "grad_norm": 12.35689926147461,
1435
+ "learning_rate": 8.728106755629691e-06,
1436
+ "loss": 0.5529,
1437
+ "step": 2040
1438
+ },
1439
+ {
1440
+ "epoch": 0.7875528236650019,
1441
+ "grad_norm": 7.250340461730957,
1442
+ "learning_rate": 8.71768140116764e-06,
1443
+ "loss": 0.6229,
1444
+ "step": 2050
1445
+ },
1446
+ {
1447
+ "epoch": 0.7913945447560508,
1448
+ "grad_norm": 16.60529327392578,
1449
+ "learning_rate": 8.707256046705588e-06,
1450
+ "loss": 0.5726,
1451
+ "step": 2060
1452
+ },
1453
+ {
1454
+ "epoch": 0.7952362658470995,
1455
+ "grad_norm": 18.4666805267334,
1456
+ "learning_rate": 8.696830692243537e-06,
1457
+ "loss": 0.5643,
1458
+ "step": 2070
1459
+ },
1460
+ {
1461
+ "epoch": 0.7990779869381482,
1462
+ "grad_norm": 31.986207962036133,
1463
+ "learning_rate": 8.686405337781485e-06,
1464
+ "loss": 0.4759,
1465
+ "step": 2080
1466
+ },
1467
+ {
1468
+ "epoch": 0.8029197080291971,
1469
+ "grad_norm": 30.724218368530273,
1470
+ "learning_rate": 8.675979983319434e-06,
1471
+ "loss": 0.6527,
1472
+ "step": 2090
1473
+ },
1474
+ {
1475
+ "epoch": 0.8067614291202458,
1476
+ "grad_norm": 22.7759952545166,
1477
+ "learning_rate": 8.665554628857382e-06,
1478
+ "loss": 0.6438,
1479
+ "step": 2100
1480
+ },
1481
+ {
1482
+ "epoch": 0.8106031502112947,
1483
+ "grad_norm": 14.61020565032959,
1484
+ "learning_rate": 8.65512927439533e-06,
1485
+ "loss": 0.3962,
1486
+ "step": 2110
1487
+ },
1488
+ {
1489
+ "epoch": 0.8144448713023434,
1490
+ "grad_norm": 20.27998161315918,
1491
+ "learning_rate": 8.644703919933279e-06,
1492
+ "loss": 0.6989,
1493
+ "step": 2120
1494
+ },
1495
+ {
1496
+ "epoch": 0.8182865923933922,
1497
+ "grad_norm": 10.2035493850708,
1498
+ "learning_rate": 8.634278565471226e-06,
1499
+ "loss": 0.5543,
1500
+ "step": 2130
1501
+ },
1502
+ {
1503
+ "epoch": 0.822128313484441,
1504
+ "grad_norm": 16.954448699951172,
1505
+ "learning_rate": 8.623853211009175e-06,
1506
+ "loss": 0.4598,
1507
+ "step": 2140
1508
+ },
1509
+ {
1510
+ "epoch": 0.8259700345754898,
1511
+ "grad_norm": 24.188817977905273,
1512
+ "learning_rate": 8.613427856547123e-06,
1513
+ "loss": 0.674,
1514
+ "step": 2150
1515
+ },
1516
+ {
1517
+ "epoch": 0.8298117556665386,
1518
+ "grad_norm": 8.472796440124512,
1519
+ "learning_rate": 8.603002502085072e-06,
1520
+ "loss": 0.4246,
1521
+ "step": 2160
1522
+ },
1523
+ {
1524
+ "epoch": 0.8336534767575874,
1525
+ "grad_norm": 21.893178939819336,
1526
+ "learning_rate": 8.59257714762302e-06,
1527
+ "loss": 0.5788,
1528
+ "step": 2170
1529
+ },
1530
+ {
1531
+ "epoch": 0.8374951978486362,
1532
+ "grad_norm": 8.200776100158691,
1533
+ "learning_rate": 8.582151793160967e-06,
1534
+ "loss": 0.4215,
1535
+ "step": 2180
1536
+ },
1537
+ {
1538
+ "epoch": 0.841336918939685,
1539
+ "grad_norm": 21.523435592651367,
1540
+ "learning_rate": 8.571726438698917e-06,
1541
+ "loss": 0.4367,
1542
+ "step": 2190
1543
+ },
1544
+ {
1545
+ "epoch": 0.8451786400307337,
1546
+ "grad_norm": 18.608898162841797,
1547
+ "learning_rate": 8.561301084236864e-06,
1548
+ "loss": 0.6324,
1549
+ "step": 2200
1550
+ },
1551
+ {
1552
+ "epoch": 0.8490203611217826,
1553
+ "grad_norm": 19.39713478088379,
1554
+ "learning_rate": 8.550875729774813e-06,
1555
+ "loss": 0.382,
1556
+ "step": 2210
1557
+ },
1558
+ {
1559
+ "epoch": 0.8528620822128313,
1560
+ "grad_norm": 15.368677139282227,
1561
+ "learning_rate": 8.540450375312761e-06,
1562
+ "loss": 0.492,
1563
+ "step": 2220
1564
+ },
1565
+ {
1566
+ "epoch": 0.8567038033038802,
1567
+ "grad_norm": 6.85573673248291,
1568
+ "learning_rate": 8.53002502085071e-06,
1569
+ "loss": 0.6801,
1570
+ "step": 2230
1571
+ },
1572
+ {
1573
+ "epoch": 0.8605455243949289,
1574
+ "grad_norm": 11.223825454711914,
1575
+ "learning_rate": 8.519599666388658e-06,
1576
+ "loss": 0.7763,
1577
+ "step": 2240
1578
+ },
1579
+ {
1580
+ "epoch": 0.8643872454859777,
1581
+ "grad_norm": 11.18885326385498,
1582
+ "learning_rate": 8.509174311926605e-06,
1583
+ "loss": 0.585,
1584
+ "step": 2250
1585
+ },
1586
+ {
1587
+ "epoch": 0.8682289665770265,
1588
+ "grad_norm": 21.877548217773438,
1589
+ "learning_rate": 8.498748957464554e-06,
1590
+ "loss": 0.5873,
1591
+ "step": 2260
1592
+ },
1593
+ {
1594
+ "epoch": 0.8720706876680753,
1595
+ "grad_norm": 25.72136116027832,
1596
+ "learning_rate": 8.488323603002504e-06,
1597
+ "loss": 0.5796,
1598
+ "step": 2270
1599
+ },
1600
+ {
1601
+ "epoch": 0.8759124087591241,
1602
+ "grad_norm": 16.472366333007812,
1603
+ "learning_rate": 8.477898248540451e-06,
1604
+ "loss": 0.4431,
1605
+ "step": 2280
1606
+ },
1607
+ {
1608
+ "epoch": 0.8797541298501729,
1609
+ "grad_norm": 5.752821445465088,
1610
+ "learning_rate": 8.467472894078399e-06,
1611
+ "loss": 0.7004,
1612
+ "step": 2290
1613
+ },
1614
+ {
1615
+ "epoch": 0.8835958509412216,
1616
+ "grad_norm": 13.56191349029541,
1617
+ "learning_rate": 8.457047539616348e-06,
1618
+ "loss": 0.4899,
1619
+ "step": 2300
1620
+ },
1621
+ {
1622
+ "epoch": 0.8874375720322705,
1623
+ "grad_norm": 5.017563343048096,
1624
+ "learning_rate": 8.446622185154296e-06,
1625
+ "loss": 0.7014,
1626
+ "step": 2310
1627
+ },
1628
+ {
1629
+ "epoch": 0.8912792931233192,
1630
+ "grad_norm": 15.450356483459473,
1631
+ "learning_rate": 8.436196830692243e-06,
1632
+ "loss": 0.5414,
1633
+ "step": 2320
1634
+ },
1635
+ {
1636
+ "epoch": 0.8951210142143681,
1637
+ "grad_norm": 16.416250228881836,
1638
+ "learning_rate": 8.425771476230192e-06,
1639
+ "loss": 0.698,
1640
+ "step": 2330
1641
+ },
1642
+ {
1643
+ "epoch": 0.8989627353054168,
1644
+ "grad_norm": 11.185935020446777,
1645
+ "learning_rate": 8.415346121768142e-06,
1646
+ "loss": 0.6264,
1647
+ "step": 2340
1648
+ },
1649
+ {
1650
+ "epoch": 0.9028044563964657,
1651
+ "grad_norm": 22.787181854248047,
1652
+ "learning_rate": 8.40492076730609e-06,
1653
+ "loss": 0.6908,
1654
+ "step": 2350
1655
+ },
1656
+ {
1657
+ "epoch": 0.9066461774875144,
1658
+ "grad_norm": 11.522934913635254,
1659
+ "learning_rate": 8.394495412844037e-06,
1660
+ "loss": 0.5546,
1661
+ "step": 2360
1662
+ },
1663
+ {
1664
+ "epoch": 0.9104878985785632,
1665
+ "grad_norm": 18.260616302490234,
1666
+ "learning_rate": 8.384070058381986e-06,
1667
+ "loss": 0.5925,
1668
+ "step": 2370
1669
+ },
1670
+ {
1671
+ "epoch": 0.914329619669612,
1672
+ "grad_norm": 7.180076599121094,
1673
+ "learning_rate": 8.373644703919933e-06,
1674
+ "loss": 0.6639,
1675
+ "step": 2380
1676
+ },
1677
+ {
1678
+ "epoch": 0.9181713407606608,
1679
+ "grad_norm": 11.107264518737793,
1680
+ "learning_rate": 8.363219349457881e-06,
1681
+ "loss": 0.6762,
1682
+ "step": 2390
1683
+ },
1684
+ {
1685
+ "epoch": 0.9220130618517096,
1686
+ "grad_norm": 12.528190612792969,
1687
+ "learning_rate": 8.35279399499583e-06,
1688
+ "loss": 0.5435,
1689
+ "step": 2400
1690
+ },
1691
+ {
1692
+ "epoch": 0.9258547829427584,
1693
+ "grad_norm": 29.454421997070312,
1694
+ "learning_rate": 8.34236864053378e-06,
1695
+ "loss": 0.5074,
1696
+ "step": 2410
1697
+ },
1698
+ {
1699
+ "epoch": 0.9296965040338071,
1700
+ "grad_norm": 14.677248001098633,
1701
+ "learning_rate": 8.331943286071727e-06,
1702
+ "loss": 0.6161,
1703
+ "step": 2420
1704
+ },
1705
+ {
1706
+ "epoch": 0.933538225124856,
1707
+ "grad_norm": 8.907113075256348,
1708
+ "learning_rate": 8.321517931609675e-06,
1709
+ "loss": 0.353,
1710
+ "step": 2430
1711
+ },
1712
+ {
1713
+ "epoch": 0.9373799462159047,
1714
+ "grad_norm": 11.691315650939941,
1715
+ "learning_rate": 8.311092577147624e-06,
1716
+ "loss": 0.4516,
1717
+ "step": 2440
1718
+ },
1719
+ {
1720
+ "epoch": 0.9412216673069536,
1721
+ "grad_norm": 2.8710756301879883,
1722
+ "learning_rate": 8.300667222685571e-06,
1723
+ "loss": 0.6726,
1724
+ "step": 2450
1725
+ },
1726
+ {
1727
+ "epoch": 0.9450633883980023,
1728
+ "grad_norm": 11.67735481262207,
1729
+ "learning_rate": 8.290241868223519e-06,
1730
+ "loss": 0.6797,
1731
+ "step": 2460
1732
+ },
1733
+ {
1734
+ "epoch": 0.948905109489051,
1735
+ "grad_norm": 17.547286987304688,
1736
+ "learning_rate": 8.279816513761468e-06,
1737
+ "loss": 0.7029,
1738
+ "step": 2470
1739
+ },
1740
+ {
1741
+ "epoch": 0.9527468305800999,
1742
+ "grad_norm": 11.663725852966309,
1743
+ "learning_rate": 8.269391159299417e-06,
1744
+ "loss": 0.5016,
1745
+ "step": 2480
1746
+ },
1747
+ {
1748
+ "epoch": 0.9565885516711486,
1749
+ "grad_norm": 9.743104934692383,
1750
+ "learning_rate": 8.258965804837365e-06,
1751
+ "loss": 0.5489,
1752
+ "step": 2490
1753
+ },
1754
+ {
1755
+ "epoch": 0.9604302727621975,
1756
+ "grad_norm": 9.579474449157715,
1757
+ "learning_rate": 8.248540450375313e-06,
1758
+ "loss": 0.5867,
1759
+ "step": 2500
1760
+ },
1761
+ {
1762
+ "epoch": 0.9642719938532462,
1763
+ "grad_norm": 13.63699722290039,
1764
+ "learning_rate": 8.238115095913262e-06,
1765
+ "loss": 0.5191,
1766
+ "step": 2510
1767
+ },
1768
+ {
1769
+ "epoch": 0.968113714944295,
1770
+ "grad_norm": 10.331293106079102,
1771
+ "learning_rate": 8.227689741451211e-06,
1772
+ "loss": 0.6654,
1773
+ "step": 2520
1774
+ },
1775
+ {
1776
+ "epoch": 0.9719554360353438,
1777
+ "grad_norm": 10.614498138427734,
1778
+ "learning_rate": 8.217264386989159e-06,
1779
+ "loss": 0.5947,
1780
+ "step": 2530
1781
+ },
1782
+ {
1783
+ "epoch": 0.9757971571263926,
1784
+ "grad_norm": 10.182368278503418,
1785
+ "learning_rate": 8.206839032527106e-06,
1786
+ "loss": 0.5482,
1787
+ "step": 2540
1788
+ },
1789
+ {
1790
+ "epoch": 0.9796388782174414,
1791
+ "grad_norm": 15.42397403717041,
1792
+ "learning_rate": 8.196413678065055e-06,
1793
+ "loss": 0.5144,
1794
+ "step": 2550
1795
+ },
1796
+ {
1797
+ "epoch": 0.9834805993084902,
1798
+ "grad_norm": 8.317682266235352,
1799
+ "learning_rate": 8.185988323603003e-06,
1800
+ "loss": 0.4518,
1801
+ "step": 2560
1802
+ },
1803
+ {
1804
+ "epoch": 0.987322320399539,
1805
+ "grad_norm": 43.10714340209961,
1806
+ "learning_rate": 8.17556296914095e-06,
1807
+ "loss": 0.5974,
1808
+ "step": 2570
1809
+ },
1810
+ {
1811
+ "epoch": 0.9911640414905878,
1812
+ "grad_norm": 7.906277656555176,
1813
+ "learning_rate": 8.1651376146789e-06,
1814
+ "loss": 0.5745,
1815
+ "step": 2580
1816
+ },
1817
+ {
1818
+ "epoch": 0.9950057625816365,
1819
+ "grad_norm": 10.229177474975586,
1820
+ "learning_rate": 8.154712260216849e-06,
1821
+ "loss": 0.4322,
1822
+ "step": 2590
1823
+ },
1824
+ {
1825
+ "epoch": 0.9988474836726854,
1826
+ "grad_norm": 12.773459434509277,
1827
+ "learning_rate": 8.144286905754796e-06,
1828
+ "loss": 0.5947,
1829
+ "step": 2600
1830
+ },
1831
+ {
1832
+ "epoch": 1.0,
1833
+ "eval_accuracy": 0.7737969455383729,
1834
+ "eval_f1_per_label": [
1835
+ 0.778160354156419,
1836
+ 0.7230364524614806,
1837
+ 0.8299703264094955
1838
+ ],
1839
+ "eval_f1_weighted": 0.774424228670159,
1840
+ "eval_loss": 0.5626052618026733,
1841
+ "eval_precision_per_label": [
1842
+ 0.7752368507023848,
1843
+ 0.7131208302446257,
1844
+ 0.8468059339993945
1845
+ ],
1846
+ "eval_precision_weighted": 0.7753803799282116,
1847
+ "eval_recall_per_label": [
1848
+ 0.7811059907834101,
1849
+ 0.7332317073170732,
1850
+ 0.8137910968868199
1851
+ ],
1852
+ "eval_recall_weighted": 0.7737969455383729,
1853
+ "eval_runtime": 38.6689,
1854
+ "eval_samples_per_second": 269.235,
1855
+ "eval_steps_per_second": 33.67,
1856
+ "step": 2603
1857
+ }
1858
+ ],
1859
+ "logging_steps": 10,
1860
+ "max_steps": 10412,
1861
+ "num_input_tokens_seen": 0,
1862
+ "num_train_epochs": 4,
1863
+ "save_steps": 500,
1864
+ "stateful_callbacks": {
1865
+ "EarlyStoppingCallback": {
1866
+ "args": {
1867
+ "early_stopping_patience": 3,
1868
+ "early_stopping_threshold": 0.0
1869
+ },
1870
+ "attributes": {
1871
+ "early_stopping_patience_counter": 0
1872
+ }
1873
+ },
1874
+ "TrainerControl": {
1875
+ "args": {
1876
+ "should_epoch_stop": false,
1877
+ "should_evaluate": false,
1878
+ "should_log": false,
1879
+ "should_save": true,
1880
+ "should_training_stop": false
1881
+ },
1882
+ "attributes": {}
1883
+ }
1884
+ },
1885
+ "total_flos": 656244433604970.0,
1886
+ "train_batch_size": 8,
1887
+ "trial_name": null,
1888
+ "trial_params": null
1889
+ }
checkpoint-2603/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8dba586531be0fd60261c0b3a7e765f40602a7ad1431e5af47acc67f5d677d1
3
+ size 5368
checkpoint-2603/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-5206/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "LABEL_0",
16
+ "1": "LABEL_1",
17
+ "2": "LABEL_2"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "LABEL_0": 0,
23
+ "LABEL_1": 1,
24
+ "LABEL_2": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.47.1",
36
+ "type_vocab_size": 1,
37
+ "use_cache": true,
38
+ "vocab_size": 50265
39
+ }
checkpoint-5206/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-5206/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52f8c020fc0c4d6020e4b3d08dd3f777d3cccaed066d062d62f62c879d2fb193
3
+ size 498615900
checkpoint-5206/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:723f7d1f0958f7c0f9e28be29371d1b68b43ba9ef66f4db9a5be2c98ec62feaa
3
+ size 997351674
checkpoint-5206/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e4ec070bd0cbaa207694c09a2dda96fee2d5dcea1cbe6564605e7d0ca5fd9bf
3
+ size 14244
checkpoint-5206/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8691e7592be263b424954219f3f8cfa62a63b9b797ffb500ca3a5b61f484949
3
+ size 1064
checkpoint-5206/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-5206/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-5206/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
checkpoint-5206/trainer_state.json ADDED
@@ -0,0 +1,3736 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5626052618026733,
3
+ "best_model_checkpoint": "/content/drive/MyDrive/tw-roberta-base-sentiment-FT-v2/checkpoint-2603",
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 5206,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.00384172109104879,
13
+ "grad_norm": 19.648235321044922,
14
+ "learning_rate": 1.2195121951219514e-07,
15
+ "loss": 0.7091,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.00768344218209758,
20
+ "grad_norm": 15.138011932373047,
21
+ "learning_rate": 2.439024390243903e-07,
22
+ "loss": 0.5428,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.01152516327314637,
27
+ "grad_norm": 15.91871452331543,
28
+ "learning_rate": 3.6585365853658536e-07,
29
+ "loss": 0.709,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.01536688436419516,
34
+ "grad_norm": 11.673059463500977,
35
+ "learning_rate": 4.878048780487805e-07,
36
+ "loss": 0.6269,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.01920860545524395,
41
+ "grad_norm": 10.827598571777344,
42
+ "learning_rate": 6.097560975609757e-07,
43
+ "loss": 0.6899,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.02305032654629274,
48
+ "grad_norm": 29.76123809814453,
49
+ "learning_rate": 7.317073170731707e-07,
50
+ "loss": 0.7235,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.02689204763734153,
55
+ "grad_norm": 16.770448684692383,
56
+ "learning_rate": 8.53658536585366e-07,
57
+ "loss": 0.731,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.03073376872839032,
62
+ "grad_norm": 17.501832962036133,
63
+ "learning_rate": 9.75609756097561e-07,
64
+ "loss": 0.7907,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.03457548981943911,
69
+ "grad_norm": 5.903749465942383,
70
+ "learning_rate": 1.0975609756097562e-06,
71
+ "loss": 0.6549,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.0384172109104879,
76
+ "grad_norm": 12.823253631591797,
77
+ "learning_rate": 1.2195121951219514e-06,
78
+ "loss": 0.6833,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.042258932001536686,
83
+ "grad_norm": 14.516304969787598,
84
+ "learning_rate": 1.3414634146341465e-06,
85
+ "loss": 0.7764,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.04610065309258548,
90
+ "grad_norm": 17.775850296020508,
91
+ "learning_rate": 1.4634146341463414e-06,
92
+ "loss": 0.6677,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.049942374183634265,
97
+ "grad_norm": 16.348901748657227,
98
+ "learning_rate": 1.5853658536585368e-06,
99
+ "loss": 0.6708,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.05378409527468306,
104
+ "grad_norm": 20.729867935180664,
105
+ "learning_rate": 1.707317073170732e-06,
106
+ "loss": 0.685,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.05762581636573185,
111
+ "grad_norm": 23.60687255859375,
112
+ "learning_rate": 1.8292682926829268e-06,
113
+ "loss": 0.7048,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.06146753745678064,
118
+ "grad_norm": 24.990461349487305,
119
+ "learning_rate": 1.951219512195122e-06,
120
+ "loss": 0.6849,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.06530925854782943,
125
+ "grad_norm": 21.754741668701172,
126
+ "learning_rate": 2.073170731707317e-06,
127
+ "loss": 0.6801,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.06915097963887822,
132
+ "grad_norm": 19.427799224853516,
133
+ "learning_rate": 2.1951219512195125e-06,
134
+ "loss": 0.6006,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.072992700729927,
139
+ "grad_norm": 11.91465950012207,
140
+ "learning_rate": 2.317073170731708e-06,
141
+ "loss": 0.6498,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.0768344218209758,
146
+ "grad_norm": 24.343521118164062,
147
+ "learning_rate": 2.4390243902439027e-06,
148
+ "loss": 0.8272,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.08067614291202459,
153
+ "grad_norm": 11.623435020446777,
154
+ "learning_rate": 2.5609756097560977e-06,
155
+ "loss": 0.6703,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.08451786400307337,
160
+ "grad_norm": 8.984451293945312,
161
+ "learning_rate": 2.682926829268293e-06,
162
+ "loss": 0.6965,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.08835958509412217,
167
+ "grad_norm": 26.78940773010254,
168
+ "learning_rate": 2.8048780487804884e-06,
169
+ "loss": 0.7135,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.09220130618517096,
174
+ "grad_norm": 17.50589370727539,
175
+ "learning_rate": 2.926829268292683e-06,
176
+ "loss": 0.6637,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.09604302727621974,
181
+ "grad_norm": 13.149524688720703,
182
+ "learning_rate": 3.0487804878048782e-06,
183
+ "loss": 0.5756,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.09988474836726853,
188
+ "grad_norm": 10.727895736694336,
189
+ "learning_rate": 3.1707317073170736e-06,
190
+ "loss": 0.594,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.10372646945831733,
195
+ "grad_norm": 15.031158447265625,
196
+ "learning_rate": 3.292682926829269e-06,
197
+ "loss": 0.7186,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.10756819054936612,
202
+ "grad_norm": 10.520498275756836,
203
+ "learning_rate": 3.414634146341464e-06,
204
+ "loss": 0.5746,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.1114099116404149,
209
+ "grad_norm": 19.504133224487305,
210
+ "learning_rate": 3.5365853658536588e-06,
211
+ "loss": 0.7709,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.1152516327314637,
216
+ "grad_norm": 12.495412826538086,
217
+ "learning_rate": 3.6585365853658537e-06,
218
+ "loss": 0.6508,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.11909335382251249,
223
+ "grad_norm": 9.115503311157227,
224
+ "learning_rate": 3.780487804878049e-06,
225
+ "loss": 0.5847,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.12293507491356127,
230
+ "grad_norm": 12.583561897277832,
231
+ "learning_rate": 3.902439024390244e-06,
232
+ "loss": 0.6114,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.12677679600461006,
237
+ "grad_norm": 16.474716186523438,
238
+ "learning_rate": 4.024390243902439e-06,
239
+ "loss": 0.5819,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.13061851709565886,
244
+ "grad_norm": 12.282912254333496,
245
+ "learning_rate": 4.146341463414634e-06,
246
+ "loss": 0.5974,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.13446023818670763,
251
+ "grad_norm": 14.967294692993164,
252
+ "learning_rate": 4.268292682926829e-06,
253
+ "loss": 0.515,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.13830195927775643,
258
+ "grad_norm": 22.999889373779297,
259
+ "learning_rate": 4.390243902439025e-06,
260
+ "loss": 0.8314,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.14214368036880523,
265
+ "grad_norm": 13.173623085021973,
266
+ "learning_rate": 4.51219512195122e-06,
267
+ "loss": 0.7274,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.145985401459854,
272
+ "grad_norm": 10.259631156921387,
273
+ "learning_rate": 4.634146341463416e-06,
274
+ "loss": 0.5758,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.1498271225509028,
279
+ "grad_norm": 20.109394073486328,
280
+ "learning_rate": 4.75609756097561e-06,
281
+ "loss": 0.6767,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.1536688436419516,
286
+ "grad_norm": 12.53744125366211,
287
+ "learning_rate": 4.8780487804878055e-06,
288
+ "loss": 0.6554,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.15751056473300037,
293
+ "grad_norm": 32.90926742553711,
294
+ "learning_rate": 5e-06,
295
+ "loss": 0.751,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.16135228582404917,
300
+ "grad_norm": 13.864742279052734,
301
+ "learning_rate": 5.121951219512195e-06,
302
+ "loss": 0.5644,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.16519400691509797,
307
+ "grad_norm": 16.864086151123047,
308
+ "learning_rate": 5.243902439024391e-06,
309
+ "loss": 0.7212,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.16903572800614675,
314
+ "grad_norm": 8.035212516784668,
315
+ "learning_rate": 5.365853658536586e-06,
316
+ "loss": 0.6411,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.17287744909719555,
321
+ "grad_norm": 15.027800559997559,
322
+ "learning_rate": 5.487804878048781e-06,
323
+ "loss": 0.7331,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.17671917018824435,
328
+ "grad_norm": 9.771449089050293,
329
+ "learning_rate": 5.609756097560977e-06,
330
+ "loss": 0.6245,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.18056089127929312,
335
+ "grad_norm": 7.735960960388184,
336
+ "learning_rate": 5.731707317073171e-06,
337
+ "loss": 0.5251,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.18440261237034192,
342
+ "grad_norm": 7.348488807678223,
343
+ "learning_rate": 5.853658536585366e-06,
344
+ "loss": 0.5108,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.1882443334613907,
349
+ "grad_norm": 30.567018508911133,
350
+ "learning_rate": 5.9756097560975615e-06,
351
+ "loss": 0.5508,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 0.1920860545524395,
356
+ "grad_norm": 19.212230682373047,
357
+ "learning_rate": 6.0975609756097564e-06,
358
+ "loss": 0.5997,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 0.1959277756434883,
363
+ "grad_norm": 17.70198631286621,
364
+ "learning_rate": 6.219512195121951e-06,
365
+ "loss": 0.8108,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 0.19976949673453706,
370
+ "grad_norm": 13.471924781799316,
371
+ "learning_rate": 6.341463414634147e-06,
372
+ "loss": 0.6738,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 0.20361121782558586,
377
+ "grad_norm": 15.494725227355957,
378
+ "learning_rate": 6.463414634146342e-06,
379
+ "loss": 0.8114,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 0.20745293891663466,
384
+ "grad_norm": 9.314640045166016,
385
+ "learning_rate": 6.585365853658538e-06,
386
+ "loss": 0.5791,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 0.21129466000768343,
391
+ "grad_norm": 11.314807891845703,
392
+ "learning_rate": 6.707317073170733e-06,
393
+ "loss": 0.6536,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 0.21513638109873223,
398
+ "grad_norm": 16.462522506713867,
399
+ "learning_rate": 6.829268292682928e-06,
400
+ "loss": 0.5598,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 0.21897810218978103,
405
+ "grad_norm": 36.564476013183594,
406
+ "learning_rate": 6.951219512195122e-06,
407
+ "loss": 0.5573,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 0.2228198232808298,
412
+ "grad_norm": 16.07019805908203,
413
+ "learning_rate": 7.0731707317073175e-06,
414
+ "loss": 0.5328,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 0.2266615443718786,
419
+ "grad_norm": 6.132660388946533,
420
+ "learning_rate": 7.1951219512195125e-06,
421
+ "loss": 0.4499,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 0.2305032654629274,
426
+ "grad_norm": 21.4416561126709,
427
+ "learning_rate": 7.317073170731707e-06,
428
+ "loss": 0.6584,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 0.23434498655397618,
433
+ "grad_norm": 18.0037841796875,
434
+ "learning_rate": 7.439024390243903e-06,
435
+ "loss": 0.4844,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 0.23818670764502498,
440
+ "grad_norm": 17.625553131103516,
441
+ "learning_rate": 7.560975609756098e-06,
442
+ "loss": 0.5122,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 0.24202842873607375,
447
+ "grad_norm": 10.607305526733398,
448
+ "learning_rate": 7.682926829268293e-06,
449
+ "loss": 0.5188,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 0.24587014982712255,
454
+ "grad_norm": 26.88294792175293,
455
+ "learning_rate": 7.804878048780489e-06,
456
+ "loss": 0.7002,
457
+ "step": 640
458
+ },
459
+ {
460
+ "epoch": 0.24971187091817135,
461
+ "grad_norm": 38.178287506103516,
462
+ "learning_rate": 7.926829268292685e-06,
463
+ "loss": 0.9407,
464
+ "step": 650
465
+ },
466
+ {
467
+ "epoch": 0.2535535920092201,
468
+ "grad_norm": 8.800226211547852,
469
+ "learning_rate": 8.048780487804879e-06,
470
+ "loss": 0.4968,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 0.2573953131002689,
475
+ "grad_norm": 30.46478843688965,
476
+ "learning_rate": 8.170731707317073e-06,
477
+ "loss": 0.5293,
478
+ "step": 670
479
+ },
480
+ {
481
+ "epoch": 0.2612370341913177,
482
+ "grad_norm": 20.630985260009766,
483
+ "learning_rate": 8.292682926829268e-06,
484
+ "loss": 0.5382,
485
+ "step": 680
486
+ },
487
+ {
488
+ "epoch": 0.2650787552823665,
489
+ "grad_norm": 19.19484519958496,
490
+ "learning_rate": 8.414634146341464e-06,
491
+ "loss": 0.6311,
492
+ "step": 690
493
+ },
494
+ {
495
+ "epoch": 0.26892047637341526,
496
+ "grad_norm": 26.084064483642578,
497
+ "learning_rate": 8.536585365853658e-06,
498
+ "loss": 0.6893,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 0.27276219746446406,
503
+ "grad_norm": 11.942285537719727,
504
+ "learning_rate": 8.658536585365854e-06,
505
+ "loss": 0.5938,
506
+ "step": 710
507
+ },
508
+ {
509
+ "epoch": 0.27660391855551286,
510
+ "grad_norm": 25.995960235595703,
511
+ "learning_rate": 8.78048780487805e-06,
512
+ "loss": 0.5693,
513
+ "step": 720
514
+ },
515
+ {
516
+ "epoch": 0.28044563964656166,
517
+ "grad_norm": 10.440145492553711,
518
+ "learning_rate": 8.902439024390244e-06,
519
+ "loss": 0.5937,
520
+ "step": 730
521
+ },
522
+ {
523
+ "epoch": 0.28428736073761046,
524
+ "grad_norm": 21.39019012451172,
525
+ "learning_rate": 9.02439024390244e-06,
526
+ "loss": 0.664,
527
+ "step": 740
528
+ },
529
+ {
530
+ "epoch": 0.28812908182865926,
531
+ "grad_norm": 11.281585693359375,
532
+ "learning_rate": 9.146341463414635e-06,
533
+ "loss": 0.5178,
534
+ "step": 750
535
+ },
536
+ {
537
+ "epoch": 0.291970802919708,
538
+ "grad_norm": 13.577981948852539,
539
+ "learning_rate": 9.268292682926831e-06,
540
+ "loss": 0.5405,
541
+ "step": 760
542
+ },
543
+ {
544
+ "epoch": 0.2958125240107568,
545
+ "grad_norm": 55.213340759277344,
546
+ "learning_rate": 9.390243902439025e-06,
547
+ "loss": 0.5633,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 0.2996542451018056,
552
+ "grad_norm": 19.74529266357422,
553
+ "learning_rate": 9.51219512195122e-06,
554
+ "loss": 0.7541,
555
+ "step": 780
556
+ },
557
+ {
558
+ "epoch": 0.3034959661928544,
559
+ "grad_norm": 45.70171356201172,
560
+ "learning_rate": 9.634146341463415e-06,
561
+ "loss": 0.6196,
562
+ "step": 790
563
+ },
564
+ {
565
+ "epoch": 0.3073376872839032,
566
+ "grad_norm": 23.01153564453125,
567
+ "learning_rate": 9.756097560975611e-06,
568
+ "loss": 0.7121,
569
+ "step": 800
570
+ },
571
+ {
572
+ "epoch": 0.311179408374952,
573
+ "grad_norm": 17.099529266357422,
574
+ "learning_rate": 9.878048780487805e-06,
575
+ "loss": 0.6021,
576
+ "step": 810
577
+ },
578
+ {
579
+ "epoch": 0.31502112946600075,
580
+ "grad_norm": 11.891583442687988,
581
+ "learning_rate": 1e-05,
582
+ "loss": 0.6071,
583
+ "step": 820
584
+ },
585
+ {
586
+ "epoch": 0.31886285055704955,
587
+ "grad_norm": 16.423572540283203,
588
+ "learning_rate": 9.98957464553795e-06,
589
+ "loss": 0.539,
590
+ "step": 830
591
+ },
592
+ {
593
+ "epoch": 0.32270457164809835,
594
+ "grad_norm": 11.026520729064941,
595
+ "learning_rate": 9.979149291075898e-06,
596
+ "loss": 0.615,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 0.32654629273914715,
601
+ "grad_norm": 25.40671730041504,
602
+ "learning_rate": 9.968723936613845e-06,
603
+ "loss": 0.4819,
604
+ "step": 850
605
+ },
606
+ {
607
+ "epoch": 0.33038801383019595,
608
+ "grad_norm": 30.425880432128906,
609
+ "learning_rate": 9.958298582151794e-06,
610
+ "loss": 0.7362,
611
+ "step": 860
612
+ },
613
+ {
614
+ "epoch": 0.3342297349212447,
615
+ "grad_norm": 21.710176467895508,
616
+ "learning_rate": 9.947873227689742e-06,
617
+ "loss": 0.5052,
618
+ "step": 870
619
+ },
620
+ {
621
+ "epoch": 0.3380714560122935,
622
+ "grad_norm": 13.537766456604004,
623
+ "learning_rate": 9.93744787322769e-06,
624
+ "loss": 0.6032,
625
+ "step": 880
626
+ },
627
+ {
628
+ "epoch": 0.3419131771033423,
629
+ "grad_norm": 19.592073440551758,
630
+ "learning_rate": 9.927022518765639e-06,
631
+ "loss": 0.5222,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 0.3457548981943911,
636
+ "grad_norm": 10.528463363647461,
637
+ "learning_rate": 9.916597164303588e-06,
638
+ "loss": 0.596,
639
+ "step": 900
640
+ },
641
+ {
642
+ "epoch": 0.3495966192854399,
643
+ "grad_norm": 15.643308639526367,
644
+ "learning_rate": 9.906171809841536e-06,
645
+ "loss": 0.7414,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 0.3534383403764887,
650
+ "grad_norm": 22.77689552307129,
651
+ "learning_rate": 9.895746455379483e-06,
652
+ "loss": 0.5151,
653
+ "step": 920
654
+ },
655
+ {
656
+ "epoch": 0.35728006146753744,
657
+ "grad_norm": 23.217538833618164,
658
+ "learning_rate": 9.885321100917432e-06,
659
+ "loss": 0.7501,
660
+ "step": 930
661
+ },
662
+ {
663
+ "epoch": 0.36112178255858624,
664
+ "grad_norm": 21.08719825744629,
665
+ "learning_rate": 9.874895746455382e-06,
666
+ "loss": 0.4944,
667
+ "step": 940
668
+ },
669
+ {
670
+ "epoch": 0.36496350364963503,
671
+ "grad_norm": 19.93057632446289,
672
+ "learning_rate": 9.864470391993327e-06,
673
+ "loss": 0.7289,
674
+ "step": 950
675
+ },
676
+ {
677
+ "epoch": 0.36880522474068383,
678
+ "grad_norm": 14.810643196105957,
679
+ "learning_rate": 9.854045037531277e-06,
680
+ "loss": 0.4956,
681
+ "step": 960
682
+ },
683
+ {
684
+ "epoch": 0.37264694583173263,
685
+ "grad_norm": 19.596710205078125,
686
+ "learning_rate": 9.843619683069226e-06,
687
+ "loss": 0.5528,
688
+ "step": 970
689
+ },
690
+ {
691
+ "epoch": 0.3764886669227814,
692
+ "grad_norm": 23.963172912597656,
693
+ "learning_rate": 9.833194328607173e-06,
694
+ "loss": 0.5904,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 0.3803303880138302,
699
+ "grad_norm": 14.250468254089355,
700
+ "learning_rate": 9.822768974145121e-06,
701
+ "loss": 0.6762,
702
+ "step": 990
703
+ },
704
+ {
705
+ "epoch": 0.384172109104879,
706
+ "grad_norm": 30.411888122558594,
707
+ "learning_rate": 9.81234361968307e-06,
708
+ "loss": 0.6031,
709
+ "step": 1000
710
+ },
711
+ {
712
+ "epoch": 0.3880138301959278,
713
+ "grad_norm": 4.892022132873535,
714
+ "learning_rate": 9.80191826522102e-06,
715
+ "loss": 0.5832,
716
+ "step": 1010
717
+ },
718
+ {
719
+ "epoch": 0.3918555512869766,
720
+ "grad_norm": 14.026780128479004,
721
+ "learning_rate": 9.791492910758967e-06,
722
+ "loss": 0.4432,
723
+ "step": 1020
724
+ },
725
+ {
726
+ "epoch": 0.3956972723780254,
727
+ "grad_norm": 15.22079086303711,
728
+ "learning_rate": 9.781067556296915e-06,
729
+ "loss": 0.5003,
730
+ "step": 1030
731
+ },
732
+ {
733
+ "epoch": 0.3995389934690741,
734
+ "grad_norm": 38.46358871459961,
735
+ "learning_rate": 9.770642201834864e-06,
736
+ "loss": 0.6863,
737
+ "step": 1040
738
+ },
739
+ {
740
+ "epoch": 0.4033807145601229,
741
+ "grad_norm": 28.509458541870117,
742
+ "learning_rate": 9.760216847372811e-06,
743
+ "loss": 0.6557,
744
+ "step": 1050
745
+ },
746
+ {
747
+ "epoch": 0.4072224356511717,
748
+ "grad_norm": 10.18283748626709,
749
+ "learning_rate": 9.749791492910759e-06,
750
+ "loss": 0.475,
751
+ "step": 1060
752
+ },
753
+ {
754
+ "epoch": 0.4110641567422205,
755
+ "grad_norm": 16.280475616455078,
756
+ "learning_rate": 9.739366138448708e-06,
757
+ "loss": 0.5423,
758
+ "step": 1070
759
+ },
760
+ {
761
+ "epoch": 0.4149058778332693,
762
+ "grad_norm": 27.256633758544922,
763
+ "learning_rate": 9.728940783986657e-06,
764
+ "loss": 0.4638,
765
+ "step": 1080
766
+ },
767
+ {
768
+ "epoch": 0.4187475989243181,
769
+ "grad_norm": 26.906049728393555,
770
+ "learning_rate": 9.718515429524605e-06,
771
+ "loss": 0.6528,
772
+ "step": 1090
773
+ },
774
+ {
775
+ "epoch": 0.42258932001536686,
776
+ "grad_norm": 10.475133895874023,
777
+ "learning_rate": 9.708090075062552e-06,
778
+ "loss": 0.8387,
779
+ "step": 1100
780
+ },
781
+ {
782
+ "epoch": 0.42643104110641566,
783
+ "grad_norm": 14.576977729797363,
784
+ "learning_rate": 9.697664720600502e-06,
785
+ "loss": 0.6325,
786
+ "step": 1110
787
+ },
788
+ {
789
+ "epoch": 0.43027276219746446,
790
+ "grad_norm": 17.823413848876953,
791
+ "learning_rate": 9.68723936613845e-06,
792
+ "loss": 0.613,
793
+ "step": 1120
794
+ },
795
+ {
796
+ "epoch": 0.43411448328851326,
797
+ "grad_norm": 12.499800682067871,
798
+ "learning_rate": 9.676814011676397e-06,
799
+ "loss": 0.5685,
800
+ "step": 1130
801
+ },
802
+ {
803
+ "epoch": 0.43795620437956206,
804
+ "grad_norm": 19.12653923034668,
805
+ "learning_rate": 9.666388657214346e-06,
806
+ "loss": 0.8678,
807
+ "step": 1140
808
+ },
809
+ {
810
+ "epoch": 0.4417979254706108,
811
+ "grad_norm": 5.942495822906494,
812
+ "learning_rate": 9.655963302752295e-06,
813
+ "loss": 0.4594,
814
+ "step": 1150
815
+ },
816
+ {
817
+ "epoch": 0.4456396465616596,
818
+ "grad_norm": 19.233552932739258,
819
+ "learning_rate": 9.645537948290243e-06,
820
+ "loss": 0.5747,
821
+ "step": 1160
822
+ },
823
+ {
824
+ "epoch": 0.4494813676527084,
825
+ "grad_norm": 17.434133529663086,
826
+ "learning_rate": 9.63511259382819e-06,
827
+ "loss": 0.5121,
828
+ "step": 1170
829
+ },
830
+ {
831
+ "epoch": 0.4533230887437572,
832
+ "grad_norm": 14.78231143951416,
833
+ "learning_rate": 9.62468723936614e-06,
834
+ "loss": 0.7225,
835
+ "step": 1180
836
+ },
837
+ {
838
+ "epoch": 0.457164809834806,
839
+ "grad_norm": 23.81663703918457,
840
+ "learning_rate": 9.614261884904089e-06,
841
+ "loss": 0.8527,
842
+ "step": 1190
843
+ },
844
+ {
845
+ "epoch": 0.4610065309258548,
846
+ "grad_norm": 18.266740798950195,
847
+ "learning_rate": 9.603836530442035e-06,
848
+ "loss": 0.4101,
849
+ "step": 1200
850
+ },
851
+ {
852
+ "epoch": 0.46484825201690355,
853
+ "grad_norm": 23.31222152709961,
854
+ "learning_rate": 9.593411175979984e-06,
855
+ "loss": 0.5847,
856
+ "step": 1210
857
+ },
858
+ {
859
+ "epoch": 0.46868997310795235,
860
+ "grad_norm": 11.039971351623535,
861
+ "learning_rate": 9.582985821517933e-06,
862
+ "loss": 0.6515,
863
+ "step": 1220
864
+ },
865
+ {
866
+ "epoch": 0.47253169419900115,
867
+ "grad_norm": 27.05122184753418,
868
+ "learning_rate": 9.57256046705588e-06,
869
+ "loss": 0.5968,
870
+ "step": 1230
871
+ },
872
+ {
873
+ "epoch": 0.47637341529004995,
874
+ "grad_norm": 18.416839599609375,
875
+ "learning_rate": 9.562135112593828e-06,
876
+ "loss": 0.6339,
877
+ "step": 1240
878
+ },
879
+ {
880
+ "epoch": 0.48021513638109875,
881
+ "grad_norm": 18.275182723999023,
882
+ "learning_rate": 9.551709758131778e-06,
883
+ "loss": 0.3595,
884
+ "step": 1250
885
+ },
886
+ {
887
+ "epoch": 0.4840568574721475,
888
+ "grad_norm": 32.038143157958984,
889
+ "learning_rate": 9.541284403669727e-06,
890
+ "loss": 0.8281,
891
+ "step": 1260
892
+ },
893
+ {
894
+ "epoch": 0.4878985785631963,
895
+ "grad_norm": 19.010108947753906,
896
+ "learning_rate": 9.530859049207674e-06,
897
+ "loss": 0.7158,
898
+ "step": 1270
899
+ },
900
+ {
901
+ "epoch": 0.4917402996542451,
902
+ "grad_norm": 9.5922269821167,
903
+ "learning_rate": 9.520433694745622e-06,
904
+ "loss": 0.5315,
905
+ "step": 1280
906
+ },
907
+ {
908
+ "epoch": 0.4955820207452939,
909
+ "grad_norm": 11.005895614624023,
910
+ "learning_rate": 9.510008340283571e-06,
911
+ "loss": 0.3599,
912
+ "step": 1290
913
+ },
914
+ {
915
+ "epoch": 0.4994237418363427,
916
+ "grad_norm": 51.6233024597168,
917
+ "learning_rate": 9.499582985821519e-06,
918
+ "loss": 0.537,
919
+ "step": 1300
920
+ },
921
+ {
922
+ "epoch": 0.5032654629273915,
923
+ "grad_norm": 19.033329010009766,
924
+ "learning_rate": 9.489157631359466e-06,
925
+ "loss": 0.6083,
926
+ "step": 1310
927
+ },
928
+ {
929
+ "epoch": 0.5071071840184402,
930
+ "grad_norm": 16.91973114013672,
931
+ "learning_rate": 9.478732276897415e-06,
932
+ "loss": 0.7693,
933
+ "step": 1320
934
+ },
935
+ {
936
+ "epoch": 0.5109489051094891,
937
+ "grad_norm": 23.38747215270996,
938
+ "learning_rate": 9.468306922435365e-06,
939
+ "loss": 0.6646,
940
+ "step": 1330
941
+ },
942
+ {
943
+ "epoch": 0.5147906262005378,
944
+ "grad_norm": 13.772806167602539,
945
+ "learning_rate": 9.457881567973312e-06,
946
+ "loss": 0.4763,
947
+ "step": 1340
948
+ },
949
+ {
950
+ "epoch": 0.5186323472915866,
951
+ "grad_norm": 8.950833320617676,
952
+ "learning_rate": 9.44745621351126e-06,
953
+ "loss": 0.5793,
954
+ "step": 1350
955
+ },
956
+ {
957
+ "epoch": 0.5224740683826354,
958
+ "grad_norm": 5.6142964363098145,
959
+ "learning_rate": 9.437030859049209e-06,
960
+ "loss": 0.5797,
961
+ "step": 1360
962
+ },
963
+ {
964
+ "epoch": 0.5263157894736842,
965
+ "grad_norm": 18.615188598632812,
966
+ "learning_rate": 9.426605504587157e-06,
967
+ "loss": 0.6041,
968
+ "step": 1370
969
+ },
970
+ {
971
+ "epoch": 0.530157510564733,
972
+ "grad_norm": 10.953849792480469,
973
+ "learning_rate": 9.416180150125104e-06,
974
+ "loss": 0.5933,
975
+ "step": 1380
976
+ },
977
+ {
978
+ "epoch": 0.5339992316557818,
979
+ "grad_norm": 11.613428115844727,
980
+ "learning_rate": 9.405754795663053e-06,
981
+ "loss": 0.5275,
982
+ "step": 1390
983
+ },
984
+ {
985
+ "epoch": 0.5378409527468305,
986
+ "grad_norm": 12.725924491882324,
987
+ "learning_rate": 9.395329441201003e-06,
988
+ "loss": 0.5673,
989
+ "step": 1400
990
+ },
991
+ {
992
+ "epoch": 0.5416826738378794,
993
+ "grad_norm": 34.54634094238281,
994
+ "learning_rate": 9.38490408673895e-06,
995
+ "loss": 0.6717,
996
+ "step": 1410
997
+ },
998
+ {
999
+ "epoch": 0.5455243949289281,
1000
+ "grad_norm": 21.028316497802734,
1001
+ "learning_rate": 9.374478732276898e-06,
1002
+ "loss": 0.5483,
1003
+ "step": 1420
1004
+ },
1005
+ {
1006
+ "epoch": 0.549366116019977,
1007
+ "grad_norm": 30.281667709350586,
1008
+ "learning_rate": 9.364053377814847e-06,
1009
+ "loss": 0.7806,
1010
+ "step": 1430
1011
+ },
1012
+ {
1013
+ "epoch": 0.5532078371110257,
1014
+ "grad_norm": 11.983960151672363,
1015
+ "learning_rate": 9.353628023352795e-06,
1016
+ "loss": 0.5061,
1017
+ "step": 1440
1018
+ },
1019
+ {
1020
+ "epoch": 0.5570495582020746,
1021
+ "grad_norm": 6.99747896194458,
1022
+ "learning_rate": 9.343202668890742e-06,
1023
+ "loss": 0.5623,
1024
+ "step": 1450
1025
+ },
1026
+ {
1027
+ "epoch": 0.5608912792931233,
1028
+ "grad_norm": 11.219843864440918,
1029
+ "learning_rate": 9.332777314428691e-06,
1030
+ "loss": 0.6227,
1031
+ "step": 1460
1032
+ },
1033
+ {
1034
+ "epoch": 0.5647330003841721,
1035
+ "grad_norm": 8.85550308227539,
1036
+ "learning_rate": 9.32235195996664e-06,
1037
+ "loss": 0.5908,
1038
+ "step": 1470
1039
+ },
1040
+ {
1041
+ "epoch": 0.5685747214752209,
1042
+ "grad_norm": 15.55632209777832,
1043
+ "learning_rate": 9.311926605504588e-06,
1044
+ "loss": 0.5888,
1045
+ "step": 1480
1046
+ },
1047
+ {
1048
+ "epoch": 0.5724164425662697,
1049
+ "grad_norm": 5.281271457672119,
1050
+ "learning_rate": 9.301501251042536e-06,
1051
+ "loss": 0.4795,
1052
+ "step": 1490
1053
+ },
1054
+ {
1055
+ "epoch": 0.5762581636573185,
1056
+ "grad_norm": 10.58825397491455,
1057
+ "learning_rate": 9.291075896580485e-06,
1058
+ "loss": 0.4825,
1059
+ "step": 1500
1060
+ },
1061
+ {
1062
+ "epoch": 0.5800998847483673,
1063
+ "grad_norm": 13.970091819763184,
1064
+ "learning_rate": 9.280650542118432e-06,
1065
+ "loss": 0.6107,
1066
+ "step": 1510
1067
+ },
1068
+ {
1069
+ "epoch": 0.583941605839416,
1070
+ "grad_norm": 15.610709190368652,
1071
+ "learning_rate": 9.270225187656382e-06,
1072
+ "loss": 0.454,
1073
+ "step": 1520
1074
+ },
1075
+ {
1076
+ "epoch": 0.5877833269304649,
1077
+ "grad_norm": 9.203128814697266,
1078
+ "learning_rate": 9.25979983319433e-06,
1079
+ "loss": 0.596,
1080
+ "step": 1530
1081
+ },
1082
+ {
1083
+ "epoch": 0.5916250480215136,
1084
+ "grad_norm": 12.340123176574707,
1085
+ "learning_rate": 9.249374478732278e-06,
1086
+ "loss": 0.6622,
1087
+ "step": 1540
1088
+ },
1089
+ {
1090
+ "epoch": 0.5954667691125625,
1091
+ "grad_norm": 6.894665718078613,
1092
+ "learning_rate": 9.238949124270226e-06,
1093
+ "loss": 0.4944,
1094
+ "step": 1550
1095
+ },
1096
+ {
1097
+ "epoch": 0.5993084902036112,
1098
+ "grad_norm": 22.704559326171875,
1099
+ "learning_rate": 9.228523769808174e-06,
1100
+ "loss": 0.6151,
1101
+ "step": 1560
1102
+ },
1103
+ {
1104
+ "epoch": 0.60315021129466,
1105
+ "grad_norm": 6.272796630859375,
1106
+ "learning_rate": 9.218098415346123e-06,
1107
+ "loss": 0.4866,
1108
+ "step": 1570
1109
+ },
1110
+ {
1111
+ "epoch": 0.6069919323857088,
1112
+ "grad_norm": 40.39881134033203,
1113
+ "learning_rate": 9.20767306088407e-06,
1114
+ "loss": 0.5471,
1115
+ "step": 1580
1116
+ },
1117
+ {
1118
+ "epoch": 0.6108336534767576,
1119
+ "grad_norm": 9.417654037475586,
1120
+ "learning_rate": 9.19724770642202e-06,
1121
+ "loss": 0.5702,
1122
+ "step": 1590
1123
+ },
1124
+ {
1125
+ "epoch": 0.6146753745678064,
1126
+ "grad_norm": 8.880293846130371,
1127
+ "learning_rate": 9.186822351959967e-06,
1128
+ "loss": 0.7593,
1129
+ "step": 1600
1130
+ },
1131
+ {
1132
+ "epoch": 0.6185170956588552,
1133
+ "grad_norm": 16.337783813476562,
1134
+ "learning_rate": 9.176396997497916e-06,
1135
+ "loss": 0.3708,
1136
+ "step": 1610
1137
+ },
1138
+ {
1139
+ "epoch": 0.622358816749904,
1140
+ "grad_norm": 15.34815502166748,
1141
+ "learning_rate": 9.165971643035864e-06,
1142
+ "loss": 0.6829,
1143
+ "step": 1620
1144
+ },
1145
+ {
1146
+ "epoch": 0.6262005378409528,
1147
+ "grad_norm": 12.125506401062012,
1148
+ "learning_rate": 9.155546288573811e-06,
1149
+ "loss": 0.5839,
1150
+ "step": 1630
1151
+ },
1152
+ {
1153
+ "epoch": 0.6300422589320015,
1154
+ "grad_norm": 12.340716361999512,
1155
+ "learning_rate": 9.14512093411176e-06,
1156
+ "loss": 0.5855,
1157
+ "step": 1640
1158
+ },
1159
+ {
1160
+ "epoch": 0.6338839800230504,
1161
+ "grad_norm": 17.276071548461914,
1162
+ "learning_rate": 9.134695579649708e-06,
1163
+ "loss": 0.4579,
1164
+ "step": 1650
1165
+ },
1166
+ {
1167
+ "epoch": 0.6377257011140991,
1168
+ "grad_norm": 4.054512977600098,
1169
+ "learning_rate": 9.124270225187658e-06,
1170
+ "loss": 0.3717,
1171
+ "step": 1660
1172
+ },
1173
+ {
1174
+ "epoch": 0.641567422205148,
1175
+ "grad_norm": 26.277875900268555,
1176
+ "learning_rate": 9.113844870725605e-06,
1177
+ "loss": 0.6934,
1178
+ "step": 1670
1179
+ },
1180
+ {
1181
+ "epoch": 0.6454091432961967,
1182
+ "grad_norm": 23.17993927001953,
1183
+ "learning_rate": 9.103419516263554e-06,
1184
+ "loss": 0.8507,
1185
+ "step": 1680
1186
+ },
1187
+ {
1188
+ "epoch": 0.6492508643872454,
1189
+ "grad_norm": 30.25948715209961,
1190
+ "learning_rate": 9.092994161801502e-06,
1191
+ "loss": 0.5851,
1192
+ "step": 1690
1193
+ },
1194
+ {
1195
+ "epoch": 0.6530925854782943,
1196
+ "grad_norm": 12.083464622497559,
1197
+ "learning_rate": 9.08256880733945e-06,
1198
+ "loss": 0.5214,
1199
+ "step": 1700
1200
+ },
1201
+ {
1202
+ "epoch": 0.656934306569343,
1203
+ "grad_norm": 26.132946014404297,
1204
+ "learning_rate": 9.072143452877399e-06,
1205
+ "loss": 0.5715,
1206
+ "step": 1710
1207
+ },
1208
+ {
1209
+ "epoch": 0.6607760276603919,
1210
+ "grad_norm": 13.83061408996582,
1211
+ "learning_rate": 9.061718098415346e-06,
1212
+ "loss": 0.6076,
1213
+ "step": 1720
1214
+ },
1215
+ {
1216
+ "epoch": 0.6646177487514406,
1217
+ "grad_norm": 36.992679595947266,
1218
+ "learning_rate": 9.051292743953295e-06,
1219
+ "loss": 0.5795,
1220
+ "step": 1730
1221
+ },
1222
+ {
1223
+ "epoch": 0.6684594698424894,
1224
+ "grad_norm": 24.426977157592773,
1225
+ "learning_rate": 9.040867389491243e-06,
1226
+ "loss": 0.6913,
1227
+ "step": 1740
1228
+ },
1229
+ {
1230
+ "epoch": 0.6723011909335382,
1231
+ "grad_norm": 15.399202346801758,
1232
+ "learning_rate": 9.030442035029192e-06,
1233
+ "loss": 0.647,
1234
+ "step": 1750
1235
+ },
1236
+ {
1237
+ "epoch": 0.676142912024587,
1238
+ "grad_norm": 36.72813034057617,
1239
+ "learning_rate": 9.02001668056714e-06,
1240
+ "loss": 0.7641,
1241
+ "step": 1760
1242
+ },
1243
+ {
1244
+ "epoch": 0.6799846331156358,
1245
+ "grad_norm": 19.219661712646484,
1246
+ "learning_rate": 9.009591326105089e-06,
1247
+ "loss": 0.7111,
1248
+ "step": 1770
1249
+ },
1250
+ {
1251
+ "epoch": 0.6838263542066846,
1252
+ "grad_norm": 10.353839874267578,
1253
+ "learning_rate": 8.999165971643037e-06,
1254
+ "loss": 0.437,
1255
+ "step": 1780
1256
+ },
1257
+ {
1258
+ "epoch": 0.6876680752977334,
1259
+ "grad_norm": 12.179790496826172,
1260
+ "learning_rate": 8.988740617180984e-06,
1261
+ "loss": 0.6514,
1262
+ "step": 1790
1263
+ },
1264
+ {
1265
+ "epoch": 0.6915097963887822,
1266
+ "grad_norm": 15.036273956298828,
1267
+ "learning_rate": 8.978315262718933e-06,
1268
+ "loss": 0.4611,
1269
+ "step": 1800
1270
+ },
1271
+ {
1272
+ "epoch": 0.6953515174798309,
1273
+ "grad_norm": 12.146955490112305,
1274
+ "learning_rate": 8.967889908256881e-06,
1275
+ "loss": 0.5176,
1276
+ "step": 1810
1277
+ },
1278
+ {
1279
+ "epoch": 0.6991932385708798,
1280
+ "grad_norm": 16.004959106445312,
1281
+ "learning_rate": 8.95746455379483e-06,
1282
+ "loss": 0.3749,
1283
+ "step": 1820
1284
+ },
1285
+ {
1286
+ "epoch": 0.7030349596619285,
1287
+ "grad_norm": 23.500526428222656,
1288
+ "learning_rate": 8.947039199332778e-06,
1289
+ "loss": 0.6124,
1290
+ "step": 1830
1291
+ },
1292
+ {
1293
+ "epoch": 0.7068766807529774,
1294
+ "grad_norm": 11.367331504821777,
1295
+ "learning_rate": 8.936613844870727e-06,
1296
+ "loss": 0.3982,
1297
+ "step": 1840
1298
+ },
1299
+ {
1300
+ "epoch": 0.7107184018440261,
1301
+ "grad_norm": 13.60319709777832,
1302
+ "learning_rate": 8.926188490408674e-06,
1303
+ "loss": 0.4618,
1304
+ "step": 1850
1305
+ },
1306
+ {
1307
+ "epoch": 0.7145601229350749,
1308
+ "grad_norm": 9.807296752929688,
1309
+ "learning_rate": 8.915763135946624e-06,
1310
+ "loss": 0.552,
1311
+ "step": 1860
1312
+ },
1313
+ {
1314
+ "epoch": 0.7184018440261237,
1315
+ "grad_norm": 41.238895416259766,
1316
+ "learning_rate": 8.905337781484571e-06,
1317
+ "loss": 0.738,
1318
+ "step": 1870
1319
+ },
1320
+ {
1321
+ "epoch": 0.7222435651171725,
1322
+ "grad_norm": 8.117176055908203,
1323
+ "learning_rate": 8.894912427022519e-06,
1324
+ "loss": 0.546,
1325
+ "step": 1880
1326
+ },
1327
+ {
1328
+ "epoch": 0.7260852862082213,
1329
+ "grad_norm": 8.292084693908691,
1330
+ "learning_rate": 8.884487072560468e-06,
1331
+ "loss": 0.5098,
1332
+ "step": 1890
1333
+ },
1334
+ {
1335
+ "epoch": 0.7299270072992701,
1336
+ "grad_norm": 16.20579719543457,
1337
+ "learning_rate": 8.874061718098416e-06,
1338
+ "loss": 0.5693,
1339
+ "step": 1900
1340
+ },
1341
+ {
1342
+ "epoch": 0.7337687283903188,
1343
+ "grad_norm": 10.686980247497559,
1344
+ "learning_rate": 8.863636363636365e-06,
1345
+ "loss": 0.6848,
1346
+ "step": 1910
1347
+ },
1348
+ {
1349
+ "epoch": 0.7376104494813677,
1350
+ "grad_norm": 12.386652946472168,
1351
+ "learning_rate": 8.853211009174312e-06,
1352
+ "loss": 0.5282,
1353
+ "step": 1920
1354
+ },
1355
+ {
1356
+ "epoch": 0.7414521705724164,
1357
+ "grad_norm": 11.129962921142578,
1358
+ "learning_rate": 8.842785654712262e-06,
1359
+ "loss": 0.5789,
1360
+ "step": 1930
1361
+ },
1362
+ {
1363
+ "epoch": 0.7452938916634653,
1364
+ "grad_norm": 8.727615356445312,
1365
+ "learning_rate": 8.83236030025021e-06,
1366
+ "loss": 0.5936,
1367
+ "step": 1940
1368
+ },
1369
+ {
1370
+ "epoch": 0.749135612754514,
1371
+ "grad_norm": 11.261787414550781,
1372
+ "learning_rate": 8.821934945788157e-06,
1373
+ "loss": 0.5308,
1374
+ "step": 1950
1375
+ },
1376
+ {
1377
+ "epoch": 0.7529773338455628,
1378
+ "grad_norm": 23.387935638427734,
1379
+ "learning_rate": 8.811509591326106e-06,
1380
+ "loss": 0.5074,
1381
+ "step": 1960
1382
+ },
1383
+ {
1384
+ "epoch": 0.7568190549366116,
1385
+ "grad_norm": 20.772794723510742,
1386
+ "learning_rate": 8.801084236864054e-06,
1387
+ "loss": 0.6157,
1388
+ "step": 1970
1389
+ },
1390
+ {
1391
+ "epoch": 0.7606607760276604,
1392
+ "grad_norm": 23.0604305267334,
1393
+ "learning_rate": 8.790658882402003e-06,
1394
+ "loss": 0.5272,
1395
+ "step": 1980
1396
+ },
1397
+ {
1398
+ "epoch": 0.7645024971187092,
1399
+ "grad_norm": 44.302425384521484,
1400
+ "learning_rate": 8.78023352793995e-06,
1401
+ "loss": 0.6709,
1402
+ "step": 1990
1403
+ },
1404
+ {
1405
+ "epoch": 0.768344218209758,
1406
+ "grad_norm": 16.66979217529297,
1407
+ "learning_rate": 8.7698081734779e-06,
1408
+ "loss": 0.4651,
1409
+ "step": 2000
1410
+ },
1411
+ {
1412
+ "epoch": 0.7721859393008068,
1413
+ "grad_norm": 18.14614486694336,
1414
+ "learning_rate": 8.759382819015847e-06,
1415
+ "loss": 0.5747,
1416
+ "step": 2010
1417
+ },
1418
+ {
1419
+ "epoch": 0.7760276603918556,
1420
+ "grad_norm": 10.635650634765625,
1421
+ "learning_rate": 8.748957464553796e-06,
1422
+ "loss": 0.5169,
1423
+ "step": 2020
1424
+ },
1425
+ {
1426
+ "epoch": 0.7798693814829043,
1427
+ "grad_norm": 13.54704475402832,
1428
+ "learning_rate": 8.738532110091744e-06,
1429
+ "loss": 0.4784,
1430
+ "step": 2030
1431
+ },
1432
+ {
1433
+ "epoch": 0.7837111025739532,
1434
+ "grad_norm": 12.35689926147461,
1435
+ "learning_rate": 8.728106755629691e-06,
1436
+ "loss": 0.5529,
1437
+ "step": 2040
1438
+ },
1439
+ {
1440
+ "epoch": 0.7875528236650019,
1441
+ "grad_norm": 7.250340461730957,
1442
+ "learning_rate": 8.71768140116764e-06,
1443
+ "loss": 0.6229,
1444
+ "step": 2050
1445
+ },
1446
+ {
1447
+ "epoch": 0.7913945447560508,
1448
+ "grad_norm": 16.60529327392578,
1449
+ "learning_rate": 8.707256046705588e-06,
1450
+ "loss": 0.5726,
1451
+ "step": 2060
1452
+ },
1453
+ {
1454
+ "epoch": 0.7952362658470995,
1455
+ "grad_norm": 18.4666805267334,
1456
+ "learning_rate": 8.696830692243537e-06,
1457
+ "loss": 0.5643,
1458
+ "step": 2070
1459
+ },
1460
+ {
1461
+ "epoch": 0.7990779869381482,
1462
+ "grad_norm": 31.986207962036133,
1463
+ "learning_rate": 8.686405337781485e-06,
1464
+ "loss": 0.4759,
1465
+ "step": 2080
1466
+ },
1467
+ {
1468
+ "epoch": 0.8029197080291971,
1469
+ "grad_norm": 30.724218368530273,
1470
+ "learning_rate": 8.675979983319434e-06,
1471
+ "loss": 0.6527,
1472
+ "step": 2090
1473
+ },
1474
+ {
1475
+ "epoch": 0.8067614291202458,
1476
+ "grad_norm": 22.7759952545166,
1477
+ "learning_rate": 8.665554628857382e-06,
1478
+ "loss": 0.6438,
1479
+ "step": 2100
1480
+ },
1481
+ {
1482
+ "epoch": 0.8106031502112947,
1483
+ "grad_norm": 14.61020565032959,
1484
+ "learning_rate": 8.65512927439533e-06,
1485
+ "loss": 0.3962,
1486
+ "step": 2110
1487
+ },
1488
+ {
1489
+ "epoch": 0.8144448713023434,
1490
+ "grad_norm": 20.27998161315918,
1491
+ "learning_rate": 8.644703919933279e-06,
1492
+ "loss": 0.6989,
1493
+ "step": 2120
1494
+ },
1495
+ {
1496
+ "epoch": 0.8182865923933922,
1497
+ "grad_norm": 10.2035493850708,
1498
+ "learning_rate": 8.634278565471226e-06,
1499
+ "loss": 0.5543,
1500
+ "step": 2130
1501
+ },
1502
+ {
1503
+ "epoch": 0.822128313484441,
1504
+ "grad_norm": 16.954448699951172,
1505
+ "learning_rate": 8.623853211009175e-06,
1506
+ "loss": 0.4598,
1507
+ "step": 2140
1508
+ },
1509
+ {
1510
+ "epoch": 0.8259700345754898,
1511
+ "grad_norm": 24.188817977905273,
1512
+ "learning_rate": 8.613427856547123e-06,
1513
+ "loss": 0.674,
1514
+ "step": 2150
1515
+ },
1516
+ {
1517
+ "epoch": 0.8298117556665386,
1518
+ "grad_norm": 8.472796440124512,
1519
+ "learning_rate": 8.603002502085072e-06,
1520
+ "loss": 0.4246,
1521
+ "step": 2160
1522
+ },
1523
+ {
1524
+ "epoch": 0.8336534767575874,
1525
+ "grad_norm": 21.893178939819336,
1526
+ "learning_rate": 8.59257714762302e-06,
1527
+ "loss": 0.5788,
1528
+ "step": 2170
1529
+ },
1530
+ {
1531
+ "epoch": 0.8374951978486362,
1532
+ "grad_norm": 8.200776100158691,
1533
+ "learning_rate": 8.582151793160967e-06,
1534
+ "loss": 0.4215,
1535
+ "step": 2180
1536
+ },
1537
+ {
1538
+ "epoch": 0.841336918939685,
1539
+ "grad_norm": 21.523435592651367,
1540
+ "learning_rate": 8.571726438698917e-06,
1541
+ "loss": 0.4367,
1542
+ "step": 2190
1543
+ },
1544
+ {
1545
+ "epoch": 0.8451786400307337,
1546
+ "grad_norm": 18.608898162841797,
1547
+ "learning_rate": 8.561301084236864e-06,
1548
+ "loss": 0.6324,
1549
+ "step": 2200
1550
+ },
1551
+ {
1552
+ "epoch": 0.8490203611217826,
1553
+ "grad_norm": 19.39713478088379,
1554
+ "learning_rate": 8.550875729774813e-06,
1555
+ "loss": 0.382,
1556
+ "step": 2210
1557
+ },
1558
+ {
1559
+ "epoch": 0.8528620822128313,
1560
+ "grad_norm": 15.368677139282227,
1561
+ "learning_rate": 8.540450375312761e-06,
1562
+ "loss": 0.492,
1563
+ "step": 2220
1564
+ },
1565
+ {
1566
+ "epoch": 0.8567038033038802,
1567
+ "grad_norm": 6.85573673248291,
1568
+ "learning_rate": 8.53002502085071e-06,
1569
+ "loss": 0.6801,
1570
+ "step": 2230
1571
+ },
1572
+ {
1573
+ "epoch": 0.8605455243949289,
1574
+ "grad_norm": 11.223825454711914,
1575
+ "learning_rate": 8.519599666388658e-06,
1576
+ "loss": 0.7763,
1577
+ "step": 2240
1578
+ },
1579
+ {
1580
+ "epoch": 0.8643872454859777,
1581
+ "grad_norm": 11.18885326385498,
1582
+ "learning_rate": 8.509174311926605e-06,
1583
+ "loss": 0.585,
1584
+ "step": 2250
1585
+ },
1586
+ {
1587
+ "epoch": 0.8682289665770265,
1588
+ "grad_norm": 21.877548217773438,
1589
+ "learning_rate": 8.498748957464554e-06,
1590
+ "loss": 0.5873,
1591
+ "step": 2260
1592
+ },
1593
+ {
1594
+ "epoch": 0.8720706876680753,
1595
+ "grad_norm": 25.72136116027832,
1596
+ "learning_rate": 8.488323603002504e-06,
1597
+ "loss": 0.5796,
1598
+ "step": 2270
1599
+ },
1600
+ {
1601
+ "epoch": 0.8759124087591241,
1602
+ "grad_norm": 16.472366333007812,
1603
+ "learning_rate": 8.477898248540451e-06,
1604
+ "loss": 0.4431,
1605
+ "step": 2280
1606
+ },
1607
+ {
1608
+ "epoch": 0.8797541298501729,
1609
+ "grad_norm": 5.752821445465088,
1610
+ "learning_rate": 8.467472894078399e-06,
1611
+ "loss": 0.7004,
1612
+ "step": 2290
1613
+ },
1614
+ {
1615
+ "epoch": 0.8835958509412216,
1616
+ "grad_norm": 13.56191349029541,
1617
+ "learning_rate": 8.457047539616348e-06,
1618
+ "loss": 0.4899,
1619
+ "step": 2300
1620
+ },
1621
+ {
1622
+ "epoch": 0.8874375720322705,
1623
+ "grad_norm": 5.017563343048096,
1624
+ "learning_rate": 8.446622185154296e-06,
1625
+ "loss": 0.7014,
1626
+ "step": 2310
1627
+ },
1628
+ {
1629
+ "epoch": 0.8912792931233192,
1630
+ "grad_norm": 15.450356483459473,
1631
+ "learning_rate": 8.436196830692243e-06,
1632
+ "loss": 0.5414,
1633
+ "step": 2320
1634
+ },
1635
+ {
1636
+ "epoch": 0.8951210142143681,
1637
+ "grad_norm": 16.416250228881836,
1638
+ "learning_rate": 8.425771476230192e-06,
1639
+ "loss": 0.698,
1640
+ "step": 2330
1641
+ },
1642
+ {
1643
+ "epoch": 0.8989627353054168,
1644
+ "grad_norm": 11.185935020446777,
1645
+ "learning_rate": 8.415346121768142e-06,
1646
+ "loss": 0.6264,
1647
+ "step": 2340
1648
+ },
1649
+ {
1650
+ "epoch": 0.9028044563964657,
1651
+ "grad_norm": 22.787181854248047,
1652
+ "learning_rate": 8.40492076730609e-06,
1653
+ "loss": 0.6908,
1654
+ "step": 2350
1655
+ },
1656
+ {
1657
+ "epoch": 0.9066461774875144,
1658
+ "grad_norm": 11.522934913635254,
1659
+ "learning_rate": 8.394495412844037e-06,
1660
+ "loss": 0.5546,
1661
+ "step": 2360
1662
+ },
1663
+ {
1664
+ "epoch": 0.9104878985785632,
1665
+ "grad_norm": 18.260616302490234,
1666
+ "learning_rate": 8.384070058381986e-06,
1667
+ "loss": 0.5925,
1668
+ "step": 2370
1669
+ },
1670
+ {
1671
+ "epoch": 0.914329619669612,
1672
+ "grad_norm": 7.180076599121094,
1673
+ "learning_rate": 8.373644703919933e-06,
1674
+ "loss": 0.6639,
1675
+ "step": 2380
1676
+ },
1677
+ {
1678
+ "epoch": 0.9181713407606608,
1679
+ "grad_norm": 11.107264518737793,
1680
+ "learning_rate": 8.363219349457881e-06,
1681
+ "loss": 0.6762,
1682
+ "step": 2390
1683
+ },
1684
+ {
1685
+ "epoch": 0.9220130618517096,
1686
+ "grad_norm": 12.528190612792969,
1687
+ "learning_rate": 8.35279399499583e-06,
1688
+ "loss": 0.5435,
1689
+ "step": 2400
1690
+ },
1691
+ {
1692
+ "epoch": 0.9258547829427584,
1693
+ "grad_norm": 29.454421997070312,
1694
+ "learning_rate": 8.34236864053378e-06,
1695
+ "loss": 0.5074,
1696
+ "step": 2410
1697
+ },
1698
+ {
1699
+ "epoch": 0.9296965040338071,
1700
+ "grad_norm": 14.677248001098633,
1701
+ "learning_rate": 8.331943286071727e-06,
1702
+ "loss": 0.6161,
1703
+ "step": 2420
1704
+ },
1705
+ {
1706
+ "epoch": 0.933538225124856,
1707
+ "grad_norm": 8.907113075256348,
1708
+ "learning_rate": 8.321517931609675e-06,
1709
+ "loss": 0.353,
1710
+ "step": 2430
1711
+ },
1712
+ {
1713
+ "epoch": 0.9373799462159047,
1714
+ "grad_norm": 11.691315650939941,
1715
+ "learning_rate": 8.311092577147624e-06,
1716
+ "loss": 0.4516,
1717
+ "step": 2440
1718
+ },
1719
+ {
1720
+ "epoch": 0.9412216673069536,
1721
+ "grad_norm": 2.8710756301879883,
1722
+ "learning_rate": 8.300667222685571e-06,
1723
+ "loss": 0.6726,
1724
+ "step": 2450
1725
+ },
1726
+ {
1727
+ "epoch": 0.9450633883980023,
1728
+ "grad_norm": 11.67735481262207,
1729
+ "learning_rate": 8.290241868223519e-06,
1730
+ "loss": 0.6797,
1731
+ "step": 2460
1732
+ },
1733
+ {
1734
+ "epoch": 0.948905109489051,
1735
+ "grad_norm": 17.547286987304688,
1736
+ "learning_rate": 8.279816513761468e-06,
1737
+ "loss": 0.7029,
1738
+ "step": 2470
1739
+ },
1740
+ {
1741
+ "epoch": 0.9527468305800999,
1742
+ "grad_norm": 11.663725852966309,
1743
+ "learning_rate": 8.269391159299417e-06,
1744
+ "loss": 0.5016,
1745
+ "step": 2480
1746
+ },
1747
+ {
1748
+ "epoch": 0.9565885516711486,
1749
+ "grad_norm": 9.743104934692383,
1750
+ "learning_rate": 8.258965804837365e-06,
1751
+ "loss": 0.5489,
1752
+ "step": 2490
1753
+ },
1754
+ {
1755
+ "epoch": 0.9604302727621975,
1756
+ "grad_norm": 9.579474449157715,
1757
+ "learning_rate": 8.248540450375313e-06,
1758
+ "loss": 0.5867,
1759
+ "step": 2500
1760
+ },
1761
+ {
1762
+ "epoch": 0.9642719938532462,
1763
+ "grad_norm": 13.63699722290039,
1764
+ "learning_rate": 8.238115095913262e-06,
1765
+ "loss": 0.5191,
1766
+ "step": 2510
1767
+ },
1768
+ {
1769
+ "epoch": 0.968113714944295,
1770
+ "grad_norm": 10.331293106079102,
1771
+ "learning_rate": 8.227689741451211e-06,
1772
+ "loss": 0.6654,
1773
+ "step": 2520
1774
+ },
1775
+ {
1776
+ "epoch": 0.9719554360353438,
1777
+ "grad_norm": 10.614498138427734,
1778
+ "learning_rate": 8.217264386989159e-06,
1779
+ "loss": 0.5947,
1780
+ "step": 2530
1781
+ },
1782
+ {
1783
+ "epoch": 0.9757971571263926,
1784
+ "grad_norm": 10.182368278503418,
1785
+ "learning_rate": 8.206839032527106e-06,
1786
+ "loss": 0.5482,
1787
+ "step": 2540
1788
+ },
1789
+ {
1790
+ "epoch": 0.9796388782174414,
1791
+ "grad_norm": 15.42397403717041,
1792
+ "learning_rate": 8.196413678065055e-06,
1793
+ "loss": 0.5144,
1794
+ "step": 2550
1795
+ },
1796
+ {
1797
+ "epoch": 0.9834805993084902,
1798
+ "grad_norm": 8.317682266235352,
1799
+ "learning_rate": 8.185988323603003e-06,
1800
+ "loss": 0.4518,
1801
+ "step": 2560
1802
+ },
1803
+ {
1804
+ "epoch": 0.987322320399539,
1805
+ "grad_norm": 43.10714340209961,
1806
+ "learning_rate": 8.17556296914095e-06,
1807
+ "loss": 0.5974,
1808
+ "step": 2570
1809
+ },
1810
+ {
1811
+ "epoch": 0.9911640414905878,
1812
+ "grad_norm": 7.906277656555176,
1813
+ "learning_rate": 8.1651376146789e-06,
1814
+ "loss": 0.5745,
1815
+ "step": 2580
1816
+ },
1817
+ {
1818
+ "epoch": 0.9950057625816365,
1819
+ "grad_norm": 10.229177474975586,
1820
+ "learning_rate": 8.154712260216849e-06,
1821
+ "loss": 0.4322,
1822
+ "step": 2590
1823
+ },
1824
+ {
1825
+ "epoch": 0.9988474836726854,
1826
+ "grad_norm": 12.773459434509277,
1827
+ "learning_rate": 8.144286905754796e-06,
1828
+ "loss": 0.5947,
1829
+ "step": 2600
1830
+ },
1831
+ {
1832
+ "epoch": 1.0,
1833
+ "eval_accuracy": 0.7737969455383729,
1834
+ "eval_f1_per_label": [
1835
+ 0.778160354156419,
1836
+ 0.7230364524614806,
1837
+ 0.8299703264094955
1838
+ ],
1839
+ "eval_f1_weighted": 0.774424228670159,
1840
+ "eval_loss": 0.5626052618026733,
1841
+ "eval_precision_per_label": [
1842
+ 0.7752368507023848,
1843
+ 0.7131208302446257,
1844
+ 0.8468059339993945
1845
+ ],
1846
+ "eval_precision_weighted": 0.7753803799282116,
1847
+ "eval_recall_per_label": [
1848
+ 0.7811059907834101,
1849
+ 0.7332317073170732,
1850
+ 0.8137910968868199
1851
+ ],
1852
+ "eval_recall_weighted": 0.7737969455383729,
1853
+ "eval_runtime": 38.6689,
1854
+ "eval_samples_per_second": 269.235,
1855
+ "eval_steps_per_second": 33.67,
1856
+ "step": 2603
1857
+ },
1858
+ {
1859
+ "epoch": 1.0026892047637341,
1860
+ "grad_norm": 10.15665340423584,
1861
+ "learning_rate": 8.133861551292744e-06,
1862
+ "loss": 0.4549,
1863
+ "step": 2610
1864
+ },
1865
+ {
1866
+ "epoch": 1.006530925854783,
1867
+ "grad_norm": 15.496761322021484,
1868
+ "learning_rate": 8.123436196830693e-06,
1869
+ "loss": 0.3519,
1870
+ "step": 2620
1871
+ },
1872
+ {
1873
+ "epoch": 1.0103726469458318,
1874
+ "grad_norm": 15.590469360351562,
1875
+ "learning_rate": 8.11301084236864e-06,
1876
+ "loss": 0.5326,
1877
+ "step": 2630
1878
+ },
1879
+ {
1880
+ "epoch": 1.0142143680368805,
1881
+ "grad_norm": 23.553977966308594,
1882
+ "learning_rate": 8.102585487906588e-06,
1883
+ "loss": 0.448,
1884
+ "step": 2640
1885
+ },
1886
+ {
1887
+ "epoch": 1.0180560891279293,
1888
+ "grad_norm": 12.566988945007324,
1889
+ "learning_rate": 8.092160133444538e-06,
1890
+ "loss": 0.4067,
1891
+ "step": 2650
1892
+ },
1893
+ {
1894
+ "epoch": 1.0218978102189782,
1895
+ "grad_norm": 18.021800994873047,
1896
+ "learning_rate": 8.081734778982487e-06,
1897
+ "loss": 0.6817,
1898
+ "step": 2660
1899
+ },
1900
+ {
1901
+ "epoch": 1.0257395313100268,
1902
+ "grad_norm": 2.4938793182373047,
1903
+ "learning_rate": 8.071309424520434e-06,
1904
+ "loss": 0.4567,
1905
+ "step": 2670
1906
+ },
1907
+ {
1908
+ "epoch": 1.0295812524010757,
1909
+ "grad_norm": 19.75528335571289,
1910
+ "learning_rate": 8.060884070058382e-06,
1911
+ "loss": 0.5433,
1912
+ "step": 2680
1913
+ },
1914
+ {
1915
+ "epoch": 1.0334229734921245,
1916
+ "grad_norm": 20.85611343383789,
1917
+ "learning_rate": 8.050458715596331e-06,
1918
+ "loss": 0.4946,
1919
+ "step": 2690
1920
+ },
1921
+ {
1922
+ "epoch": 1.0372646945831732,
1923
+ "grad_norm": 11.405384063720703,
1924
+ "learning_rate": 8.040033361134279e-06,
1925
+ "loss": 0.4871,
1926
+ "step": 2700
1927
+ },
1928
+ {
1929
+ "epoch": 1.041106415674222,
1930
+ "grad_norm": 20.596933364868164,
1931
+ "learning_rate": 8.029608006672226e-06,
1932
+ "loss": 0.5361,
1933
+ "step": 2710
1934
+ },
1935
+ {
1936
+ "epoch": 1.0449481367652709,
1937
+ "grad_norm": 19.30833625793457,
1938
+ "learning_rate": 8.019182652210176e-06,
1939
+ "loss": 0.5362,
1940
+ "step": 2720
1941
+ },
1942
+ {
1943
+ "epoch": 1.0487898578563197,
1944
+ "grad_norm": 29.342666625976562,
1945
+ "learning_rate": 8.008757297748125e-06,
1946
+ "loss": 0.4175,
1947
+ "step": 2730
1948
+ },
1949
+ {
1950
+ "epoch": 1.0526315789473684,
1951
+ "grad_norm": 31.298583984375,
1952
+ "learning_rate": 7.998331943286072e-06,
1953
+ "loss": 0.4556,
1954
+ "step": 2740
1955
+ },
1956
+ {
1957
+ "epoch": 1.0564733000384172,
1958
+ "grad_norm": 22.037307739257812,
1959
+ "learning_rate": 7.98790658882402e-06,
1960
+ "loss": 0.4966,
1961
+ "step": 2750
1962
+ },
1963
+ {
1964
+ "epoch": 1.060315021129466,
1965
+ "grad_norm": 29.734373092651367,
1966
+ "learning_rate": 7.977481234361969e-06,
1967
+ "loss": 0.4146,
1968
+ "step": 2760
1969
+ },
1970
+ {
1971
+ "epoch": 1.0641567422205147,
1972
+ "grad_norm": 26.209257125854492,
1973
+ "learning_rate": 7.967055879899918e-06,
1974
+ "loss": 0.5298,
1975
+ "step": 2770
1976
+ },
1977
+ {
1978
+ "epoch": 1.0679984633115636,
1979
+ "grad_norm": 28.883424758911133,
1980
+ "learning_rate": 7.956630525437866e-06,
1981
+ "loss": 0.4872,
1982
+ "step": 2780
1983
+ },
1984
+ {
1985
+ "epoch": 1.0718401844026124,
1986
+ "grad_norm": 20.605607986450195,
1987
+ "learning_rate": 7.946205170975813e-06,
1988
+ "loss": 0.3929,
1989
+ "step": 2790
1990
+ },
1991
+ {
1992
+ "epoch": 1.0756819054936613,
1993
+ "grad_norm": 4.712803363800049,
1994
+ "learning_rate": 7.935779816513763e-06,
1995
+ "loss": 0.429,
1996
+ "step": 2800
1997
+ },
1998
+ {
1999
+ "epoch": 1.07952362658471,
2000
+ "grad_norm": 24.59100341796875,
2001
+ "learning_rate": 7.92535446205171e-06,
2002
+ "loss": 0.4387,
2003
+ "step": 2810
2004
+ },
2005
+ {
2006
+ "epoch": 1.0833653476757588,
2007
+ "grad_norm": 41.113101959228516,
2008
+ "learning_rate": 7.914929107589658e-06,
2009
+ "loss": 0.3661,
2010
+ "step": 2820
2011
+ },
2012
+ {
2013
+ "epoch": 1.0872070687668076,
2014
+ "grad_norm": 19.407039642333984,
2015
+ "learning_rate": 7.904503753127607e-06,
2016
+ "loss": 0.3685,
2017
+ "step": 2830
2018
+ },
2019
+ {
2020
+ "epoch": 1.0910487898578562,
2021
+ "grad_norm": 17.455015182495117,
2022
+ "learning_rate": 7.894078398665556e-06,
2023
+ "loss": 0.3471,
2024
+ "step": 2840
2025
+ },
2026
+ {
2027
+ "epoch": 1.094890510948905,
2028
+ "grad_norm": 31.661861419677734,
2029
+ "learning_rate": 7.883653044203504e-06,
2030
+ "loss": 0.4217,
2031
+ "step": 2850
2032
+ },
2033
+ {
2034
+ "epoch": 1.098732232039954,
2035
+ "grad_norm": 25.433256149291992,
2036
+ "learning_rate": 7.873227689741451e-06,
2037
+ "loss": 0.563,
2038
+ "step": 2860
2039
+ },
2040
+ {
2041
+ "epoch": 1.1025739531310026,
2042
+ "grad_norm": 17.527708053588867,
2043
+ "learning_rate": 7.8628023352794e-06,
2044
+ "loss": 0.6654,
2045
+ "step": 2870
2046
+ },
2047
+ {
2048
+ "epoch": 1.1064156742220514,
2049
+ "grad_norm": 16.530296325683594,
2050
+ "learning_rate": 7.852376980817348e-06,
2051
+ "loss": 0.397,
2052
+ "step": 2880
2053
+ },
2054
+ {
2055
+ "epoch": 1.1102573953131003,
2056
+ "grad_norm": 20.510169982910156,
2057
+ "learning_rate": 7.841951626355296e-06,
2058
+ "loss": 0.5602,
2059
+ "step": 2890
2060
+ },
2061
+ {
2062
+ "epoch": 1.1140991164041492,
2063
+ "grad_norm": 17.688804626464844,
2064
+ "learning_rate": 7.831526271893245e-06,
2065
+ "loss": 0.4872,
2066
+ "step": 2900
2067
+ },
2068
+ {
2069
+ "epoch": 1.1179408374951978,
2070
+ "grad_norm": 14.219194412231445,
2071
+ "learning_rate": 7.821100917431194e-06,
2072
+ "loss": 0.4509,
2073
+ "step": 2910
2074
+ },
2075
+ {
2076
+ "epoch": 1.1217825585862466,
2077
+ "grad_norm": 19.561573028564453,
2078
+ "learning_rate": 7.810675562969142e-06,
2079
+ "loss": 0.516,
2080
+ "step": 2920
2081
+ },
2082
+ {
2083
+ "epoch": 1.1256242796772955,
2084
+ "grad_norm": 4.555116176605225,
2085
+ "learning_rate": 7.80025020850709e-06,
2086
+ "loss": 0.3659,
2087
+ "step": 2930
2088
+ },
2089
+ {
2090
+ "epoch": 1.1294660007683441,
2091
+ "grad_norm": 21.258682250976562,
2092
+ "learning_rate": 7.789824854045039e-06,
2093
+ "loss": 0.7407,
2094
+ "step": 2940
2095
+ },
2096
+ {
2097
+ "epoch": 1.133307721859393,
2098
+ "grad_norm": 23.416915893554688,
2099
+ "learning_rate": 7.779399499582986e-06,
2100
+ "loss": 0.5668,
2101
+ "step": 2950
2102
+ },
2103
+ {
2104
+ "epoch": 1.1371494429504418,
2105
+ "grad_norm": 15.231550216674805,
2106
+ "learning_rate": 7.768974145120934e-06,
2107
+ "loss": 0.7209,
2108
+ "step": 2960
2109
+ },
2110
+ {
2111
+ "epoch": 1.1409911640414907,
2112
+ "grad_norm": 13.552851676940918,
2113
+ "learning_rate": 7.758548790658883e-06,
2114
+ "loss": 0.3603,
2115
+ "step": 2970
2116
+ },
2117
+ {
2118
+ "epoch": 1.1448328851325393,
2119
+ "grad_norm": 13.771663665771484,
2120
+ "learning_rate": 7.748123436196832e-06,
2121
+ "loss": 0.4201,
2122
+ "step": 2980
2123
+ },
2124
+ {
2125
+ "epoch": 1.1486746062235882,
2126
+ "grad_norm": 10.52929973602295,
2127
+ "learning_rate": 7.73769808173478e-06,
2128
+ "loss": 0.4927,
2129
+ "step": 2990
2130
+ },
2131
+ {
2132
+ "epoch": 1.152516327314637,
2133
+ "grad_norm": 10.99349594116211,
2134
+ "learning_rate": 7.727272727272727e-06,
2135
+ "loss": 0.4471,
2136
+ "step": 3000
2137
+ },
2138
+ {
2139
+ "epoch": 1.1563580484056857,
2140
+ "grad_norm": 28.14784812927246,
2141
+ "learning_rate": 7.716847372810676e-06,
2142
+ "loss": 0.4885,
2143
+ "step": 3010
2144
+ },
2145
+ {
2146
+ "epoch": 1.1601997694967345,
2147
+ "grad_norm": 14.829123497009277,
2148
+ "learning_rate": 7.706422018348626e-06,
2149
+ "loss": 0.4161,
2150
+ "step": 3020
2151
+ },
2152
+ {
2153
+ "epoch": 1.1640414905877834,
2154
+ "grad_norm": 16.16433334350586,
2155
+ "learning_rate": 7.695996663886573e-06,
2156
+ "loss": 0.5096,
2157
+ "step": 3030
2158
+ },
2159
+ {
2160
+ "epoch": 1.167883211678832,
2161
+ "grad_norm": 22.093318939208984,
2162
+ "learning_rate": 7.68557130942452e-06,
2163
+ "loss": 0.4168,
2164
+ "step": 3040
2165
+ },
2166
+ {
2167
+ "epoch": 1.1717249327698809,
2168
+ "grad_norm": 13.417600631713867,
2169
+ "learning_rate": 7.67514595496247e-06,
2170
+ "loss": 0.5085,
2171
+ "step": 3050
2172
+ },
2173
+ {
2174
+ "epoch": 1.1755666538609297,
2175
+ "grad_norm": 18.72173309326172,
2176
+ "learning_rate": 7.664720600500418e-06,
2177
+ "loss": 0.3777,
2178
+ "step": 3060
2179
+ },
2180
+ {
2181
+ "epoch": 1.1794083749519784,
2182
+ "grad_norm": 30.103759765625,
2183
+ "learning_rate": 7.654295246038365e-06,
2184
+ "loss": 0.6035,
2185
+ "step": 3070
2186
+ },
2187
+ {
2188
+ "epoch": 1.1832500960430272,
2189
+ "grad_norm": 2.8469719886779785,
2190
+ "learning_rate": 7.643869891576314e-06,
2191
+ "loss": 0.2965,
2192
+ "step": 3080
2193
+ },
2194
+ {
2195
+ "epoch": 1.187091817134076,
2196
+ "grad_norm": 20.917043685913086,
2197
+ "learning_rate": 7.633444537114264e-06,
2198
+ "loss": 0.3834,
2199
+ "step": 3090
2200
+ },
2201
+ {
2202
+ "epoch": 1.190933538225125,
2203
+ "grad_norm": 30.07198715209961,
2204
+ "learning_rate": 7.62301918265221e-06,
2205
+ "loss": 0.4808,
2206
+ "step": 3100
2207
+ },
2208
+ {
2209
+ "epoch": 1.1947752593161736,
2210
+ "grad_norm": 17.897123336791992,
2211
+ "learning_rate": 7.612593828190159e-06,
2212
+ "loss": 0.5116,
2213
+ "step": 3110
2214
+ },
2215
+ {
2216
+ "epoch": 1.1986169804072224,
2217
+ "grad_norm": 20.788223266601562,
2218
+ "learning_rate": 7.602168473728108e-06,
2219
+ "loss": 0.5371,
2220
+ "step": 3120
2221
+ },
2222
+ {
2223
+ "epoch": 1.2024587014982713,
2224
+ "grad_norm": 17.470685958862305,
2225
+ "learning_rate": 7.5917431192660555e-06,
2226
+ "loss": 0.6223,
2227
+ "step": 3130
2228
+ },
2229
+ {
2230
+ "epoch": 1.2063004225893201,
2231
+ "grad_norm": 5.824326515197754,
2232
+ "learning_rate": 7.581317764804004e-06,
2233
+ "loss": 0.3919,
2234
+ "step": 3140
2235
+ },
2236
+ {
2237
+ "epoch": 1.2101421436803688,
2238
+ "grad_norm": 26.392175674438477,
2239
+ "learning_rate": 7.570892410341952e-06,
2240
+ "loss": 0.499,
2241
+ "step": 3150
2242
+ },
2243
+ {
2244
+ "epoch": 1.2139838647714176,
2245
+ "grad_norm": 8.42440414428711,
2246
+ "learning_rate": 7.560467055879901e-06,
2247
+ "loss": 0.3343,
2248
+ "step": 3160
2249
+ },
2250
+ {
2251
+ "epoch": 1.2178255858624665,
2252
+ "grad_norm": 10.132403373718262,
2253
+ "learning_rate": 7.550041701417848e-06,
2254
+ "loss": 0.5502,
2255
+ "step": 3170
2256
+ },
2257
+ {
2258
+ "epoch": 1.221667306953515,
2259
+ "grad_norm": 8.304975509643555,
2260
+ "learning_rate": 7.539616346955797e-06,
2261
+ "loss": 0.5273,
2262
+ "step": 3180
2263
+ },
2264
+ {
2265
+ "epoch": 1.225509028044564,
2266
+ "grad_norm": 8.726995468139648,
2267
+ "learning_rate": 7.529190992493746e-06,
2268
+ "loss": 0.5491,
2269
+ "step": 3190
2270
+ },
2271
+ {
2272
+ "epoch": 1.2293507491356128,
2273
+ "grad_norm": 21.480060577392578,
2274
+ "learning_rate": 7.518765638031694e-06,
2275
+ "loss": 0.4478,
2276
+ "step": 3200
2277
+ },
2278
+ {
2279
+ "epoch": 1.2331924702266615,
2280
+ "grad_norm": 19.07476806640625,
2281
+ "learning_rate": 7.508340283569642e-06,
2282
+ "loss": 0.6649,
2283
+ "step": 3210
2284
+ },
2285
+ {
2286
+ "epoch": 1.2370341913177103,
2287
+ "grad_norm": 22.936288833618164,
2288
+ "learning_rate": 7.49791492910759e-06,
2289
+ "loss": 0.5483,
2290
+ "step": 3220
2291
+ },
2292
+ {
2293
+ "epoch": 1.2408759124087592,
2294
+ "grad_norm": 11.86825942993164,
2295
+ "learning_rate": 7.487489574645539e-06,
2296
+ "loss": 0.5724,
2297
+ "step": 3230
2298
+ },
2299
+ {
2300
+ "epoch": 1.2447176334998078,
2301
+ "grad_norm": 3.569042921066284,
2302
+ "learning_rate": 7.477064220183486e-06,
2303
+ "loss": 0.2133,
2304
+ "step": 3240
2305
+ },
2306
+ {
2307
+ "epoch": 1.2485593545908567,
2308
+ "grad_norm": 21.424320220947266,
2309
+ "learning_rate": 7.466638865721435e-06,
2310
+ "loss": 0.7125,
2311
+ "step": 3250
2312
+ },
2313
+ {
2314
+ "epoch": 1.2524010756819055,
2315
+ "grad_norm": 10.503767967224121,
2316
+ "learning_rate": 7.456213511259384e-06,
2317
+ "loss": 0.3605,
2318
+ "step": 3260
2319
+ },
2320
+ {
2321
+ "epoch": 1.2562427967729544,
2322
+ "grad_norm": 14.187332153320312,
2323
+ "learning_rate": 7.445788156797332e-06,
2324
+ "loss": 0.337,
2325
+ "step": 3270
2326
+ },
2327
+ {
2328
+ "epoch": 1.260084517864003,
2329
+ "grad_norm": 39.91868591308594,
2330
+ "learning_rate": 7.43536280233528e-06,
2331
+ "loss": 0.6577,
2332
+ "step": 3280
2333
+ },
2334
+ {
2335
+ "epoch": 1.2639262389550519,
2336
+ "grad_norm": 22.75690269470215,
2337
+ "learning_rate": 7.424937447873228e-06,
2338
+ "loss": 0.514,
2339
+ "step": 3290
2340
+ },
2341
+ {
2342
+ "epoch": 1.2677679600461007,
2343
+ "grad_norm": 7.429053783416748,
2344
+ "learning_rate": 7.414512093411177e-06,
2345
+ "loss": 0.4178,
2346
+ "step": 3300
2347
+ },
2348
+ {
2349
+ "epoch": 1.2716096811371496,
2350
+ "grad_norm": 33.08567428588867,
2351
+ "learning_rate": 7.404086738949124e-06,
2352
+ "loss": 0.5662,
2353
+ "step": 3310
2354
+ },
2355
+ {
2356
+ "epoch": 1.2754514022281982,
2357
+ "grad_norm": 3.4354841709136963,
2358
+ "learning_rate": 7.393661384487073e-06,
2359
+ "loss": 0.3871,
2360
+ "step": 3320
2361
+ },
2362
+ {
2363
+ "epoch": 1.279293123319247,
2364
+ "grad_norm": 7.866048336029053,
2365
+ "learning_rate": 7.383236030025022e-06,
2366
+ "loss": 0.3705,
2367
+ "step": 3330
2368
+ },
2369
+ {
2370
+ "epoch": 1.283134844410296,
2371
+ "grad_norm": 21.70540428161621,
2372
+ "learning_rate": 7.37281067556297e-06,
2373
+ "loss": 0.4563,
2374
+ "step": 3340
2375
+ },
2376
+ {
2377
+ "epoch": 1.2869765655013445,
2378
+ "grad_norm": 8.615718841552734,
2379
+ "learning_rate": 7.362385321100918e-06,
2380
+ "loss": 0.4756,
2381
+ "step": 3350
2382
+ },
2383
+ {
2384
+ "epoch": 1.2908182865923934,
2385
+ "grad_norm": 7.302259922027588,
2386
+ "learning_rate": 7.351959966638866e-06,
2387
+ "loss": 0.4554,
2388
+ "step": 3360
2389
+ },
2390
+ {
2391
+ "epoch": 1.2946600076834422,
2392
+ "grad_norm": 22.20801544189453,
2393
+ "learning_rate": 7.341534612176815e-06,
2394
+ "loss": 0.5313,
2395
+ "step": 3370
2396
+ },
2397
+ {
2398
+ "epoch": 1.2985017287744909,
2399
+ "grad_norm": 30.15619468688965,
2400
+ "learning_rate": 7.331109257714763e-06,
2401
+ "loss": 0.496,
2402
+ "step": 3380
2403
+ },
2404
+ {
2405
+ "epoch": 1.3023434498655397,
2406
+ "grad_norm": 14.92813491821289,
2407
+ "learning_rate": 7.320683903252711e-06,
2408
+ "loss": 0.3749,
2409
+ "step": 3390
2410
+ },
2411
+ {
2412
+ "epoch": 1.3061851709565886,
2413
+ "grad_norm": 29.67280387878418,
2414
+ "learning_rate": 7.31025854879066e-06,
2415
+ "loss": 0.6701,
2416
+ "step": 3400
2417
+ },
2418
+ {
2419
+ "epoch": 1.3100268920476372,
2420
+ "grad_norm": 15.611763954162598,
2421
+ "learning_rate": 7.299833194328608e-06,
2422
+ "loss": 0.4524,
2423
+ "step": 3410
2424
+ },
2425
+ {
2426
+ "epoch": 1.313868613138686,
2427
+ "grad_norm": 26.292102813720703,
2428
+ "learning_rate": 7.2894078398665556e-06,
2429
+ "loss": 0.4924,
2430
+ "step": 3420
2431
+ },
2432
+ {
2433
+ "epoch": 1.317710334229735,
2434
+ "grad_norm": 15.659175872802734,
2435
+ "learning_rate": 7.278982485404504e-06,
2436
+ "loss": 0.6496,
2437
+ "step": 3430
2438
+ },
2439
+ {
2440
+ "epoch": 1.3215520553207838,
2441
+ "grad_norm": 14.91651725769043,
2442
+ "learning_rate": 7.268557130942453e-06,
2443
+ "loss": 0.4623,
2444
+ "step": 3440
2445
+ },
2446
+ {
2447
+ "epoch": 1.3253937764118324,
2448
+ "grad_norm": 31.96514892578125,
2449
+ "learning_rate": 7.258131776480401e-06,
2450
+ "loss": 0.6592,
2451
+ "step": 3450
2452
+ },
2453
+ {
2454
+ "epoch": 1.3292354975028813,
2455
+ "grad_norm": 21.0323543548584,
2456
+ "learning_rate": 7.247706422018349e-06,
2457
+ "loss": 0.67,
2458
+ "step": 3460
2459
+ },
2460
+ {
2461
+ "epoch": 1.3330772185939301,
2462
+ "grad_norm": 8.115620613098145,
2463
+ "learning_rate": 7.2372810675562975e-06,
2464
+ "loss": 0.3956,
2465
+ "step": 3470
2466
+ },
2467
+ {
2468
+ "epoch": 1.336918939684979,
2469
+ "grad_norm": 5.198733329772949,
2470
+ "learning_rate": 7.226855713094246e-06,
2471
+ "loss": 0.3424,
2472
+ "step": 3480
2473
+ },
2474
+ {
2475
+ "epoch": 1.3407606607760276,
2476
+ "grad_norm": 3.520685911178589,
2477
+ "learning_rate": 7.2164303586321935e-06,
2478
+ "loss": 0.5398,
2479
+ "step": 3490
2480
+ },
2481
+ {
2482
+ "epoch": 1.3446023818670765,
2483
+ "grad_norm": 17.651782989501953,
2484
+ "learning_rate": 7.206005004170143e-06,
2485
+ "loss": 0.5158,
2486
+ "step": 3500
2487
+ },
2488
+ {
2489
+ "epoch": 1.3484441029581253,
2490
+ "grad_norm": 6.332894325256348,
2491
+ "learning_rate": 7.195579649708091e-06,
2492
+ "loss": 0.396,
2493
+ "step": 3510
2494
+ },
2495
+ {
2496
+ "epoch": 1.352285824049174,
2497
+ "grad_norm": 17.260141372680664,
2498
+ "learning_rate": 7.185154295246039e-06,
2499
+ "loss": 0.6168,
2500
+ "step": 3520
2501
+ },
2502
+ {
2503
+ "epoch": 1.3561275451402228,
2504
+ "grad_norm": 13.815728187561035,
2505
+ "learning_rate": 7.174728940783987e-06,
2506
+ "loss": 0.3849,
2507
+ "step": 3530
2508
+ },
2509
+ {
2510
+ "epoch": 1.3599692662312717,
2511
+ "grad_norm": 64.78901672363281,
2512
+ "learning_rate": 7.1643035863219355e-06,
2513
+ "loss": 0.6198,
2514
+ "step": 3540
2515
+ },
2516
+ {
2517
+ "epoch": 1.3638109873223203,
2518
+ "grad_norm": 18.190109252929688,
2519
+ "learning_rate": 7.153878231859885e-06,
2520
+ "loss": 0.5692,
2521
+ "step": 3550
2522
+ },
2523
+ {
2524
+ "epoch": 1.3676527084133692,
2525
+ "grad_norm": 21.627002716064453,
2526
+ "learning_rate": 7.143452877397831e-06,
2527
+ "loss": 0.5806,
2528
+ "step": 3560
2529
+ },
2530
+ {
2531
+ "epoch": 1.371494429504418,
2532
+ "grad_norm": 13.441298484802246,
2533
+ "learning_rate": 7.133027522935781e-06,
2534
+ "loss": 0.5592,
2535
+ "step": 3570
2536
+ },
2537
+ {
2538
+ "epoch": 1.3753361505954667,
2539
+ "grad_norm": 19.517723083496094,
2540
+ "learning_rate": 7.122602168473729e-06,
2541
+ "loss": 0.4344,
2542
+ "step": 3580
2543
+ },
2544
+ {
2545
+ "epoch": 1.3791778716865155,
2546
+ "grad_norm": 16.326400756835938,
2547
+ "learning_rate": 7.112176814011677e-06,
2548
+ "loss": 0.4496,
2549
+ "step": 3590
2550
+ },
2551
+ {
2552
+ "epoch": 1.3830195927775644,
2553
+ "grad_norm": 7.95460844039917,
2554
+ "learning_rate": 7.101751459549625e-06,
2555
+ "loss": 0.2619,
2556
+ "step": 3600
2557
+ },
2558
+ {
2559
+ "epoch": 1.3868613138686132,
2560
+ "grad_norm": 23.37911033630371,
2561
+ "learning_rate": 7.091326105087573e-06,
2562
+ "loss": 0.467,
2563
+ "step": 3610
2564
+ },
2565
+ {
2566
+ "epoch": 1.3907030349596619,
2567
+ "grad_norm": 13.212058067321777,
2568
+ "learning_rate": 7.080900750625523e-06,
2569
+ "loss": 0.4576,
2570
+ "step": 3620
2571
+ },
2572
+ {
2573
+ "epoch": 1.3945447560507107,
2574
+ "grad_norm": 21.34543800354004,
2575
+ "learning_rate": 7.07047539616347e-06,
2576
+ "loss": 0.5783,
2577
+ "step": 3630
2578
+ },
2579
+ {
2580
+ "epoch": 1.3983864771417596,
2581
+ "grad_norm": 20.114788055419922,
2582
+ "learning_rate": 7.0600500417014186e-06,
2583
+ "loss": 0.5636,
2584
+ "step": 3640
2585
+ },
2586
+ {
2587
+ "epoch": 1.4022281982328084,
2588
+ "grad_norm": 2.998847484588623,
2589
+ "learning_rate": 7.049624687239367e-06,
2590
+ "loss": 0.4757,
2591
+ "step": 3650
2592
+ },
2593
+ {
2594
+ "epoch": 1.406069919323857,
2595
+ "grad_norm": 11.517350196838379,
2596
+ "learning_rate": 7.0391993327773145e-06,
2597
+ "loss": 0.3284,
2598
+ "step": 3660
2599
+ },
2600
+ {
2601
+ "epoch": 1.409911640414906,
2602
+ "grad_norm": 14.91612434387207,
2603
+ "learning_rate": 7.028773978315263e-06,
2604
+ "loss": 0.4234,
2605
+ "step": 3670
2606
+ },
2607
+ {
2608
+ "epoch": 1.4137533615059548,
2609
+ "grad_norm": 20.586612701416016,
2610
+ "learning_rate": 7.018348623853211e-06,
2611
+ "loss": 0.5065,
2612
+ "step": 3680
2613
+ },
2614
+ {
2615
+ "epoch": 1.4175950825970034,
2616
+ "grad_norm": 23.526439666748047,
2617
+ "learning_rate": 7.0079232693911605e-06,
2618
+ "loss": 0.6208,
2619
+ "step": 3690
2620
+ },
2621
+ {
2622
+ "epoch": 1.4214368036880523,
2623
+ "grad_norm": 27.38988494873047,
2624
+ "learning_rate": 6.997497914929108e-06,
2625
+ "loss": 0.4807,
2626
+ "step": 3700
2627
+ },
2628
+ {
2629
+ "epoch": 1.425278524779101,
2630
+ "grad_norm": 4.145619869232178,
2631
+ "learning_rate": 6.9870725604670565e-06,
2632
+ "loss": 0.3003,
2633
+ "step": 3710
2634
+ },
2635
+ {
2636
+ "epoch": 1.4291202458701497,
2637
+ "grad_norm": 7.592724323272705,
2638
+ "learning_rate": 6.976647206005005e-06,
2639
+ "loss": 0.7445,
2640
+ "step": 3720
2641
+ },
2642
+ {
2643
+ "epoch": 1.4329619669611986,
2644
+ "grad_norm": 7.246058940887451,
2645
+ "learning_rate": 6.9662218515429524e-06,
2646
+ "loss": 0.4186,
2647
+ "step": 3730
2648
+ },
2649
+ {
2650
+ "epoch": 1.4368036880522475,
2651
+ "grad_norm": 15.480023384094238,
2652
+ "learning_rate": 6.955796497080901e-06,
2653
+ "loss": 0.6303,
2654
+ "step": 3740
2655
+ },
2656
+ {
2657
+ "epoch": 1.440645409143296,
2658
+ "grad_norm": 33.452980041503906,
2659
+ "learning_rate": 6.94537114261885e-06,
2660
+ "loss": 0.3228,
2661
+ "step": 3750
2662
+ },
2663
+ {
2664
+ "epoch": 1.444487130234345,
2665
+ "grad_norm": 9.024140357971191,
2666
+ "learning_rate": 6.9349457881567985e-06,
2667
+ "loss": 0.5242,
2668
+ "step": 3760
2669
+ },
2670
+ {
2671
+ "epoch": 1.4483288513253938,
2672
+ "grad_norm": 34.077571868896484,
2673
+ "learning_rate": 6.924520433694746e-06,
2674
+ "loss": 0.5266,
2675
+ "step": 3770
2676
+ },
2677
+ {
2678
+ "epoch": 1.4521705724164424,
2679
+ "grad_norm": 23.063976287841797,
2680
+ "learning_rate": 6.914095079232694e-06,
2681
+ "loss": 0.4766,
2682
+ "step": 3780
2683
+ },
2684
+ {
2685
+ "epoch": 1.4560122935074913,
2686
+ "grad_norm": 29.312820434570312,
2687
+ "learning_rate": 6.903669724770643e-06,
2688
+ "loss": 0.4143,
2689
+ "step": 3790
2690
+ },
2691
+ {
2692
+ "epoch": 1.4598540145985401,
2693
+ "grad_norm": 31.113893508911133,
2694
+ "learning_rate": 6.89324437030859e-06,
2695
+ "loss": 0.805,
2696
+ "step": 3800
2697
+ },
2698
+ {
2699
+ "epoch": 1.463695735689589,
2700
+ "grad_norm": 8.02818489074707,
2701
+ "learning_rate": 6.882819015846539e-06,
2702
+ "loss": 0.5619,
2703
+ "step": 3810
2704
+ },
2705
+ {
2706
+ "epoch": 1.4675374567806379,
2707
+ "grad_norm": 25.848047256469727,
2708
+ "learning_rate": 6.872393661384488e-06,
2709
+ "loss": 0.542,
2710
+ "step": 3820
2711
+ },
2712
+ {
2713
+ "epoch": 1.4713791778716865,
2714
+ "grad_norm": 17.603303909301758,
2715
+ "learning_rate": 6.861968306922436e-06,
2716
+ "loss": 0.5158,
2717
+ "step": 3830
2718
+ },
2719
+ {
2720
+ "epoch": 1.4752208989627353,
2721
+ "grad_norm": 5.893566608428955,
2722
+ "learning_rate": 6.851542952460384e-06,
2723
+ "loss": 0.4472,
2724
+ "step": 3840
2725
+ },
2726
+ {
2727
+ "epoch": 1.4790626200537842,
2728
+ "grad_norm": 25.97250747680664,
2729
+ "learning_rate": 6.841117597998332e-06,
2730
+ "loss": 0.416,
2731
+ "step": 3850
2732
+ },
2733
+ {
2734
+ "epoch": 1.4829043411448328,
2735
+ "grad_norm": 13.01366901397705,
2736
+ "learning_rate": 6.830692243536281e-06,
2737
+ "loss": 0.4919,
2738
+ "step": 3860
2739
+ },
2740
+ {
2741
+ "epoch": 1.4867460622358817,
2742
+ "grad_norm": 16.056318283081055,
2743
+ "learning_rate": 6.82026688907423e-06,
2744
+ "loss": 0.3523,
2745
+ "step": 3870
2746
+ },
2747
+ {
2748
+ "epoch": 1.4905877833269305,
2749
+ "grad_norm": 28.603076934814453,
2750
+ "learning_rate": 6.8098415346121775e-06,
2751
+ "loss": 0.3091,
2752
+ "step": 3880
2753
+ },
2754
+ {
2755
+ "epoch": 1.4944295044179792,
2756
+ "grad_norm": 30.29176902770996,
2757
+ "learning_rate": 6.799416180150126e-06,
2758
+ "loss": 0.4874,
2759
+ "step": 3890
2760
+ },
2761
+ {
2762
+ "epoch": 1.498271225509028,
2763
+ "grad_norm": 22.889122009277344,
2764
+ "learning_rate": 6.788990825688074e-06,
2765
+ "loss": 0.5437,
2766
+ "step": 3900
2767
+ },
2768
+ {
2769
+ "epoch": 1.5021129466000769,
2770
+ "grad_norm": 7.505212306976318,
2771
+ "learning_rate": 6.778565471226022e-06,
2772
+ "loss": 0.4872,
2773
+ "step": 3910
2774
+ },
2775
+ {
2776
+ "epoch": 1.5059546676911255,
2777
+ "grad_norm": 23.37257194519043,
2778
+ "learning_rate": 6.76814011676397e-06,
2779
+ "loss": 0.3906,
2780
+ "step": 3920
2781
+ },
2782
+ {
2783
+ "epoch": 1.5097963887821744,
2784
+ "grad_norm": 9.461624145507812,
2785
+ "learning_rate": 6.757714762301919e-06,
2786
+ "loss": 0.4099,
2787
+ "step": 3930
2788
+ },
2789
+ {
2790
+ "epoch": 1.5136381098732232,
2791
+ "grad_norm": 21.49395751953125,
2792
+ "learning_rate": 6.747289407839868e-06,
2793
+ "loss": 0.5254,
2794
+ "step": 3940
2795
+ },
2796
+ {
2797
+ "epoch": 1.5174798309642719,
2798
+ "grad_norm": 17.44995880126953,
2799
+ "learning_rate": 6.736864053377815e-06,
2800
+ "loss": 0.4447,
2801
+ "step": 3950
2802
+ },
2803
+ {
2804
+ "epoch": 1.521321552055321,
2805
+ "grad_norm": 11.34809398651123,
2806
+ "learning_rate": 6.726438698915764e-06,
2807
+ "loss": 0.5709,
2808
+ "step": 3960
2809
+ },
2810
+ {
2811
+ "epoch": 1.5251632731463696,
2812
+ "grad_norm": 18.853090286254883,
2813
+ "learning_rate": 6.716013344453712e-06,
2814
+ "loss": 0.34,
2815
+ "step": 3970
2816
+ },
2817
+ {
2818
+ "epoch": 1.5290049942374182,
2819
+ "grad_norm": 42.40155792236328,
2820
+ "learning_rate": 6.70558798999166e-06,
2821
+ "loss": 0.4124,
2822
+ "step": 3980
2823
+ },
2824
+ {
2825
+ "epoch": 1.5328467153284673,
2826
+ "grad_norm": 16.232521057128906,
2827
+ "learning_rate": 6.695162635529608e-06,
2828
+ "loss": 0.5542,
2829
+ "step": 3990
2830
+ },
2831
+ {
2832
+ "epoch": 1.536688436419516,
2833
+ "grad_norm": 3.611929178237915,
2834
+ "learning_rate": 6.684737281067557e-06,
2835
+ "loss": 0.4139,
2836
+ "step": 4000
2837
+ },
2838
+ {
2839
+ "epoch": 1.5405301575105648,
2840
+ "grad_norm": 10.575961112976074,
2841
+ "learning_rate": 6.674311926605506e-06,
2842
+ "loss": 0.4181,
2843
+ "step": 4010
2844
+ },
2845
+ {
2846
+ "epoch": 1.5443718786016136,
2847
+ "grad_norm": 12.085956573486328,
2848
+ "learning_rate": 6.663886572143453e-06,
2849
+ "loss": 0.3825,
2850
+ "step": 4020
2851
+ },
2852
+ {
2853
+ "epoch": 1.5482135996926623,
2854
+ "grad_norm": 20.601011276245117,
2855
+ "learning_rate": 6.653461217681402e-06,
2856
+ "loss": 0.3923,
2857
+ "step": 4030
2858
+ },
2859
+ {
2860
+ "epoch": 1.5520553207837111,
2861
+ "grad_norm": 9.62112808227539,
2862
+ "learning_rate": 6.64303586321935e-06,
2863
+ "loss": 0.3697,
2864
+ "step": 4040
2865
+ },
2866
+ {
2867
+ "epoch": 1.55589704187476,
2868
+ "grad_norm": 34.249290466308594,
2869
+ "learning_rate": 6.632610508757298e-06,
2870
+ "loss": 0.5826,
2871
+ "step": 4050
2872
+ },
2873
+ {
2874
+ "epoch": 1.5597387629658086,
2875
+ "grad_norm": 16.832563400268555,
2876
+ "learning_rate": 6.622185154295246e-06,
2877
+ "loss": 0.52,
2878
+ "step": 4060
2879
+ },
2880
+ {
2881
+ "epoch": 1.5635804840568575,
2882
+ "grad_norm": 33.596351623535156,
2883
+ "learning_rate": 6.611759799833195e-06,
2884
+ "loss": 0.4241,
2885
+ "step": 4070
2886
+ },
2887
+ {
2888
+ "epoch": 1.5674222051479063,
2889
+ "grad_norm": 7.013982772827148,
2890
+ "learning_rate": 6.601334445371144e-06,
2891
+ "loss": 0.46,
2892
+ "step": 4080
2893
+ },
2894
+ {
2895
+ "epoch": 1.571263926238955,
2896
+ "grad_norm": 24.022945404052734,
2897
+ "learning_rate": 6.590909090909091e-06,
2898
+ "loss": 0.4563,
2899
+ "step": 4090
2900
+ },
2901
+ {
2902
+ "epoch": 1.5751056473300038,
2903
+ "grad_norm": 7.977485656738281,
2904
+ "learning_rate": 6.58048373644704e-06,
2905
+ "loss": 0.4118,
2906
+ "step": 4100
2907
+ },
2908
+ {
2909
+ "epoch": 1.5789473684210527,
2910
+ "grad_norm": 15.87927532196045,
2911
+ "learning_rate": 6.570058381984988e-06,
2912
+ "loss": 0.5734,
2913
+ "step": 4110
2914
+ },
2915
+ {
2916
+ "epoch": 1.5827890895121013,
2917
+ "grad_norm": 18.238597869873047,
2918
+ "learning_rate": 6.559633027522936e-06,
2919
+ "loss": 0.5423,
2920
+ "step": 4120
2921
+ },
2922
+ {
2923
+ "epoch": 1.5866308106031504,
2924
+ "grad_norm": 12.554328918457031,
2925
+ "learning_rate": 6.549207673060885e-06,
2926
+ "loss": 0.3568,
2927
+ "step": 4130
2928
+ },
2929
+ {
2930
+ "epoch": 1.590472531694199,
2931
+ "grad_norm": 12.9606294631958,
2932
+ "learning_rate": 6.538782318598833e-06,
2933
+ "loss": 0.5037,
2934
+ "step": 4140
2935
+ },
2936
+ {
2937
+ "epoch": 1.5943142527852476,
2938
+ "grad_norm": 15.28049373626709,
2939
+ "learning_rate": 6.528356964136782e-06,
2940
+ "loss": 0.5114,
2941
+ "step": 4150
2942
+ },
2943
+ {
2944
+ "epoch": 1.5981559738762967,
2945
+ "grad_norm": 22.42987823486328,
2946
+ "learning_rate": 6.517931609674729e-06,
2947
+ "loss": 0.5374,
2948
+ "step": 4160
2949
+ },
2950
+ {
2951
+ "epoch": 1.6019976949673453,
2952
+ "grad_norm": 36.76483917236328,
2953
+ "learning_rate": 6.5075062552126776e-06,
2954
+ "loss": 0.5011,
2955
+ "step": 4170
2956
+ },
2957
+ {
2958
+ "epoch": 1.6058394160583942,
2959
+ "grad_norm": 22.071678161621094,
2960
+ "learning_rate": 6.497080900750626e-06,
2961
+ "loss": 0.4214,
2962
+ "step": 4180
2963
+ },
2964
+ {
2965
+ "epoch": 1.609681137149443,
2966
+ "grad_norm": 2.2837672233581543,
2967
+ "learning_rate": 6.4866555462885735e-06,
2968
+ "loss": 0.4492,
2969
+ "step": 4190
2970
+ },
2971
+ {
2972
+ "epoch": 1.6135228582404917,
2973
+ "grad_norm": 16.134366989135742,
2974
+ "learning_rate": 6.476230191826523e-06,
2975
+ "loss": 0.5131,
2976
+ "step": 4200
2977
+ },
2978
+ {
2979
+ "epoch": 1.6173645793315405,
2980
+ "grad_norm": 14.408499717712402,
2981
+ "learning_rate": 6.465804837364471e-06,
2982
+ "loss": 0.6302,
2983
+ "step": 4210
2984
+ },
2985
+ {
2986
+ "epoch": 1.6212063004225894,
2987
+ "grad_norm": 3.4417994022369385,
2988
+ "learning_rate": 6.4553794829024195e-06,
2989
+ "loss": 0.5384,
2990
+ "step": 4220
2991
+ },
2992
+ {
2993
+ "epoch": 1.625048021513638,
2994
+ "grad_norm": 11.830604553222656,
2995
+ "learning_rate": 6.444954128440367e-06,
2996
+ "loss": 0.4217,
2997
+ "step": 4230
2998
+ },
2999
+ {
3000
+ "epoch": 1.6288897426046869,
3001
+ "grad_norm": 9.047849655151367,
3002
+ "learning_rate": 6.4345287739783155e-06,
3003
+ "loss": 0.3594,
3004
+ "step": 4240
3005
+ },
3006
+ {
3007
+ "epoch": 1.6327314636957357,
3008
+ "grad_norm": 7.678354740142822,
3009
+ "learning_rate": 6.424103419516265e-06,
3010
+ "loss": 0.4508,
3011
+ "step": 4250
3012
+ },
3013
+ {
3014
+ "epoch": 1.6365731847867844,
3015
+ "grad_norm": 5.697566032409668,
3016
+ "learning_rate": 6.413678065054212e-06,
3017
+ "loss": 0.4601,
3018
+ "step": 4260
3019
+ },
3020
+ {
3021
+ "epoch": 1.6404149058778332,
3022
+ "grad_norm": 15.278615951538086,
3023
+ "learning_rate": 6.403252710592161e-06,
3024
+ "loss": 0.4632,
3025
+ "step": 4270
3026
+ },
3027
+ {
3028
+ "epoch": 1.644256626968882,
3029
+ "grad_norm": 6.866236209869385,
3030
+ "learning_rate": 6.392827356130109e-06,
3031
+ "loss": 0.6104,
3032
+ "step": 4280
3033
+ },
3034
+ {
3035
+ "epoch": 1.6480983480599307,
3036
+ "grad_norm": 42.67068862915039,
3037
+ "learning_rate": 6.3824020016680575e-06,
3038
+ "loss": 0.7307,
3039
+ "step": 4290
3040
+ },
3041
+ {
3042
+ "epoch": 1.6519400691509798,
3043
+ "grad_norm": 42.591697692871094,
3044
+ "learning_rate": 6.371976647206005e-06,
3045
+ "loss": 0.4915,
3046
+ "step": 4300
3047
+ },
3048
+ {
3049
+ "epoch": 1.6557817902420284,
3050
+ "grad_norm": 33.19038391113281,
3051
+ "learning_rate": 6.361551292743953e-06,
3052
+ "loss": 0.3862,
3053
+ "step": 4310
3054
+ },
3055
+ {
3056
+ "epoch": 1.659623511333077,
3057
+ "grad_norm": 21.831031799316406,
3058
+ "learning_rate": 6.351125938281903e-06,
3059
+ "loss": 0.4684,
3060
+ "step": 4320
3061
+ },
3062
+ {
3063
+ "epoch": 1.6634652324241261,
3064
+ "grad_norm": 10.749385833740234,
3065
+ "learning_rate": 6.34070058381985e-06,
3066
+ "loss": 0.5235,
3067
+ "step": 4330
3068
+ },
3069
+ {
3070
+ "epoch": 1.6673069535151748,
3071
+ "grad_norm": 8.519107818603516,
3072
+ "learning_rate": 6.330275229357799e-06,
3073
+ "loss": 0.4614,
3074
+ "step": 4340
3075
+ },
3076
+ {
3077
+ "epoch": 1.6711486746062236,
3078
+ "grad_norm": 10.042859077453613,
3079
+ "learning_rate": 6.319849874895747e-06,
3080
+ "loss": 0.4232,
3081
+ "step": 4350
3082
+ },
3083
+ {
3084
+ "epoch": 1.6749903956972725,
3085
+ "grad_norm": 15.31753921508789,
3086
+ "learning_rate": 6.309424520433695e-06,
3087
+ "loss": 0.6633,
3088
+ "step": 4360
3089
+ },
3090
+ {
3091
+ "epoch": 1.6788321167883211,
3092
+ "grad_norm": 15.346444129943848,
3093
+ "learning_rate": 6.298999165971643e-06,
3094
+ "loss": 0.4809,
3095
+ "step": 4370
3096
+ },
3097
+ {
3098
+ "epoch": 1.68267383787937,
3099
+ "grad_norm": 14.51533031463623,
3100
+ "learning_rate": 6.288573811509592e-06,
3101
+ "loss": 0.3116,
3102
+ "step": 4380
3103
+ },
3104
+ {
3105
+ "epoch": 1.6865155589704188,
3106
+ "grad_norm": 10.794851303100586,
3107
+ "learning_rate": 6.2781484570475406e-06,
3108
+ "loss": 0.4442,
3109
+ "step": 4390
3110
+ },
3111
+ {
3112
+ "epoch": 1.6903572800614675,
3113
+ "grad_norm": 26.648527145385742,
3114
+ "learning_rate": 6.267723102585488e-06,
3115
+ "loss": 0.4151,
3116
+ "step": 4400
3117
+ },
3118
+ {
3119
+ "epoch": 1.6941990011525163,
3120
+ "grad_norm": 14.879584312438965,
3121
+ "learning_rate": 6.2572977481234365e-06,
3122
+ "loss": 0.5536,
3123
+ "step": 4410
3124
+ },
3125
+ {
3126
+ "epoch": 1.6980407222435652,
3127
+ "grad_norm": 28.443931579589844,
3128
+ "learning_rate": 6.246872393661385e-06,
3129
+ "loss": 0.4466,
3130
+ "step": 4420
3131
+ },
3132
+ {
3133
+ "epoch": 1.7018824433346138,
3134
+ "grad_norm": 22.08489227294922,
3135
+ "learning_rate": 6.236447039199333e-06,
3136
+ "loss": 0.5109,
3137
+ "step": 4430
3138
+ },
3139
+ {
3140
+ "epoch": 1.7057241644256627,
3141
+ "grad_norm": 21.044944763183594,
3142
+ "learning_rate": 6.226021684737281e-06,
3143
+ "loss": 0.5316,
3144
+ "step": 4440
3145
+ },
3146
+ {
3147
+ "epoch": 1.7095658855167115,
3148
+ "grad_norm": 29.13035774230957,
3149
+ "learning_rate": 6.21559633027523e-06,
3150
+ "loss": 0.4993,
3151
+ "step": 4450
3152
+ },
3153
+ {
3154
+ "epoch": 1.7134076066077601,
3155
+ "grad_norm": 11.93812084197998,
3156
+ "learning_rate": 6.2051709758131785e-06,
3157
+ "loss": 0.4563,
3158
+ "step": 4460
3159
+ },
3160
+ {
3161
+ "epoch": 1.7172493276988092,
3162
+ "grad_norm": 25.17680549621582,
3163
+ "learning_rate": 6.194745621351126e-06,
3164
+ "loss": 0.4524,
3165
+ "step": 4470
3166
+ },
3167
+ {
3168
+ "epoch": 1.7210910487898579,
3169
+ "grad_norm": 11.476607322692871,
3170
+ "learning_rate": 6.1843202668890744e-06,
3171
+ "loss": 0.4841,
3172
+ "step": 4480
3173
+ },
3174
+ {
3175
+ "epoch": 1.7249327698809065,
3176
+ "grad_norm": 18.251564025878906,
3177
+ "learning_rate": 6.173894912427023e-06,
3178
+ "loss": 0.513,
3179
+ "step": 4490
3180
+ },
3181
+ {
3182
+ "epoch": 1.7287744909719556,
3183
+ "grad_norm": 19.187213897705078,
3184
+ "learning_rate": 6.163469557964972e-06,
3185
+ "loss": 0.5729,
3186
+ "step": 4500
3187
+ },
3188
+ {
3189
+ "epoch": 1.7326162120630042,
3190
+ "grad_norm": 6.800131320953369,
3191
+ "learning_rate": 6.15304420350292e-06,
3192
+ "loss": 0.4997,
3193
+ "step": 4510
3194
+ },
3195
+ {
3196
+ "epoch": 1.736457933154053,
3197
+ "grad_norm": 4.658173561096191,
3198
+ "learning_rate": 6.142618849040868e-06,
3199
+ "loss": 0.6002,
3200
+ "step": 4520
3201
+ },
3202
+ {
3203
+ "epoch": 1.740299654245102,
3204
+ "grad_norm": 16.352256774902344,
3205
+ "learning_rate": 6.132193494578816e-06,
3206
+ "loss": 0.4366,
3207
+ "step": 4530
3208
+ },
3209
+ {
3210
+ "epoch": 1.7441413753361505,
3211
+ "grad_norm": 15.1599760055542,
3212
+ "learning_rate": 6.121768140116765e-06,
3213
+ "loss": 0.4287,
3214
+ "step": 4540
3215
+ },
3216
+ {
3217
+ "epoch": 1.7479830964271994,
3218
+ "grad_norm": 7.974790573120117,
3219
+ "learning_rate": 6.111342785654712e-06,
3220
+ "loss": 0.4333,
3221
+ "step": 4550
3222
+ },
3223
+ {
3224
+ "epoch": 1.7518248175182483,
3225
+ "grad_norm": 16.976285934448242,
3226
+ "learning_rate": 6.100917431192661e-06,
3227
+ "loss": 0.3914,
3228
+ "step": 4560
3229
+ },
3230
+ {
3231
+ "epoch": 1.755666538609297,
3232
+ "grad_norm": 3.174334764480591,
3233
+ "learning_rate": 6.09049207673061e-06,
3234
+ "loss": 0.3679,
3235
+ "step": 4570
3236
+ },
3237
+ {
3238
+ "epoch": 1.7595082597003457,
3239
+ "grad_norm": 32.927494049072266,
3240
+ "learning_rate": 6.0800667222685575e-06,
3241
+ "loss": 0.6631,
3242
+ "step": 4580
3243
+ },
3244
+ {
3245
+ "epoch": 1.7633499807913946,
3246
+ "grad_norm": 10.811646461486816,
3247
+ "learning_rate": 6.069641367806506e-06,
3248
+ "loss": 0.5384,
3249
+ "step": 4590
3250
+ },
3251
+ {
3252
+ "epoch": 1.7671917018824432,
3253
+ "grad_norm": 18.781339645385742,
3254
+ "learning_rate": 6.059216013344454e-06,
3255
+ "loss": 0.5194,
3256
+ "step": 4600
3257
+ },
3258
+ {
3259
+ "epoch": 1.771033422973492,
3260
+ "grad_norm": 21.202974319458008,
3261
+ "learning_rate": 6.048790658882403e-06,
3262
+ "loss": 0.5962,
3263
+ "step": 4610
3264
+ },
3265
+ {
3266
+ "epoch": 1.774875144064541,
3267
+ "grad_norm": 15.951532363891602,
3268
+ "learning_rate": 6.03836530442035e-06,
3269
+ "loss": 0.515,
3270
+ "step": 4620
3271
+ },
3272
+ {
3273
+ "epoch": 1.7787168651555896,
3274
+ "grad_norm": 21.22430419921875,
3275
+ "learning_rate": 6.0279399499582995e-06,
3276
+ "loss": 0.5223,
3277
+ "step": 4630
3278
+ },
3279
+ {
3280
+ "epoch": 1.7825585862466387,
3281
+ "grad_norm": 15.455586433410645,
3282
+ "learning_rate": 6.017514595496248e-06,
3283
+ "loss": 0.526,
3284
+ "step": 4640
3285
+ },
3286
+ {
3287
+ "epoch": 1.7864003073376873,
3288
+ "grad_norm": 7.954679012298584,
3289
+ "learning_rate": 6.0070892410341954e-06,
3290
+ "loss": 0.528,
3291
+ "step": 4650
3292
+ },
3293
+ {
3294
+ "epoch": 1.790242028428736,
3295
+ "grad_norm": 10.174606323242188,
3296
+ "learning_rate": 5.996663886572144e-06,
3297
+ "loss": 0.4072,
3298
+ "step": 4660
3299
+ },
3300
+ {
3301
+ "epoch": 1.794083749519785,
3302
+ "grad_norm": 30.961978912353516,
3303
+ "learning_rate": 5.986238532110092e-06,
3304
+ "loss": 0.4113,
3305
+ "step": 4670
3306
+ },
3307
+ {
3308
+ "epoch": 1.7979254706108336,
3309
+ "grad_norm": 20.904510498046875,
3310
+ "learning_rate": 5.975813177648041e-06,
3311
+ "loss": 0.5494,
3312
+ "step": 4680
3313
+ },
3314
+ {
3315
+ "epoch": 1.8017671917018825,
3316
+ "grad_norm": 32.23243713378906,
3317
+ "learning_rate": 5.965387823185988e-06,
3318
+ "loss": 0.65,
3319
+ "step": 4690
3320
+ },
3321
+ {
3322
+ "epoch": 1.8056089127929313,
3323
+ "grad_norm": 9.392688751220703,
3324
+ "learning_rate": 5.954962468723937e-06,
3325
+ "loss": 0.4458,
3326
+ "step": 4700
3327
+ },
3328
+ {
3329
+ "epoch": 1.80945063388398,
3330
+ "grad_norm": 23.203550338745117,
3331
+ "learning_rate": 5.944537114261886e-06,
3332
+ "loss": 0.3476,
3333
+ "step": 4710
3334
+ },
3335
+ {
3336
+ "epoch": 1.8132923549750288,
3337
+ "grad_norm": 21.198944091796875,
3338
+ "learning_rate": 5.934111759799833e-06,
3339
+ "loss": 0.4905,
3340
+ "step": 4720
3341
+ },
3342
+ {
3343
+ "epoch": 1.8171340760660777,
3344
+ "grad_norm": 7.847315788269043,
3345
+ "learning_rate": 5.923686405337782e-06,
3346
+ "loss": 0.2561,
3347
+ "step": 4730
3348
+ },
3349
+ {
3350
+ "epoch": 1.8209757971571263,
3351
+ "grad_norm": 19.153392791748047,
3352
+ "learning_rate": 5.91326105087573e-06,
3353
+ "loss": 0.5025,
3354
+ "step": 4740
3355
+ },
3356
+ {
3357
+ "epoch": 1.8248175182481752,
3358
+ "grad_norm": 13.056893348693848,
3359
+ "learning_rate": 5.902835696413679e-06,
3360
+ "loss": 0.6548,
3361
+ "step": 4750
3362
+ },
3363
+ {
3364
+ "epoch": 1.828659239339224,
3365
+ "grad_norm": 6.967535972595215,
3366
+ "learning_rate": 5.892410341951627e-06,
3367
+ "loss": 0.7137,
3368
+ "step": 4760
3369
+ },
3370
+ {
3371
+ "epoch": 1.8325009604302727,
3372
+ "grad_norm": 15.452996253967285,
3373
+ "learning_rate": 5.881984987489575e-06,
3374
+ "loss": 0.4757,
3375
+ "step": 4770
3376
+ },
3377
+ {
3378
+ "epoch": 1.8363426815213215,
3379
+ "grad_norm": 26.09940528869629,
3380
+ "learning_rate": 5.871559633027524e-06,
3381
+ "loss": 0.3872,
3382
+ "step": 4780
3383
+ },
3384
+ {
3385
+ "epoch": 1.8401844026123704,
3386
+ "grad_norm": 13.307591438293457,
3387
+ "learning_rate": 5.861134278565471e-06,
3388
+ "loss": 0.3825,
3389
+ "step": 4790
3390
+ },
3391
+ {
3392
+ "epoch": 1.844026123703419,
3393
+ "grad_norm": 11.655804634094238,
3394
+ "learning_rate": 5.85070892410342e-06,
3395
+ "loss": 0.3827,
3396
+ "step": 4800
3397
+ },
3398
+ {
3399
+ "epoch": 1.847867844794468,
3400
+ "grad_norm": 30.40113067626953,
3401
+ "learning_rate": 5.840283569641368e-06,
3402
+ "loss": 0.4702,
3403
+ "step": 4810
3404
+ },
3405
+ {
3406
+ "epoch": 1.8517095658855167,
3407
+ "grad_norm": 35.177120208740234,
3408
+ "learning_rate": 5.829858215179317e-06,
3409
+ "loss": 0.3508,
3410
+ "step": 4820
3411
+ },
3412
+ {
3413
+ "epoch": 1.8555512869765654,
3414
+ "grad_norm": 22.448490142822266,
3415
+ "learning_rate": 5.819432860717265e-06,
3416
+ "loss": 0.5955,
3417
+ "step": 4830
3418
+ },
3419
+ {
3420
+ "epoch": 1.8593930080676144,
3421
+ "grad_norm": 12.29417610168457,
3422
+ "learning_rate": 5.809007506255213e-06,
3423
+ "loss": 0.382,
3424
+ "step": 4840
3425
+ },
3426
+ {
3427
+ "epoch": 1.863234729158663,
3428
+ "grad_norm": 49.52952194213867,
3429
+ "learning_rate": 5.798582151793162e-06,
3430
+ "loss": 0.519,
3431
+ "step": 4850
3432
+ },
3433
+ {
3434
+ "epoch": 1.867076450249712,
3435
+ "grad_norm": 11.148819923400879,
3436
+ "learning_rate": 5.788156797331109e-06,
3437
+ "loss": 0.568,
3438
+ "step": 4860
3439
+ },
3440
+ {
3441
+ "epoch": 1.8709181713407608,
3442
+ "grad_norm": 5.599549770355225,
3443
+ "learning_rate": 5.777731442869058e-06,
3444
+ "loss": 0.5568,
3445
+ "step": 4870
3446
+ },
3447
+ {
3448
+ "epoch": 1.8747598924318094,
3449
+ "grad_norm": 10.196747779846191,
3450
+ "learning_rate": 5.767306088407007e-06,
3451
+ "loss": 0.4946,
3452
+ "step": 4880
3453
+ },
3454
+ {
3455
+ "epoch": 1.8786016135228583,
3456
+ "grad_norm": 32.05455017089844,
3457
+ "learning_rate": 5.756880733944955e-06,
3458
+ "loss": 0.4545,
3459
+ "step": 4890
3460
+ },
3461
+ {
3462
+ "epoch": 1.8824433346139071,
3463
+ "grad_norm": 10.12232494354248,
3464
+ "learning_rate": 5.746455379482903e-06,
3465
+ "loss": 0.534,
3466
+ "step": 4900
3467
+ },
3468
+ {
3469
+ "epoch": 1.8862850557049557,
3470
+ "grad_norm": 2.4176321029663086,
3471
+ "learning_rate": 5.736030025020851e-06,
3472
+ "loss": 0.4063,
3473
+ "step": 4910
3474
+ },
3475
+ {
3476
+ "epoch": 1.8901267767960046,
3477
+ "grad_norm": 10.467967987060547,
3478
+ "learning_rate": 5.7256046705587996e-06,
3479
+ "loss": 0.5102,
3480
+ "step": 4920
3481
+ },
3482
+ {
3483
+ "epoch": 1.8939684978870535,
3484
+ "grad_norm": 13.962821006774902,
3485
+ "learning_rate": 5.715179316096747e-06,
3486
+ "loss": 0.5077,
3487
+ "step": 4930
3488
+ },
3489
+ {
3490
+ "epoch": 1.897810218978102,
3491
+ "grad_norm": 5.787616729736328,
3492
+ "learning_rate": 5.7047539616346955e-06,
3493
+ "loss": 0.4232,
3494
+ "step": 4940
3495
+ },
3496
+ {
3497
+ "epoch": 1.901651940069151,
3498
+ "grad_norm": 17.215404510498047,
3499
+ "learning_rate": 5.694328607172645e-06,
3500
+ "loss": 0.5902,
3501
+ "step": 4950
3502
+ },
3503
+ {
3504
+ "epoch": 1.9054936611601998,
3505
+ "grad_norm": 11.960589408874512,
3506
+ "learning_rate": 5.683903252710593e-06,
3507
+ "loss": 0.4809,
3508
+ "step": 4960
3509
+ },
3510
+ {
3511
+ "epoch": 1.9093353822512484,
3512
+ "grad_norm": 25.745502471923828,
3513
+ "learning_rate": 5.673477898248541e-06,
3514
+ "loss": 0.4995,
3515
+ "step": 4970
3516
+ },
3517
+ {
3518
+ "epoch": 1.9131771033422975,
3519
+ "grad_norm": 10.54256534576416,
3520
+ "learning_rate": 5.663052543786489e-06,
3521
+ "loss": 0.3592,
3522
+ "step": 4980
3523
+ },
3524
+ {
3525
+ "epoch": 1.9170188244333461,
3526
+ "grad_norm": 19.720035552978516,
3527
+ "learning_rate": 5.6526271893244375e-06,
3528
+ "loss": 0.3611,
3529
+ "step": 4990
3530
+ },
3531
+ {
3532
+ "epoch": 1.9208605455243948,
3533
+ "grad_norm": 8.2754487991333,
3534
+ "learning_rate": 5.642201834862385e-06,
3535
+ "loss": 0.3646,
3536
+ "step": 5000
3537
+ },
3538
+ {
3539
+ "epoch": 1.9247022666154439,
3540
+ "grad_norm": 16.336483001708984,
3541
+ "learning_rate": 5.631776480400334e-06,
3542
+ "loss": 0.6616,
3543
+ "step": 5010
3544
+ },
3545
+ {
3546
+ "epoch": 1.9285439877064925,
3547
+ "grad_norm": 24.838665008544922,
3548
+ "learning_rate": 5.621351125938283e-06,
3549
+ "loss": 0.529,
3550
+ "step": 5020
3551
+ },
3552
+ {
3553
+ "epoch": 1.9323857087975413,
3554
+ "grad_norm": 16.165803909301758,
3555
+ "learning_rate": 5.610925771476231e-06,
3556
+ "loss": 0.5018,
3557
+ "step": 5030
3558
+ },
3559
+ {
3560
+ "epoch": 1.9362274298885902,
3561
+ "grad_norm": 12.195728302001953,
3562
+ "learning_rate": 5.600500417014179e-06,
3563
+ "loss": 0.4034,
3564
+ "step": 5040
3565
+ },
3566
+ {
3567
+ "epoch": 1.9400691509796388,
3568
+ "grad_norm": 16.773571014404297,
3569
+ "learning_rate": 5.590075062552127e-06,
3570
+ "loss": 0.5083,
3571
+ "step": 5050
3572
+ },
3573
+ {
3574
+ "epoch": 1.9439108720706877,
3575
+ "grad_norm": 22.202640533447266,
3576
+ "learning_rate": 5.579649708090075e-06,
3577
+ "loss": 0.6077,
3578
+ "step": 5060
3579
+ },
3580
+ {
3581
+ "epoch": 1.9477525931617365,
3582
+ "grad_norm": 14.229264259338379,
3583
+ "learning_rate": 5.569224353628023e-06,
3584
+ "loss": 0.3105,
3585
+ "step": 5070
3586
+ },
3587
+ {
3588
+ "epoch": 1.9515943142527852,
3589
+ "grad_norm": 13.377602577209473,
3590
+ "learning_rate": 5.558798999165972e-06,
3591
+ "loss": 0.4201,
3592
+ "step": 5080
3593
+ },
3594
+ {
3595
+ "epoch": 1.955436035343834,
3596
+ "grad_norm": 3.7054190635681152,
3597
+ "learning_rate": 5.548373644703921e-06,
3598
+ "loss": 0.3833,
3599
+ "step": 5090
3600
+ },
3601
+ {
3602
+ "epoch": 1.9592777564348829,
3603
+ "grad_norm": 1.4279061555862427,
3604
+ "learning_rate": 5.537948290241869e-06,
3605
+ "loss": 0.2661,
3606
+ "step": 5100
3607
+ },
3608
+ {
3609
+ "epoch": 1.9631194775259315,
3610
+ "grad_norm": 13.63090705871582,
3611
+ "learning_rate": 5.5275229357798165e-06,
3612
+ "loss": 0.559,
3613
+ "step": 5110
3614
+ },
3615
+ {
3616
+ "epoch": 1.9669611986169804,
3617
+ "grad_norm": 5.49478816986084,
3618
+ "learning_rate": 5.517097581317765e-06,
3619
+ "loss": 0.5809,
3620
+ "step": 5120
3621
+ },
3622
+ {
3623
+ "epoch": 1.9708029197080292,
3624
+ "grad_norm": 14.8590726852417,
3625
+ "learning_rate": 5.506672226855714e-06,
3626
+ "loss": 0.3949,
3627
+ "step": 5130
3628
+ },
3629
+ {
3630
+ "epoch": 1.9746446407990779,
3631
+ "grad_norm": 24.46096420288086,
3632
+ "learning_rate": 5.496246872393661e-06,
3633
+ "loss": 0.5104,
3634
+ "step": 5140
3635
+ },
3636
+ {
3637
+ "epoch": 1.978486361890127,
3638
+ "grad_norm": 18.829071044921875,
3639
+ "learning_rate": 5.48582151793161e-06,
3640
+ "loss": 0.5408,
3641
+ "step": 5150
3642
+ },
3643
+ {
3644
+ "epoch": 1.9823280829811756,
3645
+ "grad_norm": 12.263307571411133,
3646
+ "learning_rate": 5.4753961634695585e-06,
3647
+ "loss": 0.4514,
3648
+ "step": 5160
3649
+ },
3650
+ {
3651
+ "epoch": 1.9861698040722242,
3652
+ "grad_norm": 3.8610880374908447,
3653
+ "learning_rate": 5.464970809007507e-06,
3654
+ "loss": 0.534,
3655
+ "step": 5170
3656
+ },
3657
+ {
3658
+ "epoch": 1.9900115251632733,
3659
+ "grad_norm": 14.170186042785645,
3660
+ "learning_rate": 5.4545454545454545e-06,
3661
+ "loss": 0.4005,
3662
+ "step": 5180
3663
+ },
3664
+ {
3665
+ "epoch": 1.993853246254322,
3666
+ "grad_norm": 13.322096824645996,
3667
+ "learning_rate": 5.444120100083403e-06,
3668
+ "loss": 0.3869,
3669
+ "step": 5190
3670
+ },
3671
+ {
3672
+ "epoch": 1.9976949673453708,
3673
+ "grad_norm": 24.19983673095703,
3674
+ "learning_rate": 5.433694745621352e-06,
3675
+ "loss": 0.6315,
3676
+ "step": 5200
3677
+ },
3678
+ {
3679
+ "epoch": 2.0,
3680
+ "eval_accuracy": 0.7710114302180386,
3681
+ "eval_f1_per_label": [
3682
+ 0.7862362971985384,
3683
+ 0.701864199210777,
3684
+ 0.8301230992034757
3685
+ ],
3686
+ "eval_f1_weighted": 0.768826861103172,
3687
+ "eval_loss": 0.6022440791130066,
3688
+ "eval_precision_per_label": [
3689
+ 0.7314447592067989,
3690
+ 0.7556401992382069,
3691
+ 0.8264129181084199
3692
+ ],
3693
+ "eval_precision_weighted": 0.77194411701186,
3694
+ "eval_recall_per_label": [
3695
+ 0.8499012508229098,
3696
+ 0.6552337398373984,
3697
+ 0.8338667442537097
3698
+ ],
3699
+ "eval_recall_weighted": 0.7710114302180386,
3700
+ "eval_runtime": 38.8175,
3701
+ "eval_samples_per_second": 268.204,
3702
+ "eval_steps_per_second": 33.542,
3703
+ "step": 5206
3704
+ }
3705
+ ],
3706
+ "logging_steps": 10,
3707
+ "max_steps": 10412,
3708
+ "num_input_tokens_seen": 0,
3709
+ "num_train_epochs": 4,
3710
+ "save_steps": 500,
3711
+ "stateful_callbacks": {
3712
+ "EarlyStoppingCallback": {
3713
+ "args": {
3714
+ "early_stopping_patience": 3,
3715
+ "early_stopping_threshold": 0.0
3716
+ },
3717
+ "attributes": {
3718
+ "early_stopping_patience_counter": 1
3719
+ }
3720
+ },
3721
+ "TrainerControl": {
3722
+ "args": {
3723
+ "should_epoch_stop": false,
3724
+ "should_evaluate": false,
3725
+ "should_log": false,
3726
+ "should_save": true,
3727
+ "should_training_stop": false
3728
+ },
3729
+ "attributes": {}
3730
+ }
3731
+ },
3732
+ "total_flos": 1317340020849300.0,
3733
+ "train_batch_size": 8,
3734
+ "trial_name": null,
3735
+ "trial_params": null
3736
+ }
checkpoint-5206/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8dba586531be0fd60261c0b3a7e765f40602a7ad1431e5af47acc67f5d677d1
3
+ size 5368
checkpoint-5206/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-7809/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "LABEL_0",
16
+ "1": "LABEL_1",
17
+ "2": "LABEL_2"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "LABEL_0": 0,
23
+ "LABEL_1": 1,
24
+ "LABEL_2": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.47.1",
36
+ "type_vocab_size": 1,
37
+ "use_cache": true,
38
+ "vocab_size": 50265
39
+ }
checkpoint-7809/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-7809/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cce826b33e5816e672055668fb8149b8d1dd0203246fa0a06e3ada97c32f85
3
+ size 498615900
checkpoint-7809/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0926251872637ebbb6a74f3eb1f85ef667f6fe3228bb253ff8a4de24340a082
3
+ size 997351674
checkpoint-7809/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de5d9c449c71e20d8763ad1186f02df348ab0a9805a55ff9f2e873803c0da56a
3
+ size 14244
checkpoint-7809/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19bdb55452d79a708b33d99d1daca914556ee6da744864f7c9c064d8bf4e02b5
3
+ size 1064
checkpoint-7809/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-7809/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-7809/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
checkpoint-7809/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-7809/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8dba586531be0fd60261c0b3a7e765f40602a7ad1431e5af47acc67f5d677d1
3
+ size 5368
checkpoint-7809/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "LABEL_0",
16
+ "1": "LABEL_1",
17
+ "2": "LABEL_2"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "LABEL_0": 0,
23
+ "LABEL_1": 1,
24
+ "LABEL_2": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.47.1",
36
+ "type_vocab_size": 1,
37
+ "use_cache": true,
38
+ "vocab_size": 50265
39
+ }
events.out.tfevents.1735229724.0be04a97d1bd.376.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e4b59c62c542d9a743478032642e343a45a8925bf8163b37ead18c31447535e
3
+ size 227372
merges.txt ADDED
The diff for this file is too large to render. See raw diff