calvintwr commited on
Commit
bd47b51
·
verified ·
1 Parent(s): bb4a71e

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. 2024-09-23/06-36-18/.hydra/config.yaml +74 -0
  3. 2024-09-23/06-36-18/.hydra/hydra.yaml +154 -0
  4. 2024-09-23/06-36-18/.hydra/overrides.yaml +1 -0
  5. 2024-09-23/06-36-18/train.log +0 -0
  6. 2024-09-23/07-06-14/.hydra/config.yaml +74 -0
  7. 2024-09-23/07-06-14/.hydra/hydra.yaml +154 -0
  8. 2024-09-23/07-06-14/.hydra/overrides.yaml +1 -0
  9. 2024-09-23/07-06-14/train.log +0 -0
  10. 2024-09-23/08-39-13/.hydra/config.yaml +74 -0
  11. 2024-09-23/08-39-13/.hydra/hydra.yaml +154 -0
  12. 2024-09-23/08-39-13/.hydra/overrides.yaml +1 -0
  13. 2024-09-23/08-39-13/train.log +0 -0
  14. 2024-09-23/08-40-08/.hydra/config.yaml +74 -0
  15. 2024-09-23/08-40-08/.hydra/hydra.yaml +154 -0
  16. 2024-09-23/08-40-08/.hydra/overrides.yaml +1 -0
  17. 2024-09-23/08-40-08/train.log +0 -0
  18. 2024-09-23/08-40-08/wandb/debug-internal.log +14 -0
  19. 2024-09-23/08-40-08/wandb/debug.log +26 -0
  20. 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/config.yaml +114 -0
  21. 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/output.log +3 -0
  22. 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/requirements.txt +121 -0
  23. 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-metadata.json +88 -0
  24. 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-summary.json +1 -0
  25. 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log +12 -0
  26. 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log +14 -0
  27. 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log +26 -0
  28. 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/run-a2kxhd8v.wandb +0 -0
  29. 2024-09-23/09-32-28/.hydra/config.yaml +74 -0
  30. 2024-09-23/09-32-28/.hydra/hydra.yaml +154 -0
  31. 2024-09-23/09-32-28/.hydra/overrides.yaml +1 -0
  32. 2024-09-23/09-32-28/train.log +0 -0
  33. 2024-09-23/09-32-28/wandb/debug-internal.log +18 -0
  34. 2024-09-23/09-32-28/wandb/debug.log +26 -0
  35. 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/config.yaml +115 -0
  36. 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/output.log +33 -0
  37. 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-metadata.json +88 -0
  38. 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-summary.json +1 -0
  39. 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log +13 -0
  40. 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log +18 -0
  41. 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log +26 -0
  42. 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/run-tkkvzfon.wandb +0 -0
  43. 2024-09-23/09-33-58/.hydra/config.yaml +74 -0
  44. 2024-09-23/09-33-58/.hydra/hydra.yaml +154 -0
  45. 2024-09-23/09-33-58/.hydra/overrides.yaml +1 -0
  46. 2024-09-23/09-33-58/checkpoints/ckpt_1000.pt +3 -0
  47. 2024-09-23/09-33-58/checkpoints/ckpt_2000.pt +3 -0
  48. 2024-09-23/09-33-58/checkpoints/ckpt_3000.pt +3 -0
  49. 2024-09-23/09-33-58/checkpoints/ckpt_4000.pt +3 -0
  50. 2024-09-23/09-33-58/checkpoints/ckpt_5000.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ 2024-09-23/09-33-58/wandb/run-20240923_093407-jnzzkcth/run-jnzzkcth.wandb filter=lfs diff=lfs merge=lfs -text
37
+ 2024-09-23/15-02-55/wandb/run-20240923_150304-bbl5fd2u/run-bbl5fd2u.wandb filter=lfs diff=lfs merge=lfs -text
38
+ 2024-09-23/15-28-03/wandb/run-20240923_152812-jp82yqcj/run-jp82yqcj.wandb filter=lfs diff=lfs merge=lfs -text
2024-09-23/06-36-18/.hydra/config.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experimental:
2
+ model:
3
+ core_model_type: pass_through
4
+ hidden_dim: 384
5
+ byte_hidden: 128
6
+ max_chunk_length: 12
7
+ max_num_chunks: 1024
8
+ num_delimiter_layers: 3
9
+ num_byte_decoder_layers: 5
10
+ target_chunk_len: 8.0
11
+ chunk_len_loss_weight: 0.1
12
+ chunk_len_penalty: 0.1
13
+ context_window: 8192
14
+ embedding_model_type: byte_level
15
+ tokenizer_type: bpe
16
+ tokenizer_dataset_name: simple_en_wiki
17
+ tokenizer_simplify_data: true
18
+ vocab_size: 259
19
+ lm_head_type: byte_level
20
+ lm_head_normalization: rms_norm
21
+ lm_head_bias: false
22
+ lm_head_dropout: 0.0
23
+ model_shell_type: byte_autoencoder_shell
24
+ embedding_weight_tying: true
25
+ ffn_weight_tying: false
26
+ cproj_weight_tying: false
27
+ positional_encoding_type: rope
28
+ trainer:
29
+ trainer_type: base_trainer
30
+ dataset: fineweb_edu_10B
31
+ batch_size: 6
32
+ gradient_accumulation_steps: 8
33
+ max_iters: 10000
34
+ eval_interval: 50000000
35
+ log_interval: 1
36
+ checkpoint_interval: 1000
37
+ eval_iters: 1000
38
+ run_eval: false
39
+ eval:
40
+ mcq_benchmarks: null
41
+ mcq_num_samples: 1000
42
+ eval_byte_metrics: false
43
+ text_modeling_eval: false
44
+ text_generation_eval: false
45
+ optimizer:
46
+ optimizer_name: adamW
47
+ lr: 0.0005
48
+ min_lr: 5.0e-05
49
+ weight_decay: 0.01
50
+ beta1: 0.9
51
+ beta2: 0.95
52
+ grad_clip: 1.0
53
+ lr_scheduler:
54
+ name: cosine
55
+ warmup_iters: 100
56
+ dataloader:
57
+ name: autoencoder
58
+ datasampling:
59
+ name: standard
60
+ loss_fn:
61
+ name: pass_through
62
+ general:
63
+ logging:
64
+ wandb_log: true
65
+ wandb_project: SuperTinyLanguageModels
66
+ wandb_run_name: null
67
+ group_name: experimental_byte_level
68
+ paths:
69
+ output_dir: outputs
70
+ data_dir: data
71
+ checkpoint_dir: checkpoints
72
+ eval_dir: evals
73
+ seed: 489
74
+ device: cuda
2024-09-23/06-36-18/.hydra/hydra.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: train
117
+ chdir: null
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: experimental/byte_autoencoder_1
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /root/SuperTinyLanguageModels
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /root/SuperTinyLanguageModels/configs/train
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/06-36-18
144
+ choices:
145
+ hydra/env: default
146
+ hydra/callbacks: null
147
+ hydra/job_logging: default
148
+ hydra/hydra_logging: default
149
+ hydra/hydra_help: default
150
+ hydra/help: default
151
+ hydra/sweeper: basic
152
+ hydra/launcher: basic
153
+ hydra/output: default
154
+ verbose: false
2024-09-23/06-36-18/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
2024-09-23/06-36-18/train.log ADDED
File without changes
2024-09-23/07-06-14/.hydra/config.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experimental:
2
+ model:
3
+ core_model_type: pass_through
4
+ hidden_dim: 384
5
+ byte_hidden: 128
6
+ max_chunk_length: 12
7
+ max_num_chunks: 1024
8
+ num_delimiter_layers: 3
9
+ num_byte_decoder_layers: 5
10
+ target_chunk_len: 8.0
11
+ chunk_len_loss_weight: 0.1
12
+ chunk_len_penalty: 0.1
13
+ context_window: 8192
14
+ embedding_model_type: byte_level
15
+ tokenizer_type: bpe
16
+ tokenizer_dataset_name: simple_en_wiki
17
+ tokenizer_simplify_data: true
18
+ vocab_size: 259
19
+ lm_head_type: byte_level
20
+ lm_head_normalization: rms_norm
21
+ lm_head_bias: false
22
+ lm_head_dropout: 0.0
23
+ model_shell_type: byte_autoencoder_shell
24
+ embedding_weight_tying: true
25
+ ffn_weight_tying: false
26
+ cproj_weight_tying: false
27
+ positional_encoding_type: rope
28
+ trainer:
29
+ trainer_type: base_trainer
30
+ dataset: fineweb_edu_10B
31
+ batch_size: 6
32
+ gradient_accumulation_steps: 8
33
+ max_iters: 10000
34
+ eval_interval: 50000000
35
+ log_interval: 1
36
+ checkpoint_interval: 1000
37
+ eval_iters: 1000
38
+ run_eval: false
39
+ eval:
40
+ mcq_benchmarks: null
41
+ mcq_num_samples: 1000
42
+ eval_byte_metrics: false
43
+ text_modeling_eval: false
44
+ text_generation_eval: false
45
+ optimizer:
46
+ optimizer_name: adamW
47
+ lr: 0.0005
48
+ min_lr: 5.0e-05
49
+ weight_decay: 0.01
50
+ beta1: 0.9
51
+ beta2: 0.95
52
+ grad_clip: 1.0
53
+ lr_scheduler:
54
+ name: cosine
55
+ warmup_iters: 100
56
+ dataloader:
57
+ name: autoencoder
58
+ datasampling:
59
+ name: standard
60
+ loss_fn:
61
+ name: pass_through
62
+ general:
63
+ logging:
64
+ wandb_log: true
65
+ wandb_project: SuperTinyLanguageModels
66
+ wandb_run_name: null
67
+ group_name: experimental_byte_level
68
+ paths:
69
+ output_dir: outputs
70
+ data_dir: data
71
+ checkpoint_dir: checkpoints
72
+ eval_dir: evals
73
+ seed: 489
74
+ device: cuda
2024-09-23/07-06-14/.hydra/hydra.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: train
117
+ chdir: null
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: experimental/byte_autoencoder_1
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /root/SuperTinyLanguageModels
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /root/SuperTinyLanguageModels/configs/train
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/07-06-14
144
+ choices:
145
+ hydra/env: default
146
+ hydra/callbacks: null
147
+ hydra/job_logging: default
148
+ hydra/hydra_logging: default
149
+ hydra/hydra_help: default
150
+ hydra/help: default
151
+ hydra/sweeper: basic
152
+ hydra/launcher: basic
153
+ hydra/output: default
154
+ verbose: false
2024-09-23/07-06-14/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
2024-09-23/07-06-14/train.log ADDED
File without changes
2024-09-23/08-39-13/.hydra/config.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experimental:
2
+ model:
3
+ core_model_type: pass_through
4
+ hidden_dim: 384
5
+ byte_hidden: 128
6
+ max_chunk_length: 12
7
+ max_num_chunks: 1024
8
+ num_delimiter_layers: 3
9
+ num_byte_decoder_layers: 5
10
+ target_chunk_len: 8.0
11
+ chunk_len_loss_weight: 0.1
12
+ chunk_len_penalty: 0.1
13
+ context_window: 8192
14
+ embedding_model_type: byte_level
15
+ tokenizer_type: bpe
16
+ tokenizer_dataset_name: simple_en_wiki
17
+ tokenizer_simplify_data: true
18
+ vocab_size: 259
19
+ lm_head_type: byte_level
20
+ lm_head_normalization: rms_norm
21
+ lm_head_bias: false
22
+ lm_head_dropout: 0.0
23
+ model_shell_type: byte_autoencoder_shell
24
+ embedding_weight_tying: true
25
+ ffn_weight_tying: false
26
+ cproj_weight_tying: false
27
+ positional_encoding_type: rope
28
+ trainer:
29
+ trainer_type: base_trainer
30
+ dataset: fineweb_edu_10B
31
+ batch_size: 6
32
+ gradient_accumulation_steps: 8
33
+ max_iters: 10000
34
+ eval_interval: 50000000
35
+ log_interval: 1
36
+ checkpoint_interval: 1000
37
+ eval_iters: 1000
38
+ run_eval: false
39
+ eval:
40
+ mcq_benchmarks: null
41
+ mcq_num_samples: 1000
42
+ eval_byte_metrics: false
43
+ text_modeling_eval: false
44
+ text_generation_eval: false
45
+ optimizer:
46
+ optimizer_name: adamW
47
+ lr: 0.0005
48
+ min_lr: 5.0e-05
49
+ weight_decay: 0.01
50
+ beta1: 0.9
51
+ beta2: 0.95
52
+ grad_clip: 1.0
53
+ lr_scheduler:
54
+ name: cosine
55
+ warmup_iters: 100
56
+ dataloader:
57
+ name: autoencoder
58
+ datasampling:
59
+ name: standard
60
+ loss_fn:
61
+ name: pass_through
62
+ general:
63
+ logging:
64
+ wandb_log: true
65
+ wandb_project: SuperTinyLanguageModels
66
+ wandb_run_name: null
67
+ group_name: experimental_byte_level
68
+ paths:
69
+ output_dir: outputs
70
+ data_dir: data
71
+ checkpoint_dir: checkpoints
72
+ eval_dir: evals
73
+ seed: 489
74
+ device: cuda
2024-09-23/08-39-13/.hydra/hydra.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: train
117
+ chdir: null
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: experimental/byte_autoencoder_1
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /root/SuperTinyLanguageModels
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /root/SuperTinyLanguageModels/configs/train
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/08-39-13
144
+ choices:
145
+ hydra/env: default
146
+ hydra/callbacks: null
147
+ hydra/job_logging: default
148
+ hydra/hydra_logging: default
149
+ hydra/hydra_help: default
150
+ hydra/help: default
151
+ hydra/sweeper: basic
152
+ hydra/launcher: basic
153
+ hydra/output: default
154
+ verbose: false
2024-09-23/08-39-13/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
2024-09-23/08-39-13/train.log ADDED
File without changes
2024-09-23/08-40-08/.hydra/config.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experimental:
2
+ model:
3
+ core_model_type: pass_through
4
+ hidden_dim: 384
5
+ byte_hidden: 128
6
+ max_chunk_length: 12
7
+ max_num_chunks: 1024
8
+ num_delimiter_layers: 3
9
+ num_byte_decoder_layers: 5
10
+ target_chunk_len: 8.0
11
+ chunk_len_loss_weight: 0.1
12
+ chunk_len_penalty: 0.1
13
+ context_window: 8192
14
+ embedding_model_type: byte_level
15
+ tokenizer_type: bpe
16
+ tokenizer_dataset_name: simple_en_wiki
17
+ tokenizer_simplify_data: true
18
+ vocab_size: 259
19
+ lm_head_type: byte_level
20
+ lm_head_normalization: rms_norm
21
+ lm_head_bias: false
22
+ lm_head_dropout: 0.0
23
+ model_shell_type: byte_autoencoder_shell
24
+ embedding_weight_tying: true
25
+ ffn_weight_tying: false
26
+ cproj_weight_tying: false
27
+ positional_encoding_type: rope
28
+ trainer:
29
+ trainer_type: base_trainer
30
+ dataset: fineweb_edu_10B
31
+ batch_size: 6
32
+ gradient_accumulation_steps: 8
33
+ max_iters: 10000
34
+ eval_interval: 50000000
35
+ log_interval: 1
36
+ checkpoint_interval: 1000
37
+ eval_iters: 1000
38
+ run_eval: false
39
+ eval:
40
+ mcq_benchmarks: null
41
+ mcq_num_samples: 1000
42
+ eval_byte_metrics: false
43
+ text_modeling_eval: false
44
+ text_generation_eval: false
45
+ optimizer:
46
+ optimizer_name: adamW
47
+ lr: 0.0005
48
+ min_lr: 5.0e-05
49
+ weight_decay: 0.01
50
+ beta1: 0.9
51
+ beta2: 0.95
52
+ grad_clip: 1.0
53
+ lr_scheduler:
54
+ name: cosine
55
+ warmup_iters: 100
56
+ dataloader:
57
+ name: autoencoder
58
+ datasampling:
59
+ name: standard
60
+ loss_fn:
61
+ name: pass_through
62
+ general:
63
+ logging:
64
+ wandb_log: true
65
+ wandb_project: SuperTinyLanguageModels
66
+ wandb_run_name: null
67
+ group_name: experimental_byte_level
68
+ paths:
69
+ output_dir: outputs
70
+ data_dir: data
71
+ checkpoint_dir: checkpoints
72
+ eval_dir: evals
73
+ seed: 489
74
+ device: cuda
2024-09-23/08-40-08/.hydra/hydra.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: train
117
+ chdir: null
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: experimental/byte_autoencoder_1
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /root/SuperTinyLanguageModels
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /root/SuperTinyLanguageModels/configs/train
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08
144
+ choices:
145
+ hydra/env: default
146
+ hydra/callbacks: null
147
+ hydra/job_logging: default
148
+ hydra/hydra_logging: default
149
+ hydra/hydra_help: default
150
+ hydra/help: default
151
+ hydra/sweeper: basic
152
+ hydra/launcher: basic
153
+ hydra/output: default
154
+ verbose: false
2024-09-23/08-40-08/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
2024-09-23/08-40-08/train.log ADDED
File without changes
2024-09-23/08-40-08/wandb/debug-internal.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-09-23T09:14:22.59580271Z","level":"INFO","msg":"using version","core version":"0.18.1"}
2
+ {"time":"2024-09-23T09:14:22.59581747Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
3
+ {"time":"2024-09-23T09:14:22.595881422Z","level":"INFO","msg":"using version","core version":"0.18.1"}
4
+ {"time":"2024-09-23T09:14:22.595887882Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
5
+ {"time":"2024-09-23T09:14:22.59917443Z","level":"INFO","msg":"created new stream","id":"a2kxhd8v"}
6
+ {"time":"2024-09-23T09:14:22.59919309Z","level":"INFO","msg":"stream: started","id":"a2kxhd8v"}
7
+ {"time":"2024-09-23T09:14:22.59921417Z","level":"INFO","msg":"sender: started","stream_id":{"value":"a2kxhd8v"}}
8
+ {"time":"2024-09-23T09:14:22.599226691Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"a2kxhd8v"}}
9
+ {"time":"2024-09-23T09:14:22.599236461Z","level":"INFO","msg":"handler: started","stream_id":{"value":"a2kxhd8v"}}
10
+ {"time":"2024-09-23T09:14:22.982350736Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
11
+ {"time":"2024-09-23T09:14:22.985015444Z","level":"INFO","msg":"Starting system monitor"}
12
+ {"time":"2024-09-23T09:14:27.10372121Z","level":"INFO","msg":"stream: closing","id":"a2kxhd8v"}
13
+ {"time":"2024-09-23T09:14:27.103806442Z","level":"INFO","msg":"Stopping system monitor"}
14
+ {"time":"2024-09-23T09:14:27.104964992Z","level":"INFO","msg":"Stopped system monitor"}
2024-09-23/08-40-08/wandb/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
2
+ 2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Configure stats pid to 78108
3
+ 2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
4
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/settings
5
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
6
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
8
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying login settings: {}
9
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log
10
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log
11
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():616] calling init triggers
12
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
13
+ config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
14
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():666] starting backend
15
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():670] setting up manager
16
+ 2024-09-23 09:14:22,584 INFO MainThread:78108 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-09-23 09:14:22,586 INFO MainThread:78108 [wandb_init.py:init():678] backend started and connected
18
+ 2024-09-23 09:14:22,588 INFO MainThread:78108 [wandb_init.py:init():773] updated telemetry
19
+ 2024-09-23 09:14:22,598 INFO MainThread:78108 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
20
+ 2024-09-23 09:14:22,974 INFO MainThread:78108 [wandb_init.py:init():857] starting run threads in backend
21
+ 2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_console_start():2459] atexit reg
22
+ 2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_redirect():2307] redirect: wrap_raw
23
+ 2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2372] Wrapping output streams.
24
+ 2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2397] Redirects installed.
25
+ 2024-09-23 09:14:23,135 INFO MainThread:78108 [wandb_init.py:init():900] run started, returning control to user process
26
+ 2024-09-23 09:14:27,104 WARNING MsgRouterThr:78108 [router.py:message_loop():77] message_loop has been closed
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/config.yaml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.1
4
+ m: []
5
+ python_version: 3.10.14
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 49
12
+ - 50
13
+ - 51
14
+ - 53
15
+ - 55
16
+ "2":
17
+ - 1
18
+ - 5
19
+ - 11
20
+ - 49
21
+ - 50
22
+ - 51
23
+ - 53
24
+ - 55
25
+ "3":
26
+ - 13
27
+ - 15
28
+ - 16
29
+ - 23
30
+ - 55
31
+ "4": 3.10.14
32
+ "5": 0.18.1
33
+ "6": 4.44.2
34
+ "8":
35
+ - 5
36
+ - 9
37
+ "12": 0.18.1
38
+ "13": linux-x86_64
39
+ general:
40
+ value:
41
+ device: cuda
42
+ logging:
43
+ group_name: experimental_byte_level
44
+ wandb_log: true
45
+ wandb_project: SuperTinyLanguageModels
46
+ wandb_run_name: null
47
+ paths:
48
+ checkpoint_dir: checkpoints
49
+ data_dir: /root/SuperTinyLanguageModels/data
50
+ eval_dir: /root/SuperTinyLanguageModels/evals
51
+ output_dir: outputs
52
+ seed: 489
53
+ model:
54
+ value:
55
+ byte_hidden: 128
56
+ chunk_len_loss_weight: 0.1
57
+ chunk_len_penalty: 0.1
58
+ context_window: 8192
59
+ core_model_type: pass_through
60
+ cproj_weight_tying: false
61
+ embedding_model_type: byte_level
62
+ embedding_weight_tying: true
63
+ ffn_weight_tying: false
64
+ hidden_dim: 384
65
+ lm_head_bias: false
66
+ lm_head_dropout: 0
67
+ lm_head_normalization: rms_norm
68
+ lm_head_type: byte_level
69
+ max_chunk_length: 12
70
+ max_num_chunks: 1024
71
+ model_shell_type: byte_autoencoder_shell
72
+ num_byte_decoder_layers: 5
73
+ num_delimiter_layers: 3
74
+ positional_encoding_type: rope
75
+ target_chunk_len: 8
76
+ tokenizer_dataset_name: simple_en_wiki
77
+ tokenizer_simplify_data: true
78
+ tokenizer_type: bpe
79
+ vocab_size: 259
80
+ trainer:
81
+ value:
82
+ batch_size: 6
83
+ checkpoint_interval: 1000
84
+ dataloader:
85
+ name: autoencoder
86
+ datasampling:
87
+ name: standard
88
+ dataset: fineweb_edu_10B
89
+ eval:
90
+ eval_byte_metrics: false
91
+ mcq_benchmarks: null
92
+ mcq_num_samples: 1000
93
+ text_generation_eval: false
94
+ text_modeling_eval: false
95
+ eval_interval: 50000000
96
+ eval_iters: 1000
97
+ gradient_accumulation_steps: 8
98
+ log_interval: 1
99
+ loss_fn:
100
+ name: pass_through
101
+ lr_scheduler:
102
+ name: cosine
103
+ warmup_iters: 100
104
+ max_iters: 10000
105
+ optimizer:
106
+ beta1: 0.9
107
+ beta2: 0.95
108
+ grad_clip: 1
109
+ lr: 0.0005
110
+ min_lr: 5e-05
111
+ optimizer_name: adamW
112
+ weight_decay: 0.01
113
+ run_eval: false
114
+ trainer_type: base_trainer
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/output.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Weight and Biases Initialized
2
+ Rank0 Trainer built
3
+ Training loop is starting
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/requirements.txt ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==75.1.0
2
+ wheel==0.44.0
3
+ pip==24.2
4
+ wcwidth==0.2.13
5
+ sentencepiece==0.2.0
6
+ pytz==2024.2
7
+ mpmath==1.3.0
8
+ distlib==0.3.8
9
+ antlr4-python3-runtime==4.9.3
10
+ xxhash==3.5.0
11
+ urllib3==2.2.3
12
+ tzdata==2024.1
13
+ typing_extensions==4.12.2
14
+ tqdm==4.66.5
15
+ threadpoolctl==3.5.0
16
+ sympy==1.13.3
17
+ smmap==5.0.1
18
+ six==1.16.0
19
+ setproctitle==1.3.3
20
+ safetensors==0.4.5
21
+ regex==2024.9.11
22
+ rapidfuzz==3.9.7
23
+ PyYAML==6.0.2
24
+ pytrec-eval-terrier==0.5.6
25
+ pyphen==0.16.0
26
+ Pygments==2.18.0
27
+ psutil==6.0.0
28
+ protobuf==5.28.2
29
+ prettytable==3.11.0
30
+ polars==1.7.1
31
+ platformdirs==4.3.6
32
+ pillow==10.4.0
33
+ packaging==24.1
34
+ nvidia-nvtx-cu12==12.1.105
35
+ nvidia-nvjitlink-cu12==12.6.68
36
+ nvidia-nccl-cu12==2.20.5
37
+ nvidia-curand-cu12==10.3.2.106
38
+ nvidia-cufft-cu12==11.0.2.54
39
+ nvidia-cuda-runtime-cu12==12.1.105
40
+ nvidia-cuda-nvrtc-cu12==12.1.105
41
+ nvidia-cuda-cupti-cu12==12.1.105
42
+ nvidia-cublas-cu12==12.1.3.1
43
+ numpy==1.26.4
44
+ nodeenv==1.9.1
45
+ networkx==3.3
46
+ mdurl==0.1.2
47
+ MarkupSafe==2.1.5
48
+ joblib==1.4.2
49
+ idna==3.10
50
+ identify==2.6.1
51
+ fsspec==2024.6.1
52
+ frozenlist==1.4.1
53
+ filelock==3.16.1
54
+ eval_type_backport==0.2.0
55
+ dill==0.3.8
56
+ click==8.1.7
57
+ charset-normalizer==3.3.2
58
+ cfgv==3.4.0
59
+ certifi==2024.8.30
60
+ attrs==24.2.0
61
+ async-timeout==4.0.3
62
+ annotated-types==0.7.0
63
+ aiohappyeyeballs==2.4.0
64
+ virtualenv==20.26.5
65
+ triton==3.0.0
66
+ textstat==0.7.4
67
+ sentry-sdk==2.14.0
68
+ scipy==1.14.1
69
+ requests==2.32.3
70
+ python-dateutil==2.9.0.post0
71
+ pydantic_core==2.23.4
72
+ pyarrow==17.0.0
73
+ omegaconf==2.3.0
74
+ nvidia-cusparse-cu12==12.1.0.106
75
+ nvidia-cudnn-cu12==9.1.0.70
76
+ nltk==3.9.1
77
+ multiprocess==0.70.16
78
+ multidict==6.1.0
79
+ markdown-it-py==3.0.0
80
+ Levenshtein==0.26.0
81
+ Jinja2==3.1.4
82
+ gitdb==4.0.11
83
+ docker-pycreds==0.4.0
84
+ aiosignal==1.3.1
85
+ yarl==1.11.1
86
+ tiktoken==0.7.0
87
+ scikit-learn==1.5.2
88
+ rich==13.8.1
89
+ pydantic==2.9.2
90
+ pre-commit==3.8.0
91
+ pandas==2.2.3
92
+ nvidia-cusolver-cu12==11.4.5.107
93
+ language_tool_python==2.8.1
94
+ hydra-core==1.3.2
95
+ huggingface-hub==0.25.0
96
+ GitPython==3.1.43
97
+ wandb==0.18.1
98
+ torch==2.4.1
99
+ tokenizers==0.19.1
100
+ aiohttp==3.10.5
101
+ transformers==4.44.2
102
+ sentence-transformers==3.1.1
103
+ datasets==3.0.0
104
+ mteb==1.14.21
105
+ autocommand==2.2.2
106
+ backports.tarfile==1.2.0
107
+ importlib_metadata==8.0.0
108
+ importlib_resources==6.4.0
109
+ inflect==7.3.1
110
+ jaraco.collections==5.1.0
111
+ jaraco.context==5.3.0
112
+ jaraco.functools==4.0.1
113
+ jaraco.text==3.12.1
114
+ more-itertools==10.3.0
115
+ packaging==24.1
116
+ platformdirs==4.2.2
117
+ tomli==2.0.1
118
+ typeguard==4.3.0
119
+ typing_extensions==4.12.2
120
+ wheel==0.43.0
121
+ zipp==3.19.2
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-metadata.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-117-generic-x86_64-with-glibc2.31",
3
+ "python": "3.10.14",
4
+ "startedAt": "2024-09-23T09:14:22.586171Z",
5
+ "args": [
6
+ "--config-name",
7
+ "experimental/byte_autoencoder_1"
8
+ ],
9
+ "program": "/root/SuperTinyLanguageModels/train.py",
10
+ "codePath": "train.py",
11
+ "git": {
12
+ "remote": "https://github.com/LeonGuertler/SuperTinyLanguageModels.git",
13
+ "commit": "ebdf9039e89c5d337997d0c2b11bf4e992886243"
14
+ },
15
+ "email": "[email protected]",
16
+ "root": "/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08",
17
+ "host": "11c6e13f6a55",
18
+ "username": "root",
19
+ "executable": "/root/SuperTinyLanguageModels/.conda/bin/python3",
20
+ "cpu_count": 128,
21
+ "cpu_count_logical": 256,
22
+ "gpu": "[NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090]",
23
+ "gpu_count": 8,
24
+ "disk": {
25
+ "/": {
26
+ "total": "1123133947904",
27
+ "used": "551794225152"
28
+ }
29
+ },
30
+ "memory": {
31
+ "total": "540812599296"
32
+ },
33
+ "cpu": {
34
+ "count": 128,
35
+ "countLogical": 256
36
+ },
37
+ "gpu_nvidia": [
38
+ {
39
+ "name": "NVIDIA GeForce RTX 4090",
40
+ "memoryTotal": "25757220864",
41
+ "cudaCores": 16384,
42
+ "architecture": "Ada"
43
+ },
44
+ {
45
+ "name": "NVIDIA GeForce RTX 4090",
46
+ "memoryTotal": "25757220864",
47
+ "cudaCores": 16384,
48
+ "architecture": "Ada"
49
+ },
50
+ {
51
+ "name": "NVIDIA GeForce RTX 4090",
52
+ "memoryTotal": "25757220864",
53
+ "cudaCores": 16384,
54
+ "architecture": "Ada"
55
+ },
56
+ {
57
+ "name": "NVIDIA GeForce RTX 4090",
58
+ "memoryTotal": "25757220864",
59
+ "cudaCores": 16384,
60
+ "architecture": "Ada"
61
+ },
62
+ {
63
+ "name": "NVIDIA GeForce RTX 4090",
64
+ "memoryTotal": "25757220864",
65
+ "cudaCores": 16384,
66
+ "architecture": "Ada"
67
+ },
68
+ {
69
+ "name": "NVIDIA GeForce RTX 4090",
70
+ "memoryTotal": "25757220864",
71
+ "cudaCores": 16384,
72
+ "architecture": "Ada"
73
+ },
74
+ {
75
+ "name": "NVIDIA GeForce RTX 4090",
76
+ "memoryTotal": "25757220864",
77
+ "cudaCores": 16384,
78
+ "architecture": "Ada"
79
+ },
80
+ {
81
+ "name": "NVIDIA GeForce RTX 4090",
82
+ "memoryTotal": "25757220864",
83
+ "cudaCores": 16384,
84
+ "architecture": "Ada"
85
+ }
86
+ ],
87
+ "cudaVersion": "12.5"
88
+ }
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":4}}
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-09-23T09:14:21.933081362Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp9hgpve6u/port-78108.txt","pid":78108,"debug":false,"disable-analytics":false}
2
+ {"time":"2024-09-23T09:14:21.933136193Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2024-09-23T09:14:21.935284221Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":78108}
4
+ {"time":"2024-09-23T09:14:21.935348272Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":43153,"Zone":""}}
5
+ {"time":"2024-09-23T09:14:22.076126266Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:57616"}
6
+ {"time":"2024-09-23T09:14:22.595626377Z","level":"INFO","msg":"connection init received","streamId":"a2kxhd8v","id":"127.0.0.1:57616"}
7
+ {"time":"2024-09-23T09:14:22.595853241Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240923_091421.log /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log: file exists"}
8
+ {"time":"2024-09-23T09:14:22.59919809Z","level":"INFO","msg":"connection init completed","streamId":"a2kxhd8v","id":"127.0.0.1:57616"}
9
+ {"time":"2024-09-23T09:14:27.103590738Z","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:57616"}
10
+ {"time":"2024-09-23T09:14:27.103797162Z","level":"INFO","msg":"server is shutting down"}
11
+ {"time":"2024-09-23T09:14:27.104072727Z","level":"INFO","msg":"closed connection","id":"127.0.0.1:57616"}
12
+ {"time":"2024-09-23T09:14:28.465863147Z","level":"INFO","msg":"Parent process exited, terminating service process."}
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-09-23T09:14:22.59580271Z","level":"INFO","msg":"using version","core version":"0.18.1"}
2
+ {"time":"2024-09-23T09:14:22.59581747Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
3
+ {"time":"2024-09-23T09:14:22.595881422Z","level":"INFO","msg":"using version","core version":"0.18.1"}
4
+ {"time":"2024-09-23T09:14:22.595887882Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
5
+ {"time":"2024-09-23T09:14:22.59917443Z","level":"INFO","msg":"created new stream","id":"a2kxhd8v"}
6
+ {"time":"2024-09-23T09:14:22.59919309Z","level":"INFO","msg":"stream: started","id":"a2kxhd8v"}
7
+ {"time":"2024-09-23T09:14:22.59921417Z","level":"INFO","msg":"sender: started","stream_id":{"value":"a2kxhd8v"}}
8
+ {"time":"2024-09-23T09:14:22.599226691Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"a2kxhd8v"}}
9
+ {"time":"2024-09-23T09:14:22.599236461Z","level":"INFO","msg":"handler: started","stream_id":{"value":"a2kxhd8v"}}
10
+ {"time":"2024-09-23T09:14:22.982350736Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
11
+ {"time":"2024-09-23T09:14:22.985015444Z","level":"INFO","msg":"Starting system monitor"}
12
+ {"time":"2024-09-23T09:14:27.10372121Z","level":"INFO","msg":"stream: closing","id":"a2kxhd8v"}
13
+ {"time":"2024-09-23T09:14:27.103806442Z","level":"INFO","msg":"Stopping system monitor"}
14
+ {"time":"2024-09-23T09:14:27.104964992Z","level":"INFO","msg":"Stopped system monitor"}
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
2
+ 2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Configure stats pid to 78108
3
+ 2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
4
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/settings
5
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
6
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
8
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying login settings: {}
9
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log
10
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log
11
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():616] calling init triggers
12
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
13
+ config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
14
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():666] starting backend
15
+ 2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():670] setting up manager
16
+ 2024-09-23 09:14:22,584 INFO MainThread:78108 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-09-23 09:14:22,586 INFO MainThread:78108 [wandb_init.py:init():678] backend started and connected
18
+ 2024-09-23 09:14:22,588 INFO MainThread:78108 [wandb_init.py:init():773] updated telemetry
19
+ 2024-09-23 09:14:22,598 INFO MainThread:78108 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
20
+ 2024-09-23 09:14:22,974 INFO MainThread:78108 [wandb_init.py:init():857] starting run threads in backend
21
+ 2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_console_start():2459] atexit reg
22
+ 2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_redirect():2307] redirect: wrap_raw
23
+ 2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2372] Wrapping output streams.
24
+ 2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2397] Redirects installed.
25
+ 2024-09-23 09:14:23,135 INFO MainThread:78108 [wandb_init.py:init():900] run started, returning control to user process
26
+ 2024-09-23 09:14:27,104 WARNING MsgRouterThr:78108 [router.py:message_loop():77] message_loop has been closed
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/run-a2kxhd8v.wandb ADDED
File without changes
2024-09-23/09-32-28/.hydra/config.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experimental:
2
+ model:
3
+ core_model_type: pass_through
4
+ hidden_dim: 384
5
+ byte_hidden: 128
6
+ max_chunk_length: 12
7
+ max_num_chunks: 1024
8
+ num_delimiter_layers: 3
9
+ num_byte_decoder_layers: 5
10
+ target_chunk_len: 8.0
11
+ chunk_len_loss_weight: 0.1
12
+ chunk_len_penalty: 0.1
13
+ context_window: 8192
14
+ embedding_model_type: byte_level
15
+ tokenizer_type: bpe
16
+ tokenizer_dataset_name: simple_en_wiki
17
+ tokenizer_simplify_data: true
18
+ vocab_size: 259
19
+ lm_head_type: byte_level
20
+ lm_head_normalization: rms_norm
21
+ lm_head_bias: false
22
+ lm_head_dropout: 0.0
23
+ model_shell_type: byte_autoencoder_shell
24
+ embedding_weight_tying: true
25
+ ffn_weight_tying: false
26
+ cproj_weight_tying: false
27
+ positional_encoding_type: rope
28
+ trainer:
29
+ trainer_type: base_trainer
30
+ dataset: fineweb_edu_10B
31
+ batch_size: 6
32
+ gradient_accumulation_steps: 8
33
+ max_iters: 10000
34
+ eval_interval: 50000000
35
+ log_interval: 1
36
+ checkpoint_interval: 1000
37
+ eval_iters: 1000
38
+ run_eval: false
39
+ eval:
40
+ mcq_benchmarks: null
41
+ mcq_num_samples: 1000
42
+ eval_byte_metrics: false
43
+ text_modeling_eval: false
44
+ text_generation_eval: false
45
+ optimizer:
46
+ optimizer_name: adamW
47
+ lr: 0.0005
48
+ min_lr: 5.0e-05
49
+ weight_decay: 0.01
50
+ beta1: 0.9
51
+ beta2: 0.95
52
+ grad_clip: 1.0
53
+ lr_scheduler:
54
+ name: cosine
55
+ warmup_iters: 100
56
+ dataloader:
57
+ name: autoencoder
58
+ datasampling:
59
+ name: standard
60
+ loss_fn:
61
+ name: pass_through
62
+ general:
63
+ logging:
64
+ wandb_log: true
65
+ wandb_project: SuperTinyLanguageModels
66
+ wandb_run_name: null
67
+ group_name: experimental_byte_level
68
+ paths:
69
+ output_dir: outputs
70
+ data_dir: data
71
+ checkpoint_dir: checkpoints
72
+ eval_dir: evals
73
+ seed: 489
74
+ device: cuda
2024-09-23/09-32-28/.hydra/hydra.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: train
117
+ chdir: null
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: experimental/byte_autoencoder_1
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /root/SuperTinyLanguageModels
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /root/SuperTinyLanguageModels/configs/train
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28
144
+ choices:
145
+ hydra/env: default
146
+ hydra/callbacks: null
147
+ hydra/job_logging: default
148
+ hydra/hydra_logging: default
149
+ hydra/hydra_help: default
150
+ hydra/help: default
151
+ hydra/sweeper: basic
152
+ hydra/launcher: basic
153
+ hydra/output: default
154
+ verbose: false
2024-09-23/09-32-28/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
2024-09-23/09-32-28/train.log ADDED
File without changes
2024-09-23/09-32-28/wandb/debug-internal.log ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-09-23T09:32:37.2270228Z","level":"INFO","msg":"using version","core version":"0.18.1"}
2
+ {"time":"2024-09-23T09:32:37.227060611Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
3
+ {"time":"2024-09-23T09:32:37.227169702Z","level":"INFO","msg":"using version","core version":"0.18.1"}
4
+ {"time":"2024-09-23T09:32:37.227182172Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
5
+ {"time":"2024-09-23T09:32:37.230824708Z","level":"INFO","msg":"created new stream","id":"tkkvzfon"}
6
+ {"time":"2024-09-23T09:32:37.230859859Z","level":"INFO","msg":"stream: started","id":"tkkvzfon"}
7
+ {"time":"2024-09-23T09:32:37.230903499Z","level":"INFO","msg":"sender: started","stream_id":{"value":"tkkvzfon"}}
8
+ {"time":"2024-09-23T09:32:37.23092371Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"tkkvzfon"}}
9
+ {"time":"2024-09-23T09:32:37.23097304Z","level":"INFO","msg":"handler: started","stream_id":{"value":"tkkvzfon"}}
10
+ {"time":"2024-09-23T09:32:37.634282756Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
11
+ {"time":"2024-09-23T09:32:37.636527894Z","level":"INFO","msg":"Starting system monitor"}
12
+ {"time":"2024-09-23T09:33:46.746283667Z","level":"INFO","msg":"stream: closing","id":"tkkvzfon"}
13
+ {"time":"2024-09-23T09:33:46.746349498Z","level":"INFO","msg":"Stopping system monitor"}
14
+ {"time":"2024-09-23T09:33:46.747359311Z","level":"INFO","msg":"Stopped system monitor"}
15
+ {"time":"2024-09-23T09:33:49.926631346Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"tkkvzfon"}}
16
+ {"time":"2024-09-23T09:33:49.926725448Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"tkkvzfon"}}
17
+ {"time":"2024-09-23T09:33:49.926795918Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"tkkvzfon"}}
18
+ {"time":"2024-09-23T09:33:49.927056922Z","level":"INFO","msg":"stream: closed","id":"tkkvzfon"}
2024-09-23/09-32-28/wandb/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
2
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Configure stats pid to 81916
3
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
4
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/settings
5
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
6
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
8
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying login settings: {}
9
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log
10
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log
11
+ 2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():616] calling init triggers
12
+ 2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
13
+ config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
14
+ 2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():666] starting backend
15
+ 2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():670] setting up manager
16
+ 2024-09-23 09:32:37,223 INFO MainThread:81916 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-09-23 09:32:37,224 INFO MainThread:81916 [wandb_init.py:init():678] backend started and connected
18
+ 2024-09-23 09:32:37,227 INFO MainThread:81916 [wandb_init.py:init():773] updated telemetry
19
+ 2024-09-23 09:32:37,236 INFO MainThread:81916 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
20
+ 2024-09-23 09:32:37,631 INFO MainThread:81916 [wandb_init.py:init():857] starting run threads in backend
21
+ 2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_console_start():2459] atexit reg
22
+ 2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_redirect():2307] redirect: wrap_raw
23
+ 2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2372] Wrapping output streams.
24
+ 2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2397] Redirects installed.
25
+ 2024-09-23 09:32:37,806 INFO MainThread:81916 [wandb_init.py:init():900] run started, returning control to user process
26
+ 2024-09-23 09:33:46,746 WARNING MsgRouterThr:81916 [router.py:message_loop():77] message_loop has been closed
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/config.yaml ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.1
4
+ m: []
5
+ python_version: 3.10.14
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 49
12
+ - 50
13
+ - 51
14
+ - 53
15
+ - 55
16
+ "2":
17
+ - 1
18
+ - 5
19
+ - 11
20
+ - 49
21
+ - 50
22
+ - 51
23
+ - 53
24
+ - 55
25
+ "3":
26
+ - 13
27
+ - 15
28
+ - 16
29
+ - 23
30
+ - 55
31
+ - 61
32
+ "4": 3.10.14
33
+ "5": 0.18.1
34
+ "6": 4.44.2
35
+ "8":
36
+ - 5
37
+ - 9
38
+ "12": 0.18.1
39
+ "13": linux-x86_64
40
+ general:
41
+ value:
42
+ device: cuda
43
+ logging:
44
+ group_name: experimental_byte_level
45
+ wandb_log: true
46
+ wandb_project: SuperTinyLanguageModels
47
+ wandb_run_name: null
48
+ paths:
49
+ checkpoint_dir: checkpoints
50
+ data_dir: /root/SuperTinyLanguageModels/data
51
+ eval_dir: /root/SuperTinyLanguageModels/evals
52
+ output_dir: outputs
53
+ seed: 489
54
+ model:
55
+ value:
56
+ byte_hidden: 128
57
+ chunk_len_loss_weight: 0.1
58
+ chunk_len_penalty: 0.1
59
+ context_window: 8192
60
+ core_model_type: pass_through
61
+ cproj_weight_tying: false
62
+ embedding_model_type: byte_level
63
+ embedding_weight_tying: true
64
+ ffn_weight_tying: false
65
+ hidden_dim: 384
66
+ lm_head_bias: false
67
+ lm_head_dropout: 0
68
+ lm_head_normalization: rms_norm
69
+ lm_head_type: byte_level
70
+ max_chunk_length: 12
71
+ max_num_chunks: 1024
72
+ model_shell_type: byte_autoencoder_shell
73
+ num_byte_decoder_layers: 5
74
+ num_delimiter_layers: 3
75
+ positional_encoding_type: rope
76
+ target_chunk_len: 8
77
+ tokenizer_dataset_name: simple_en_wiki
78
+ tokenizer_simplify_data: true
79
+ tokenizer_type: bpe
80
+ vocab_size: 259
81
+ trainer:
82
+ value:
83
+ batch_size: 6
84
+ checkpoint_interval: 1000
85
+ dataloader:
86
+ name: autoencoder
87
+ datasampling:
88
+ name: standard
89
+ dataset: fineweb_edu_10B
90
+ eval:
91
+ eval_byte_metrics: false
92
+ mcq_benchmarks: null
93
+ mcq_num_samples: 1000
94
+ text_generation_eval: false
95
+ text_modeling_eval: false
96
+ eval_interval: 50000000
97
+ eval_iters: 1000
98
+ gradient_accumulation_steps: 8
99
+ log_interval: 1
100
+ loss_fn:
101
+ name: pass_through
102
+ lr_scheduler:
103
+ name: cosine
104
+ warmup_iters: 100
105
+ max_iters: 10000
106
+ optimizer:
107
+ beta1: 0.9
108
+ beta2: 0.95
109
+ grad_clip: 1
110
+ lr: 0.0005
111
+ min_lr: 5e-05
112
+ optimizer_name: adamW
113
+ weight_decay: 0.01
114
+ run_eval: false
115
+ trainer_type: base_trainer
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/output.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Weight and Biases Initialized
2
+ Rank0 Trainer built
3
+ Training loop is starting
4
+ All GPU(s): step 1: loss 10.4062, lr 5.0e-06, dt 2.1s
5
+ All GPU(s): step 2: loss 10.4297, lr 1.0e-05, dt 2.1s
6
+ All GPU(s): step 3: loss 10.3672, lr 1.5e-05, dt 2.1s
7
+ All GPU(s): step 4: loss 10.3203, lr 2.0e-05, dt 2.1s
8
+ All GPU(s): step 5: loss 10.2344, lr 2.5e-05, dt 2.1s
9
+ All GPU(s): step 6: loss 10.1406, lr 3.0e-05, dt 2.1s
10
+ All GPU(s): step 7: loss 10.0234, lr 3.5e-05, dt 2.1s
11
+ All GPU(s): step 8: loss 9.9688, lr 4.0e-05, dt 2.1s
12
+ All GPU(s): step 9: loss 9.8594, lr 4.5e-05, dt 2.2s
13
+ All GPU(s): step 10: loss 9.6328, lr 5.0e-05, dt 2.1s
14
+ All GPU(s): step 11: loss 9.5312, lr 5.5e-05, dt 2.1s
15
+ All GPU(s): step 12: loss 9.3750, lr 6.0e-05, dt 2.1s
16
+ All GPU(s): step 13: loss 9.2109, lr 6.5e-05, dt 2.1s
17
+ All GPU(s): step 14: loss 9.0078, lr 7.0e-05, dt 2.1s
18
+ All GPU(s): step 15: loss 8.8203, lr 7.5e-05, dt 2.1s
19
+ All GPU(s): step 16: loss 8.6562, lr 8.0e-05, dt 2.0s
20
+ All GPU(s): step 17: loss 8.4922, lr 8.5e-05, dt 2.1s
21
+ All GPU(s): step 18: loss 8.2891, lr 9.0e-05, dt 2.1s
22
+ All GPU(s): step 19: loss 8.1328, lr 9.5e-05, dt 2.1s
23
+ All GPU(s): step 20: loss 7.9414, lr 1.0e-04, dt 2.0s
24
+ All GPU(s): step 21: loss 7.7852, lr 1.1e-04, dt 2.1s
25
+ All GPU(s): step 22: loss 7.5977, lr 1.1e-04, dt 2.1s
26
+ All GPU(s): step 23: loss 7.4453, lr 1.2e-04, dt 2.1s
27
+ All GPU(s): step 24: loss 7.3164, lr 1.2e-04, dt 2.1s
28
+ All GPU(s): step 25: loss 7.1836, lr 1.3e-04, dt 2.1s
29
+ All GPU(s): step 26: loss 7.1406, lr 1.3e-04, dt 2.1s
30
+ All GPU(s): step 27: loss 6.9414, lr 1.4e-04, dt 2.1s
31
+ All GPU(s): step 28: loss 6.8633, lr 1.4e-04, dt 2.2s
32
+ All GPU(s): step 29: loss 6.7461, lr 1.5e-04, dt 2.1s
33
+ All GPU(s): step 30: loss 6.5742, lr 1.5e-04, dt 2.1s
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-metadata.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-117-generic-x86_64-with-glibc2.31",
3
+ "python": "3.10.14",
4
+ "startedAt": "2024-09-23T09:32:37.224689Z",
5
+ "args": [
6
+ "--config-name",
7
+ "experimental/byte_autoencoder_1"
8
+ ],
9
+ "program": "/root/SuperTinyLanguageModels/train.py",
10
+ "codePath": "train.py",
11
+ "git": {
12
+ "remote": "https://github.com/LeonGuertler/SuperTinyLanguageModels.git",
13
+ "commit": "c36bf6b78927d4d365c52a835f0e178edacbab29"
14
+ },
15
+ "email": "[email protected]",
16
+ "root": "/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28",
17
+ "host": "11c6e13f6a55",
18
+ "username": "root",
19
+ "executable": "/root/SuperTinyLanguageModels/.conda/bin/python3",
20
+ "cpu_count": 128,
21
+ "cpu_count_logical": 256,
22
+ "gpu": "[NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090]",
23
+ "gpu_count": 8,
24
+ "disk": {
25
+ "/": {
26
+ "total": "1123133947904",
27
+ "used": "551794495488"
28
+ }
29
+ },
30
+ "memory": {
31
+ "total": "540812599296"
32
+ },
33
+ "cpu": {
34
+ "count": 128,
35
+ "countLogical": 256
36
+ },
37
+ "gpu_nvidia": [
38
+ {
39
+ "name": "NVIDIA GeForce RTX 4090",
40
+ "memoryTotal": "25757220864",
41
+ "cudaCores": 16384,
42
+ "architecture": "Ada"
43
+ },
44
+ {
45
+ "name": "NVIDIA GeForce RTX 4090",
46
+ "memoryTotal": "25757220864",
47
+ "cudaCores": 16384,
48
+ "architecture": "Ada"
49
+ },
50
+ {
51
+ "name": "NVIDIA GeForce RTX 4090",
52
+ "memoryTotal": "25757220864",
53
+ "cudaCores": 16384,
54
+ "architecture": "Ada"
55
+ },
56
+ {
57
+ "name": "NVIDIA GeForce RTX 4090",
58
+ "memoryTotal": "25757220864",
59
+ "cudaCores": 16384,
60
+ "architecture": "Ada"
61
+ },
62
+ {
63
+ "name": "NVIDIA GeForce RTX 4090",
64
+ "memoryTotal": "25757220864",
65
+ "cudaCores": 16384,
66
+ "architecture": "Ada"
67
+ },
68
+ {
69
+ "name": "NVIDIA GeForce RTX 4090",
70
+ "memoryTotal": "25757220864",
71
+ "cudaCores": 16384,
72
+ "architecture": "Ada"
73
+ },
74
+ {
75
+ "name": "NVIDIA GeForce RTX 4090",
76
+ "memoryTotal": "25757220864",
77
+ "cudaCores": 16384,
78
+ "architecture": "Ada"
79
+ },
80
+ {
81
+ "name": "NVIDIA GeForce RTX 4090",
82
+ "memoryTotal": "25757220864",
83
+ "cudaCores": 16384,
84
+ "architecture": "Ada"
85
+ }
86
+ ],
87
+ "cudaVersion": "12.5"
88
+ }
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"additional_info/chunk_len_penalty_loss":0,"additional_info/total-loss":6.543508529663086,"_step":1474560,"additional_info/chunk_len_loss":2.0561606884002686,"iter":30,"token_num":1474560,"additional_info/BCE-loss":4.487347602844238,"loss":6.57421875,"lr":0.00015,"_timestamp":1.7270840240730202e+09,"_runtime":69.521643938,"additional_info/average_chunk_length":3.4655094146728516,"_wandb":{"runtime":69}}
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-09-23T09:32:36.53490736Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmppr55fcxh/port-81916.txt","pid":81916,"debug":false,"disable-analytics":false}
2
+ {"time":"2024-09-23T09:32:36.534984841Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2024-09-23T09:32:36.551541231Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":81916}
4
+ {"time":"2024-09-23T09:32:36.55148544Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44587,"Zone":""}}
5
+ {"time":"2024-09-23T09:32:36.722786198Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:60908"}
6
+ {"time":"2024-09-23T09:32:37.226730857Z","level":"INFO","msg":"connection init received","streamId":"tkkvzfon","id":"127.0.0.1:60908"}
7
+ {"time":"2024-09-23T09:32:37.227116001Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240923_093236.log /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log: file exists"}
8
+ {"time":"2024-09-23T09:32:37.230871019Z","level":"INFO","msg":"connection init completed","streamId":"tkkvzfon","id":"127.0.0.1:60908"}
9
+ {"time":"2024-09-23T09:33:46.746114105Z","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:60908"}
10
+ {"time":"2024-09-23T09:33:46.746363968Z","level":"INFO","msg":"server is shutting down"}
11
+ {"time":"2024-09-23T09:33:46.746627582Z","level":"INFO","msg":"closed connection","id":"127.0.0.1:60908"}
12
+ {"time":"2024-09-23T09:33:49.927260015Z","level":"INFO","msg":"connection closed","id":"127.0.0.1:60908"}
13
+ {"time":"2024-09-23T09:33:49.927297555Z","level":"INFO","msg":"server is closed"}
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-09-23T09:32:37.2270228Z","level":"INFO","msg":"using version","core version":"0.18.1"}
2
+ {"time":"2024-09-23T09:32:37.227060611Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
3
+ {"time":"2024-09-23T09:32:37.227169702Z","level":"INFO","msg":"using version","core version":"0.18.1"}
4
+ {"time":"2024-09-23T09:32:37.227182172Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
5
+ {"time":"2024-09-23T09:32:37.230824708Z","level":"INFO","msg":"created new stream","id":"tkkvzfon"}
6
+ {"time":"2024-09-23T09:32:37.230859859Z","level":"INFO","msg":"stream: started","id":"tkkvzfon"}
7
+ {"time":"2024-09-23T09:32:37.230903499Z","level":"INFO","msg":"sender: started","stream_id":{"value":"tkkvzfon"}}
8
+ {"time":"2024-09-23T09:32:37.23092371Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"tkkvzfon"}}
9
+ {"time":"2024-09-23T09:32:37.23097304Z","level":"INFO","msg":"handler: started","stream_id":{"value":"tkkvzfon"}}
10
+ {"time":"2024-09-23T09:32:37.634282756Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
11
+ {"time":"2024-09-23T09:32:37.636527894Z","level":"INFO","msg":"Starting system monitor"}
12
+ {"time":"2024-09-23T09:33:46.746283667Z","level":"INFO","msg":"stream: closing","id":"tkkvzfon"}
13
+ {"time":"2024-09-23T09:33:46.746349498Z","level":"INFO","msg":"Stopping system monitor"}
14
+ {"time":"2024-09-23T09:33:46.747359311Z","level":"INFO","msg":"Stopped system monitor"}
15
+ {"time":"2024-09-23T09:33:49.926631346Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"tkkvzfon"}}
16
+ {"time":"2024-09-23T09:33:49.926725448Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"tkkvzfon"}}
17
+ {"time":"2024-09-23T09:33:49.926795918Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"tkkvzfon"}}
18
+ {"time":"2024-09-23T09:33:49.927056922Z","level":"INFO","msg":"stream: closed","id":"tkkvzfon"}
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
2
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Configure stats pid to 81916
3
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
4
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/settings
5
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
6
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
8
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying login settings: {}
9
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log
10
+ 2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log
11
+ 2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():616] calling init triggers
12
+ 2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
13
+ config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
14
+ 2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():666] starting backend
15
+ 2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():670] setting up manager
16
+ 2024-09-23 09:32:37,223 INFO MainThread:81916 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-09-23 09:32:37,224 INFO MainThread:81916 [wandb_init.py:init():678] backend started and connected
18
+ 2024-09-23 09:32:37,227 INFO MainThread:81916 [wandb_init.py:init():773] updated telemetry
19
+ 2024-09-23 09:32:37,236 INFO MainThread:81916 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
20
+ 2024-09-23 09:32:37,631 INFO MainThread:81916 [wandb_init.py:init():857] starting run threads in backend
21
+ 2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_console_start():2459] atexit reg
22
+ 2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_redirect():2307] redirect: wrap_raw
23
+ 2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2372] Wrapping output streams.
24
+ 2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2397] Redirects installed.
25
+ 2024-09-23 09:32:37,806 INFO MainThread:81916 [wandb_init.py:init():900] run started, returning control to user process
26
+ 2024-09-23 09:33:46,746 WARNING MsgRouterThr:81916 [router.py:message_loop():77] message_loop has been closed
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/run-tkkvzfon.wandb ADDED
Binary file (124 kB). View file
 
2024-09-23/09-33-58/.hydra/config.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experimental:
2
+ model:
3
+ core_model_type: pass_through
4
+ hidden_dim: 384
5
+ byte_hidden: 128
6
+ max_chunk_length: 12
7
+ max_num_chunks: 1024
8
+ num_delimiter_layers: 3
9
+ num_byte_decoder_layers: 5
10
+ target_chunk_len: 8.0
11
+ chunk_len_loss_weight: 0.1
12
+ chunk_len_penalty: 0.1
13
+ context_window: 8192
14
+ embedding_model_type: byte_level
15
+ tokenizer_type: bpe
16
+ tokenizer_dataset_name: simple_en_wiki
17
+ tokenizer_simplify_data: true
18
+ vocab_size: 259
19
+ lm_head_type: byte_level
20
+ lm_head_normalization: rms_norm
21
+ lm_head_bias: false
22
+ lm_head_dropout: 0.0
23
+ model_shell_type: byte_autoencoder_shell
24
+ embedding_weight_tying: true
25
+ ffn_weight_tying: false
26
+ cproj_weight_tying: false
27
+ positional_encoding_type: rope
28
+ trainer:
29
+ trainer_type: base_trainer
30
+ dataset: fineweb_edu_10B
31
+ batch_size: 6
32
+ gradient_accumulation_steps: 8
33
+ max_iters: 10000
34
+ eval_interval: 50000000
35
+ log_interval: 1
36
+ checkpoint_interval: 1000
37
+ eval_iters: 1000
38
+ run_eval: false
39
+ eval:
40
+ mcq_benchmarks: null
41
+ mcq_num_samples: 1000
42
+ eval_byte_metrics: false
43
+ text_modeling_eval: false
44
+ text_generation_eval: false
45
+ optimizer:
46
+ optimizer_name: adamW
47
+ lr: 0.0005
48
+ min_lr: 5.0e-05
49
+ weight_decay: 0.01
50
+ beta1: 0.9
51
+ beta2: 0.95
52
+ grad_clip: 1.0
53
+ lr_scheduler:
54
+ name: cosine
55
+ warmup_iters: 100
56
+ dataloader:
57
+ name: autoencoder
58
+ datasampling:
59
+ name: standard
60
+ loss_fn:
61
+ name: pass_through
62
+ general:
63
+ logging:
64
+ wandb_log: true
65
+ wandb_project: SuperTinyLanguageModels
66
+ wandb_run_name: null
67
+ group_name: experimental_byte_level
68
+ paths:
69
+ output_dir: outputs
70
+ data_dir: data
71
+ checkpoint_dir: checkpoints
72
+ eval_dir: evals
73
+ seed: 489
74
+ device: cuda
2024-09-23/09-33-58/.hydra/hydra.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: train
117
+ chdir: null
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: experimental/byte_autoencoder_1
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /root/SuperTinyLanguageModels
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /root/SuperTinyLanguageModels/configs/train
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/09-33-58
144
+ choices:
145
+ hydra/env: default
146
+ hydra/callbacks: null
147
+ hydra/job_logging: default
148
+ hydra/hydra_logging: default
149
+ hydra/hydra_help: default
150
+ hydra/help: default
151
+ hydra/sweeper: basic
152
+ hydra/launcher: basic
153
+ hydra/output: default
154
+ verbose: false
2024-09-23/09-33-58/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
2024-09-23/09-33-58/checkpoints/ckpt_1000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9e847e5371dfd2f5ac68ee97e737d4ab63d42fdde1c885d6ab4915a9b3ccf83
3
+ size 69377274
2024-09-23/09-33-58/checkpoints/ckpt_2000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:261a3f997548dd7b6a92a1a7a51b37b1d559a7b64547c95b98a336bdc2685da0
3
+ size 69377274
2024-09-23/09-33-58/checkpoints/ckpt_3000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51018e44f695f15948d2cbcd014d62113a7a82a67ca7ca25dc767a77c12ae563
3
+ size 69377274
2024-09-23/09-33-58/checkpoints/ckpt_4000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf165a859555ddeb74ad0c7b6e10f17fa5f91c1b060a14bd77dd7fedbde5503c
3
+ size 69377274
2024-09-23/09-33-58/checkpoints/ckpt_5000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99bdcae468dc981532ae56ecd8616824b1cf86801d364510be19a57467a81dbb
3
+ size 69377274