Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- 2024-09-23/06-36-18/.hydra/config.yaml +74 -0
- 2024-09-23/06-36-18/.hydra/hydra.yaml +154 -0
- 2024-09-23/06-36-18/.hydra/overrides.yaml +1 -0
- 2024-09-23/06-36-18/train.log +0 -0
- 2024-09-23/07-06-14/.hydra/config.yaml +74 -0
- 2024-09-23/07-06-14/.hydra/hydra.yaml +154 -0
- 2024-09-23/07-06-14/.hydra/overrides.yaml +1 -0
- 2024-09-23/07-06-14/train.log +0 -0
- 2024-09-23/08-39-13/.hydra/config.yaml +74 -0
- 2024-09-23/08-39-13/.hydra/hydra.yaml +154 -0
- 2024-09-23/08-39-13/.hydra/overrides.yaml +1 -0
- 2024-09-23/08-39-13/train.log +0 -0
- 2024-09-23/08-40-08/.hydra/config.yaml +74 -0
- 2024-09-23/08-40-08/.hydra/hydra.yaml +154 -0
- 2024-09-23/08-40-08/.hydra/overrides.yaml +1 -0
- 2024-09-23/08-40-08/train.log +0 -0
- 2024-09-23/08-40-08/wandb/debug-internal.log +14 -0
- 2024-09-23/08-40-08/wandb/debug.log +26 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/config.yaml +114 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/output.log +3 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/requirements.txt +121 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-metadata.json +88 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-summary.json +1 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log +12 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log +14 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log +26 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/run-a2kxhd8v.wandb +0 -0
- 2024-09-23/09-32-28/.hydra/config.yaml +74 -0
- 2024-09-23/09-32-28/.hydra/hydra.yaml +154 -0
- 2024-09-23/09-32-28/.hydra/overrides.yaml +1 -0
- 2024-09-23/09-32-28/train.log +0 -0
- 2024-09-23/09-32-28/wandb/debug-internal.log +18 -0
- 2024-09-23/09-32-28/wandb/debug.log +26 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/config.yaml +115 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/output.log +33 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-metadata.json +88 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-summary.json +1 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log +13 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log +18 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log +26 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/run-tkkvzfon.wandb +0 -0
- 2024-09-23/09-33-58/.hydra/config.yaml +74 -0
- 2024-09-23/09-33-58/.hydra/hydra.yaml +154 -0
- 2024-09-23/09-33-58/.hydra/overrides.yaml +1 -0
- 2024-09-23/09-33-58/checkpoints/ckpt_1000.pt +3 -0
- 2024-09-23/09-33-58/checkpoints/ckpt_2000.pt +3 -0
- 2024-09-23/09-33-58/checkpoints/ckpt_3000.pt +3 -0
- 2024-09-23/09-33-58/checkpoints/ckpt_4000.pt +3 -0
- 2024-09-23/09-33-58/checkpoints/ckpt_5000.pt +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
2024-09-23/09-33-58/wandb/run-20240923_093407-jnzzkcth/run-jnzzkcth.wandb filter=lfs diff=lfs merge=lfs -text
|
37 |
+
2024-09-23/15-02-55/wandb/run-20240923_150304-bbl5fd2u/run-bbl5fd2u.wandb filter=lfs diff=lfs merge=lfs -text
|
38 |
+
2024-09-23/15-28-03/wandb/run-20240923_152812-jp82yqcj/run-jp82yqcj.wandb filter=lfs diff=lfs merge=lfs -text
|
2024-09-23/06-36-18/.hydra/config.yaml
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
experimental:
|
2 |
+
model:
|
3 |
+
core_model_type: pass_through
|
4 |
+
hidden_dim: 384
|
5 |
+
byte_hidden: 128
|
6 |
+
max_chunk_length: 12
|
7 |
+
max_num_chunks: 1024
|
8 |
+
num_delimiter_layers: 3
|
9 |
+
num_byte_decoder_layers: 5
|
10 |
+
target_chunk_len: 8.0
|
11 |
+
chunk_len_loss_weight: 0.1
|
12 |
+
chunk_len_penalty: 0.1
|
13 |
+
context_window: 8192
|
14 |
+
embedding_model_type: byte_level
|
15 |
+
tokenizer_type: bpe
|
16 |
+
tokenizer_dataset_name: simple_en_wiki
|
17 |
+
tokenizer_simplify_data: true
|
18 |
+
vocab_size: 259
|
19 |
+
lm_head_type: byte_level
|
20 |
+
lm_head_normalization: rms_norm
|
21 |
+
lm_head_bias: false
|
22 |
+
lm_head_dropout: 0.0
|
23 |
+
model_shell_type: byte_autoencoder_shell
|
24 |
+
embedding_weight_tying: true
|
25 |
+
ffn_weight_tying: false
|
26 |
+
cproj_weight_tying: false
|
27 |
+
positional_encoding_type: rope
|
28 |
+
trainer:
|
29 |
+
trainer_type: base_trainer
|
30 |
+
dataset: fineweb_edu_10B
|
31 |
+
batch_size: 6
|
32 |
+
gradient_accumulation_steps: 8
|
33 |
+
max_iters: 10000
|
34 |
+
eval_interval: 50000000
|
35 |
+
log_interval: 1
|
36 |
+
checkpoint_interval: 1000
|
37 |
+
eval_iters: 1000
|
38 |
+
run_eval: false
|
39 |
+
eval:
|
40 |
+
mcq_benchmarks: null
|
41 |
+
mcq_num_samples: 1000
|
42 |
+
eval_byte_metrics: false
|
43 |
+
text_modeling_eval: false
|
44 |
+
text_generation_eval: false
|
45 |
+
optimizer:
|
46 |
+
optimizer_name: adamW
|
47 |
+
lr: 0.0005
|
48 |
+
min_lr: 5.0e-05
|
49 |
+
weight_decay: 0.01
|
50 |
+
beta1: 0.9
|
51 |
+
beta2: 0.95
|
52 |
+
grad_clip: 1.0
|
53 |
+
lr_scheduler:
|
54 |
+
name: cosine
|
55 |
+
warmup_iters: 100
|
56 |
+
dataloader:
|
57 |
+
name: autoencoder
|
58 |
+
datasampling:
|
59 |
+
name: standard
|
60 |
+
loss_fn:
|
61 |
+
name: pass_through
|
62 |
+
general:
|
63 |
+
logging:
|
64 |
+
wandb_log: true
|
65 |
+
wandb_project: SuperTinyLanguageModels
|
66 |
+
wandb_run_name: null
|
67 |
+
group_name: experimental_byte_level
|
68 |
+
paths:
|
69 |
+
output_dir: outputs
|
70 |
+
data_dir: data
|
71 |
+
checkpoint_dir: checkpoints
|
72 |
+
eval_dir: evals
|
73 |
+
seed: 489
|
74 |
+
device: cuda
|
2024-09-23/06-36-18/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: train
|
117 |
+
chdir: null
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: experimental/byte_autoencoder_1
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /root/SuperTinyLanguageModels
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/06-36-18
|
144 |
+
choices:
|
145 |
+
hydra/env: default
|
146 |
+
hydra/callbacks: null
|
147 |
+
hydra/job_logging: default
|
148 |
+
hydra/hydra_logging: default
|
149 |
+
hydra/hydra_help: default
|
150 |
+
hydra/help: default
|
151 |
+
hydra/sweeper: basic
|
152 |
+
hydra/launcher: basic
|
153 |
+
hydra/output: default
|
154 |
+
verbose: false
|
2024-09-23/06-36-18/.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
2024-09-23/06-36-18/train.log
ADDED
File without changes
|
2024-09-23/07-06-14/.hydra/config.yaml
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
experimental:
|
2 |
+
model:
|
3 |
+
core_model_type: pass_through
|
4 |
+
hidden_dim: 384
|
5 |
+
byte_hidden: 128
|
6 |
+
max_chunk_length: 12
|
7 |
+
max_num_chunks: 1024
|
8 |
+
num_delimiter_layers: 3
|
9 |
+
num_byte_decoder_layers: 5
|
10 |
+
target_chunk_len: 8.0
|
11 |
+
chunk_len_loss_weight: 0.1
|
12 |
+
chunk_len_penalty: 0.1
|
13 |
+
context_window: 8192
|
14 |
+
embedding_model_type: byte_level
|
15 |
+
tokenizer_type: bpe
|
16 |
+
tokenizer_dataset_name: simple_en_wiki
|
17 |
+
tokenizer_simplify_data: true
|
18 |
+
vocab_size: 259
|
19 |
+
lm_head_type: byte_level
|
20 |
+
lm_head_normalization: rms_norm
|
21 |
+
lm_head_bias: false
|
22 |
+
lm_head_dropout: 0.0
|
23 |
+
model_shell_type: byte_autoencoder_shell
|
24 |
+
embedding_weight_tying: true
|
25 |
+
ffn_weight_tying: false
|
26 |
+
cproj_weight_tying: false
|
27 |
+
positional_encoding_type: rope
|
28 |
+
trainer:
|
29 |
+
trainer_type: base_trainer
|
30 |
+
dataset: fineweb_edu_10B
|
31 |
+
batch_size: 6
|
32 |
+
gradient_accumulation_steps: 8
|
33 |
+
max_iters: 10000
|
34 |
+
eval_interval: 50000000
|
35 |
+
log_interval: 1
|
36 |
+
checkpoint_interval: 1000
|
37 |
+
eval_iters: 1000
|
38 |
+
run_eval: false
|
39 |
+
eval:
|
40 |
+
mcq_benchmarks: null
|
41 |
+
mcq_num_samples: 1000
|
42 |
+
eval_byte_metrics: false
|
43 |
+
text_modeling_eval: false
|
44 |
+
text_generation_eval: false
|
45 |
+
optimizer:
|
46 |
+
optimizer_name: adamW
|
47 |
+
lr: 0.0005
|
48 |
+
min_lr: 5.0e-05
|
49 |
+
weight_decay: 0.01
|
50 |
+
beta1: 0.9
|
51 |
+
beta2: 0.95
|
52 |
+
grad_clip: 1.0
|
53 |
+
lr_scheduler:
|
54 |
+
name: cosine
|
55 |
+
warmup_iters: 100
|
56 |
+
dataloader:
|
57 |
+
name: autoencoder
|
58 |
+
datasampling:
|
59 |
+
name: standard
|
60 |
+
loss_fn:
|
61 |
+
name: pass_through
|
62 |
+
general:
|
63 |
+
logging:
|
64 |
+
wandb_log: true
|
65 |
+
wandb_project: SuperTinyLanguageModels
|
66 |
+
wandb_run_name: null
|
67 |
+
group_name: experimental_byte_level
|
68 |
+
paths:
|
69 |
+
output_dir: outputs
|
70 |
+
data_dir: data
|
71 |
+
checkpoint_dir: checkpoints
|
72 |
+
eval_dir: evals
|
73 |
+
seed: 489
|
74 |
+
device: cuda
|
2024-09-23/07-06-14/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: train
|
117 |
+
chdir: null
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: experimental/byte_autoencoder_1
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /root/SuperTinyLanguageModels
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/07-06-14
|
144 |
+
choices:
|
145 |
+
hydra/env: default
|
146 |
+
hydra/callbacks: null
|
147 |
+
hydra/job_logging: default
|
148 |
+
hydra/hydra_logging: default
|
149 |
+
hydra/hydra_help: default
|
150 |
+
hydra/help: default
|
151 |
+
hydra/sweeper: basic
|
152 |
+
hydra/launcher: basic
|
153 |
+
hydra/output: default
|
154 |
+
verbose: false
|
2024-09-23/07-06-14/.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
2024-09-23/07-06-14/train.log
ADDED
File without changes
|
2024-09-23/08-39-13/.hydra/config.yaml
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
experimental:
|
2 |
+
model:
|
3 |
+
core_model_type: pass_through
|
4 |
+
hidden_dim: 384
|
5 |
+
byte_hidden: 128
|
6 |
+
max_chunk_length: 12
|
7 |
+
max_num_chunks: 1024
|
8 |
+
num_delimiter_layers: 3
|
9 |
+
num_byte_decoder_layers: 5
|
10 |
+
target_chunk_len: 8.0
|
11 |
+
chunk_len_loss_weight: 0.1
|
12 |
+
chunk_len_penalty: 0.1
|
13 |
+
context_window: 8192
|
14 |
+
embedding_model_type: byte_level
|
15 |
+
tokenizer_type: bpe
|
16 |
+
tokenizer_dataset_name: simple_en_wiki
|
17 |
+
tokenizer_simplify_data: true
|
18 |
+
vocab_size: 259
|
19 |
+
lm_head_type: byte_level
|
20 |
+
lm_head_normalization: rms_norm
|
21 |
+
lm_head_bias: false
|
22 |
+
lm_head_dropout: 0.0
|
23 |
+
model_shell_type: byte_autoencoder_shell
|
24 |
+
embedding_weight_tying: true
|
25 |
+
ffn_weight_tying: false
|
26 |
+
cproj_weight_tying: false
|
27 |
+
positional_encoding_type: rope
|
28 |
+
trainer:
|
29 |
+
trainer_type: base_trainer
|
30 |
+
dataset: fineweb_edu_10B
|
31 |
+
batch_size: 6
|
32 |
+
gradient_accumulation_steps: 8
|
33 |
+
max_iters: 10000
|
34 |
+
eval_interval: 50000000
|
35 |
+
log_interval: 1
|
36 |
+
checkpoint_interval: 1000
|
37 |
+
eval_iters: 1000
|
38 |
+
run_eval: false
|
39 |
+
eval:
|
40 |
+
mcq_benchmarks: null
|
41 |
+
mcq_num_samples: 1000
|
42 |
+
eval_byte_metrics: false
|
43 |
+
text_modeling_eval: false
|
44 |
+
text_generation_eval: false
|
45 |
+
optimizer:
|
46 |
+
optimizer_name: adamW
|
47 |
+
lr: 0.0005
|
48 |
+
min_lr: 5.0e-05
|
49 |
+
weight_decay: 0.01
|
50 |
+
beta1: 0.9
|
51 |
+
beta2: 0.95
|
52 |
+
grad_clip: 1.0
|
53 |
+
lr_scheduler:
|
54 |
+
name: cosine
|
55 |
+
warmup_iters: 100
|
56 |
+
dataloader:
|
57 |
+
name: autoencoder
|
58 |
+
datasampling:
|
59 |
+
name: standard
|
60 |
+
loss_fn:
|
61 |
+
name: pass_through
|
62 |
+
general:
|
63 |
+
logging:
|
64 |
+
wandb_log: true
|
65 |
+
wandb_project: SuperTinyLanguageModels
|
66 |
+
wandb_run_name: null
|
67 |
+
group_name: experimental_byte_level
|
68 |
+
paths:
|
69 |
+
output_dir: outputs
|
70 |
+
data_dir: data
|
71 |
+
checkpoint_dir: checkpoints
|
72 |
+
eval_dir: evals
|
73 |
+
seed: 489
|
74 |
+
device: cuda
|
2024-09-23/08-39-13/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: train
|
117 |
+
chdir: null
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: experimental/byte_autoencoder_1
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /root/SuperTinyLanguageModels
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/08-39-13
|
144 |
+
choices:
|
145 |
+
hydra/env: default
|
146 |
+
hydra/callbacks: null
|
147 |
+
hydra/job_logging: default
|
148 |
+
hydra/hydra_logging: default
|
149 |
+
hydra/hydra_help: default
|
150 |
+
hydra/help: default
|
151 |
+
hydra/sweeper: basic
|
152 |
+
hydra/launcher: basic
|
153 |
+
hydra/output: default
|
154 |
+
verbose: false
|
2024-09-23/08-39-13/.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
2024-09-23/08-39-13/train.log
ADDED
File without changes
|
2024-09-23/08-40-08/.hydra/config.yaml
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
experimental:
|
2 |
+
model:
|
3 |
+
core_model_type: pass_through
|
4 |
+
hidden_dim: 384
|
5 |
+
byte_hidden: 128
|
6 |
+
max_chunk_length: 12
|
7 |
+
max_num_chunks: 1024
|
8 |
+
num_delimiter_layers: 3
|
9 |
+
num_byte_decoder_layers: 5
|
10 |
+
target_chunk_len: 8.0
|
11 |
+
chunk_len_loss_weight: 0.1
|
12 |
+
chunk_len_penalty: 0.1
|
13 |
+
context_window: 8192
|
14 |
+
embedding_model_type: byte_level
|
15 |
+
tokenizer_type: bpe
|
16 |
+
tokenizer_dataset_name: simple_en_wiki
|
17 |
+
tokenizer_simplify_data: true
|
18 |
+
vocab_size: 259
|
19 |
+
lm_head_type: byte_level
|
20 |
+
lm_head_normalization: rms_norm
|
21 |
+
lm_head_bias: false
|
22 |
+
lm_head_dropout: 0.0
|
23 |
+
model_shell_type: byte_autoencoder_shell
|
24 |
+
embedding_weight_tying: true
|
25 |
+
ffn_weight_tying: false
|
26 |
+
cproj_weight_tying: false
|
27 |
+
positional_encoding_type: rope
|
28 |
+
trainer:
|
29 |
+
trainer_type: base_trainer
|
30 |
+
dataset: fineweb_edu_10B
|
31 |
+
batch_size: 6
|
32 |
+
gradient_accumulation_steps: 8
|
33 |
+
max_iters: 10000
|
34 |
+
eval_interval: 50000000
|
35 |
+
log_interval: 1
|
36 |
+
checkpoint_interval: 1000
|
37 |
+
eval_iters: 1000
|
38 |
+
run_eval: false
|
39 |
+
eval:
|
40 |
+
mcq_benchmarks: null
|
41 |
+
mcq_num_samples: 1000
|
42 |
+
eval_byte_metrics: false
|
43 |
+
text_modeling_eval: false
|
44 |
+
text_generation_eval: false
|
45 |
+
optimizer:
|
46 |
+
optimizer_name: adamW
|
47 |
+
lr: 0.0005
|
48 |
+
min_lr: 5.0e-05
|
49 |
+
weight_decay: 0.01
|
50 |
+
beta1: 0.9
|
51 |
+
beta2: 0.95
|
52 |
+
grad_clip: 1.0
|
53 |
+
lr_scheduler:
|
54 |
+
name: cosine
|
55 |
+
warmup_iters: 100
|
56 |
+
dataloader:
|
57 |
+
name: autoencoder
|
58 |
+
datasampling:
|
59 |
+
name: standard
|
60 |
+
loss_fn:
|
61 |
+
name: pass_through
|
62 |
+
general:
|
63 |
+
logging:
|
64 |
+
wandb_log: true
|
65 |
+
wandb_project: SuperTinyLanguageModels
|
66 |
+
wandb_run_name: null
|
67 |
+
group_name: experimental_byte_level
|
68 |
+
paths:
|
69 |
+
output_dir: outputs
|
70 |
+
data_dir: data
|
71 |
+
checkpoint_dir: checkpoints
|
72 |
+
eval_dir: evals
|
73 |
+
seed: 489
|
74 |
+
device: cuda
|
2024-09-23/08-40-08/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: train
|
117 |
+
chdir: null
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: experimental/byte_autoencoder_1
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /root/SuperTinyLanguageModels
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08
|
144 |
+
choices:
|
145 |
+
hydra/env: default
|
146 |
+
hydra/callbacks: null
|
147 |
+
hydra/job_logging: default
|
148 |
+
hydra/hydra_logging: default
|
149 |
+
hydra/hydra_help: default
|
150 |
+
hydra/help: default
|
151 |
+
hydra/sweeper: basic
|
152 |
+
hydra/launcher: basic
|
153 |
+
hydra/output: default
|
154 |
+
verbose: false
|
2024-09-23/08-40-08/.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
2024-09-23/08-40-08/train.log
ADDED
File without changes
|
2024-09-23/08-40-08/wandb/debug-internal.log
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-09-23T09:14:22.59580271Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
2 |
+
{"time":"2024-09-23T09:14:22.59581747Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
|
3 |
+
{"time":"2024-09-23T09:14:22.595881422Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
4 |
+
{"time":"2024-09-23T09:14:22.595887882Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
|
5 |
+
{"time":"2024-09-23T09:14:22.59917443Z","level":"INFO","msg":"created new stream","id":"a2kxhd8v"}
|
6 |
+
{"time":"2024-09-23T09:14:22.59919309Z","level":"INFO","msg":"stream: started","id":"a2kxhd8v"}
|
7 |
+
{"time":"2024-09-23T09:14:22.59921417Z","level":"INFO","msg":"sender: started","stream_id":{"value":"a2kxhd8v"}}
|
8 |
+
{"time":"2024-09-23T09:14:22.599226691Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"a2kxhd8v"}}
|
9 |
+
{"time":"2024-09-23T09:14:22.599236461Z","level":"INFO","msg":"handler: started","stream_id":{"value":"a2kxhd8v"}}
|
10 |
+
{"time":"2024-09-23T09:14:22.982350736Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
11 |
+
{"time":"2024-09-23T09:14:22.985015444Z","level":"INFO","msg":"Starting system monitor"}
|
12 |
+
{"time":"2024-09-23T09:14:27.10372121Z","level":"INFO","msg":"stream: closing","id":"a2kxhd8v"}
|
13 |
+
{"time":"2024-09-23T09:14:27.103806442Z","level":"INFO","msg":"Stopping system monitor"}
|
14 |
+
{"time":"2024-09-23T09:14:27.104964992Z","level":"INFO","msg":"Stopped system monitor"}
|
2024-09-23/08-40-08/wandb/debug.log
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
|
2 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Configure stats pid to 78108
|
3 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
4 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/settings
|
5 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
6 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
|
7 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
|
8 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying login settings: {}
|
9 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log
|
10 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log
|
11 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():616] calling init triggers
|
12 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
|
13 |
+
config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
|
14 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():666] starting backend
|
15 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():670] setting up manager
|
16 |
+
2024-09-23 09:14:22,584 INFO MainThread:78108 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
17 |
+
2024-09-23 09:14:22,586 INFO MainThread:78108 [wandb_init.py:init():678] backend started and connected
|
18 |
+
2024-09-23 09:14:22,588 INFO MainThread:78108 [wandb_init.py:init():773] updated telemetry
|
19 |
+
2024-09-23 09:14:22,598 INFO MainThread:78108 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
|
20 |
+
2024-09-23 09:14:22,974 INFO MainThread:78108 [wandb_init.py:init():857] starting run threads in backend
|
21 |
+
2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_console_start():2459] atexit reg
|
22 |
+
2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_redirect():2307] redirect: wrap_raw
|
23 |
+
2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2372] Wrapping output streams.
|
24 |
+
2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2397] Redirects installed.
|
25 |
+
2024-09-23 09:14:23,135 INFO MainThread:78108 [wandb_init.py:init():900] run started, returning control to user process
|
26 |
+
2024-09-23 09:14:27,104 WARNING MsgRouterThr:78108 [router.py:message_loop():77] message_loop has been closed
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/config.yaml
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.18.1
|
4 |
+
m: []
|
5 |
+
python_version: 3.10.14
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 5
|
10 |
+
- 11
|
11 |
+
- 49
|
12 |
+
- 50
|
13 |
+
- 51
|
14 |
+
- 53
|
15 |
+
- 55
|
16 |
+
"2":
|
17 |
+
- 1
|
18 |
+
- 5
|
19 |
+
- 11
|
20 |
+
- 49
|
21 |
+
- 50
|
22 |
+
- 51
|
23 |
+
- 53
|
24 |
+
- 55
|
25 |
+
"3":
|
26 |
+
- 13
|
27 |
+
- 15
|
28 |
+
- 16
|
29 |
+
- 23
|
30 |
+
- 55
|
31 |
+
"4": 3.10.14
|
32 |
+
"5": 0.18.1
|
33 |
+
"6": 4.44.2
|
34 |
+
"8":
|
35 |
+
- 5
|
36 |
+
- 9
|
37 |
+
"12": 0.18.1
|
38 |
+
"13": linux-x86_64
|
39 |
+
general:
|
40 |
+
value:
|
41 |
+
device: cuda
|
42 |
+
logging:
|
43 |
+
group_name: experimental_byte_level
|
44 |
+
wandb_log: true
|
45 |
+
wandb_project: SuperTinyLanguageModels
|
46 |
+
wandb_run_name: null
|
47 |
+
paths:
|
48 |
+
checkpoint_dir: checkpoints
|
49 |
+
data_dir: /root/SuperTinyLanguageModels/data
|
50 |
+
eval_dir: /root/SuperTinyLanguageModels/evals
|
51 |
+
output_dir: outputs
|
52 |
+
seed: 489
|
53 |
+
model:
|
54 |
+
value:
|
55 |
+
byte_hidden: 128
|
56 |
+
chunk_len_loss_weight: 0.1
|
57 |
+
chunk_len_penalty: 0.1
|
58 |
+
context_window: 8192
|
59 |
+
core_model_type: pass_through
|
60 |
+
cproj_weight_tying: false
|
61 |
+
embedding_model_type: byte_level
|
62 |
+
embedding_weight_tying: true
|
63 |
+
ffn_weight_tying: false
|
64 |
+
hidden_dim: 384
|
65 |
+
lm_head_bias: false
|
66 |
+
lm_head_dropout: 0
|
67 |
+
lm_head_normalization: rms_norm
|
68 |
+
lm_head_type: byte_level
|
69 |
+
max_chunk_length: 12
|
70 |
+
max_num_chunks: 1024
|
71 |
+
model_shell_type: byte_autoencoder_shell
|
72 |
+
num_byte_decoder_layers: 5
|
73 |
+
num_delimiter_layers: 3
|
74 |
+
positional_encoding_type: rope
|
75 |
+
target_chunk_len: 8
|
76 |
+
tokenizer_dataset_name: simple_en_wiki
|
77 |
+
tokenizer_simplify_data: true
|
78 |
+
tokenizer_type: bpe
|
79 |
+
vocab_size: 259
|
80 |
+
trainer:
|
81 |
+
value:
|
82 |
+
batch_size: 6
|
83 |
+
checkpoint_interval: 1000
|
84 |
+
dataloader:
|
85 |
+
name: autoencoder
|
86 |
+
datasampling:
|
87 |
+
name: standard
|
88 |
+
dataset: fineweb_edu_10B
|
89 |
+
eval:
|
90 |
+
eval_byte_metrics: false
|
91 |
+
mcq_benchmarks: null
|
92 |
+
mcq_num_samples: 1000
|
93 |
+
text_generation_eval: false
|
94 |
+
text_modeling_eval: false
|
95 |
+
eval_interval: 50000000
|
96 |
+
eval_iters: 1000
|
97 |
+
gradient_accumulation_steps: 8
|
98 |
+
log_interval: 1
|
99 |
+
loss_fn:
|
100 |
+
name: pass_through
|
101 |
+
lr_scheduler:
|
102 |
+
name: cosine
|
103 |
+
warmup_iters: 100
|
104 |
+
max_iters: 10000
|
105 |
+
optimizer:
|
106 |
+
beta1: 0.9
|
107 |
+
beta2: 0.95
|
108 |
+
grad_clip: 1
|
109 |
+
lr: 0.0005
|
110 |
+
min_lr: 5e-05
|
111 |
+
optimizer_name: adamW
|
112 |
+
weight_decay: 0.01
|
113 |
+
run_eval: false
|
114 |
+
trainer_type: base_trainer
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/output.log
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
Weight and Biases Initialized
|
2 |
+
Rank0 Trainer built
|
3 |
+
Training loop is starting
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/requirements.txt
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
setuptools==75.1.0
|
2 |
+
wheel==0.44.0
|
3 |
+
pip==24.2
|
4 |
+
wcwidth==0.2.13
|
5 |
+
sentencepiece==0.2.0
|
6 |
+
pytz==2024.2
|
7 |
+
mpmath==1.3.0
|
8 |
+
distlib==0.3.8
|
9 |
+
antlr4-python3-runtime==4.9.3
|
10 |
+
xxhash==3.5.0
|
11 |
+
urllib3==2.2.3
|
12 |
+
tzdata==2024.1
|
13 |
+
typing_extensions==4.12.2
|
14 |
+
tqdm==4.66.5
|
15 |
+
threadpoolctl==3.5.0
|
16 |
+
sympy==1.13.3
|
17 |
+
smmap==5.0.1
|
18 |
+
six==1.16.0
|
19 |
+
setproctitle==1.3.3
|
20 |
+
safetensors==0.4.5
|
21 |
+
regex==2024.9.11
|
22 |
+
rapidfuzz==3.9.7
|
23 |
+
PyYAML==6.0.2
|
24 |
+
pytrec-eval-terrier==0.5.6
|
25 |
+
pyphen==0.16.0
|
26 |
+
Pygments==2.18.0
|
27 |
+
psutil==6.0.0
|
28 |
+
protobuf==5.28.2
|
29 |
+
prettytable==3.11.0
|
30 |
+
polars==1.7.1
|
31 |
+
platformdirs==4.3.6
|
32 |
+
pillow==10.4.0
|
33 |
+
packaging==24.1
|
34 |
+
nvidia-nvtx-cu12==12.1.105
|
35 |
+
nvidia-nvjitlink-cu12==12.6.68
|
36 |
+
nvidia-nccl-cu12==2.20.5
|
37 |
+
nvidia-curand-cu12==10.3.2.106
|
38 |
+
nvidia-cufft-cu12==11.0.2.54
|
39 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
40 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
41 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
42 |
+
nvidia-cublas-cu12==12.1.3.1
|
43 |
+
numpy==1.26.4
|
44 |
+
nodeenv==1.9.1
|
45 |
+
networkx==3.3
|
46 |
+
mdurl==0.1.2
|
47 |
+
MarkupSafe==2.1.5
|
48 |
+
joblib==1.4.2
|
49 |
+
idna==3.10
|
50 |
+
identify==2.6.1
|
51 |
+
fsspec==2024.6.1
|
52 |
+
frozenlist==1.4.1
|
53 |
+
filelock==3.16.1
|
54 |
+
eval_type_backport==0.2.0
|
55 |
+
dill==0.3.8
|
56 |
+
click==8.1.7
|
57 |
+
charset-normalizer==3.3.2
|
58 |
+
cfgv==3.4.0
|
59 |
+
certifi==2024.8.30
|
60 |
+
attrs==24.2.0
|
61 |
+
async-timeout==4.0.3
|
62 |
+
annotated-types==0.7.0
|
63 |
+
aiohappyeyeballs==2.4.0
|
64 |
+
virtualenv==20.26.5
|
65 |
+
triton==3.0.0
|
66 |
+
textstat==0.7.4
|
67 |
+
sentry-sdk==2.14.0
|
68 |
+
scipy==1.14.1
|
69 |
+
requests==2.32.3
|
70 |
+
python-dateutil==2.9.0.post0
|
71 |
+
pydantic_core==2.23.4
|
72 |
+
pyarrow==17.0.0
|
73 |
+
omegaconf==2.3.0
|
74 |
+
nvidia-cusparse-cu12==12.1.0.106
|
75 |
+
nvidia-cudnn-cu12==9.1.0.70
|
76 |
+
nltk==3.9.1
|
77 |
+
multiprocess==0.70.16
|
78 |
+
multidict==6.1.0
|
79 |
+
markdown-it-py==3.0.0
|
80 |
+
Levenshtein==0.26.0
|
81 |
+
Jinja2==3.1.4
|
82 |
+
gitdb==4.0.11
|
83 |
+
docker-pycreds==0.4.0
|
84 |
+
aiosignal==1.3.1
|
85 |
+
yarl==1.11.1
|
86 |
+
tiktoken==0.7.0
|
87 |
+
scikit-learn==1.5.2
|
88 |
+
rich==13.8.1
|
89 |
+
pydantic==2.9.2
|
90 |
+
pre-commit==3.8.0
|
91 |
+
pandas==2.2.3
|
92 |
+
nvidia-cusolver-cu12==11.4.5.107
|
93 |
+
language_tool_python==2.8.1
|
94 |
+
hydra-core==1.3.2
|
95 |
+
huggingface-hub==0.25.0
|
96 |
+
GitPython==3.1.43
|
97 |
+
wandb==0.18.1
|
98 |
+
torch==2.4.1
|
99 |
+
tokenizers==0.19.1
|
100 |
+
aiohttp==3.10.5
|
101 |
+
transformers==4.44.2
|
102 |
+
sentence-transformers==3.1.1
|
103 |
+
datasets==3.0.0
|
104 |
+
mteb==1.14.21
|
105 |
+
autocommand==2.2.2
|
106 |
+
backports.tarfile==1.2.0
|
107 |
+
importlib_metadata==8.0.0
|
108 |
+
importlib_resources==6.4.0
|
109 |
+
inflect==7.3.1
|
110 |
+
jaraco.collections==5.1.0
|
111 |
+
jaraco.context==5.3.0
|
112 |
+
jaraco.functools==4.0.1
|
113 |
+
jaraco.text==3.12.1
|
114 |
+
more-itertools==10.3.0
|
115 |
+
packaging==24.1
|
116 |
+
platformdirs==4.2.2
|
117 |
+
tomli==2.0.1
|
118 |
+
typeguard==4.3.0
|
119 |
+
typing_extensions==4.12.2
|
120 |
+
wheel==0.43.0
|
121 |
+
zipp==3.19.2
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-metadata.json
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-117-generic-x86_64-with-glibc2.31",
|
3 |
+
"python": "3.10.14",
|
4 |
+
"startedAt": "2024-09-23T09:14:22.586171Z",
|
5 |
+
"args": [
|
6 |
+
"--config-name",
|
7 |
+
"experimental/byte_autoencoder_1"
|
8 |
+
],
|
9 |
+
"program": "/root/SuperTinyLanguageModels/train.py",
|
10 |
+
"codePath": "train.py",
|
11 |
+
"git": {
|
12 |
+
"remote": "https://github.com/LeonGuertler/SuperTinyLanguageModels.git",
|
13 |
+
"commit": "ebdf9039e89c5d337997d0c2b11bf4e992886243"
|
14 |
+
},
|
15 |
+
"email": "[email protected]",
|
16 |
+
"root": "/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08",
|
17 |
+
"host": "11c6e13f6a55",
|
18 |
+
"username": "root",
|
19 |
+
"executable": "/root/SuperTinyLanguageModels/.conda/bin/python3",
|
20 |
+
"cpu_count": 128,
|
21 |
+
"cpu_count_logical": 256,
|
22 |
+
"gpu": "[NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090]",
|
23 |
+
"gpu_count": 8,
|
24 |
+
"disk": {
|
25 |
+
"/": {
|
26 |
+
"total": "1123133947904",
|
27 |
+
"used": "551794225152"
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"memory": {
|
31 |
+
"total": "540812599296"
|
32 |
+
},
|
33 |
+
"cpu": {
|
34 |
+
"count": 128,
|
35 |
+
"countLogical": 256
|
36 |
+
},
|
37 |
+
"gpu_nvidia": [
|
38 |
+
{
|
39 |
+
"name": "NVIDIA GeForce RTX 4090",
|
40 |
+
"memoryTotal": "25757220864",
|
41 |
+
"cudaCores": 16384,
|
42 |
+
"architecture": "Ada"
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"name": "NVIDIA GeForce RTX 4090",
|
46 |
+
"memoryTotal": "25757220864",
|
47 |
+
"cudaCores": 16384,
|
48 |
+
"architecture": "Ada"
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"name": "NVIDIA GeForce RTX 4090",
|
52 |
+
"memoryTotal": "25757220864",
|
53 |
+
"cudaCores": 16384,
|
54 |
+
"architecture": "Ada"
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"name": "NVIDIA GeForce RTX 4090",
|
58 |
+
"memoryTotal": "25757220864",
|
59 |
+
"cudaCores": 16384,
|
60 |
+
"architecture": "Ada"
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"name": "NVIDIA GeForce RTX 4090",
|
64 |
+
"memoryTotal": "25757220864",
|
65 |
+
"cudaCores": 16384,
|
66 |
+
"architecture": "Ada"
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"name": "NVIDIA GeForce RTX 4090",
|
70 |
+
"memoryTotal": "25757220864",
|
71 |
+
"cudaCores": 16384,
|
72 |
+
"architecture": "Ada"
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"name": "NVIDIA GeForce RTX 4090",
|
76 |
+
"memoryTotal": "25757220864",
|
77 |
+
"cudaCores": 16384,
|
78 |
+
"architecture": "Ada"
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "NVIDIA GeForce RTX 4090",
|
82 |
+
"memoryTotal": "25757220864",
|
83 |
+
"cudaCores": 16384,
|
84 |
+
"architecture": "Ada"
|
85 |
+
}
|
86 |
+
],
|
87 |
+
"cudaVersion": "12.5"
|
88 |
+
}
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_wandb":{"runtime":4}}
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-09-23T09:14:21.933081362Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp9hgpve6u/port-78108.txt","pid":78108,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2024-09-23T09:14:21.933136193Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2024-09-23T09:14:21.935284221Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":78108}
|
4 |
+
{"time":"2024-09-23T09:14:21.935348272Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":43153,"Zone":""}}
|
5 |
+
{"time":"2024-09-23T09:14:22.076126266Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:57616"}
|
6 |
+
{"time":"2024-09-23T09:14:22.595626377Z","level":"INFO","msg":"connection init received","streamId":"a2kxhd8v","id":"127.0.0.1:57616"}
|
7 |
+
{"time":"2024-09-23T09:14:22.595853241Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240923_091421.log /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log: file exists"}
|
8 |
+
{"time":"2024-09-23T09:14:22.59919809Z","level":"INFO","msg":"connection init completed","streamId":"a2kxhd8v","id":"127.0.0.1:57616"}
|
9 |
+
{"time":"2024-09-23T09:14:27.103590738Z","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:57616"}
|
10 |
+
{"time":"2024-09-23T09:14:27.103797162Z","level":"INFO","msg":"server is shutting down"}
|
11 |
+
{"time":"2024-09-23T09:14:27.104072727Z","level":"INFO","msg":"closed connection","id":"127.0.0.1:57616"}
|
12 |
+
{"time":"2024-09-23T09:14:28.465863147Z","level":"INFO","msg":"Parent process exited, terminating service process."}
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-09-23T09:14:22.59580271Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
2 |
+
{"time":"2024-09-23T09:14:22.59581747Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
|
3 |
+
{"time":"2024-09-23T09:14:22.595881422Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
4 |
+
{"time":"2024-09-23T09:14:22.595887882Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
|
5 |
+
{"time":"2024-09-23T09:14:22.59917443Z","level":"INFO","msg":"created new stream","id":"a2kxhd8v"}
|
6 |
+
{"time":"2024-09-23T09:14:22.59919309Z","level":"INFO","msg":"stream: started","id":"a2kxhd8v"}
|
7 |
+
{"time":"2024-09-23T09:14:22.59921417Z","level":"INFO","msg":"sender: started","stream_id":{"value":"a2kxhd8v"}}
|
8 |
+
{"time":"2024-09-23T09:14:22.599226691Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"a2kxhd8v"}}
|
9 |
+
{"time":"2024-09-23T09:14:22.599236461Z","level":"INFO","msg":"handler: started","stream_id":{"value":"a2kxhd8v"}}
|
10 |
+
{"time":"2024-09-23T09:14:22.982350736Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
11 |
+
{"time":"2024-09-23T09:14:22.985015444Z","level":"INFO","msg":"Starting system monitor"}
|
12 |
+
{"time":"2024-09-23T09:14:27.10372121Z","level":"INFO","msg":"stream: closing","id":"a2kxhd8v"}
|
13 |
+
{"time":"2024-09-23T09:14:27.103806442Z","level":"INFO","msg":"Stopping system monitor"}
|
14 |
+
{"time":"2024-09-23T09:14:27.104964992Z","level":"INFO","msg":"Stopped system monitor"}
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
|
2 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Configure stats pid to 78108
|
3 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
4 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/settings
|
5 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
6 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
|
7 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
|
8 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying login settings: {}
|
9 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log
|
10 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log
|
11 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():616] calling init triggers
|
12 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
|
13 |
+
config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
|
14 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():666] starting backend
|
15 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():670] setting up manager
|
16 |
+
2024-09-23 09:14:22,584 INFO MainThread:78108 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
17 |
+
2024-09-23 09:14:22,586 INFO MainThread:78108 [wandb_init.py:init():678] backend started and connected
|
18 |
+
2024-09-23 09:14:22,588 INFO MainThread:78108 [wandb_init.py:init():773] updated telemetry
|
19 |
+
2024-09-23 09:14:22,598 INFO MainThread:78108 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
|
20 |
+
2024-09-23 09:14:22,974 INFO MainThread:78108 [wandb_init.py:init():857] starting run threads in backend
|
21 |
+
2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_console_start():2459] atexit reg
|
22 |
+
2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_redirect():2307] redirect: wrap_raw
|
23 |
+
2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2372] Wrapping output streams.
|
24 |
+
2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2397] Redirects installed.
|
25 |
+
2024-09-23 09:14:23,135 INFO MainThread:78108 [wandb_init.py:init():900] run started, returning control to user process
|
26 |
+
2024-09-23 09:14:27,104 WARNING MsgRouterThr:78108 [router.py:message_loop():77] message_loop has been closed
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/run-a2kxhd8v.wandb
ADDED
File without changes
|
2024-09-23/09-32-28/.hydra/config.yaml
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
experimental:
|
2 |
+
model:
|
3 |
+
core_model_type: pass_through
|
4 |
+
hidden_dim: 384
|
5 |
+
byte_hidden: 128
|
6 |
+
max_chunk_length: 12
|
7 |
+
max_num_chunks: 1024
|
8 |
+
num_delimiter_layers: 3
|
9 |
+
num_byte_decoder_layers: 5
|
10 |
+
target_chunk_len: 8.0
|
11 |
+
chunk_len_loss_weight: 0.1
|
12 |
+
chunk_len_penalty: 0.1
|
13 |
+
context_window: 8192
|
14 |
+
embedding_model_type: byte_level
|
15 |
+
tokenizer_type: bpe
|
16 |
+
tokenizer_dataset_name: simple_en_wiki
|
17 |
+
tokenizer_simplify_data: true
|
18 |
+
vocab_size: 259
|
19 |
+
lm_head_type: byte_level
|
20 |
+
lm_head_normalization: rms_norm
|
21 |
+
lm_head_bias: false
|
22 |
+
lm_head_dropout: 0.0
|
23 |
+
model_shell_type: byte_autoencoder_shell
|
24 |
+
embedding_weight_tying: true
|
25 |
+
ffn_weight_tying: false
|
26 |
+
cproj_weight_tying: false
|
27 |
+
positional_encoding_type: rope
|
28 |
+
trainer:
|
29 |
+
trainer_type: base_trainer
|
30 |
+
dataset: fineweb_edu_10B
|
31 |
+
batch_size: 6
|
32 |
+
gradient_accumulation_steps: 8
|
33 |
+
max_iters: 10000
|
34 |
+
eval_interval: 50000000
|
35 |
+
log_interval: 1
|
36 |
+
checkpoint_interval: 1000
|
37 |
+
eval_iters: 1000
|
38 |
+
run_eval: false
|
39 |
+
eval:
|
40 |
+
mcq_benchmarks: null
|
41 |
+
mcq_num_samples: 1000
|
42 |
+
eval_byte_metrics: false
|
43 |
+
text_modeling_eval: false
|
44 |
+
text_generation_eval: false
|
45 |
+
optimizer:
|
46 |
+
optimizer_name: adamW
|
47 |
+
lr: 0.0005
|
48 |
+
min_lr: 5.0e-05
|
49 |
+
weight_decay: 0.01
|
50 |
+
beta1: 0.9
|
51 |
+
beta2: 0.95
|
52 |
+
grad_clip: 1.0
|
53 |
+
lr_scheduler:
|
54 |
+
name: cosine
|
55 |
+
warmup_iters: 100
|
56 |
+
dataloader:
|
57 |
+
name: autoencoder
|
58 |
+
datasampling:
|
59 |
+
name: standard
|
60 |
+
loss_fn:
|
61 |
+
name: pass_through
|
62 |
+
general:
|
63 |
+
logging:
|
64 |
+
wandb_log: true
|
65 |
+
wandb_project: SuperTinyLanguageModels
|
66 |
+
wandb_run_name: null
|
67 |
+
group_name: experimental_byte_level
|
68 |
+
paths:
|
69 |
+
output_dir: outputs
|
70 |
+
data_dir: data
|
71 |
+
checkpoint_dir: checkpoints
|
72 |
+
eval_dir: evals
|
73 |
+
seed: 489
|
74 |
+
device: cuda
|
2024-09-23/09-32-28/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: train
|
117 |
+
chdir: null
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: experimental/byte_autoencoder_1
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /root/SuperTinyLanguageModels
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28
|
144 |
+
choices:
|
145 |
+
hydra/env: default
|
146 |
+
hydra/callbacks: null
|
147 |
+
hydra/job_logging: default
|
148 |
+
hydra/hydra_logging: default
|
149 |
+
hydra/hydra_help: default
|
150 |
+
hydra/help: default
|
151 |
+
hydra/sweeper: basic
|
152 |
+
hydra/launcher: basic
|
153 |
+
hydra/output: default
|
154 |
+
verbose: false
|
2024-09-23/09-32-28/.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
2024-09-23/09-32-28/train.log
ADDED
File without changes
|
2024-09-23/09-32-28/wandb/debug-internal.log
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-09-23T09:32:37.2270228Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
2 |
+
{"time":"2024-09-23T09:32:37.227060611Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
|
3 |
+
{"time":"2024-09-23T09:32:37.227169702Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
4 |
+
{"time":"2024-09-23T09:32:37.227182172Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
|
5 |
+
{"time":"2024-09-23T09:32:37.230824708Z","level":"INFO","msg":"created new stream","id":"tkkvzfon"}
|
6 |
+
{"time":"2024-09-23T09:32:37.230859859Z","level":"INFO","msg":"stream: started","id":"tkkvzfon"}
|
7 |
+
{"time":"2024-09-23T09:32:37.230903499Z","level":"INFO","msg":"sender: started","stream_id":{"value":"tkkvzfon"}}
|
8 |
+
{"time":"2024-09-23T09:32:37.23092371Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"tkkvzfon"}}
|
9 |
+
{"time":"2024-09-23T09:32:37.23097304Z","level":"INFO","msg":"handler: started","stream_id":{"value":"tkkvzfon"}}
|
10 |
+
{"time":"2024-09-23T09:32:37.634282756Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
11 |
+
{"time":"2024-09-23T09:32:37.636527894Z","level":"INFO","msg":"Starting system monitor"}
|
12 |
+
{"time":"2024-09-23T09:33:46.746283667Z","level":"INFO","msg":"stream: closing","id":"tkkvzfon"}
|
13 |
+
{"time":"2024-09-23T09:33:46.746349498Z","level":"INFO","msg":"Stopping system monitor"}
|
14 |
+
{"time":"2024-09-23T09:33:46.747359311Z","level":"INFO","msg":"Stopped system monitor"}
|
15 |
+
{"time":"2024-09-23T09:33:49.926631346Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"tkkvzfon"}}
|
16 |
+
{"time":"2024-09-23T09:33:49.926725448Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"tkkvzfon"}}
|
17 |
+
{"time":"2024-09-23T09:33:49.926795918Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"tkkvzfon"}}
|
18 |
+
{"time":"2024-09-23T09:33:49.927056922Z","level":"INFO","msg":"stream: closed","id":"tkkvzfon"}
|
2024-09-23/09-32-28/wandb/debug.log
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
|
2 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Configure stats pid to 81916
|
3 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
4 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/settings
|
5 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
6 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
|
7 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
|
8 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying login settings: {}
|
9 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log
|
10 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log
|
11 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():616] calling init triggers
|
12 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
|
13 |
+
config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
|
14 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():666] starting backend
|
15 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():670] setting up manager
|
16 |
+
2024-09-23 09:32:37,223 INFO MainThread:81916 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
17 |
+
2024-09-23 09:32:37,224 INFO MainThread:81916 [wandb_init.py:init():678] backend started and connected
|
18 |
+
2024-09-23 09:32:37,227 INFO MainThread:81916 [wandb_init.py:init():773] updated telemetry
|
19 |
+
2024-09-23 09:32:37,236 INFO MainThread:81916 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
|
20 |
+
2024-09-23 09:32:37,631 INFO MainThread:81916 [wandb_init.py:init():857] starting run threads in backend
|
21 |
+
2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_console_start():2459] atexit reg
|
22 |
+
2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_redirect():2307] redirect: wrap_raw
|
23 |
+
2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2372] Wrapping output streams.
|
24 |
+
2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2397] Redirects installed.
|
25 |
+
2024-09-23 09:32:37,806 INFO MainThread:81916 [wandb_init.py:init():900] run started, returning control to user process
|
26 |
+
2024-09-23 09:33:46,746 WARNING MsgRouterThr:81916 [router.py:message_loop():77] message_loop has been closed
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/config.yaml
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.18.1
|
4 |
+
m: []
|
5 |
+
python_version: 3.10.14
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 5
|
10 |
+
- 11
|
11 |
+
- 49
|
12 |
+
- 50
|
13 |
+
- 51
|
14 |
+
- 53
|
15 |
+
- 55
|
16 |
+
"2":
|
17 |
+
- 1
|
18 |
+
- 5
|
19 |
+
- 11
|
20 |
+
- 49
|
21 |
+
- 50
|
22 |
+
- 51
|
23 |
+
- 53
|
24 |
+
- 55
|
25 |
+
"3":
|
26 |
+
- 13
|
27 |
+
- 15
|
28 |
+
- 16
|
29 |
+
- 23
|
30 |
+
- 55
|
31 |
+
- 61
|
32 |
+
"4": 3.10.14
|
33 |
+
"5": 0.18.1
|
34 |
+
"6": 4.44.2
|
35 |
+
"8":
|
36 |
+
- 5
|
37 |
+
- 9
|
38 |
+
"12": 0.18.1
|
39 |
+
"13": linux-x86_64
|
40 |
+
general:
|
41 |
+
value:
|
42 |
+
device: cuda
|
43 |
+
logging:
|
44 |
+
group_name: experimental_byte_level
|
45 |
+
wandb_log: true
|
46 |
+
wandb_project: SuperTinyLanguageModels
|
47 |
+
wandb_run_name: null
|
48 |
+
paths:
|
49 |
+
checkpoint_dir: checkpoints
|
50 |
+
data_dir: /root/SuperTinyLanguageModels/data
|
51 |
+
eval_dir: /root/SuperTinyLanguageModels/evals
|
52 |
+
output_dir: outputs
|
53 |
+
seed: 489
|
54 |
+
model:
|
55 |
+
value:
|
56 |
+
byte_hidden: 128
|
57 |
+
chunk_len_loss_weight: 0.1
|
58 |
+
chunk_len_penalty: 0.1
|
59 |
+
context_window: 8192
|
60 |
+
core_model_type: pass_through
|
61 |
+
cproj_weight_tying: false
|
62 |
+
embedding_model_type: byte_level
|
63 |
+
embedding_weight_tying: true
|
64 |
+
ffn_weight_tying: false
|
65 |
+
hidden_dim: 384
|
66 |
+
lm_head_bias: false
|
67 |
+
lm_head_dropout: 0
|
68 |
+
lm_head_normalization: rms_norm
|
69 |
+
lm_head_type: byte_level
|
70 |
+
max_chunk_length: 12
|
71 |
+
max_num_chunks: 1024
|
72 |
+
model_shell_type: byte_autoencoder_shell
|
73 |
+
num_byte_decoder_layers: 5
|
74 |
+
num_delimiter_layers: 3
|
75 |
+
positional_encoding_type: rope
|
76 |
+
target_chunk_len: 8
|
77 |
+
tokenizer_dataset_name: simple_en_wiki
|
78 |
+
tokenizer_simplify_data: true
|
79 |
+
tokenizer_type: bpe
|
80 |
+
vocab_size: 259
|
81 |
+
trainer:
|
82 |
+
value:
|
83 |
+
batch_size: 6
|
84 |
+
checkpoint_interval: 1000
|
85 |
+
dataloader:
|
86 |
+
name: autoencoder
|
87 |
+
datasampling:
|
88 |
+
name: standard
|
89 |
+
dataset: fineweb_edu_10B
|
90 |
+
eval:
|
91 |
+
eval_byte_metrics: false
|
92 |
+
mcq_benchmarks: null
|
93 |
+
mcq_num_samples: 1000
|
94 |
+
text_generation_eval: false
|
95 |
+
text_modeling_eval: false
|
96 |
+
eval_interval: 50000000
|
97 |
+
eval_iters: 1000
|
98 |
+
gradient_accumulation_steps: 8
|
99 |
+
log_interval: 1
|
100 |
+
loss_fn:
|
101 |
+
name: pass_through
|
102 |
+
lr_scheduler:
|
103 |
+
name: cosine
|
104 |
+
warmup_iters: 100
|
105 |
+
max_iters: 10000
|
106 |
+
optimizer:
|
107 |
+
beta1: 0.9
|
108 |
+
beta2: 0.95
|
109 |
+
grad_clip: 1
|
110 |
+
lr: 0.0005
|
111 |
+
min_lr: 5e-05
|
112 |
+
optimizer_name: adamW
|
113 |
+
weight_decay: 0.01
|
114 |
+
run_eval: false
|
115 |
+
trainer_type: base_trainer
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/output.log
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Weight and Biases Initialized
|
2 |
+
Rank0 Trainer built
|
3 |
+
Training loop is starting
|
4 |
+
All GPU(s): step 1: loss 10.4062, lr 5.0e-06, dt 2.1s
|
5 |
+
All GPU(s): step 2: loss 10.4297, lr 1.0e-05, dt 2.1s
|
6 |
+
All GPU(s): step 3: loss 10.3672, lr 1.5e-05, dt 2.1s
|
7 |
+
All GPU(s): step 4: loss 10.3203, lr 2.0e-05, dt 2.1s
|
8 |
+
All GPU(s): step 5: loss 10.2344, lr 2.5e-05, dt 2.1s
|
9 |
+
All GPU(s): step 6: loss 10.1406, lr 3.0e-05, dt 2.1s
|
10 |
+
All GPU(s): step 7: loss 10.0234, lr 3.5e-05, dt 2.1s
|
11 |
+
All GPU(s): step 8: loss 9.9688, lr 4.0e-05, dt 2.1s
|
12 |
+
All GPU(s): step 9: loss 9.8594, lr 4.5e-05, dt 2.2s
|
13 |
+
All GPU(s): step 10: loss 9.6328, lr 5.0e-05, dt 2.1s
|
14 |
+
All GPU(s): step 11: loss 9.5312, lr 5.5e-05, dt 2.1s
|
15 |
+
All GPU(s): step 12: loss 9.3750, lr 6.0e-05, dt 2.1s
|
16 |
+
All GPU(s): step 13: loss 9.2109, lr 6.5e-05, dt 2.1s
|
17 |
+
All GPU(s): step 14: loss 9.0078, lr 7.0e-05, dt 2.1s
|
18 |
+
All GPU(s): step 15: loss 8.8203, lr 7.5e-05, dt 2.1s
|
19 |
+
All GPU(s): step 16: loss 8.6562, lr 8.0e-05, dt 2.0s
|
20 |
+
All GPU(s): step 17: loss 8.4922, lr 8.5e-05, dt 2.1s
|
21 |
+
All GPU(s): step 18: loss 8.2891, lr 9.0e-05, dt 2.1s
|
22 |
+
All GPU(s): step 19: loss 8.1328, lr 9.5e-05, dt 2.1s
|
23 |
+
All GPU(s): step 20: loss 7.9414, lr 1.0e-04, dt 2.0s
|
24 |
+
All GPU(s): step 21: loss 7.7852, lr 1.1e-04, dt 2.1s
|
25 |
+
All GPU(s): step 22: loss 7.5977, lr 1.1e-04, dt 2.1s
|
26 |
+
All GPU(s): step 23: loss 7.4453, lr 1.2e-04, dt 2.1s
|
27 |
+
All GPU(s): step 24: loss 7.3164, lr 1.2e-04, dt 2.1s
|
28 |
+
All GPU(s): step 25: loss 7.1836, lr 1.3e-04, dt 2.1s
|
29 |
+
All GPU(s): step 26: loss 7.1406, lr 1.3e-04, dt 2.1s
|
30 |
+
All GPU(s): step 27: loss 6.9414, lr 1.4e-04, dt 2.1s
|
31 |
+
All GPU(s): step 28: loss 6.8633, lr 1.4e-04, dt 2.2s
|
32 |
+
All GPU(s): step 29: loss 6.7461, lr 1.5e-04, dt 2.1s
|
33 |
+
All GPU(s): step 30: loss 6.5742, lr 1.5e-04, dt 2.1s
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-metadata.json
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-117-generic-x86_64-with-glibc2.31",
|
3 |
+
"python": "3.10.14",
|
4 |
+
"startedAt": "2024-09-23T09:32:37.224689Z",
|
5 |
+
"args": [
|
6 |
+
"--config-name",
|
7 |
+
"experimental/byte_autoencoder_1"
|
8 |
+
],
|
9 |
+
"program": "/root/SuperTinyLanguageModels/train.py",
|
10 |
+
"codePath": "train.py",
|
11 |
+
"git": {
|
12 |
+
"remote": "https://github.com/LeonGuertler/SuperTinyLanguageModels.git",
|
13 |
+
"commit": "c36bf6b78927d4d365c52a835f0e178edacbab29"
|
14 |
+
},
|
15 |
+
"email": "[email protected]",
|
16 |
+
"root": "/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28",
|
17 |
+
"host": "11c6e13f6a55",
|
18 |
+
"username": "root",
|
19 |
+
"executable": "/root/SuperTinyLanguageModels/.conda/bin/python3",
|
20 |
+
"cpu_count": 128,
|
21 |
+
"cpu_count_logical": 256,
|
22 |
+
"gpu": "[NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090]",
|
23 |
+
"gpu_count": 8,
|
24 |
+
"disk": {
|
25 |
+
"/": {
|
26 |
+
"total": "1123133947904",
|
27 |
+
"used": "551794495488"
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"memory": {
|
31 |
+
"total": "540812599296"
|
32 |
+
},
|
33 |
+
"cpu": {
|
34 |
+
"count": 128,
|
35 |
+
"countLogical": 256
|
36 |
+
},
|
37 |
+
"gpu_nvidia": [
|
38 |
+
{
|
39 |
+
"name": "NVIDIA GeForce RTX 4090",
|
40 |
+
"memoryTotal": "25757220864",
|
41 |
+
"cudaCores": 16384,
|
42 |
+
"architecture": "Ada"
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"name": "NVIDIA GeForce RTX 4090",
|
46 |
+
"memoryTotal": "25757220864",
|
47 |
+
"cudaCores": 16384,
|
48 |
+
"architecture": "Ada"
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"name": "NVIDIA GeForce RTX 4090",
|
52 |
+
"memoryTotal": "25757220864",
|
53 |
+
"cudaCores": 16384,
|
54 |
+
"architecture": "Ada"
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"name": "NVIDIA GeForce RTX 4090",
|
58 |
+
"memoryTotal": "25757220864",
|
59 |
+
"cudaCores": 16384,
|
60 |
+
"architecture": "Ada"
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"name": "NVIDIA GeForce RTX 4090",
|
64 |
+
"memoryTotal": "25757220864",
|
65 |
+
"cudaCores": 16384,
|
66 |
+
"architecture": "Ada"
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"name": "NVIDIA GeForce RTX 4090",
|
70 |
+
"memoryTotal": "25757220864",
|
71 |
+
"cudaCores": 16384,
|
72 |
+
"architecture": "Ada"
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"name": "NVIDIA GeForce RTX 4090",
|
76 |
+
"memoryTotal": "25757220864",
|
77 |
+
"cudaCores": 16384,
|
78 |
+
"architecture": "Ada"
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "NVIDIA GeForce RTX 4090",
|
82 |
+
"memoryTotal": "25757220864",
|
83 |
+
"cudaCores": 16384,
|
84 |
+
"architecture": "Ada"
|
85 |
+
}
|
86 |
+
],
|
87 |
+
"cudaVersion": "12.5"
|
88 |
+
}
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"additional_info/chunk_len_penalty_loss":0,"additional_info/total-loss":6.543508529663086,"_step":1474560,"additional_info/chunk_len_loss":2.0561606884002686,"iter":30,"token_num":1474560,"additional_info/BCE-loss":4.487347602844238,"loss":6.57421875,"lr":0.00015,"_timestamp":1.7270840240730202e+09,"_runtime":69.521643938,"additional_info/average_chunk_length":3.4655094146728516,"_wandb":{"runtime":69}}
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-09-23T09:32:36.53490736Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmppr55fcxh/port-81916.txt","pid":81916,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2024-09-23T09:32:36.534984841Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2024-09-23T09:32:36.551541231Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":81916}
|
4 |
+
{"time":"2024-09-23T09:32:36.55148544Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44587,"Zone":""}}
|
5 |
+
{"time":"2024-09-23T09:32:36.722786198Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:60908"}
|
6 |
+
{"time":"2024-09-23T09:32:37.226730857Z","level":"INFO","msg":"connection init received","streamId":"tkkvzfon","id":"127.0.0.1:60908"}
|
7 |
+
{"time":"2024-09-23T09:32:37.227116001Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240923_093236.log /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log: file exists"}
|
8 |
+
{"time":"2024-09-23T09:32:37.230871019Z","level":"INFO","msg":"connection init completed","streamId":"tkkvzfon","id":"127.0.0.1:60908"}
|
9 |
+
{"time":"2024-09-23T09:33:46.746114105Z","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:60908"}
|
10 |
+
{"time":"2024-09-23T09:33:46.746363968Z","level":"INFO","msg":"server is shutting down"}
|
11 |
+
{"time":"2024-09-23T09:33:46.746627582Z","level":"INFO","msg":"closed connection","id":"127.0.0.1:60908"}
|
12 |
+
{"time":"2024-09-23T09:33:49.927260015Z","level":"INFO","msg":"connection closed","id":"127.0.0.1:60908"}
|
13 |
+
{"time":"2024-09-23T09:33:49.927297555Z","level":"INFO","msg":"server is closed"}
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-09-23T09:32:37.2270228Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
2 |
+
{"time":"2024-09-23T09:32:37.227060611Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
|
3 |
+
{"time":"2024-09-23T09:32:37.227169702Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
4 |
+
{"time":"2024-09-23T09:32:37.227182172Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
|
5 |
+
{"time":"2024-09-23T09:32:37.230824708Z","level":"INFO","msg":"created new stream","id":"tkkvzfon"}
|
6 |
+
{"time":"2024-09-23T09:32:37.230859859Z","level":"INFO","msg":"stream: started","id":"tkkvzfon"}
|
7 |
+
{"time":"2024-09-23T09:32:37.230903499Z","level":"INFO","msg":"sender: started","stream_id":{"value":"tkkvzfon"}}
|
8 |
+
{"time":"2024-09-23T09:32:37.23092371Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"tkkvzfon"}}
|
9 |
+
{"time":"2024-09-23T09:32:37.23097304Z","level":"INFO","msg":"handler: started","stream_id":{"value":"tkkvzfon"}}
|
10 |
+
{"time":"2024-09-23T09:32:37.634282756Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
11 |
+
{"time":"2024-09-23T09:32:37.636527894Z","level":"INFO","msg":"Starting system monitor"}
|
12 |
+
{"time":"2024-09-23T09:33:46.746283667Z","level":"INFO","msg":"stream: closing","id":"tkkvzfon"}
|
13 |
+
{"time":"2024-09-23T09:33:46.746349498Z","level":"INFO","msg":"Stopping system monitor"}
|
14 |
+
{"time":"2024-09-23T09:33:46.747359311Z","level":"INFO","msg":"Stopped system monitor"}
|
15 |
+
{"time":"2024-09-23T09:33:49.926631346Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"tkkvzfon"}}
|
16 |
+
{"time":"2024-09-23T09:33:49.926725448Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"tkkvzfon"}}
|
17 |
+
{"time":"2024-09-23T09:33:49.926795918Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"tkkvzfon"}}
|
18 |
+
{"time":"2024-09-23T09:33:49.927056922Z","level":"INFO","msg":"stream: closed","id":"tkkvzfon"}
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
|
2 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Configure stats pid to 81916
|
3 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
4 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/settings
|
5 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
6 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
|
7 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
|
8 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying login settings: {}
|
9 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log
|
10 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log
|
11 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():616] calling init triggers
|
12 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
|
13 |
+
config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
|
14 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():666] starting backend
|
15 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():670] setting up manager
|
16 |
+
2024-09-23 09:32:37,223 INFO MainThread:81916 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
17 |
+
2024-09-23 09:32:37,224 INFO MainThread:81916 [wandb_init.py:init():678] backend started and connected
|
18 |
+
2024-09-23 09:32:37,227 INFO MainThread:81916 [wandb_init.py:init():773] updated telemetry
|
19 |
+
2024-09-23 09:32:37,236 INFO MainThread:81916 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
|
20 |
+
2024-09-23 09:32:37,631 INFO MainThread:81916 [wandb_init.py:init():857] starting run threads in backend
|
21 |
+
2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_console_start():2459] atexit reg
|
22 |
+
2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_redirect():2307] redirect: wrap_raw
|
23 |
+
2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2372] Wrapping output streams.
|
24 |
+
2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2397] Redirects installed.
|
25 |
+
2024-09-23 09:32:37,806 INFO MainThread:81916 [wandb_init.py:init():900] run started, returning control to user process
|
26 |
+
2024-09-23 09:33:46,746 WARNING MsgRouterThr:81916 [router.py:message_loop():77] message_loop has been closed
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/run-tkkvzfon.wandb
ADDED
Binary file (124 kB). View file
|
|
2024-09-23/09-33-58/.hydra/config.yaml
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
experimental:
|
2 |
+
model:
|
3 |
+
core_model_type: pass_through
|
4 |
+
hidden_dim: 384
|
5 |
+
byte_hidden: 128
|
6 |
+
max_chunk_length: 12
|
7 |
+
max_num_chunks: 1024
|
8 |
+
num_delimiter_layers: 3
|
9 |
+
num_byte_decoder_layers: 5
|
10 |
+
target_chunk_len: 8.0
|
11 |
+
chunk_len_loss_weight: 0.1
|
12 |
+
chunk_len_penalty: 0.1
|
13 |
+
context_window: 8192
|
14 |
+
embedding_model_type: byte_level
|
15 |
+
tokenizer_type: bpe
|
16 |
+
tokenizer_dataset_name: simple_en_wiki
|
17 |
+
tokenizer_simplify_data: true
|
18 |
+
vocab_size: 259
|
19 |
+
lm_head_type: byte_level
|
20 |
+
lm_head_normalization: rms_norm
|
21 |
+
lm_head_bias: false
|
22 |
+
lm_head_dropout: 0.0
|
23 |
+
model_shell_type: byte_autoencoder_shell
|
24 |
+
embedding_weight_tying: true
|
25 |
+
ffn_weight_tying: false
|
26 |
+
cproj_weight_tying: false
|
27 |
+
positional_encoding_type: rope
|
28 |
+
trainer:
|
29 |
+
trainer_type: base_trainer
|
30 |
+
dataset: fineweb_edu_10B
|
31 |
+
batch_size: 6
|
32 |
+
gradient_accumulation_steps: 8
|
33 |
+
max_iters: 10000
|
34 |
+
eval_interval: 50000000
|
35 |
+
log_interval: 1
|
36 |
+
checkpoint_interval: 1000
|
37 |
+
eval_iters: 1000
|
38 |
+
run_eval: false
|
39 |
+
eval:
|
40 |
+
mcq_benchmarks: null
|
41 |
+
mcq_num_samples: 1000
|
42 |
+
eval_byte_metrics: false
|
43 |
+
text_modeling_eval: false
|
44 |
+
text_generation_eval: false
|
45 |
+
optimizer:
|
46 |
+
optimizer_name: adamW
|
47 |
+
lr: 0.0005
|
48 |
+
min_lr: 5.0e-05
|
49 |
+
weight_decay: 0.01
|
50 |
+
beta1: 0.9
|
51 |
+
beta2: 0.95
|
52 |
+
grad_clip: 1.0
|
53 |
+
lr_scheduler:
|
54 |
+
name: cosine
|
55 |
+
warmup_iters: 100
|
56 |
+
dataloader:
|
57 |
+
name: autoencoder
|
58 |
+
datasampling:
|
59 |
+
name: standard
|
60 |
+
loss_fn:
|
61 |
+
name: pass_through
|
62 |
+
general:
|
63 |
+
logging:
|
64 |
+
wandb_log: true
|
65 |
+
wandb_project: SuperTinyLanguageModels
|
66 |
+
wandb_run_name: null
|
67 |
+
group_name: experimental_byte_level
|
68 |
+
paths:
|
69 |
+
output_dir: outputs
|
70 |
+
data_dir: data
|
71 |
+
checkpoint_dir: checkpoints
|
72 |
+
eval_dir: evals
|
73 |
+
seed: 489
|
74 |
+
device: cuda
|
2024-09-23/09-33-58/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: train
|
117 |
+
chdir: null
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: experimental/byte_autoencoder_1
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /root/SuperTinyLanguageModels
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/09-33-58
|
144 |
+
choices:
|
145 |
+
hydra/env: default
|
146 |
+
hydra/callbacks: null
|
147 |
+
hydra/job_logging: default
|
148 |
+
hydra/hydra_logging: default
|
149 |
+
hydra/hydra_help: default
|
150 |
+
hydra/help: default
|
151 |
+
hydra/sweeper: basic
|
152 |
+
hydra/launcher: basic
|
153 |
+
hydra/output: default
|
154 |
+
verbose: false
|
2024-09-23/09-33-58/.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
2024-09-23/09-33-58/checkpoints/ckpt_1000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d9e847e5371dfd2f5ac68ee97e737d4ab63d42fdde1c885d6ab4915a9b3ccf83
|
3 |
+
size 69377274
|
2024-09-23/09-33-58/checkpoints/ckpt_2000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:261a3f997548dd7b6a92a1a7a51b37b1d559a7b64547c95b98a336bdc2685da0
|
3 |
+
size 69377274
|
2024-09-23/09-33-58/checkpoints/ckpt_3000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51018e44f695f15948d2cbcd014d62113a7a82a67ca7ca25dc767a77c12ae563
|
3 |
+
size 69377274
|
2024-09-23/09-33-58/checkpoints/ckpt_4000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf165a859555ddeb74ad0c7b6e10f17fa5f91c1b060a14bd77dd7fedbde5503c
|
3 |
+
size 69377274
|
2024-09-23/09-33-58/checkpoints/ckpt_5000.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99bdcae468dc981532ae56ecd8616824b1cf86801d364510be19a57467a81dbb
|
3 |
+
size 69377274
|