Update README.md
Browse files
README.md
CHANGED
@@ -11,4 +11,36 @@ Base Model -- 1 Gig of semi-structured pretraining data:
|
|
11 |
Merge LORA into instruct model -- 100 MB of structured story-instruct data:
|
12 |

|
13 |
- Story-instruct tune phase 1 (Constant LR, ~1250 steps, 1 epoch)
|
14 |
-
- Story-instruct tune phase 2 (Cosine LR, ~1250 steps, 1 epoch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
Merge LORA into instruct model -- 100 MB of structured story-instruct data:
|
12 |

|
13 |
- Story-instruct tune phase 1 (Constant LR, ~1250 steps, 1 epoch)
|
14 |
+
- Story-instruct tune phase 2 (Cosine LR, ~1250 steps, 1 epoch)
|
15 |
+
|
16 |
+
Trained using <https://github.com/unslothai/unsloth>
|
17 |
+
Rough script:
|
18 |
+
```python
|
19 |
+
trainer = SFTTrainer(
|
20 |
+
model = model,
|
21 |
+
train_dataset = train_dataset,
|
22 |
+
dataset_text_field = "text",
|
23 |
+
max_seq_length = max_seq_length,
|
24 |
+
tokenizer = tokenizer,
|
25 |
+
args = TrainingArguments(
|
26 |
+
per_device_train_batch_size = 2,
|
27 |
+
warmup_steps = 45,
|
28 |
+
num_train_epochs=2,
|
29 |
+
fp16 = not torch.cuda.is_bf16_supported(),
|
30 |
+
bf16 = torch.cuda.is_bf16_supported(),
|
31 |
+
logging_steps = 15,
|
32 |
+
logging_dir="logs",
|
33 |
+
report_to="tensorboard",
|
34 |
+
output_dir = "outputs",
|
35 |
+
save_strategy=IntervalStrategy.STEPS,
|
36 |
+
save_steps=100,
|
37 |
+
save_total_limit=30,
|
38 |
+
optim = "adamw_torch_fused",
|
39 |
+
lr_scheduler_type="cosine", # <- Changed over time
|
40 |
+
learning_rate=5e-5,
|
41 |
+
weight_decay=0.10, # .15 for base pretraining
|
42 |
+
adam_beta1=0.88, # .9 for base pretraining
|
43 |
+
adam_beta2=0.99, # .999 for base pretraining
|
44 |
+
),
|
45 |
+
)
|
46 |
+
```
|