nassersala commited on May 24, 2024

Commit

78e88b5

verified ·

1 Parent(s): f2b6734

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +154 -3
adapter_config.json +34 -0
adapter_model.bin +3 -0
checkpoint-157/README.md +202 -0
checkpoint-157/adapter_config.json +34 -0
checkpoint-157/adapter_model.safetensors +3 -0
checkpoint-157/optimizer.pt +3 -0
checkpoint-157/rng_state.pth +3 -0
checkpoint-157/scheduler.pt +3 -0
checkpoint-157/special_tokens_map.json +24 -0
checkpoint-157/tokenizer.model +3 -0
checkpoint-157/tokenizer_config.json +43 -0
checkpoint-157/trainer_state.json +1152 -0
checkpoint-157/training_args.bin +3 -0
checkpoint-314/README.md +202 -0
checkpoint-314/adapter_config.json +34 -0
checkpoint-314/adapter_model.safetensors +3 -0
checkpoint-314/optimizer.pt +3 -0
checkpoint-314/rng_state.pth +3 -0
checkpoint-314/scheduler.pt +3 -0
checkpoint-314/special_tokens_map.json +24 -0
checkpoint-314/tokenizer.model +3 -0
checkpoint-314/tokenizer_config.json +43 -0
checkpoint-314/trainer_state.json +2283 -0
checkpoint-314/training_args.bin +3 -0
checkpoint-471/README.md +202 -0
checkpoint-471/adapter_config.json +34 -0
checkpoint-471/adapter_model.safetensors +3 -0
checkpoint-471/optimizer.pt +3 -0
checkpoint-471/rng_state.pth +3 -0
checkpoint-471/scheduler.pt +3 -0
checkpoint-471/special_tokens_map.json +24 -0
checkpoint-471/tokenizer.model +3 -0
checkpoint-471/tokenizer_config.json +43 -0
checkpoint-471/trainer_state.json +3414 -0
checkpoint-471/training_args.bin +3 -0
checkpoint-628/README.md +202 -0
checkpoint-628/adapter_config.json +34 -0
checkpoint-628/adapter_model.safetensors +3 -0
checkpoint-628/optimizer.pt +3 -0
checkpoint-628/rng_state.pth +3 -0
checkpoint-628/scheduler.pt +3 -0
checkpoint-628/special_tokens_map.json +24 -0
checkpoint-628/tokenizer.model +3 -0
checkpoint-628/tokenizer_config.json +43 -0
checkpoint-628/trainer_state.json +0 -0
checkpoint-628/training_args.bin +3 -0
config.json +44 -0
special_tokens_map.json +24 -0
tokenizer.model +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,154 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+library_name: peft
+tags:
+- generated_from_trainer
+base_model: openlm-research/open_llama_3b_v2
+model-index:
+- name: outputs/lora-out
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.4.0`
+```yaml
+base_model: openlm-research/open_llama_3b_v2
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.02
+adapter: lora
+lora_model_dir:
+sequence_len: 1024
+sample_packing: true
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+lora_fan_in_fan_out:
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+output_dir: ./outputs/lora-out
+gradient_accumulation_steps: 1
+micro_batch_size: 64
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: true
+tf32: false
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+gptq_groupsize:
+s2_attention:
+gptq_model_v1:
+warmup_steps: 20
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
+```
+</details><br>
+# outputs/lora-out
+This model is a fine-tuned version of [openlm-research/open_llama_3b_v2](https://huggingface.co/openlm-research/open_llama_3b_v2) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.9697
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 64
+- eval_batch_size: 64
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 20
+- num_epochs: 4
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.3031        | 0.0064 | 1    | 1.5004          |
+| 1.1084        | 0.2548 | 40   | 1.1224          |
+| 1.0912        | 0.5096 | 80   | 1.0586          |
+| 1.0727        | 0.7643 | 120  | 1.0301          |
+| 1.0438        | 1.0191 | 160  | 1.0126          |
+| 1.0126        | 1.2484 | 200  | 1.0035          |
+| 1.048         | 1.5032 | 240  | 0.9938          |
+| 1.0839        | 1.7580 | 280  | 0.9859          |
+| 1.0817        | 2.0127 | 320  | 0.9801          |
+| 1.0115        | 2.2420 | 360  | 0.9788          |
+| 1.0356        | 2.4968 | 400  | 0.9730          |
+| 0.992         | 2.7516 | 440  | 0.9725          |
+| 1.0219        | 3.0064 | 480  | 0.9682          |
+| 0.9637        | 3.2357 | 520  | 0.9707          |
+| 1.0085        | 3.4904 | 560  | 0.9698          |
+| 0.9547        | 3.7452 | 600  | 0.9697          |
+### Framework versions
+- PEFT 0.10.0
+- Transformers 4.40.2
+- Pytorch 2.1.2+cu118
+- Datasets 2.19.1
+- Tokenizers 0.19.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7375d44b826facc7d77f64e65e52effd1b224300382d2c59f31036d16039eef9
+size 50982842

checkpoint-157/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: openlm-research/open_llama_3b_v2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-157/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-157/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e10f39be2721fc1d80a010387b31a6eaad25ab64292e1aa91ef7258e87fe99b
+size 50899792

checkpoint-157/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43f44d3fb92972c8902c0a8d8fc11c2fd1ad0f2f8a78309a25a2c042c68477da
+size 25871492

checkpoint-157/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b3ee827a7a00012c0a116546df467feee35e70376d81a7a85b1a70eb90414d3
+size 14244

checkpoint-157/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cf2f5c5a3d7f63043a6897d6edde504f6e3996de0e776b0058eec472573140c
+size 1064

checkpoint-157/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-157/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574

checkpoint-157/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-157/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1152 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 40,
+  "global_step": 157,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006369426751592357,
+      "grad_norm": 0.1806156039237976,
+      "learning_rate": 1e-05,
+      "loss": 1.3031,
+      "step": 1
+    },
+    {
+      "epoch": 0.006369426751592357,
+      "eval_loss": 1.5003942251205444,
+      "eval_runtime": 19.6641,
+      "eval_samples_per_second": 55.533,
+      "eval_steps_per_second": 0.915,
+      "step": 1
+    },
+    {
+      "epoch": 0.012738853503184714,
+      "grad_norm": 0.1688886284828186,
+      "learning_rate": 2e-05,
+      "loss": 1.3305,
+      "step": 2
+    },
+    {
+      "epoch": 0.01910828025477707,
+      "grad_norm": 0.20123907923698425,
+      "learning_rate": 3e-05,
+      "loss": 1.324,
+      "step": 3
+    },
+    {
+      "epoch": 0.025477707006369428,
+      "grad_norm": 0.18879620730876923,
+      "learning_rate": 4e-05,
+      "loss": 1.3638,
+      "step": 4
+    },
+    {
+      "epoch": 0.03184713375796178,
+      "grad_norm": 0.20348915457725525,
+      "learning_rate": 5e-05,
+      "loss": 1.3686,
+      "step": 5
+    },
+    {
+      "epoch": 0.03821656050955414,
+      "grad_norm": 0.212239071726799,
+      "learning_rate": 6e-05,
+      "loss": 1.2865,
+      "step": 6
+    },
+    {
+      "epoch": 0.044585987261146494,
+      "grad_norm": 0.19280897080898285,
+      "learning_rate": 7e-05,
+      "loss": 1.313,
+      "step": 7
+    },
+    {
+      "epoch": 0.050955414012738856,
+      "grad_norm": 0.1767151653766632,
+      "learning_rate": 8e-05,
+      "loss": 1.3207,
+      "step": 8
+    },
+    {
+      "epoch": 0.05732484076433121,
+      "grad_norm": 0.20014327764511108,
+      "learning_rate": 9e-05,
+      "loss": 1.3143,
+      "step": 9
+    },
+    {
+      "epoch": 0.06369426751592357,
+      "grad_norm": 0.18035855889320374,
+      "learning_rate": 0.0001,
+      "loss": 1.252,
+      "step": 10
+    },
+    {
+      "epoch": 0.07006369426751592,
+      "grad_norm": 0.19993054866790771,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.302,
+      "step": 11
+    },
+    {
+      "epoch": 0.07643312101910828,
+      "grad_norm": 0.18973341584205627,
+      "learning_rate": 0.00012,
+      "loss": 1.2608,
+      "step": 12
+    },
+    {
+      "epoch": 0.08280254777070063,
+      "grad_norm": 0.19669465720653534,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2329,
+      "step": 13
+    },
+    {
+      "epoch": 0.08917197452229299,
+      "grad_norm": 0.1886417716741562,
+      "learning_rate": 0.00014,
+      "loss": 1.241,
+      "step": 14
+    },
+    {
+      "epoch": 0.09554140127388536,
+      "grad_norm": 0.19076582789421082,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2539,
+      "step": 15
+    },
+    {
+      "epoch": 0.10191082802547771,
+      "grad_norm": 0.16027267277240753,
+      "learning_rate": 0.00016,
+      "loss": 1.2123,
+      "step": 16
+    },
+    {
+      "epoch": 0.10828025477707007,
+      "grad_norm": 0.16112814843654633,
+      "learning_rate": 0.00017,
+      "loss": 1.2465,
+      "step": 17
+    },
+    {
+      "epoch": 0.11464968152866242,
+      "grad_norm": 0.15539830923080444,
+      "learning_rate": 0.00018,
+      "loss": 1.1717,
+      "step": 18
+    },
+    {
+      "epoch": 0.12101910828025478,
+      "grad_norm": 0.15739695727825165,
+      "learning_rate": 0.00019,
+      "loss": 1.1412,
+      "step": 19
+    },
+    {
+      "epoch": 0.12738853503184713,
+      "grad_norm": 0.15658576786518097,
+      "learning_rate": 0.0002,
+      "loss": 1.1731,
+      "step": 20
+    },
+    {
+      "epoch": 0.1337579617834395,
+      "grad_norm": 0.1474328637123108,
+      "learning_rate": 0.00019999866506037345,
+      "loss": 1.2051,
+      "step": 21
+    },
+    {
+      "epoch": 0.14012738853503184,
+      "grad_norm": 0.11234907805919647,
+      "learning_rate": 0.00019999466027713507,
+      "loss": 1.1803,
+      "step": 22
+    },
+    {
+      "epoch": 0.1464968152866242,
+      "grad_norm": 0.1053839772939682,
+      "learning_rate": 0.00019998798575720776,
+      "loss": 1.1436,
+      "step": 23
+    },
+    {
+      "epoch": 0.15286624203821655,
+      "grad_norm": 0.1049942821264267,
+      "learning_rate": 0.00019997864167879312,
+      "loss": 1.1881,
+      "step": 24
+    },
+    {
+      "epoch": 0.1592356687898089,
+      "grad_norm": 0.11039146035909653,
+      "learning_rate": 0.00019996662829136676,
+      "loss": 1.1528,
+      "step": 25
+    },
+    {
+      "epoch": 0.16560509554140126,
+      "grad_norm": 0.09678228944540024,
+      "learning_rate": 0.0001999519459156716,
+      "loss": 1.1496,
+      "step": 26
+    },
+    {
+      "epoch": 0.17197452229299362,
+      "grad_norm": 0.09857058525085449,
+      "learning_rate": 0.0001999345949437094,
+      "loss": 1.1304,
+      "step": 27
+    },
+    {
+      "epoch": 0.17834394904458598,
+      "grad_norm": 0.10835567116737366,
+      "learning_rate": 0.0001999145758387301,
+      "loss": 1.2262,
+      "step": 28
+    },
+    {
+      "epoch": 0.18471337579617833,
+      "grad_norm": 0.09927600622177124,
+      "learning_rate": 0.0001998918891352197,
+      "loss": 1.1382,
+      "step": 29
+    },
+    {
+      "epoch": 0.1910828025477707,
+      "grad_norm": 0.09861327707767487,
+      "learning_rate": 0.00019986653543888568,
+      "loss": 1.1987,
+      "step": 30
+    },
+    {
+      "epoch": 0.19745222929936307,
+      "grad_norm": 0.09174010157585144,
+      "learning_rate": 0.00019983851542664126,
+      "loss": 1.127,
+      "step": 31
+    },
+    {
+      "epoch": 0.20382165605095542,
+      "grad_norm": 0.08863182365894318,
+      "learning_rate": 0.00019980782984658683,
+      "loss": 1.211,
+      "step": 32
+    },
+    {
+      "epoch": 0.21019108280254778,
+      "grad_norm": 0.08810263872146606,
+      "learning_rate": 0.00019977447951799034,
+      "loss": 1.1476,
+      "step": 33
+    },
+    {
+      "epoch": 0.21656050955414013,
+      "grad_norm": 0.08641776442527771,
+      "learning_rate": 0.00019973846533126533,
+      "loss": 1.1497,
+      "step": 34
+    },
+    {
+      "epoch": 0.2229299363057325,
+      "grad_norm": 0.09637051075696945,
+      "learning_rate": 0.00019969978824794707,
+      "loss": 1.1471,
+      "step": 35
+    },
+    {
+      "epoch": 0.22929936305732485,
+      "grad_norm": 0.09402573108673096,
+      "learning_rate": 0.000199658449300667,
+      "loss": 1.0976,
+      "step": 36
+    },
+    {
+      "epoch": 0.2356687898089172,
+      "grad_norm": 0.09077832847833633,
+      "learning_rate": 0.00019961444959312508,
+      "loss": 1.1119,
+      "step": 37
+    },
+    {
+      "epoch": 0.24203821656050956,
+      "grad_norm": 0.08864310383796692,
+      "learning_rate": 0.0001995677903000604,
+      "loss": 1.1157,
+      "step": 38
+    },
+    {
+      "epoch": 0.2484076433121019,
+      "grad_norm": 0.09867957979440689,
+      "learning_rate": 0.0001995184726672197,
+      "loss": 1.1656,
+      "step": 39
+    },
+    {
+      "epoch": 0.25477707006369427,
+      "grad_norm": 0.09343115240335464,
+      "learning_rate": 0.00019946649801132427,
+      "loss": 1.1084,
+      "step": 40
+    },
+    {
+      "epoch": 0.25477707006369427,
+      "eval_loss": 1.1224156618118286,
+      "eval_runtime": 19.2915,
+      "eval_samples_per_second": 56.605,
+      "eval_steps_per_second": 0.933,
+      "step": 40
+    },
+    {
+      "epoch": 0.2611464968152866,
+      "grad_norm": 0.09474795311689377,
+      "learning_rate": 0.00019941186772003464,
+      "loss": 1.1486,
+      "step": 41
+    },
+    {
+      "epoch": 0.267515923566879,
+      "grad_norm": 0.09726471453905106,
+      "learning_rate": 0.00019935458325191365,
+      "loss": 1.1499,
+      "step": 42
+    },
+    {
+      "epoch": 0.27388535031847133,
+      "grad_norm": 0.09273070096969604,
+      "learning_rate": 0.0001992946461363874,
+      "loss": 1.1361,
+      "step": 43
+    },
+    {
+      "epoch": 0.2802547770700637,
+      "grad_norm": 0.10344096273183823,
+      "learning_rate": 0.0001992320579737045,
+      "loss": 1.0999,
+      "step": 44
+    },
+    {
+      "epoch": 0.28662420382165604,
+      "grad_norm": 0.09499648213386536,
+      "learning_rate": 0.00019916682043489336,
+      "loss": 1.0919,
+      "step": 45
+    },
+    {
+      "epoch": 0.2929936305732484,
+      "grad_norm": 0.09483088552951813,
+      "learning_rate": 0.00019909893526171745,
+      "loss": 1.0992,
+      "step": 46
+    },
+    {
+      "epoch": 0.29936305732484075,
+      "grad_norm": 0.10382100939750671,
+      "learning_rate": 0.00019902840426662895,
+      "loss": 1.1093,
+      "step": 47
+    },
+    {
+      "epoch": 0.3057324840764331,
+      "grad_norm": 0.10187891870737076,
+      "learning_rate": 0.00019895522933272028,
+      "loss": 1.1063,
+      "step": 48
+    },
+    {
+      "epoch": 0.31210191082802546,
+      "grad_norm": 0.1022520437836647,
+      "learning_rate": 0.00019887941241367377,
+      "loss": 1.1095,
+      "step": 49
+    },
+    {
+      "epoch": 0.3184713375796178,
+      "grad_norm": 0.11470162868499756,
+      "learning_rate": 0.00019880095553370967,
+      "loss": 1.0859,
+      "step": 50
+    },
+    {
+      "epoch": 0.3248407643312102,
+      "grad_norm": 0.09845008701086044,
+      "learning_rate": 0.0001987198607875319,
+      "loss": 1.0941,
+      "step": 51
+    },
+    {
+      "epoch": 0.33121019108280253,
+      "grad_norm": 0.1080709770321846,
+      "learning_rate": 0.00019863613034027224,
+      "loss": 1.084,
+      "step": 52
+    },
+    {
+      "epoch": 0.3375796178343949,
+      "grad_norm": 0.11064234375953674,
+      "learning_rate": 0.0001985497664274326,
+      "loss": 1.1018,
+      "step": 53
+    },
+    {
+      "epoch": 0.34394904458598724,
+      "grad_norm": 0.10099776834249496,
+      "learning_rate": 0.0001984607713548251,
+      "loss": 1.0881,
+      "step": 54
+    },
+    {
+      "epoch": 0.3503184713375796,
+      "grad_norm": 0.11960357427597046,
+      "learning_rate": 0.0001983691474985108,
+      "loss": 1.0845,
+      "step": 55
+    },
+    {
+      "epoch": 0.35668789808917195,
+      "grad_norm": 0.10840114951133728,
+      "learning_rate": 0.00019827489730473596,
+      "loss": 1.131,
+      "step": 56
+    },
+    {
+      "epoch": 0.3630573248407643,
+      "grad_norm": 0.10177604109048843,
+      "learning_rate": 0.00019817802328986697,
+      "loss": 1.079,
+      "step": 57
+    },
+    {
+      "epoch": 0.36942675159235666,
+      "grad_norm": 0.11752859503030777,
+      "learning_rate": 0.00019807852804032305,
+      "loss": 1.0833,
+      "step": 58
+    },
+    {
+      "epoch": 0.37579617834394907,
+      "grad_norm": 0.11149834841489792,
+      "learning_rate": 0.00019797641421250725,
+      "loss": 1.1009,
+      "step": 59
+    },
+    {
+      "epoch": 0.3821656050955414,
+      "grad_norm": 0.10446681827306747,
+      "learning_rate": 0.00019787168453273544,
+      "loss": 1.1211,
+      "step": 60
+    },
+    {
+      "epoch": 0.3885350318471338,
+      "grad_norm": 0.12820479273796082,
+      "learning_rate": 0.00019776434179716366,
+      "loss": 1.1455,
+      "step": 61
+    },
+    {
+      "epoch": 0.39490445859872614,
+      "grad_norm": 0.10011500865221024,
+      "learning_rate": 0.00019765438887171327,
+      "loss": 1.0779,
+      "step": 62
+    },
+    {
+      "epoch": 0.4012738853503185,
+      "grad_norm": 0.11496227979660034,
+      "learning_rate": 0.0001975418286919947,
+      "loss": 1.1174,
+      "step": 63
+    },
+    {
+      "epoch": 0.40764331210191085,
+      "grad_norm": 0.10938404500484467,
+      "learning_rate": 0.00019742666426322876,
+      "loss": 1.0576,
+      "step": 64
+    },
+    {
+      "epoch": 0.4140127388535032,
+      "grad_norm": 0.12636032700538635,
+      "learning_rate": 0.0001973088986601667,
+      "loss": 1.083,
+      "step": 65
+    },
+    {
+      "epoch": 0.42038216560509556,
+      "grad_norm": 0.10620423406362534,
+      "learning_rate": 0.00019718853502700783,
+      "loss": 1.0728,
+      "step": 66
+    },
+    {
+      "epoch": 0.4267515923566879,
+      "grad_norm": 0.11206210404634476,
+      "learning_rate": 0.0001970655765773159,
+      "loss": 1.1107,
+      "step": 67
+    },
+    {
+      "epoch": 0.43312101910828027,
+      "grad_norm": 0.12613879144191742,
+      "learning_rate": 0.00019694002659393305,
+      "loss": 1.1065,
+      "step": 68
+    },
+    {
+      "epoch": 0.4394904458598726,
+      "grad_norm": 0.10636976361274719,
+      "learning_rate": 0.00019681188842889222,
+      "loss": 1.1192,
+      "step": 69
+    },
+    {
+      "epoch": 0.445859872611465,
+      "grad_norm": 0.11036239564418793,
+      "learning_rate": 0.00019668116550332766,
+      "loss": 1.1362,
+      "step": 70
+    },
+    {
+      "epoch": 0.45222929936305734,
+      "grad_norm": 0.11907072365283966,
+      "learning_rate": 0.0001965478613073837,
+      "loss": 1.1009,
+      "step": 71
+    },
+    {
+      "epoch": 0.4585987261146497,
+      "grad_norm": 0.11267364770174026,
+      "learning_rate": 0.00019641197940012137,
+      "loss": 1.0694,
+      "step": 72
+    },
+    {
+      "epoch": 0.46496815286624205,
+      "grad_norm": 0.10659351199865341,
+      "learning_rate": 0.00019627352340942353,
+      "loss": 1.0844,
+      "step": 73
+    },
+    {
+      "epoch": 0.4713375796178344,
+      "grad_norm": 0.12426211684942245,
+      "learning_rate": 0.00019613249703189796,
+      "loss": 1.1203,
+      "step": 74
+    },
+    {
+      "epoch": 0.47770700636942676,
+      "grad_norm": 0.11883872747421265,
+      "learning_rate": 0.00019598890403277864,
+      "loss": 1.0879,
+      "step": 75
+    },
+    {
+      "epoch": 0.4840764331210191,
+      "grad_norm": 0.11355262994766235,
+      "learning_rate": 0.0001958427482458253,
+      "loss": 1.1045,
+      "step": 76
+    },
+    {
+      "epoch": 0.49044585987261147,
+      "grad_norm": 0.11006154865026474,
+      "learning_rate": 0.0001956940335732209,
+      "loss": 1.1058,
+      "step": 77
+    },
+    {
+      "epoch": 0.4968152866242038,
+      "grad_norm": 0.11379122734069824,
+      "learning_rate": 0.00019554276398546768,
+      "loss": 1.1224,
+      "step": 78
+    },
+    {
+      "epoch": 0.5031847133757962,
+      "grad_norm": 0.11065732687711716,
+      "learning_rate": 0.000195388943521281,
+      "loss": 1.1033,
+      "step": 79
+    },
+    {
+      "epoch": 0.5095541401273885,
+      "grad_norm": 0.11113402247428894,
+      "learning_rate": 0.00019523257628748146,
+      "loss": 1.0912,
+      "step": 80
+    },
+    {
+      "epoch": 0.5095541401273885,
+      "eval_loss": 1.0586377382278442,
+      "eval_runtime": 19.2899,
+      "eval_samples_per_second": 56.61,
+      "eval_steps_per_second": 0.933,
+      "step": 80
+    },
+    {
+      "epoch": 0.5159235668789809,
+      "grad_norm": 0.11783529818058014,
+      "learning_rate": 0.00019507366645888543,
+      "loss": 1.0938,
+      "step": 81
+    },
+    {
+      "epoch": 0.5222929936305732,
+      "grad_norm": 0.12089723348617554,
+      "learning_rate": 0.00019491221827819347,
+      "loss": 1.1068,
+      "step": 82
+    },
+    {
+      "epoch": 0.5286624203821656,
+      "grad_norm": 0.10991813987493515,
+      "learning_rate": 0.00019474823605587703,
+      "loss": 1.1393,
+      "step": 83
+    },
+    {
+      "epoch": 0.535031847133758,
+      "grad_norm": 0.11100416630506516,
+      "learning_rate": 0.00019458172417006347,
+      "loss": 1.1081,
+      "step": 84
+    },
+    {
+      "epoch": 0.5414012738853503,
+      "grad_norm": 0.11886284500360489,
+      "learning_rate": 0.00019441268706641907,
+      "loss": 1.1168,
+      "step": 85
+    },
+    {
+      "epoch": 0.5477707006369427,
+      "grad_norm": 0.11771067976951599,
+      "learning_rate": 0.00019424112925803039,
+      "loss": 1.098,
+      "step": 86
+    },
+    {
+      "epoch": 0.554140127388535,
+      "grad_norm": 0.11022554337978363,
+      "learning_rate": 0.00019406705532528374,
+      "loss": 1.1179,
+      "step": 87
+    },
+    {
+      "epoch": 0.5605095541401274,
+      "grad_norm": 0.11891311407089233,
+      "learning_rate": 0.00019389046991574298,
+      "loss": 1.0866,
+      "step": 88
+    },
+    {
+      "epoch": 0.5668789808917197,
+      "grad_norm": 0.11594802141189575,
+      "learning_rate": 0.00019371137774402527,
+      "loss": 1.1146,
+      "step": 89
+    },
+    {
+      "epoch": 0.5732484076433121,
+      "grad_norm": 0.1181577518582344,
+      "learning_rate": 0.0001935297835916754,
+      "loss": 1.1213,
+      "step": 90
+    },
+    {
+      "epoch": 0.5796178343949044,
+      "grad_norm": 0.10821503400802612,
+      "learning_rate": 0.00019334569230703794,
+      "loss": 1.1121,
+      "step": 91
+    },
+    {
+      "epoch": 0.5859872611464968,
+      "grad_norm": 0.118013896048069,
+      "learning_rate": 0.0001931591088051279,
+      "loss": 1.117,
+      "step": 92
+    },
+    {
+      "epoch": 0.5923566878980892,
+      "grad_norm": 0.11678043752908707,
+      "learning_rate": 0.0001929700380674995,
+      "loss": 1.0974,
+      "step": 93
+    },
+    {
+      "epoch": 0.5987261146496815,
+      "grad_norm": 0.11073200404644012,
+      "learning_rate": 0.00019277848514211317,
+      "loss": 1.1059,
+      "step": 94
+    },
+    {
+      "epoch": 0.6050955414012739,
+      "grad_norm": 0.11440474539995193,
+      "learning_rate": 0.00019258445514320065,
+      "loss": 1.0913,
+      "step": 95
+    },
+    {
+      "epoch": 0.6114649681528662,
+      "grad_norm": 0.11020273715257645,
+      "learning_rate": 0.0001923879532511287,
+      "loss": 1.0836,
+      "step": 96
+    },
+    {
+      "epoch": 0.6178343949044586,
+      "grad_norm": 0.11285867542028427,
+      "learning_rate": 0.0001921889847122605,
+      "loss": 1.0842,
+      "step": 97
+    },
+    {
+      "epoch": 0.6242038216560509,
+      "grad_norm": 0.11981746554374695,
+      "learning_rate": 0.00019198755483881583,
+      "loss": 1.1062,
+      "step": 98
+    },
+    {
+      "epoch": 0.6305732484076433,
+      "grad_norm": 0.11882256716489792,
+      "learning_rate": 0.0001917836690087291,
+      "loss": 1.1012,
+      "step": 99
+    },
+    {
+      "epoch": 0.6369426751592356,
+      "grad_norm": 0.11642686277627945,
+      "learning_rate": 0.00019157733266550575,
+      "loss": 1.0823,
+      "step": 100
+    },
+    {
+      "epoch": 0.643312101910828,
+      "grad_norm": 0.11980683356523514,
+      "learning_rate": 0.00019136855131807705,
+      "loss": 1.105,
+      "step": 101
+    },
+    {
+      "epoch": 0.6496815286624203,
+      "grad_norm": 0.1147085651755333,
+      "learning_rate": 0.0001911573305406528,
+      "loss": 1.0794,
+      "step": 102
+    },
+    {
+      "epoch": 0.6560509554140127,
+      "grad_norm": 0.12037765234708786,
+      "learning_rate": 0.00019094367597257282,
+      "loss": 1.1059,
+      "step": 103
+    },
+    {
+      "epoch": 0.6624203821656051,
+      "grad_norm": 0.12135636061429977,
+      "learning_rate": 0.000190727593318156,
+      "loss": 1.118,
+      "step": 104
+    },
+    {
+      "epoch": 0.6687898089171974,
+      "grad_norm": 0.13285911083221436,
+      "learning_rate": 0.00019050908834654834,
+      "loss": 1.0817,
+      "step": 105
+    },
+    {
+      "epoch": 0.6751592356687898,
+      "grad_norm": 0.11360063403844833,
+      "learning_rate": 0.00019028816689156878,
+      "loss": 1.0711,
+      "step": 106
+    },
+    {
+      "epoch": 0.6815286624203821,
+      "grad_norm": 0.13178926706314087,
+      "learning_rate": 0.00019006483485155338,
+      "loss": 1.1266,
+      "step": 107
+    },
+    {
+      "epoch": 0.6878980891719745,
+      "grad_norm": 0.1290571093559265,
+      "learning_rate": 0.0001898390981891979,
+      "loss": 1.0776,
+      "step": 108
+    },
+    {
+      "epoch": 0.6942675159235668,
+      "grad_norm": 0.11376259475946426,
+      "learning_rate": 0.0001896109629313987,
+      "loss": 1.1026,
+      "step": 109
+    },
+    {
+      "epoch": 0.7006369426751592,
+      "grad_norm": 0.12076874077320099,
+      "learning_rate": 0.0001893804351690917,
+      "loss": 1.104,
+      "step": 110
+    },
+    {
+      "epoch": 0.7070063694267515,
+      "grad_norm": 0.12165362387895584,
+      "learning_rate": 0.0001891475210570898,
+      "loss": 1.0884,
+      "step": 111
+    },
+    {
+      "epoch": 0.7133757961783439,
+      "grad_norm": 0.10634943842887878,
+      "learning_rate": 0.00018891222681391851,
+      "loss": 1.0844,
+      "step": 112
+    },
+    {
+      "epoch": 0.7197452229299363,
+      "grad_norm": 0.11928383260965347,
+      "learning_rate": 0.00018867455872165008,
+      "loss": 1.1205,
+      "step": 113
+    },
+    {
+      "epoch": 0.7261146496815286,
+      "grad_norm": 0.1243489533662796,
+      "learning_rate": 0.00018843452312573554,
+      "loss": 1.0704,
+      "step": 114
+    },
+    {
+      "epoch": 0.732484076433121,
+      "grad_norm": 0.11439479887485504,
+      "learning_rate": 0.0001881921264348355,
+      "loss": 1.0809,
+      "step": 115
+    },
+    {
+      "epoch": 0.7388535031847133,
+      "grad_norm": 0.1184995099902153,
+      "learning_rate": 0.0001879473751206489,
+      "loss": 1.1619,
+      "step": 116
+    },
+    {
+      "epoch": 0.7452229299363057,
+      "grad_norm": 0.11846223473548889,
+      "learning_rate": 0.00018770027571774031,
+      "loss": 1.0835,
+      "step": 117
+    },
+    {
+      "epoch": 0.7515923566878981,
+      "grad_norm": 0.11566226184368134,
+      "learning_rate": 0.00018745083482336544,
+      "loss": 1.0658,
+      "step": 118
+    },
+    {
+      "epoch": 0.7579617834394905,
+      "grad_norm": 0.11553015559911728,
+      "learning_rate": 0.00018719905909729494,
+      "loss": 1.0773,
+      "step": 119
+    },
+    {
+      "epoch": 0.7643312101910829,
+      "grad_norm": 0.13605500757694244,
+      "learning_rate": 0.0001869449552616367,
+      "loss": 1.0727,
+      "step": 120
+    },
+    {
+      "epoch": 0.7643312101910829,
+      "eval_loss": 1.0301120281219482,
+      "eval_runtime": 19.2781,
+      "eval_samples_per_second": 56.645,
+      "eval_steps_per_second": 0.934,
+      "step": 120
+    },
+    {
+      "epoch": 0.7707006369426752,
+      "grad_norm": 0.1149601861834526,
+      "learning_rate": 0.00018668853010065634,
+      "loss": 1.0745,
+      "step": 121
+    },
+    {
+      "epoch": 0.7770700636942676,
+      "grad_norm": 0.11904130131006241,
+      "learning_rate": 0.00018642979046059593,
+      "loss": 1.0574,
+      "step": 122
+    },
+    {
+      "epoch": 0.7834394904458599,
+      "grad_norm": 0.11868870258331299,
+      "learning_rate": 0.00018616874324949159,
+      "loss": 1.0681,
+      "step": 123
+    },
+    {
+      "epoch": 0.7898089171974523,
+      "grad_norm": 0.11400648951530457,
+      "learning_rate": 0.00018590539543698854,
+      "loss": 1.0874,
+      "step": 124
+    },
+    {
+      "epoch": 0.7961783439490446,
+      "grad_norm": 0.12247481942176819,
+      "learning_rate": 0.0001856397540541554,
+      "loss": 1.0832,
+      "step": 125
+    },
+    {
+      "epoch": 0.802547770700637,
+      "grad_norm": 0.11855783313512802,
+      "learning_rate": 0.0001853718261932964,
+      "loss": 1.0775,
+      "step": 126
+    },
+    {
+      "epoch": 0.8089171974522293,
+      "grad_norm": 0.11434577405452728,
+      "learning_rate": 0.00018510161900776187,
+      "loss": 1.048,
+      "step": 127
+    },
+    {
+      "epoch": 0.8152866242038217,
+      "grad_norm": 0.12175115942955017,
+      "learning_rate": 0.00018482913971175737,
+      "loss": 1.0776,
+      "step": 128
+    },
+    {
+      "epoch": 0.821656050955414,
+      "grad_norm": 0.1237318217754364,
+      "learning_rate": 0.00018455439558015115,
+      "loss": 1.0977,
+      "step": 129
+    },
+    {
+      "epoch": 0.8280254777070064,
+      "grad_norm": 0.12041562050580978,
+      "learning_rate": 0.00018427739394827973,
+      "loss": 1.0477,
+      "step": 130
+    },
+    {
+      "epoch": 0.8343949044585988,
+      "grad_norm": 0.11855332553386688,
+      "learning_rate": 0.00018399814221175227,
+      "loss": 1.1026,
+      "step": 131
+    },
+    {
+      "epoch": 0.8407643312101911,
+      "grad_norm": 0.12020997703075409,
+      "learning_rate": 0.00018371664782625287,
+      "loss": 1.0484,
+      "step": 132
+    },
+    {
+      "epoch": 0.8471337579617835,
+      "grad_norm": 0.1116231232881546,
+      "learning_rate": 0.00018343291830734176,
+      "loss": 1.0772,
+      "step": 133
+    },
+    {
+      "epoch": 0.8535031847133758,
+      "grad_norm": 0.12280379235744476,
+      "learning_rate": 0.00018314696123025454,
+      "loss": 1.0829,
+      "step": 134
+    },
+    {
+      "epoch": 0.8598726114649682,
+      "grad_norm": 0.11589805781841278,
+      "learning_rate": 0.00018285878422969983,
+      "loss": 1.0636,
+      "step": 135
+    },
+    {
+      "epoch": 0.8662420382165605,
+      "grad_norm": 0.11667989194393158,
+      "learning_rate": 0.0001825683949996556,
+      "loss": 1.0783,
+      "step": 136
+    },
+    {
+      "epoch": 0.8726114649681529,
+      "grad_norm": 0.11666262894868851,
+      "learning_rate": 0.00018227580129316366,
+      "loss": 1.0587,
+      "step": 137
+    },
+    {
+      "epoch": 0.8789808917197452,
+      "grad_norm": 0.11791834235191345,
+      "learning_rate": 0.00018198101092212267,
+      "loss": 1.0955,
+      "step": 138
+    },
+    {
+      "epoch": 0.8853503184713376,
+      "grad_norm": 0.12023093551397324,
+      "learning_rate": 0.00018168403175707954,
+      "loss": 1.1133,
+      "step": 139
+    },
+    {
+      "epoch": 0.89171974522293,
+      "grad_norm": 0.12082846462726593,
+      "learning_rate": 0.0001813848717270195,
+      "loss": 1.1083,
+      "step": 140
+    },
+    {
+      "epoch": 0.8980891719745223,
+      "grad_norm": 0.1259888857603073,
+      "learning_rate": 0.00018108353881915402,
+      "loss": 1.0931,
+      "step": 141
+    },
+    {
+      "epoch": 0.9044585987261147,
+      "grad_norm": 0.11900565028190613,
+      "learning_rate": 0.00018078004107870797,
+      "loss": 1.0955,
+      "step": 142
+    },
+    {
+      "epoch": 0.910828025477707,
+      "grad_norm": 0.11422552168369293,
+      "learning_rate": 0.00018047438660870446,
+      "loss": 1.0473,
+      "step": 143
+    },
+    {
+      "epoch": 0.9171974522292994,
+      "grad_norm": 0.13001863658428192,
+      "learning_rate": 0.00018016658356974884,
+      "loss": 1.0273,
+      "step": 144
+    },
+    {
+      "epoch": 0.9235668789808917,
+      "grad_norm": 0.11941977590322495,
+      "learning_rate": 0.0001798566401798106,
+      "loss": 1.0774,
+      "step": 145
+    },
+    {
+      "epoch": 0.9299363057324841,
+      "grad_norm": 0.12032714486122131,
+      "learning_rate": 0.00017954456471400393,
+      "loss": 1.1162,
+      "step": 146
+    },
+    {
+      "epoch": 0.9363057324840764,
+      "grad_norm": 0.13784518837928772,
+      "learning_rate": 0.00017923036550436704,
+      "loss": 1.095,
+      "step": 147
+    },
+    {
+      "epoch": 0.9426751592356688,
+      "grad_norm": 0.12085068970918655,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 1.1024,
+      "step": 148
+    },
+    {
+      "epoch": 0.9490445859872612,
+      "grad_norm": 0.11120469868183136,
+      "learning_rate": 0.00017859562946503788,
+      "loss": 1.0502,
+      "step": 149
+    },
+    {
+      "epoch": 0.9554140127388535,
+      "grad_norm": 0.1275676190853119,
+      "learning_rate": 0.00017827510958203147,
+      "loss": 1.0875,
+      "step": 150
+    },
+    {
+      "epoch": 0.9617834394904459,
+      "grad_norm": 0.13544359803199768,
+      "learning_rate": 0.00017795249984811396,
+      "loss": 1.0985,
+      "step": 151
+    },
+    {
+      "epoch": 0.9681528662420382,
+      "grad_norm": 0.11840228736400604,
+      "learning_rate": 0.00017762780887657574,
+      "loss": 1.059,
+      "step": 152
+    },
+    {
+      "epoch": 0.9745222929936306,
+      "grad_norm": 0.12622268497943878,
+      "learning_rate": 0.0001773010453362737,
+      "loss": 1.1034,
+      "step": 153
+    },
+    {
+      "epoch": 0.9808917197452229,
+      "grad_norm": 0.11485569179058075,
+      "learning_rate": 0.0001769722179513998,
+      "loss": 1.0639,
+      "step": 154
+    },
+    {
+      "epoch": 0.9872611464968153,
+      "grad_norm": 0.11948831379413605,
+      "learning_rate": 0.00017664133550124815,
+      "loss": 1.0635,
+      "step": 155
+    },
+    {
+      "epoch": 0.9936305732484076,
+      "grad_norm": 0.1214427575469017,
+      "learning_rate": 0.00017630840681998066,
+      "loss": 1.1361,
+      "step": 156
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.11713624000549316,
+      "learning_rate": 0.00017597344079639112,
+      "loss": 1.0619,
+      "step": 157
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 628,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 157,
+  "total_flos": 2.0567076783046656e+17,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-157/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3ec2d42f7297673d946070db22cc38c40ecdb7e5fb5b23a335c46b1268e0b80
+size 5816

checkpoint-314/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: openlm-research/open_llama_3b_v2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-314/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-314/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e837612b8ff249bddd7090a09a1b5fa2a0d26b9d9c7ee29bc8bf2bafbe8dafa
+size 50899792

checkpoint-314/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47125b7194d4df4a4d8b0b29e287cb06fc14d2732945312321fba445b9559170
+size 25871876

checkpoint-314/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d245e05e72192c132e0f2edb6fdcae0c578c890f0fe912f17ec7b0bba2d38cc3
+size 14244

checkpoint-314/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b3abaf7b8569e1dde3882a96a36e4c25622fc823315cf4151ad6fe86c0a20fd
+size 1064

checkpoint-314/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-314/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574

checkpoint-314/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-314/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2283 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9745222929936306,
+  "eval_steps": 40,
+  "global_step": 314,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006369426751592357,
+      "grad_norm": 0.1806156039237976,
+      "learning_rate": 1e-05,
+      "loss": 1.3031,
+      "step": 1
+    },
+    {
+      "epoch": 0.006369426751592357,
+      "eval_loss": 1.5003942251205444,
+      "eval_runtime": 19.6641,
+      "eval_samples_per_second": 55.533,
+      "eval_steps_per_second": 0.915,
+      "step": 1
+    },
+    {
+      "epoch": 0.012738853503184714,
+      "grad_norm": 0.1688886284828186,
+      "learning_rate": 2e-05,
+      "loss": 1.3305,
+      "step": 2
+    },
+    {
+      "epoch": 0.01910828025477707,
+      "grad_norm": 0.20123907923698425,
+      "learning_rate": 3e-05,
+      "loss": 1.324,
+      "step": 3
+    },
+    {
+      "epoch": 0.025477707006369428,
+      "grad_norm": 0.18879620730876923,
+      "learning_rate": 4e-05,
+      "loss": 1.3638,
+      "step": 4
+    },
+    {
+      "epoch": 0.03184713375796178,
+      "grad_norm": 0.20348915457725525,
+      "learning_rate": 5e-05,
+      "loss": 1.3686,
+      "step": 5
+    },
+    {
+      "epoch": 0.03821656050955414,
+      "grad_norm": 0.212239071726799,
+      "learning_rate": 6e-05,
+      "loss": 1.2865,
+      "step": 6
+    },
+    {
+      "epoch": 0.044585987261146494,
+      "grad_norm": 0.19280897080898285,
+      "learning_rate": 7e-05,
+      "loss": 1.313,
+      "step": 7
+    },
+    {
+      "epoch": 0.050955414012738856,
+      "grad_norm": 0.1767151653766632,
+      "learning_rate": 8e-05,
+      "loss": 1.3207,
+      "step": 8
+    },
+    {
+      "epoch": 0.05732484076433121,
+      "grad_norm": 0.20014327764511108,
+      "learning_rate": 9e-05,
+      "loss": 1.3143,
+      "step": 9
+    },
+    {
+      "epoch": 0.06369426751592357,
+      "grad_norm": 0.18035855889320374,
+      "learning_rate": 0.0001,
+      "loss": 1.252,
+      "step": 10
+    },
+    {
+      "epoch": 0.07006369426751592,
+      "grad_norm": 0.19993054866790771,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.302,
+      "step": 11
+    },
+    {
+      "epoch": 0.07643312101910828,
+      "grad_norm": 0.18973341584205627,
+      "learning_rate": 0.00012,
+      "loss": 1.2608,
+      "step": 12
+    },
+    {
+      "epoch": 0.08280254777070063,
+      "grad_norm": 0.19669465720653534,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2329,
+      "step": 13
+    },
+    {
+      "epoch": 0.08917197452229299,
+      "grad_norm": 0.1886417716741562,
+      "learning_rate": 0.00014,
+      "loss": 1.241,
+      "step": 14
+    },
+    {
+      "epoch": 0.09554140127388536,
+      "grad_norm": 0.19076582789421082,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2539,
+      "step": 15
+    },
+    {
+      "epoch": 0.10191082802547771,
+      "grad_norm": 0.16027267277240753,
+      "learning_rate": 0.00016,
+      "loss": 1.2123,
+      "step": 16
+    },
+    {
+      "epoch": 0.10828025477707007,
+      "grad_norm": 0.16112814843654633,
+      "learning_rate": 0.00017,
+      "loss": 1.2465,
+      "step": 17
+    },
+    {
+      "epoch": 0.11464968152866242,
+      "grad_norm": 0.15539830923080444,
+      "learning_rate": 0.00018,
+      "loss": 1.1717,
+      "step": 18
+    },
+    {
+      "epoch": 0.12101910828025478,
+      "grad_norm": 0.15739695727825165,
+      "learning_rate": 0.00019,
+      "loss": 1.1412,
+      "step": 19
+    },
+    {
+      "epoch": 0.12738853503184713,
+      "grad_norm": 0.15658576786518097,
+      "learning_rate": 0.0002,
+      "loss": 1.1731,
+      "step": 20
+    },
+    {
+      "epoch": 0.1337579617834395,
+      "grad_norm": 0.1474328637123108,
+      "learning_rate": 0.00019999866506037345,
+      "loss": 1.2051,
+      "step": 21
+    },
+    {
+      "epoch": 0.14012738853503184,
+      "grad_norm": 0.11234907805919647,
+      "learning_rate": 0.00019999466027713507,
+      "loss": 1.1803,
+      "step": 22
+    },
+    {
+      "epoch": 0.1464968152866242,
+      "grad_norm": 0.1053839772939682,
+      "learning_rate": 0.00019998798575720776,
+      "loss": 1.1436,
+      "step": 23
+    },
+    {
+      "epoch": 0.15286624203821655,
+      "grad_norm": 0.1049942821264267,
+      "learning_rate": 0.00019997864167879312,
+      "loss": 1.1881,
+      "step": 24
+    },
+    {
+      "epoch": 0.1592356687898089,
+      "grad_norm": 0.11039146035909653,
+      "learning_rate": 0.00019996662829136676,
+      "loss": 1.1528,
+      "step": 25
+    },
+    {
+      "epoch": 0.16560509554140126,
+      "grad_norm": 0.09678228944540024,
+      "learning_rate": 0.0001999519459156716,
+      "loss": 1.1496,
+      "step": 26
+    },
+    {
+      "epoch": 0.17197452229299362,
+      "grad_norm": 0.09857058525085449,
+      "learning_rate": 0.0001999345949437094,
+      "loss": 1.1304,
+      "step": 27
+    },
+    {
+      "epoch": 0.17834394904458598,
+      "grad_norm": 0.10835567116737366,
+      "learning_rate": 0.0001999145758387301,
+      "loss": 1.2262,
+      "step": 28
+    },
+    {
+      "epoch": 0.18471337579617833,
+      "grad_norm": 0.09927600622177124,
+      "learning_rate": 0.0001998918891352197,
+      "loss": 1.1382,
+      "step": 29
+    },
+    {
+      "epoch": 0.1910828025477707,
+      "grad_norm": 0.09861327707767487,
+      "learning_rate": 0.00019986653543888568,
+      "loss": 1.1987,
+      "step": 30
+    },
+    {
+      "epoch": 0.19745222929936307,
+      "grad_norm": 0.09174010157585144,
+      "learning_rate": 0.00019983851542664126,
+      "loss": 1.127,
+      "step": 31
+    },
+    {
+      "epoch": 0.20382165605095542,
+      "grad_norm": 0.08863182365894318,
+      "learning_rate": 0.00019980782984658683,
+      "loss": 1.211,
+      "step": 32
+    },
+    {
+      "epoch": 0.21019108280254778,
+      "grad_norm": 0.08810263872146606,
+      "learning_rate": 0.00019977447951799034,
+      "loss": 1.1476,
+      "step": 33
+    },
+    {
+      "epoch": 0.21656050955414013,
+      "grad_norm": 0.08641776442527771,
+      "learning_rate": 0.00019973846533126533,
+      "loss": 1.1497,
+      "step": 34
+    },
+    {
+      "epoch": 0.2229299363057325,
+      "grad_norm": 0.09637051075696945,
+      "learning_rate": 0.00019969978824794707,
+      "loss": 1.1471,
+      "step": 35
+    },
+    {
+      "epoch": 0.22929936305732485,
+      "grad_norm": 0.09402573108673096,
+      "learning_rate": 0.000199658449300667,
+      "loss": 1.0976,
+      "step": 36
+    },
+    {
+      "epoch": 0.2356687898089172,
+      "grad_norm": 0.09077832847833633,
+      "learning_rate": 0.00019961444959312508,
+      "loss": 1.1119,
+      "step": 37
+    },
+    {
+      "epoch": 0.24203821656050956,
+      "grad_norm": 0.08864310383796692,
+      "learning_rate": 0.0001995677903000604,
+      "loss": 1.1157,
+      "step": 38
+    },
+    {
+      "epoch": 0.2484076433121019,
+      "grad_norm": 0.09867957979440689,
+      "learning_rate": 0.0001995184726672197,
+      "loss": 1.1656,
+      "step": 39
+    },
+    {
+      "epoch": 0.25477707006369427,
+      "grad_norm": 0.09343115240335464,
+      "learning_rate": 0.00019946649801132427,
+      "loss": 1.1084,
+      "step": 40
+    },
+    {
+      "epoch": 0.25477707006369427,
+      "eval_loss": 1.1224156618118286,
+      "eval_runtime": 19.2915,
+      "eval_samples_per_second": 56.605,
+      "eval_steps_per_second": 0.933,
+      "step": 40
+    },
+    {
+      "epoch": 0.2611464968152866,
+      "grad_norm": 0.09474795311689377,
+      "learning_rate": 0.00019941186772003464,
+      "loss": 1.1486,
+      "step": 41
+    },
+    {
+      "epoch": 0.267515923566879,
+      "grad_norm": 0.09726471453905106,
+      "learning_rate": 0.00019935458325191365,
+      "loss": 1.1499,
+      "step": 42
+    },
+    {
+      "epoch": 0.27388535031847133,
+      "grad_norm": 0.09273070096969604,
+      "learning_rate": 0.0001992946461363874,
+      "loss": 1.1361,
+      "step": 43
+    },
+    {
+      "epoch": 0.2802547770700637,
+      "grad_norm": 0.10344096273183823,
+      "learning_rate": 0.0001992320579737045,
+      "loss": 1.0999,
+      "step": 44
+    },
+    {
+      "epoch": 0.28662420382165604,
+      "grad_norm": 0.09499648213386536,
+      "learning_rate": 0.00019916682043489336,
+      "loss": 1.0919,
+      "step": 45
+    },
+    {
+      "epoch": 0.2929936305732484,
+      "grad_norm": 0.09483088552951813,
+      "learning_rate": 0.00019909893526171745,
+      "loss": 1.0992,
+      "step": 46
+    },
+    {
+      "epoch": 0.29936305732484075,
+      "grad_norm": 0.10382100939750671,
+      "learning_rate": 0.00019902840426662895,
+      "loss": 1.1093,
+      "step": 47
+    },
+    {
+      "epoch": 0.3057324840764331,
+      "grad_norm": 0.10187891870737076,
+      "learning_rate": 0.00019895522933272028,
+      "loss": 1.1063,
+      "step": 48
+    },
+    {
+      "epoch": 0.31210191082802546,
+      "grad_norm": 0.1022520437836647,
+      "learning_rate": 0.00019887941241367377,
+      "loss": 1.1095,
+      "step": 49
+    },
+    {
+      "epoch": 0.3184713375796178,
+      "grad_norm": 0.11470162868499756,
+      "learning_rate": 0.00019880095553370967,
+      "loss": 1.0859,
+      "step": 50
+    },
+    {
+      "epoch": 0.3248407643312102,
+      "grad_norm": 0.09845008701086044,
+      "learning_rate": 0.0001987198607875319,
+      "loss": 1.0941,
+      "step": 51
+    },
+    {
+      "epoch": 0.33121019108280253,
+      "grad_norm": 0.1080709770321846,
+      "learning_rate": 0.00019863613034027224,
+      "loss": 1.084,
+      "step": 52
+    },
+    {
+      "epoch": 0.3375796178343949,
+      "grad_norm": 0.11064234375953674,
+      "learning_rate": 0.0001985497664274326,
+      "loss": 1.1018,
+      "step": 53
+    },
+    {
+      "epoch": 0.34394904458598724,
+      "grad_norm": 0.10099776834249496,
+      "learning_rate": 0.0001984607713548251,
+      "loss": 1.0881,
+      "step": 54
+    },
+    {
+      "epoch": 0.3503184713375796,
+      "grad_norm": 0.11960357427597046,
+      "learning_rate": 0.0001983691474985108,
+      "loss": 1.0845,
+      "step": 55
+    },
+    {
+      "epoch": 0.35668789808917195,
+      "grad_norm": 0.10840114951133728,
+      "learning_rate": 0.00019827489730473596,
+      "loss": 1.131,
+      "step": 56
+    },
+    {
+      "epoch": 0.3630573248407643,
+      "grad_norm": 0.10177604109048843,
+      "learning_rate": 0.00019817802328986697,
+      "loss": 1.079,
+      "step": 57
+    },
+    {
+      "epoch": 0.36942675159235666,
+      "grad_norm": 0.11752859503030777,
+      "learning_rate": 0.00019807852804032305,
+      "loss": 1.0833,
+      "step": 58
+    },
+    {
+      "epoch": 0.37579617834394907,
+      "grad_norm": 0.11149834841489792,
+      "learning_rate": 0.00019797641421250725,
+      "loss": 1.1009,
+      "step": 59
+    },
+    {
+      "epoch": 0.3821656050955414,
+      "grad_norm": 0.10446681827306747,
+      "learning_rate": 0.00019787168453273544,
+      "loss": 1.1211,
+      "step": 60
+    },
+    {
+      "epoch": 0.3885350318471338,
+      "grad_norm": 0.12820479273796082,
+      "learning_rate": 0.00019776434179716366,
+      "loss": 1.1455,
+      "step": 61
+    },
+    {
+      "epoch": 0.39490445859872614,
+      "grad_norm": 0.10011500865221024,
+      "learning_rate": 0.00019765438887171327,
+      "loss": 1.0779,
+      "step": 62
+    },
+    {
+      "epoch": 0.4012738853503185,
+      "grad_norm": 0.11496227979660034,
+      "learning_rate": 0.0001975418286919947,
+      "loss": 1.1174,
+      "step": 63
+    },
+    {
+      "epoch": 0.40764331210191085,
+      "grad_norm": 0.10938404500484467,
+      "learning_rate": 0.00019742666426322876,
+      "loss": 1.0576,
+      "step": 64
+    },
+    {
+      "epoch": 0.4140127388535032,
+      "grad_norm": 0.12636032700538635,
+      "learning_rate": 0.0001973088986601667,
+      "loss": 1.083,
+      "step": 65
+    },
+    {
+      "epoch": 0.42038216560509556,
+      "grad_norm": 0.10620423406362534,
+      "learning_rate": 0.00019718853502700783,
+      "loss": 1.0728,
+      "step": 66
+    },
+    {
+      "epoch": 0.4267515923566879,
+      "grad_norm": 0.11206210404634476,
+      "learning_rate": 0.0001970655765773159,
+      "loss": 1.1107,
+      "step": 67
+    },
+    {
+      "epoch": 0.43312101910828027,
+      "grad_norm": 0.12613879144191742,
+      "learning_rate": 0.00019694002659393305,
+      "loss": 1.1065,
+      "step": 68
+    },
+    {
+      "epoch": 0.4394904458598726,
+      "grad_norm": 0.10636976361274719,
+      "learning_rate": 0.00019681188842889222,
+      "loss": 1.1192,
+      "step": 69
+    },
+    {
+      "epoch": 0.445859872611465,
+      "grad_norm": 0.11036239564418793,
+      "learning_rate": 0.00019668116550332766,
+      "loss": 1.1362,
+      "step": 70
+    },
+    {
+      "epoch": 0.45222929936305734,
+      "grad_norm": 0.11907072365283966,
+      "learning_rate": 0.0001965478613073837,
+      "loss": 1.1009,
+      "step": 71
+    },
+    {
+      "epoch": 0.4585987261146497,
+      "grad_norm": 0.11267364770174026,
+      "learning_rate": 0.00019641197940012137,
+      "loss": 1.0694,
+      "step": 72
+    },
+    {
+      "epoch": 0.46496815286624205,
+      "grad_norm": 0.10659351199865341,
+      "learning_rate": 0.00019627352340942353,
+      "loss": 1.0844,
+      "step": 73
+    },
+    {
+      "epoch": 0.4713375796178344,
+      "grad_norm": 0.12426211684942245,
+      "learning_rate": 0.00019613249703189796,
+      "loss": 1.1203,
+      "step": 74
+    },
+    {
+      "epoch": 0.47770700636942676,
+      "grad_norm": 0.11883872747421265,
+      "learning_rate": 0.00019598890403277864,
+      "loss": 1.0879,
+      "step": 75
+    },
+    {
+      "epoch": 0.4840764331210191,
+      "grad_norm": 0.11355262994766235,
+      "learning_rate": 0.0001958427482458253,
+      "loss": 1.1045,
+      "step": 76
+    },
+    {
+      "epoch": 0.49044585987261147,
+      "grad_norm": 0.11006154865026474,
+      "learning_rate": 0.0001956940335732209,
+      "loss": 1.1058,
+      "step": 77
+    },
+    {
+      "epoch": 0.4968152866242038,
+      "grad_norm": 0.11379122734069824,
+      "learning_rate": 0.00019554276398546768,
+      "loss": 1.1224,
+      "step": 78
+    },
+    {
+      "epoch": 0.5031847133757962,
+      "grad_norm": 0.11065732687711716,
+      "learning_rate": 0.000195388943521281,
+      "loss": 1.1033,
+      "step": 79
+    },
+    {
+      "epoch": 0.5095541401273885,
+      "grad_norm": 0.11113402247428894,
+      "learning_rate": 0.00019523257628748146,
+      "loss": 1.0912,
+      "step": 80
+    },
+    {
+      "epoch": 0.5095541401273885,
+      "eval_loss": 1.0586377382278442,
+      "eval_runtime": 19.2899,
+      "eval_samples_per_second": 56.61,
+      "eval_steps_per_second": 0.933,
+      "step": 80
+    },
+    {
+      "epoch": 0.5159235668789809,
+      "grad_norm": 0.11783529818058014,
+      "learning_rate": 0.00019507366645888543,
+      "loss": 1.0938,
+      "step": 81
+    },
+    {
+      "epoch": 0.5222929936305732,
+      "grad_norm": 0.12089723348617554,
+      "learning_rate": 0.00019491221827819347,
+      "loss": 1.1068,
+      "step": 82
+    },
+    {
+      "epoch": 0.5286624203821656,
+      "grad_norm": 0.10991813987493515,
+      "learning_rate": 0.00019474823605587703,
+      "loss": 1.1393,
+      "step": 83
+    },
+    {
+      "epoch": 0.535031847133758,
+      "grad_norm": 0.11100416630506516,
+      "learning_rate": 0.00019458172417006347,
+      "loss": 1.1081,
+      "step": 84
+    },
+    {
+      "epoch": 0.5414012738853503,
+      "grad_norm": 0.11886284500360489,
+      "learning_rate": 0.00019441268706641907,
+      "loss": 1.1168,
+      "step": 85
+    },
+    {
+      "epoch": 0.5477707006369427,
+      "grad_norm": 0.11771067976951599,
+      "learning_rate": 0.00019424112925803039,
+      "loss": 1.098,
+      "step": 86
+    },
+    {
+      "epoch": 0.554140127388535,
+      "grad_norm": 0.11022554337978363,
+      "learning_rate": 0.00019406705532528374,
+      "loss": 1.1179,
+      "step": 87
+    },
+    {
+      "epoch": 0.5605095541401274,
+      "grad_norm": 0.11891311407089233,
+      "learning_rate": 0.00019389046991574298,
+      "loss": 1.0866,
+      "step": 88
+    },
+    {
+      "epoch": 0.5668789808917197,
+      "grad_norm": 0.11594802141189575,
+      "learning_rate": 0.00019371137774402527,
+      "loss": 1.1146,
+      "step": 89
+    },
+    {
+      "epoch": 0.5732484076433121,
+      "grad_norm": 0.1181577518582344,
+      "learning_rate": 0.0001935297835916754,
+      "loss": 1.1213,
+      "step": 90
+    },
+    {
+      "epoch": 0.5796178343949044,
+      "grad_norm": 0.10821503400802612,
+      "learning_rate": 0.00019334569230703794,
+      "loss": 1.1121,
+      "step": 91
+    },
+    {
+      "epoch": 0.5859872611464968,
+      "grad_norm": 0.118013896048069,
+      "learning_rate": 0.0001931591088051279,
+      "loss": 1.117,
+      "step": 92
+    },
+    {
+      "epoch": 0.5923566878980892,
+      "grad_norm": 0.11678043752908707,
+      "learning_rate": 0.0001929700380674995,
+      "loss": 1.0974,
+      "step": 93
+    },
+    {
+      "epoch": 0.5987261146496815,
+      "grad_norm": 0.11073200404644012,
+      "learning_rate": 0.00019277848514211317,
+      "loss": 1.1059,
+      "step": 94
+    },
+    {
+      "epoch": 0.6050955414012739,
+      "grad_norm": 0.11440474539995193,
+      "learning_rate": 0.00019258445514320065,
+      "loss": 1.0913,
+      "step": 95
+    },
+    {
+      "epoch": 0.6114649681528662,
+      "grad_norm": 0.11020273715257645,
+      "learning_rate": 0.0001923879532511287,
+      "loss": 1.0836,
+      "step": 96
+    },
+    {
+      "epoch": 0.6178343949044586,
+      "grad_norm": 0.11285867542028427,
+      "learning_rate": 0.0001921889847122605,
+      "loss": 1.0842,
+      "step": 97
+    },
+    {
+      "epoch": 0.6242038216560509,
+      "grad_norm": 0.11981746554374695,
+      "learning_rate": 0.00019198755483881583,
+      "loss": 1.1062,
+      "step": 98
+    },
+    {
+      "epoch": 0.6305732484076433,
+      "grad_norm": 0.11882256716489792,
+      "learning_rate": 0.0001917836690087291,
+      "loss": 1.1012,
+      "step": 99
+    },
+    {
+      "epoch": 0.6369426751592356,
+      "grad_norm": 0.11642686277627945,
+      "learning_rate": 0.00019157733266550575,
+      "loss": 1.0823,
+      "step": 100
+    },
+    {
+      "epoch": 0.643312101910828,
+      "grad_norm": 0.11980683356523514,
+      "learning_rate": 0.00019136855131807705,
+      "loss": 1.105,
+      "step": 101
+    },
+    {
+      "epoch": 0.6496815286624203,
+      "grad_norm": 0.1147085651755333,
+      "learning_rate": 0.0001911573305406528,
+      "loss": 1.0794,
+      "step": 102
+    },
+    {
+      "epoch": 0.6560509554140127,
+      "grad_norm": 0.12037765234708786,
+      "learning_rate": 0.00019094367597257282,
+      "loss": 1.1059,
+      "step": 103
+    },
+    {
+      "epoch": 0.6624203821656051,
+      "grad_norm": 0.12135636061429977,
+      "learning_rate": 0.000190727593318156,
+      "loss": 1.118,
+      "step": 104
+    },
+    {
+      "epoch": 0.6687898089171974,
+      "grad_norm": 0.13285911083221436,
+      "learning_rate": 0.00019050908834654834,
+      "loss": 1.0817,
+      "step": 105
+    },
+    {
+      "epoch": 0.6751592356687898,
+      "grad_norm": 0.11360063403844833,
+      "learning_rate": 0.00019028816689156878,
+      "loss": 1.0711,
+      "step": 106
+    },
+    {
+      "epoch": 0.6815286624203821,
+      "grad_norm": 0.13178926706314087,
+      "learning_rate": 0.00019006483485155338,
+      "loss": 1.1266,
+      "step": 107
+    },
+    {
+      "epoch": 0.6878980891719745,
+      "grad_norm": 0.1290571093559265,
+      "learning_rate": 0.0001898390981891979,
+      "loss": 1.0776,
+      "step": 108
+    },
+    {
+      "epoch": 0.6942675159235668,
+      "grad_norm": 0.11376259475946426,
+      "learning_rate": 0.0001896109629313987,
+      "loss": 1.1026,
+      "step": 109
+    },
+    {
+      "epoch": 0.7006369426751592,
+      "grad_norm": 0.12076874077320099,
+      "learning_rate": 0.0001893804351690917,
+      "loss": 1.104,
+      "step": 110
+    },
+    {
+      "epoch": 0.7070063694267515,
+      "grad_norm": 0.12165362387895584,
+      "learning_rate": 0.0001891475210570898,
+      "loss": 1.0884,
+      "step": 111
+    },
+    {
+      "epoch": 0.7133757961783439,
+      "grad_norm": 0.10634943842887878,
+      "learning_rate": 0.00018891222681391851,
+      "loss": 1.0844,
+      "step": 112
+    },
+    {
+      "epoch": 0.7197452229299363,
+      "grad_norm": 0.11928383260965347,
+      "learning_rate": 0.00018867455872165008,
+      "loss": 1.1205,
+      "step": 113
+    },
+    {
+      "epoch": 0.7261146496815286,
+      "grad_norm": 0.1243489533662796,
+      "learning_rate": 0.00018843452312573554,
+      "loss": 1.0704,
+      "step": 114
+    },
+    {
+      "epoch": 0.732484076433121,
+      "grad_norm": 0.11439479887485504,
+      "learning_rate": 0.0001881921264348355,
+      "loss": 1.0809,
+      "step": 115
+    },
+    {
+      "epoch": 0.7388535031847133,
+      "grad_norm": 0.1184995099902153,
+      "learning_rate": 0.0001879473751206489,
+      "loss": 1.1619,
+      "step": 116
+    },
+    {
+      "epoch": 0.7452229299363057,
+      "grad_norm": 0.11846223473548889,
+      "learning_rate": 0.00018770027571774031,
+      "loss": 1.0835,
+      "step": 117
+    },
+    {
+      "epoch": 0.7515923566878981,
+      "grad_norm": 0.11566226184368134,
+      "learning_rate": 0.00018745083482336544,
+      "loss": 1.0658,
+      "step": 118
+    },
+    {
+      "epoch": 0.7579617834394905,
+      "grad_norm": 0.11553015559911728,
+      "learning_rate": 0.00018719905909729494,
+      "loss": 1.0773,
+      "step": 119
+    },
+    {
+      "epoch": 0.7643312101910829,
+      "grad_norm": 0.13605500757694244,
+      "learning_rate": 0.0001869449552616367,
+      "loss": 1.0727,
+      "step": 120
+    },
+    {
+      "epoch": 0.7643312101910829,
+      "eval_loss": 1.0301120281219482,
+      "eval_runtime": 19.2781,
+      "eval_samples_per_second": 56.645,
+      "eval_steps_per_second": 0.934,
+      "step": 120
+    },
+    {
+      "epoch": 0.7707006369426752,
+      "grad_norm": 0.1149601861834526,
+      "learning_rate": 0.00018668853010065634,
+      "loss": 1.0745,
+      "step": 121
+    },
+    {
+      "epoch": 0.7770700636942676,
+      "grad_norm": 0.11904130131006241,
+      "learning_rate": 0.00018642979046059593,
+      "loss": 1.0574,
+      "step": 122
+    },
+    {
+      "epoch": 0.7834394904458599,
+      "grad_norm": 0.11868870258331299,
+      "learning_rate": 0.00018616874324949159,
+      "loss": 1.0681,
+      "step": 123
+    },
+    {
+      "epoch": 0.7898089171974523,
+      "grad_norm": 0.11400648951530457,
+      "learning_rate": 0.00018590539543698854,
+      "loss": 1.0874,
+      "step": 124
+    },
+    {
+      "epoch": 0.7961783439490446,
+      "grad_norm": 0.12247481942176819,
+      "learning_rate": 0.0001856397540541554,
+      "loss": 1.0832,
+      "step": 125
+    },
+    {
+      "epoch": 0.802547770700637,
+      "grad_norm": 0.11855783313512802,
+      "learning_rate": 0.0001853718261932964,
+      "loss": 1.0775,
+      "step": 126
+    },
+    {
+      "epoch": 0.8089171974522293,
+      "grad_norm": 0.11434577405452728,
+      "learning_rate": 0.00018510161900776187,
+      "loss": 1.048,
+      "step": 127
+    },
+    {
+      "epoch": 0.8152866242038217,
+      "grad_norm": 0.12175115942955017,
+      "learning_rate": 0.00018482913971175737,
+      "loss": 1.0776,
+      "step": 128
+    },
+    {
+      "epoch": 0.821656050955414,
+      "grad_norm": 0.1237318217754364,
+      "learning_rate": 0.00018455439558015115,
+      "loss": 1.0977,
+      "step": 129
+    },
+    {
+      "epoch": 0.8280254777070064,
+      "grad_norm": 0.12041562050580978,
+      "learning_rate": 0.00018427739394827973,
+      "loss": 1.0477,
+      "step": 130
+    },
+    {
+      "epoch": 0.8343949044585988,
+      "grad_norm": 0.11855332553386688,
+      "learning_rate": 0.00018399814221175227,
+      "loss": 1.1026,
+      "step": 131
+    },
+    {
+      "epoch": 0.8407643312101911,
+      "grad_norm": 0.12020997703075409,
+      "learning_rate": 0.00018371664782625287,
+      "loss": 1.0484,
+      "step": 132
+    },
+    {
+      "epoch": 0.8471337579617835,
+      "grad_norm": 0.1116231232881546,
+      "learning_rate": 0.00018343291830734176,
+      "loss": 1.0772,
+      "step": 133
+    },
+    {
+      "epoch": 0.8535031847133758,
+      "grad_norm": 0.12280379235744476,
+      "learning_rate": 0.00018314696123025454,
+      "loss": 1.0829,
+      "step": 134
+    },
+    {
+      "epoch": 0.8598726114649682,
+      "grad_norm": 0.11589805781841278,
+      "learning_rate": 0.00018285878422969983,
+      "loss": 1.0636,
+      "step": 135
+    },
+    {
+      "epoch": 0.8662420382165605,
+      "grad_norm": 0.11667989194393158,
+      "learning_rate": 0.0001825683949996556,
+      "loss": 1.0783,
+      "step": 136
+    },
+    {
+      "epoch": 0.8726114649681529,
+      "grad_norm": 0.11666262894868851,
+      "learning_rate": 0.00018227580129316366,
+      "loss": 1.0587,
+      "step": 137
+    },
+    {
+      "epoch": 0.8789808917197452,
+      "grad_norm": 0.11791834235191345,
+      "learning_rate": 0.00018198101092212267,
+      "loss": 1.0955,
+      "step": 138
+    },
+    {
+      "epoch": 0.8853503184713376,
+      "grad_norm": 0.12023093551397324,
+      "learning_rate": 0.00018168403175707954,
+      "loss": 1.1133,
+      "step": 139
+    },
+    {
+      "epoch": 0.89171974522293,
+      "grad_norm": 0.12082846462726593,
+      "learning_rate": 0.0001813848717270195,
+      "loss": 1.1083,
+      "step": 140
+    },
+    {
+      "epoch": 0.8980891719745223,
+      "grad_norm": 0.1259888857603073,
+      "learning_rate": 0.00018108353881915402,
+      "loss": 1.0931,
+      "step": 141
+    },
+    {
+      "epoch": 0.9044585987261147,
+      "grad_norm": 0.11900565028190613,
+      "learning_rate": 0.00018078004107870797,
+      "loss": 1.0955,
+      "step": 142
+    },
+    {
+      "epoch": 0.910828025477707,
+      "grad_norm": 0.11422552168369293,
+      "learning_rate": 0.00018047438660870446,
+      "loss": 1.0473,
+      "step": 143
+    },
+    {
+      "epoch": 0.9171974522292994,
+      "grad_norm": 0.13001863658428192,
+      "learning_rate": 0.00018016658356974884,
+      "loss": 1.0273,
+      "step": 144
+    },
+    {
+      "epoch": 0.9235668789808917,
+      "grad_norm": 0.11941977590322495,
+      "learning_rate": 0.0001798566401798106,
+      "loss": 1.0774,
+      "step": 145
+    },
+    {
+      "epoch": 0.9299363057324841,
+      "grad_norm": 0.12032714486122131,
+      "learning_rate": 0.00017954456471400393,
+      "loss": 1.1162,
+      "step": 146
+    },
+    {
+      "epoch": 0.9363057324840764,
+      "grad_norm": 0.13784518837928772,
+      "learning_rate": 0.00017923036550436704,
+      "loss": 1.095,
+      "step": 147
+    },
+    {
+      "epoch": 0.9426751592356688,
+      "grad_norm": 0.12085068970918655,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 1.1024,
+      "step": 148
+    },
+    {
+      "epoch": 0.9490445859872612,
+      "grad_norm": 0.11120469868183136,
+      "learning_rate": 0.00017859562946503788,
+      "loss": 1.0502,
+      "step": 149
+    },
+    {
+      "epoch": 0.9554140127388535,
+      "grad_norm": 0.1275676190853119,
+      "learning_rate": 0.00017827510958203147,
+      "loss": 1.0875,
+      "step": 150
+    },
+    {
+      "epoch": 0.9617834394904459,
+      "grad_norm": 0.13544359803199768,
+      "learning_rate": 0.00017795249984811396,
+      "loss": 1.0985,
+      "step": 151
+    },
+    {
+      "epoch": 0.9681528662420382,
+      "grad_norm": 0.11840228736400604,
+      "learning_rate": 0.00017762780887657574,
+      "loss": 1.059,
+      "step": 152
+    },
+    {
+      "epoch": 0.9745222929936306,
+      "grad_norm": 0.12622268497943878,
+      "learning_rate": 0.0001773010453362737,
+      "loss": 1.1034,
+      "step": 153
+    },
+    {
+      "epoch": 0.9808917197452229,
+      "grad_norm": 0.11485569179058075,
+      "learning_rate": 0.0001769722179513998,
+      "loss": 1.0639,
+      "step": 154
+    },
+    {
+      "epoch": 0.9872611464968153,
+      "grad_norm": 0.11948831379413605,
+      "learning_rate": 0.00017664133550124815,
+      "loss": 1.0635,
+      "step": 155
+    },
+    {
+      "epoch": 0.9936305732484076,
+      "grad_norm": 0.1214427575469017,
+      "learning_rate": 0.00017630840681998066,
+      "loss": 1.1361,
+      "step": 156
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.11713624000549316,
+      "learning_rate": 0.00017597344079639112,
+      "loss": 1.0619,
+      "step": 157
+    },
+    {
+      "epoch": 1.0063694267515924,
+      "grad_norm": 0.11573248356580734,
+      "learning_rate": 0.00017563644637366788,
+      "loss": 1.1141,
+      "step": 158
+    },
+    {
+      "epoch": 1.0127388535031847,
+      "grad_norm": 0.11592724919319153,
+      "learning_rate": 0.00017529743254915512,
+      "loss": 1.0569,
+      "step": 159
+    },
+    {
+      "epoch": 1.019108280254777,
+      "grad_norm": 0.12063013017177582,
+      "learning_rate": 0.0001749564083741126,
+      "loss": 1.0438,
+      "step": 160
+    },
+    {
+      "epoch": 1.019108280254777,
+      "eval_loss": 1.012627124786377,
+      "eval_runtime": 19.2811,
+      "eval_samples_per_second": 56.636,
+      "eval_steps_per_second": 0.934,
+      "step": 160
+    },
+    {
+      "epoch": 1.0254777070063694,
+      "grad_norm": 1.0452429056167603,
+      "learning_rate": 0.00017461338295347406,
+      "loss": 1.1776,
+      "step": 161
+    },
+    {
+      "epoch": 1.0063694267515924,
+      "grad_norm": 0.13684259355068207,
+      "learning_rate": 0.000174268365445604,
+      "loss": 1.0491,
+      "step": 162
+    },
+    {
+      "epoch": 1.0127388535031847,
+      "grad_norm": 0.13602878153324127,
+      "learning_rate": 0.0001739213650620533,
+      "loss": 1.0311,
+      "step": 163
+    },
+    {
+      "epoch": 1.019108280254777,
+      "grad_norm": 0.1506141573190689,
+      "learning_rate": 0.00017357239106731317,
+      "loss": 1.0233,
+      "step": 164
+    },
+    {
+      "epoch": 1.0254777070063694,
+      "grad_norm": 0.13961653411388397,
+      "learning_rate": 0.00017322145277856794,
+      "loss": 1.0515,
+      "step": 165
+    },
+    {
+      "epoch": 1.0318471337579618,
+      "grad_norm": 0.12875933945178986,
+      "learning_rate": 0.00017286855956544613,
+      "loss": 1.0643,
+      "step": 166
+    },
+    {
+      "epoch": 1.0382165605095541,
+      "grad_norm": 0.13839364051818848,
+      "learning_rate": 0.0001725137208497705,
+      "loss": 1.0853,
+      "step": 167
+    },
+    {
+      "epoch": 1.0445859872611465,
+      "grad_norm": 0.1401708722114563,
+      "learning_rate": 0.0001721569461053062,
+      "loss": 1.0608,
+      "step": 168
+    },
+    {
+      "epoch": 1.0509554140127388,
+      "grad_norm": 0.13666324317455292,
+      "learning_rate": 0.0001717982448575082,
+      "loss": 1.0186,
+      "step": 169
+    },
+    {
+      "epoch": 1.0573248407643312,
+      "grad_norm": 0.13511985540390015,
+      "learning_rate": 0.00017143762668326667,
+      "loss": 1.0775,
+      "step": 170
+    },
+    {
+      "epoch": 1.0636942675159236,
+      "grad_norm": 0.13401229679584503,
+      "learning_rate": 0.00017107510121065138,
+      "loss": 1.0587,
+      "step": 171
+    },
+    {
+      "epoch": 1.070063694267516,
+      "grad_norm": 0.13937029242515564,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 1.0655,
+      "step": 172
+    },
+    {
+      "epoch": 1.0764331210191083,
+      "grad_norm": 0.13978822529315948,
+      "learning_rate": 0.0001703443671369333,
+      "loss": 1.0417,
+      "step": 173
+    },
+    {
+      "epoch": 1.0828025477707006,
+      "grad_norm": 0.1328263282775879,
+      "learning_rate": 0.00016997617804554796,
+      "loss": 1.0609,
+      "step": 174
+    },
+    {
+      "epoch": 1.089171974522293,
+      "grad_norm": 0.13478587567806244,
+      "learning_rate": 0.00016960612067470288,
+      "loss": 1.0314,
+      "step": 175
+    },
+    {
+      "epoch": 1.0955414012738853,
+      "grad_norm": 0.12482774257659912,
+      "learning_rate": 0.00016923420490448296,
+      "loss": 1.0173,
+      "step": 176
+    },
+    {
+      "epoch": 1.1019108280254777,
+      "grad_norm": 0.12970109283924103,
+      "learning_rate": 0.0001688604406645903,
+      "loss": 1.0904,
+      "step": 177
+    },
+    {
+      "epoch": 1.10828025477707,
+      "grad_norm": 0.12363622337579727,
+      "learning_rate": 0.00016848483793407873,
+      "loss": 1.0434,
+      "step": 178
+    },
+    {
+      "epoch": 1.1146496815286624,
+      "grad_norm": 0.13114579021930695,
+      "learning_rate": 0.00016810740674108764,
+      "loss": 1.0456,
+      "step": 179
+    },
+    {
+      "epoch": 1.1210191082802548,
+      "grad_norm": 0.13814528286457062,
+      "learning_rate": 0.00016772815716257412,
+      "loss": 1.0845,
+      "step": 180
+    },
+    {
+      "epoch": 1.127388535031847,
+      "grad_norm": 0.12670482695102692,
+      "learning_rate": 0.00016734709932404403,
+      "loss": 1.0392,
+      "step": 181
+    },
+    {
+      "epoch": 1.1337579617834395,
+      "grad_norm": 0.13344614207744598,
+      "learning_rate": 0.00016696424339928152,
+      "loss": 1.0429,
+      "step": 182
+    },
+    {
+      "epoch": 1.1401273885350318,
+      "grad_norm": 0.14558671414852142,
+      "learning_rate": 0.00016657959961007747,
+      "loss": 1.0615,
+      "step": 183
+    },
+    {
+      "epoch": 1.1464968152866242,
+      "grad_norm": 0.13091522455215454,
+      "learning_rate": 0.00016619317822595667,
+      "loss": 1.0816,
+      "step": 184
+    },
+    {
+      "epoch": 1.1528662420382165,
+      "grad_norm": 0.1288042962551117,
+      "learning_rate": 0.00016580498956390342,
+      "loss": 1.0114,
+      "step": 185
+    },
+    {
+      "epoch": 1.1592356687898089,
+      "grad_norm": 0.12748295068740845,
+      "learning_rate": 0.00016541504398808631,
+      "loss": 1.096,
+      "step": 186
+    },
+    {
+      "epoch": 1.1656050955414012,
+      "grad_norm": 0.13045403361320496,
+      "learning_rate": 0.00016502335190958135,
+      "loss": 0.9977,
+      "step": 187
+    },
+    {
+      "epoch": 1.1719745222929936,
+      "grad_norm": 0.14281457662582397,
+      "learning_rate": 0.00016462992378609407,
+      "loss": 1.0434,
+      "step": 188
+    },
+    {
+      "epoch": 1.178343949044586,
+      "grad_norm": 0.1320338100194931,
+      "learning_rate": 0.00016423477012168038,
+      "loss": 1.0554,
+      "step": 189
+    },
+    {
+      "epoch": 1.1847133757961783,
+      "grad_norm": 0.12324702739715576,
+      "learning_rate": 0.00016383790146646588,
+      "loss": 1.0416,
+      "step": 190
+    },
+    {
+      "epoch": 1.1910828025477707,
+      "grad_norm": 0.1301770806312561,
+      "learning_rate": 0.00016343932841636456,
+      "loss": 1.0613,
+      "step": 191
+    },
+    {
+      "epoch": 1.197452229299363,
+      "grad_norm": 0.14009694755077362,
+      "learning_rate": 0.0001630390616127955,
+      "loss": 1.0139,
+      "step": 192
+    },
+    {
+      "epoch": 1.2038216560509554,
+      "grad_norm": 0.13656193017959595,
+      "learning_rate": 0.00016263711174239914,
+      "loss": 1.0632,
+      "step": 193
+    },
+    {
+      "epoch": 1.2101910828025477,
+      "grad_norm": 0.12946204841136932,
+      "learning_rate": 0.00016223348953675162,
+      "loss": 1.0458,
+      "step": 194
+    },
+    {
+      "epoch": 1.21656050955414,
+      "grad_norm": 0.1356847584247589,
+      "learning_rate": 0.00016182820577207842,
+      "loss": 1.0928,
+      "step": 195
+    },
+    {
+      "epoch": 1.2229299363057324,
+      "grad_norm": 0.1389479786157608,
+      "learning_rate": 0.0001614212712689668,
+      "loss": 1.0577,
+      "step": 196
+    },
+    {
+      "epoch": 1.2292993630573248,
+      "grad_norm": 0.1340690702199936,
+      "learning_rate": 0.00016101269689207655,
+      "loss": 1.0572,
+      "step": 197
+    },
+    {
+      "epoch": 1.2356687898089171,
+      "grad_norm": 0.13188521564006805,
+      "learning_rate": 0.00016060249354985025,
+      "loss": 1.0775,
+      "step": 198
+    },
+    {
+      "epoch": 1.2420382165605095,
+      "grad_norm": 0.12922795116901398,
+      "learning_rate": 0.00016019067219422178,
+      "loss": 1.0434,
+      "step": 199
+    },
+    {
+      "epoch": 1.2484076433121019,
+      "grad_norm": 0.12612590193748474,
+      "learning_rate": 0.0001597772438203241,
+      "loss": 1.0126,
+      "step": 200
+    },
+    {
+      "epoch": 1.2484076433121019,
+      "eval_loss": 1.00348961353302,
+      "eval_runtime": 19.3163,
+      "eval_samples_per_second": 56.533,
+      "eval_steps_per_second": 0.932,
+      "step": 200
+    },
+    {
+      "epoch": 1.2547770700636942,
+      "grad_norm": 0.1387277990579605,
+      "learning_rate": 0.0001593622194661956,
+      "loss": 1.0421,
+      "step": 201
+    },
+    {
+      "epoch": 1.2611464968152866,
+      "grad_norm": 0.13583126664161682,
+      "learning_rate": 0.00015894561021248535,
+      "loss": 1.0441,
+      "step": 202
+    },
+    {
+      "epoch": 1.267515923566879,
+      "grad_norm": 0.12996627390384674,
+      "learning_rate": 0.00015852742718215743,
+      "loss": 1.0342,
+      "step": 203
+    },
+    {
+      "epoch": 1.2738853503184713,
+      "grad_norm": 0.13653862476348877,
+      "learning_rate": 0.00015810768154019385,
+      "loss": 1.0108,
+      "step": 204
+    },
+    {
+      "epoch": 1.2802547770700636,
+      "grad_norm": 0.1289973258972168,
+      "learning_rate": 0.0001576863844932963,
+      "loss": 1.0523,
+      "step": 205
+    },
+    {
+      "epoch": 1.286624203821656,
+      "grad_norm": 0.13348506391048431,
+      "learning_rate": 0.00015726354728958736,
+      "loss": 1.0564,
+      "step": 206
+    },
+    {
+      "epoch": 1.2929936305732483,
+      "grad_norm": 0.12048185616731644,
+      "learning_rate": 0.0001568391812183097,
+      "loss": 1.0457,
+      "step": 207
+    },
+    {
+      "epoch": 1.2993630573248407,
+      "grad_norm": 0.12991134822368622,
+      "learning_rate": 0.00015641329760952513,
+      "loss": 1.05,
+      "step": 208
+    },
+    {
+      "epoch": 1.305732484076433,
+      "grad_norm": 0.13280436396598816,
+      "learning_rate": 0.00015598590783381163,
+      "loss": 1.0747,
+      "step": 209
+    },
+    {
+      "epoch": 1.3121019108280254,
+      "grad_norm": 0.13099676370620728,
+      "learning_rate": 0.00015555702330196023,
+      "loss": 1.0764,
+      "step": 210
+    },
+    {
+      "epoch": 1.3184713375796178,
+      "grad_norm": 0.1397230178117752,
+      "learning_rate": 0.00015512665546467007,
+      "loss": 1.0716,
+      "step": 211
+    },
+    {
+      "epoch": 1.3248407643312101,
+      "grad_norm": 0.13324333727359772,
+      "learning_rate": 0.00015469481581224272,
+      "loss": 1.0926,
+      "step": 212
+    },
+    {
+      "epoch": 1.3312101910828025,
+      "grad_norm": 0.1313484162092209,
+      "learning_rate": 0.00015426151587427547,
+      "loss": 1.0533,
+      "step": 213
+    },
+    {
+      "epoch": 1.3375796178343948,
+      "grad_norm": 0.1433049589395523,
+      "learning_rate": 0.00015382676721935345,
+      "loss": 1.055,
+      "step": 214
+    },
+    {
+      "epoch": 1.3439490445859872,
+      "grad_norm": 0.1309911012649536,
+      "learning_rate": 0.00015339058145474085,
+      "loss": 1.0536,
+      "step": 215
+    },
+    {
+      "epoch": 1.3503184713375795,
+      "grad_norm": 0.13482902944087982,
+      "learning_rate": 0.00015295297022607088,
+      "loss": 1.0176,
+      "step": 216
+    },
+    {
+      "epoch": 1.356687898089172,
+      "grad_norm": 0.13102853298187256,
+      "learning_rate": 0.00015251394521703494,
+      "loss": 1.0849,
+      "step": 217
+    },
+    {
+      "epoch": 1.3630573248407643,
+      "grad_norm": 0.13901150226593018,
+      "learning_rate": 0.00015207351814907068,
+      "loss": 1.0452,
+      "step": 218
+    },
+    {
+      "epoch": 1.3694267515923566,
+      "grad_norm": 0.13824929296970367,
+      "learning_rate": 0.000151631700781049,
+      "loss": 1.0083,
+      "step": 219
+    },
+    {
+      "epoch": 1.3757961783439492,
+      "grad_norm": 0.1309863179922104,
+      "learning_rate": 0.00015118850490896012,
+      "loss": 1.0517,
+      "step": 220
+    },
+    {
+      "epoch": 1.3821656050955413,
+      "grad_norm": 0.1359570473432541,
+      "learning_rate": 0.0001507439423655987,
+      "loss": 1.0452,
+      "step": 221
+    },
+    {
+      "epoch": 1.388535031847134,
+      "grad_norm": 0.13473795354366302,
+      "learning_rate": 0.00015029802502024788,
+      "loss": 1.0234,
+      "step": 222
+    },
+    {
+      "epoch": 1.394904458598726,
+      "grad_norm": 0.13787756860256195,
+      "learning_rate": 0.0001498507647783623,
+      "loss": 1.0811,
+      "step": 223
+    },
+    {
+      "epoch": 1.4012738853503186,
+      "grad_norm": 0.1334763914346695,
+      "learning_rate": 0.00014940217358125042,
+      "loss": 1.0363,
+      "step": 224
+    },
+    {
+      "epoch": 1.4076433121019107,
+      "grad_norm": 0.13535600900650024,
+      "learning_rate": 0.0001489522634057555,
+      "loss": 1.059,
+      "step": 225
+    },
+    {
+      "epoch": 1.4140127388535033,
+      "grad_norm": 0.1335124969482422,
+      "learning_rate": 0.00014850104626393598,
+      "loss": 1.0602,
+      "step": 226
+    },
+    {
+      "epoch": 1.4203821656050954,
+      "grad_norm": 0.13075490295886993,
+      "learning_rate": 0.00014804853420274472,
+      "loss": 1.0344,
+      "step": 227
+    },
+    {
+      "epoch": 1.426751592356688,
+      "grad_norm": 0.13887614011764526,
+      "learning_rate": 0.00014759473930370736,
+      "loss": 1.0728,
+      "step": 228
+    },
+    {
+      "epoch": 1.4331210191082802,
+      "grad_norm": 0.12808558344841003,
+      "learning_rate": 0.0001471396736825998,
+      "loss": 1.0158,
+      "step": 229
+    },
+    {
+      "epoch": 1.4394904458598727,
+      "grad_norm": 0.1339128464460373,
+      "learning_rate": 0.00014668334948912453,
+      "loss": 1.0647,
+      "step": 230
+    },
+    {
+      "epoch": 1.4458598726114649,
+      "grad_norm": 0.13178490102291107,
+      "learning_rate": 0.00014622577890658665,
+      "loss": 1.0684,
+      "step": 231
+    },
+    {
+      "epoch": 1.4522292993630574,
+      "grad_norm": 0.13547855615615845,
+      "learning_rate": 0.00014576697415156817,
+      "loss": 1.0712,
+      "step": 232
+    },
+    {
+      "epoch": 1.4585987261146496,
+      "grad_norm": 0.13795921206474304,
+      "learning_rate": 0.00014530694747360204,
+      "loss": 1.0776,
+      "step": 233
+    },
+    {
+      "epoch": 1.4649681528662422,
+      "grad_norm": 0.13771343231201172,
+      "learning_rate": 0.00014484571115484508,
+      "loss": 1.0517,
+      "step": 234
+    },
+    {
+      "epoch": 1.4713375796178343,
+      "grad_norm": 0.13231024146080017,
+      "learning_rate": 0.0001443832775097501,
+      "loss": 1.0776,
+      "step": 235
+    },
+    {
+      "epoch": 1.4777070063694269,
+      "grad_norm": 0.1319817453622818,
+      "learning_rate": 0.00014391965888473703,
+      "loss": 1.0494,
+      "step": 236
+    },
+    {
+      "epoch": 1.484076433121019,
+      "grad_norm": 0.13426139950752258,
+      "learning_rate": 0.0001434548676578634,
+      "loss": 1.001,
+      "step": 237
+    },
+    {
+      "epoch": 1.4904458598726116,
+      "grad_norm": 0.13087789714336395,
+      "learning_rate": 0.0001429889162384937,
+      "loss": 1.0588,
+      "step": 238
+    },
+    {
+      "epoch": 1.4968152866242037,
+      "grad_norm": 0.13652274012565613,
+      "learning_rate": 0.00014252181706696817,
+      "loss": 1.0124,
+      "step": 239
+    },
+    {
+      "epoch": 1.5031847133757963,
+      "grad_norm": 0.13933531939983368,
+      "learning_rate": 0.00014205358261427074,
+      "loss": 1.048,
+      "step": 240
+    },
+    {
+      "epoch": 1.5031847133757963,
+      "eval_loss": 0.9937697052955627,
+      "eval_runtime": 19.2892,
+      "eval_samples_per_second": 56.612,
+      "eval_steps_per_second": 0.933,
+      "step": 240
+    },
+    {
+      "epoch": 1.5095541401273884,
+      "grad_norm": 0.13970831036567688,
+      "learning_rate": 0.00014158422538169596,
+      "loss": 1.0433,
+      "step": 241
+    },
+    {
+      "epoch": 1.515923566878981,
+      "grad_norm": 0.13193373382091522,
+      "learning_rate": 0.0001411137579005151,
+      "loss": 1.0726,
+      "step": 242
+    },
+    {
+      "epoch": 1.5222929936305731,
+      "grad_norm": 0.14291027188301086,
+      "learning_rate": 0.0001406421927316419,
+      "loss": 1.0825,
+      "step": 243
+    },
+    {
+      "epoch": 1.5286624203821657,
+      "grad_norm": 0.14268159866333008,
+      "learning_rate": 0.00014016954246529696,
+      "loss": 1.0887,
+      "step": 244
+    },
+    {
+      "epoch": 1.5350318471337578,
+      "grad_norm": 0.13607299327850342,
+      "learning_rate": 0.00013969581972067164,
+      "loss": 1.0644,
+      "step": 245
+    },
+    {
+      "epoch": 1.5414012738853504,
+      "grad_norm": 0.13732877373695374,
+      "learning_rate": 0.0001392210371455913,
+      "loss": 1.0339,
+      "step": 246
+    },
+    {
+      "epoch": 1.5477707006369426,
+      "grad_norm": 0.13315479457378387,
+      "learning_rate": 0.00013874520741617735,
+      "loss": 1.0284,
+      "step": 247
+    },
+    {
+      "epoch": 1.5541401273885351,
+      "grad_norm": 0.13376399874687195,
+      "learning_rate": 0.000138268343236509,
+      "loss": 1.0279,
+      "step": 248
+    },
+    {
+      "epoch": 1.5605095541401273,
+      "grad_norm": 0.13698357343673706,
+      "learning_rate": 0.00013779045733828407,
+      "loss": 1.0884,
+      "step": 249
+    },
+    {
+      "epoch": 1.5668789808917198,
+      "grad_norm": 0.13575707376003265,
+      "learning_rate": 0.00013731156248047904,
+      "loss": 1.0383,
+      "step": 250
+    },
+    {
+      "epoch": 1.573248407643312,
+      "grad_norm": 0.14258643984794617,
+      "learning_rate": 0.00013683167144900834,
+      "loss": 1.0812,
+      "step": 251
+    },
+    {
+      "epoch": 1.5796178343949046,
+      "grad_norm": 0.1422533541917801,
+      "learning_rate": 0.00013635079705638298,
+      "loss": 1.0259,
+      "step": 252
+    },
+    {
+      "epoch": 1.5859872611464967,
+      "grad_norm": 0.13875292241573334,
+      "learning_rate": 0.00013586895214136874,
+      "loss": 1.0507,
+      "step": 253
+    },
+    {
+      "epoch": 1.5923566878980893,
+      "grad_norm": 0.1358788013458252,
+      "learning_rate": 0.00013538614956864296,
+      "loss": 1.066,
+      "step": 254
+    },
+    {
+      "epoch": 1.5987261146496814,
+      "grad_norm": 0.13774985074996948,
+      "learning_rate": 0.0001349024022284514,
+      "loss": 1.0485,
+      "step": 255
+    },
+    {
+      "epoch": 1.605095541401274,
+      "grad_norm": 0.13040746748447418,
+      "learning_rate": 0.00013441772303626387,
+      "loss": 1.0173,
+      "step": 256
+    },
+    {
+      "epoch": 1.611464968152866,
+      "grad_norm": 0.1312469244003296,
+      "learning_rate": 0.00013393212493242963,
+      "loss": 1.0489,
+      "step": 257
+    },
+    {
+      "epoch": 1.6178343949044587,
+      "grad_norm": 0.14885447919368744,
+      "learning_rate": 0.00013344562088183165,
+      "loss": 1.0403,
+      "step": 258
+    },
+    {
+      "epoch": 1.6242038216560508,
+      "grad_norm": 0.12916652858257294,
+      "learning_rate": 0.00013295822387354071,
+      "loss": 1.024,
+      "step": 259
+    },
+    {
+      "epoch": 1.6305732484076434,
+      "grad_norm": 0.14133484661579132,
+      "learning_rate": 0.00013246994692046836,
+      "loss": 1.0708,
+      "step": 260
+    },
+    {
+      "epoch": 1.6369426751592355,
+      "grad_norm": 0.1382388323545456,
+      "learning_rate": 0.0001319808030590197,
+      "loss": 1.0245,
+      "step": 261
+    },
+    {
+      "epoch": 1.643312101910828,
+      "grad_norm": 0.133922278881073,
+      "learning_rate": 0.0001314908053487452,
+      "loss": 1.0811,
+      "step": 262
+    },
+    {
+      "epoch": 1.6496815286624202,
+      "grad_norm": 0.13291525840759277,
+      "learning_rate": 0.00013099996687199203,
+      "loss": 1.0158,
+      "step": 263
+    },
+    {
+      "epoch": 1.6560509554140128,
+      "grad_norm": 0.13765017688274384,
+      "learning_rate": 0.00013050830073355488,
+      "loss": 1.051,
+      "step": 264
+    },
+    {
+      "epoch": 1.662420382165605,
+      "grad_norm": 0.13831576704978943,
+      "learning_rate": 0.000130015820060326,
+      "loss": 1.0277,
+      "step": 265
+    },
+    {
+      "epoch": 1.6687898089171975,
+      "grad_norm": 0.13770005106925964,
+      "learning_rate": 0.00012952253800094468,
+      "loss": 1.0982,
+      "step": 266
+    },
+    {
+      "epoch": 1.6751592356687897,
+      "grad_norm": 0.13507647812366486,
+      "learning_rate": 0.00012902846772544624,
+      "loss": 0.9961,
+      "step": 267
+    },
+    {
+      "epoch": 1.6815286624203822,
+      "grad_norm": 0.13365790247917175,
+      "learning_rate": 0.00012853362242491053,
+      "loss": 1.0191,
+      "step": 268
+    },
+    {
+      "epoch": 1.6878980891719744,
+      "grad_norm": 0.13307291269302368,
+      "learning_rate": 0.00012803801531110955,
+      "loss": 1.0709,
+      "step": 269
+    },
+    {
+      "epoch": 1.694267515923567,
+      "grad_norm": 0.1397312730550766,
+      "learning_rate": 0.0001275416596161548,
+      "loss": 1.0412,
+      "step": 270
+    },
+    {
+      "epoch": 1.700636942675159,
+      "grad_norm": 0.14555448293685913,
+      "learning_rate": 0.00012704456859214397,
+      "loss": 1.0675,
+      "step": 271
+    },
+    {
+      "epoch": 1.7070063694267517,
+      "grad_norm": 0.14011207222938538,
+      "learning_rate": 0.00012654675551080724,
+      "loss": 1.0632,
+      "step": 272
+    },
+    {
+      "epoch": 1.7133757961783438,
+      "grad_norm": 0.14120171964168549,
+      "learning_rate": 0.00012604823366315273,
+      "loss": 1.0307,
+      "step": 273
+    },
+    {
+      "epoch": 1.7197452229299364,
+      "grad_norm": 0.13699355721473694,
+      "learning_rate": 0.00012554901635911187,
+      "loss": 1.0482,
+      "step": 274
+    },
+    {
+      "epoch": 1.7261146496815285,
+      "grad_norm": 0.14194992184638977,
+      "learning_rate": 0.00012504911692718385,
+      "loss": 1.0944,
+      "step": 275
+    },
+    {
+      "epoch": 1.732484076433121,
+      "grad_norm": 0.13791659474372864,
+      "learning_rate": 0.00012454854871407994,
+      "loss": 1.0276,
+      "step": 276
+    },
+    {
+      "epoch": 1.7388535031847132,
+      "grad_norm": 0.1348312497138977,
+      "learning_rate": 0.0001240473250843669,
+      "loss": 1.0388,
+      "step": 277
+    },
+    {
+      "epoch": 1.7452229299363058,
+      "grad_norm": 0.13901084661483765,
+      "learning_rate": 0.00012354545942011057,
+      "loss": 1.0605,
+      "step": 278
+    },
+    {
+      "epoch": 1.7515923566878981,
+      "grad_norm": 0.13213810324668884,
+      "learning_rate": 0.00012304296512051814,
+      "loss": 1.0163,
+      "step": 279
+    },
+    {
+      "epoch": 1.7579617834394905,
+      "grad_norm": 0.13962963223457336,
+      "learning_rate": 0.00012253985560158062,
+      "loss": 1.0839,
+      "step": 280
+    },
+    {
+      "epoch": 1.7579617834394905,
+      "eval_loss": 0.9859278202056885,
+      "eval_runtime": 19.3296,
+      "eval_samples_per_second": 56.494,
+      "eval_steps_per_second": 0.931,
+      "step": 280
+    },
+    {
+      "epoch": 1.7643312101910829,
+      "grad_norm": 0.13637703657150269,
+      "learning_rate": 0.00012203614429571475,
+      "loss": 1.0466,
+      "step": 281
+    },
+    {
+      "epoch": 1.7707006369426752,
+      "grad_norm": 0.13617432117462158,
+      "learning_rate": 0.00012153184465140413,
+      "loss": 1.086,
+      "step": 282
+    },
+    {
+      "epoch": 1.7770700636942676,
+      "grad_norm": 0.1326834112405777,
+      "learning_rate": 0.00012102697013284034,
+      "loss": 1.0692,
+      "step": 283
+    },
+    {
+      "epoch": 1.78343949044586,
+      "grad_norm": 0.1359279453754425,
+      "learning_rate": 0.00012052153421956342,
+      "loss": 1.0337,
+      "step": 284
+    },
+    {
+      "epoch": 1.7898089171974523,
+      "grad_norm": 0.13754823803901672,
+      "learning_rate": 0.00012001555040610197,
+      "loss": 1.0377,
+      "step": 285
+    },
+    {
+      "epoch": 1.7961783439490446,
+      "grad_norm": 0.1381075084209442,
+      "learning_rate": 0.00011950903220161285,
+      "loss": 1.0311,
+      "step": 286
+    },
+    {
+      "epoch": 1.802547770700637,
+      "grad_norm": 0.14102081954479218,
+      "learning_rate": 0.00011900199312952047,
+      "loss": 1.0645,
+      "step": 287
+    },
+    {
+      "epoch": 1.8089171974522293,
+      "grad_norm": 0.14201205968856812,
+      "learning_rate": 0.00011849444672715586,
+      "loss": 1.043,
+      "step": 288
+    },
+    {
+      "epoch": 1.8152866242038217,
+      "grad_norm": 0.13808976113796234,
+      "learning_rate": 0.0001179864065453951,
+      "loss": 1.0263,
+      "step": 289
+    },
+    {
+      "epoch": 1.821656050955414,
+      "grad_norm": 0.13844111561775208,
+      "learning_rate": 0.00011747788614829758,
+      "loss": 1.0483,
+      "step": 290
+    },
+    {
+      "epoch": 1.8280254777070064,
+      "grad_norm": 0.13990408182144165,
+      "learning_rate": 0.00011696889911274393,
+      "loss": 1.0591,
+      "step": 291
+    },
+    {
+      "epoch": 1.8343949044585988,
+      "grad_norm": 0.14219453930854797,
+      "learning_rate": 0.00011645945902807341,
+      "loss": 1.0702,
+      "step": 292
+    },
+    {
+      "epoch": 1.8407643312101911,
+      "grad_norm": 0.13736915588378906,
+      "learning_rate": 0.0001159495794957211,
+      "loss": 1.0457,
+      "step": 293
+    },
+    {
+      "epoch": 1.8471337579617835,
+      "grad_norm": 0.1305588185787201,
+      "learning_rate": 0.00011543927412885489,
+      "loss": 1.0006,
+      "step": 294
+    },
+    {
+      "epoch": 1.8535031847133758,
+      "grad_norm": 0.1360785961151123,
+      "learning_rate": 0.0001149285565520119,
+      "loss": 1.0055,
+      "step": 295
+    },
+    {
+      "epoch": 1.8598726114649682,
+      "grad_norm": 0.13506443798542023,
+      "learning_rate": 0.00011441744040073468,
+      "loss": 1.0519,
+      "step": 296
+    },
+    {
+      "epoch": 1.8662420382165605,
+      "grad_norm": 0.1369323432445526,
+      "learning_rate": 0.0001139059393212074,
+      "loss": 1.0547,
+      "step": 297
+    },
+    {
+      "epoch": 1.872611464968153,
+      "grad_norm": 0.13333867490291595,
+      "learning_rate": 0.00011339406696989128,
+      "loss": 1.0537,
+      "step": 298
+    },
+    {
+      "epoch": 1.8789808917197452,
+      "grad_norm": 0.13622106611728668,
+      "learning_rate": 0.00011288183701315995,
+      "loss": 1.0517,
+      "step": 299
+    },
+    {
+      "epoch": 1.8853503184713376,
+      "grad_norm": 0.14023956656455994,
+      "learning_rate": 0.00011236926312693479,
+      "loss": 1.0559,
+      "step": 300
+    },
+    {
+      "epoch": 1.89171974522293,
+      "grad_norm": 0.1363711655139923,
+      "learning_rate": 0.00011185635899631963,
+      "loss": 1.0291,
+      "step": 301
+    },
+    {
+      "epoch": 1.8980891719745223,
+      "grad_norm": 0.13854017853736877,
+      "learning_rate": 0.00011134313831523547,
+      "loss": 1.0605,
+      "step": 302
+    },
+    {
+      "epoch": 1.9044585987261147,
+      "grad_norm": 0.13852174580097198,
+      "learning_rate": 0.00011082961478605475,
+      "loss": 1.0553,
+      "step": 303
+    },
+    {
+      "epoch": 1.910828025477707,
+      "grad_norm": 0.14662423729896545,
+      "learning_rate": 0.00011031580211923571,
+      "loss": 1.0619,
+      "step": 304
+    },
+    {
+      "epoch": 1.9171974522292994,
+      "grad_norm": 0.14127817749977112,
+      "learning_rate": 0.0001098017140329561,
+      "loss": 1.0698,
+      "step": 305
+    },
+    {
+      "epoch": 1.9235668789808917,
+      "grad_norm": 0.1394420862197876,
+      "learning_rate": 0.00010928736425274701,
+      "loss": 1.0433,
+      "step": 306
+    },
+    {
+      "epoch": 1.929936305732484,
+      "grad_norm": 0.1438218653202057,
+      "learning_rate": 0.00010877276651112662,
+      "loss": 1.0498,
+      "step": 307
+    },
+    {
+      "epoch": 1.9363057324840764,
+      "grad_norm": 0.1500382274389267,
+      "learning_rate": 0.00010825793454723325,
+      "loss": 1.0613,
+      "step": 308
+    },
+    {
+      "epoch": 1.9426751592356688,
+      "grad_norm": 0.14135834574699402,
+      "learning_rate": 0.00010774288210645862,
+      "loss": 1.0435,
+      "step": 309
+    },
+    {
+      "epoch": 1.9490445859872612,
+      "grad_norm": 0.1469028890132904,
+      "learning_rate": 0.00010722762294008106,
+      "loss": 1.0064,
+      "step": 310
+    },
+    {
+      "epoch": 1.9554140127388535,
+      "grad_norm": 0.14101552963256836,
+      "learning_rate": 0.00010671217080489814,
+      "loss": 1.0485,
+      "step": 311
+    },
+    {
+      "epoch": 1.9617834394904459,
+      "grad_norm": 0.1395803987979889,
+      "learning_rate": 0.00010619653946285947,
+      "loss": 1.0405,
+      "step": 312
+    },
+    {
+      "epoch": 1.9681528662420382,
+      "grad_norm": 0.1441717892885208,
+      "learning_rate": 0.00010568074268069928,
+      "loss": 1.0183,
+      "step": 313
+    },
+    {
+      "epoch": 1.9745222929936306,
+      "grad_norm": 0.14449232816696167,
+      "learning_rate": 0.00010516479422956882,
+      "loss": 1.035,
+      "step": 314
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 628,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 157,
+  "total_flos": 4.1002945779695616e+17,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-314/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3ec2d42f7297673d946070db22cc38c40ecdb7e5fb5b23a335c46b1268e0b80
+size 5816

checkpoint-471/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: openlm-research/open_llama_3b_v2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-471/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-471/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:baa55b451255c83d0891283afd3e6f477219e68ea384f95f826a0f641deb0764
+size 50899792

checkpoint-471/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70400a26bceece3d719fc45fa49ab8308aacf3e91467cfb7e2ad8f27c9326e80
+size 25871876

checkpoint-471/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5328f04f222a66b45931d6bc246721e0747decf9d78d167903d0547a248f78f0
+size 14244

checkpoint-471/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:093c4c8889f78cd7012edc9e826ff8c0bdb7f82e793242afd57bd6801520bc0c
+size 1064

checkpoint-471/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-471/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574

checkpoint-471/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-471/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3414 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.949044585987261,
+  "eval_steps": 40,
+  "global_step": 471,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006369426751592357,
+      "grad_norm": 0.1806156039237976,
+      "learning_rate": 1e-05,
+      "loss": 1.3031,
+      "step": 1
+    },
+    {
+      "epoch": 0.006369426751592357,
+      "eval_loss": 1.5003942251205444,
+      "eval_runtime": 19.6641,
+      "eval_samples_per_second": 55.533,
+      "eval_steps_per_second": 0.915,
+      "step": 1
+    },
+    {
+      "epoch": 0.012738853503184714,
+      "grad_norm": 0.1688886284828186,
+      "learning_rate": 2e-05,
+      "loss": 1.3305,
+      "step": 2
+    },
+    {
+      "epoch": 0.01910828025477707,
+      "grad_norm": 0.20123907923698425,
+      "learning_rate": 3e-05,
+      "loss": 1.324,
+      "step": 3
+    },
+    {
+      "epoch": 0.025477707006369428,
+      "grad_norm": 0.18879620730876923,
+      "learning_rate": 4e-05,
+      "loss": 1.3638,
+      "step": 4
+    },
+    {
+      "epoch": 0.03184713375796178,
+      "grad_norm": 0.20348915457725525,
+      "learning_rate": 5e-05,
+      "loss": 1.3686,
+      "step": 5
+    },
+    {
+      "epoch": 0.03821656050955414,
+      "grad_norm": 0.212239071726799,
+      "learning_rate": 6e-05,
+      "loss": 1.2865,
+      "step": 6
+    },
+    {
+      "epoch": 0.044585987261146494,
+      "grad_norm": 0.19280897080898285,
+      "learning_rate": 7e-05,
+      "loss": 1.313,
+      "step": 7
+    },
+    {
+      "epoch": 0.050955414012738856,
+      "grad_norm": 0.1767151653766632,
+      "learning_rate": 8e-05,
+      "loss": 1.3207,
+      "step": 8
+    },
+    {
+      "epoch": 0.05732484076433121,
+      "grad_norm": 0.20014327764511108,
+      "learning_rate": 9e-05,
+      "loss": 1.3143,
+      "step": 9
+    },
+    {
+      "epoch": 0.06369426751592357,
+      "grad_norm": 0.18035855889320374,
+      "learning_rate": 0.0001,
+      "loss": 1.252,
+      "step": 10
+    },
+    {
+      "epoch": 0.07006369426751592,
+      "grad_norm": 0.19993054866790771,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.302,
+      "step": 11
+    },
+    {
+      "epoch": 0.07643312101910828,
+      "grad_norm": 0.18973341584205627,
+      "learning_rate": 0.00012,
+      "loss": 1.2608,
+      "step": 12
+    },
+    {
+      "epoch": 0.08280254777070063,
+      "grad_norm": 0.19669465720653534,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2329,
+      "step": 13
+    },
+    {
+      "epoch": 0.08917197452229299,
+      "grad_norm": 0.1886417716741562,
+      "learning_rate": 0.00014,
+      "loss": 1.241,
+      "step": 14
+    },
+    {
+      "epoch": 0.09554140127388536,
+      "grad_norm": 0.19076582789421082,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2539,
+      "step": 15
+    },
+    {
+      "epoch": 0.10191082802547771,
+      "grad_norm": 0.16027267277240753,
+      "learning_rate": 0.00016,
+      "loss": 1.2123,
+      "step": 16
+    },
+    {
+      "epoch": 0.10828025477707007,
+      "grad_norm": 0.16112814843654633,
+      "learning_rate": 0.00017,
+      "loss": 1.2465,
+      "step": 17
+    },
+    {
+      "epoch": 0.11464968152866242,
+      "grad_norm": 0.15539830923080444,
+      "learning_rate": 0.00018,
+      "loss": 1.1717,
+      "step": 18
+    },
+    {
+      "epoch": 0.12101910828025478,
+      "grad_norm": 0.15739695727825165,
+      "learning_rate": 0.00019,
+      "loss": 1.1412,
+      "step": 19
+    },
+    {
+      "epoch": 0.12738853503184713,
+      "grad_norm": 0.15658576786518097,
+      "learning_rate": 0.0002,
+      "loss": 1.1731,
+      "step": 20
+    },
+    {
+      "epoch": 0.1337579617834395,
+      "grad_norm": 0.1474328637123108,
+      "learning_rate": 0.00019999866506037345,
+      "loss": 1.2051,
+      "step": 21
+    },
+    {
+      "epoch": 0.14012738853503184,
+      "grad_norm": 0.11234907805919647,
+      "learning_rate": 0.00019999466027713507,
+      "loss": 1.1803,
+      "step": 22
+    },
+    {
+      "epoch": 0.1464968152866242,
+      "grad_norm": 0.1053839772939682,
+      "learning_rate": 0.00019998798575720776,
+      "loss": 1.1436,
+      "step": 23
+    },
+    {
+      "epoch": 0.15286624203821655,
+      "grad_norm": 0.1049942821264267,
+      "learning_rate": 0.00019997864167879312,
+      "loss": 1.1881,
+      "step": 24
+    },
+    {
+      "epoch": 0.1592356687898089,
+      "grad_norm": 0.11039146035909653,
+      "learning_rate": 0.00019996662829136676,
+      "loss": 1.1528,
+      "step": 25
+    },
+    {
+      "epoch": 0.16560509554140126,
+      "grad_norm": 0.09678228944540024,
+      "learning_rate": 0.0001999519459156716,
+      "loss": 1.1496,
+      "step": 26
+    },
+    {
+      "epoch": 0.17197452229299362,
+      "grad_norm": 0.09857058525085449,
+      "learning_rate": 0.0001999345949437094,
+      "loss": 1.1304,
+      "step": 27
+    },
+    {
+      "epoch": 0.17834394904458598,
+      "grad_norm": 0.10835567116737366,
+      "learning_rate": 0.0001999145758387301,
+      "loss": 1.2262,
+      "step": 28
+    },
+    {
+      "epoch": 0.18471337579617833,
+      "grad_norm": 0.09927600622177124,
+      "learning_rate": 0.0001998918891352197,
+      "loss": 1.1382,
+      "step": 29
+    },
+    {
+      "epoch": 0.1910828025477707,
+      "grad_norm": 0.09861327707767487,
+      "learning_rate": 0.00019986653543888568,
+      "loss": 1.1987,
+      "step": 30
+    },
+    {
+      "epoch": 0.19745222929936307,
+      "grad_norm": 0.09174010157585144,
+      "learning_rate": 0.00019983851542664126,
+      "loss": 1.127,
+      "step": 31
+    },
+    {
+      "epoch": 0.20382165605095542,
+      "grad_norm": 0.08863182365894318,
+      "learning_rate": 0.00019980782984658683,
+      "loss": 1.211,
+      "step": 32
+    },
+    {
+      "epoch": 0.21019108280254778,
+      "grad_norm": 0.08810263872146606,
+      "learning_rate": 0.00019977447951799034,
+      "loss": 1.1476,
+      "step": 33
+    },
+    {
+      "epoch": 0.21656050955414013,
+      "grad_norm": 0.08641776442527771,
+      "learning_rate": 0.00019973846533126533,
+      "loss": 1.1497,
+      "step": 34
+    },
+    {
+      "epoch": 0.2229299363057325,
+      "grad_norm": 0.09637051075696945,
+      "learning_rate": 0.00019969978824794707,
+      "loss": 1.1471,
+      "step": 35
+    },
+    {
+      "epoch": 0.22929936305732485,
+      "grad_norm": 0.09402573108673096,
+      "learning_rate": 0.000199658449300667,
+      "loss": 1.0976,
+      "step": 36
+    },
+    {
+      "epoch": 0.2356687898089172,
+      "grad_norm": 0.09077832847833633,
+      "learning_rate": 0.00019961444959312508,
+      "loss": 1.1119,
+      "step": 37
+    },
+    {
+      "epoch": 0.24203821656050956,
+      "grad_norm": 0.08864310383796692,
+      "learning_rate": 0.0001995677903000604,
+      "loss": 1.1157,
+      "step": 38
+    },
+    {
+      "epoch": 0.2484076433121019,
+      "grad_norm": 0.09867957979440689,
+      "learning_rate": 0.0001995184726672197,
+      "loss": 1.1656,
+      "step": 39
+    },
+    {
+      "epoch": 0.25477707006369427,
+      "grad_norm": 0.09343115240335464,
+      "learning_rate": 0.00019946649801132427,
+      "loss": 1.1084,
+      "step": 40
+    },
+    {
+      "epoch": 0.25477707006369427,
+      "eval_loss": 1.1224156618118286,
+      "eval_runtime": 19.2915,
+      "eval_samples_per_second": 56.605,
+      "eval_steps_per_second": 0.933,
+      "step": 40
+    },
+    {
+      "epoch": 0.2611464968152866,
+      "grad_norm": 0.09474795311689377,
+      "learning_rate": 0.00019941186772003464,
+      "loss": 1.1486,
+      "step": 41
+    },
+    {
+      "epoch": 0.267515923566879,
+      "grad_norm": 0.09726471453905106,
+      "learning_rate": 0.00019935458325191365,
+      "loss": 1.1499,
+      "step": 42
+    },
+    {
+      "epoch": 0.27388535031847133,
+      "grad_norm": 0.09273070096969604,
+      "learning_rate": 0.0001992946461363874,
+      "loss": 1.1361,
+      "step": 43
+    },
+    {
+      "epoch": 0.2802547770700637,
+      "grad_norm": 0.10344096273183823,
+      "learning_rate": 0.0001992320579737045,
+      "loss": 1.0999,
+      "step": 44
+    },
+    {
+      "epoch": 0.28662420382165604,
+      "grad_norm": 0.09499648213386536,
+      "learning_rate": 0.00019916682043489336,
+      "loss": 1.0919,
+      "step": 45
+    },
+    {
+      "epoch": 0.2929936305732484,
+      "grad_norm": 0.09483088552951813,
+      "learning_rate": 0.00019909893526171745,
+      "loss": 1.0992,
+      "step": 46
+    },
+    {
+      "epoch": 0.29936305732484075,
+      "grad_norm": 0.10382100939750671,
+      "learning_rate": 0.00019902840426662895,
+      "loss": 1.1093,
+      "step": 47
+    },
+    {
+      "epoch": 0.3057324840764331,
+      "grad_norm": 0.10187891870737076,
+      "learning_rate": 0.00019895522933272028,
+      "loss": 1.1063,
+      "step": 48
+    },
+    {
+      "epoch": 0.31210191082802546,
+      "grad_norm": 0.1022520437836647,
+      "learning_rate": 0.00019887941241367377,
+      "loss": 1.1095,
+      "step": 49
+    },
+    {
+      "epoch": 0.3184713375796178,
+      "grad_norm": 0.11470162868499756,
+      "learning_rate": 0.00019880095553370967,
+      "loss": 1.0859,
+      "step": 50
+    },
+    {
+      "epoch": 0.3248407643312102,
+      "grad_norm": 0.09845008701086044,
+      "learning_rate": 0.0001987198607875319,
+      "loss": 1.0941,
+      "step": 51
+    },
+    {
+      "epoch": 0.33121019108280253,
+      "grad_norm": 0.1080709770321846,
+      "learning_rate": 0.00019863613034027224,
+      "loss": 1.084,
+      "step": 52
+    },
+    {
+      "epoch": 0.3375796178343949,
+      "grad_norm": 0.11064234375953674,
+      "learning_rate": 0.0001985497664274326,
+      "loss": 1.1018,
+      "step": 53
+    },
+    {
+      "epoch": 0.34394904458598724,
+      "grad_norm": 0.10099776834249496,
+      "learning_rate": 0.0001984607713548251,
+      "loss": 1.0881,
+      "step": 54
+    },
+    {
+      "epoch": 0.3503184713375796,
+      "grad_norm": 0.11960357427597046,
+      "learning_rate": 0.0001983691474985108,
+      "loss": 1.0845,
+      "step": 55
+    },
+    {
+      "epoch": 0.35668789808917195,
+      "grad_norm": 0.10840114951133728,
+      "learning_rate": 0.00019827489730473596,
+      "loss": 1.131,
+      "step": 56
+    },
+    {
+      "epoch": 0.3630573248407643,
+      "grad_norm": 0.10177604109048843,
+      "learning_rate": 0.00019817802328986697,
+      "loss": 1.079,
+      "step": 57
+    },
+    {
+      "epoch": 0.36942675159235666,
+      "grad_norm": 0.11752859503030777,
+      "learning_rate": 0.00019807852804032305,
+      "loss": 1.0833,
+      "step": 58
+    },
+    {
+      "epoch": 0.37579617834394907,
+      "grad_norm": 0.11149834841489792,
+      "learning_rate": 0.00019797641421250725,
+      "loss": 1.1009,
+      "step": 59
+    },
+    {
+      "epoch": 0.3821656050955414,
+      "grad_norm": 0.10446681827306747,
+      "learning_rate": 0.00019787168453273544,
+      "loss": 1.1211,
+      "step": 60
+    },
+    {
+      "epoch": 0.3885350318471338,
+      "grad_norm": 0.12820479273796082,
+      "learning_rate": 0.00019776434179716366,
+      "loss": 1.1455,
+      "step": 61
+    },
+    {
+      "epoch": 0.39490445859872614,
+      "grad_norm": 0.10011500865221024,
+      "learning_rate": 0.00019765438887171327,
+      "loss": 1.0779,
+      "step": 62
+    },
+    {
+      "epoch": 0.4012738853503185,
+      "grad_norm": 0.11496227979660034,
+      "learning_rate": 0.0001975418286919947,
+      "loss": 1.1174,
+      "step": 63
+    },
+    {
+      "epoch": 0.40764331210191085,
+      "grad_norm": 0.10938404500484467,
+      "learning_rate": 0.00019742666426322876,
+      "loss": 1.0576,
+      "step": 64
+    },
+    {
+      "epoch": 0.4140127388535032,
+      "grad_norm": 0.12636032700538635,
+      "learning_rate": 0.0001973088986601667,
+      "loss": 1.083,
+      "step": 65
+    },
+    {
+      "epoch": 0.42038216560509556,
+      "grad_norm": 0.10620423406362534,
+      "learning_rate": 0.00019718853502700783,
+      "loss": 1.0728,
+      "step": 66
+    },
+    {
+      "epoch": 0.4267515923566879,
+      "grad_norm": 0.11206210404634476,
+      "learning_rate": 0.0001970655765773159,
+      "loss": 1.1107,
+      "step": 67
+    },
+    {
+      "epoch": 0.43312101910828027,
+      "grad_norm": 0.12613879144191742,
+      "learning_rate": 0.00019694002659393305,
+      "loss": 1.1065,
+      "step": 68
+    },
+    {
+      "epoch": 0.4394904458598726,
+      "grad_norm": 0.10636976361274719,
+      "learning_rate": 0.00019681188842889222,
+      "loss": 1.1192,
+      "step": 69
+    },
+    {
+      "epoch": 0.445859872611465,
+      "grad_norm": 0.11036239564418793,
+      "learning_rate": 0.00019668116550332766,
+      "loss": 1.1362,
+      "step": 70
+    },
+    {
+      "epoch": 0.45222929936305734,
+      "grad_norm": 0.11907072365283966,
+      "learning_rate": 0.0001965478613073837,
+      "loss": 1.1009,
+      "step": 71
+    },
+    {
+      "epoch": 0.4585987261146497,
+      "grad_norm": 0.11267364770174026,
+      "learning_rate": 0.00019641197940012137,
+      "loss": 1.0694,
+      "step": 72
+    },
+    {
+      "epoch": 0.46496815286624205,
+      "grad_norm": 0.10659351199865341,
+      "learning_rate": 0.00019627352340942353,
+      "loss": 1.0844,
+      "step": 73
+    },
+    {
+      "epoch": 0.4713375796178344,
+      "grad_norm": 0.12426211684942245,
+      "learning_rate": 0.00019613249703189796,
+      "loss": 1.1203,
+      "step": 74
+    },
+    {
+      "epoch": 0.47770700636942676,
+      "grad_norm": 0.11883872747421265,
+      "learning_rate": 0.00019598890403277864,
+      "loss": 1.0879,
+      "step": 75
+    },
+    {
+      "epoch": 0.4840764331210191,
+      "grad_norm": 0.11355262994766235,
+      "learning_rate": 0.0001958427482458253,
+      "loss": 1.1045,
+      "step": 76
+    },
+    {
+      "epoch": 0.49044585987261147,
+      "grad_norm": 0.11006154865026474,
+      "learning_rate": 0.0001956940335732209,
+      "loss": 1.1058,
+      "step": 77
+    },
+    {
+      "epoch": 0.4968152866242038,
+      "grad_norm": 0.11379122734069824,
+      "learning_rate": 0.00019554276398546768,
+      "loss": 1.1224,
+      "step": 78
+    },
+    {
+      "epoch": 0.5031847133757962,
+      "grad_norm": 0.11065732687711716,
+      "learning_rate": 0.000195388943521281,
+      "loss": 1.1033,
+      "step": 79
+    },
+    {
+      "epoch": 0.5095541401273885,
+      "grad_norm": 0.11113402247428894,
+      "learning_rate": 0.00019523257628748146,
+      "loss": 1.0912,
+      "step": 80
+    },
+    {
+      "epoch": 0.5095541401273885,
+      "eval_loss": 1.0586377382278442,
+      "eval_runtime": 19.2899,
+      "eval_samples_per_second": 56.61,
+      "eval_steps_per_second": 0.933,
+      "step": 80
+    },
+    {
+      "epoch": 0.5159235668789809,
+      "grad_norm": 0.11783529818058014,
+      "learning_rate": 0.00019507366645888543,
+      "loss": 1.0938,
+      "step": 81
+    },
+    {
+      "epoch": 0.5222929936305732,
+      "grad_norm": 0.12089723348617554,
+      "learning_rate": 0.00019491221827819347,
+      "loss": 1.1068,
+      "step": 82
+    },
+    {
+      "epoch": 0.5286624203821656,
+      "grad_norm": 0.10991813987493515,
+      "learning_rate": 0.00019474823605587703,
+      "loss": 1.1393,
+      "step": 83
+    },
+    {
+      "epoch": 0.535031847133758,
+      "grad_norm": 0.11100416630506516,
+      "learning_rate": 0.00019458172417006347,
+      "loss": 1.1081,
+      "step": 84
+    },
+    {
+      "epoch": 0.5414012738853503,
+      "grad_norm": 0.11886284500360489,
+      "learning_rate": 0.00019441268706641907,
+      "loss": 1.1168,
+      "step": 85
+    },
+    {
+      "epoch": 0.5477707006369427,
+      "grad_norm": 0.11771067976951599,
+      "learning_rate": 0.00019424112925803039,
+      "loss": 1.098,
+      "step": 86
+    },
+    {
+      "epoch": 0.554140127388535,
+      "grad_norm": 0.11022554337978363,
+      "learning_rate": 0.00019406705532528374,
+      "loss": 1.1179,
+      "step": 87
+    },
+    {
+      "epoch": 0.5605095541401274,
+      "grad_norm": 0.11891311407089233,
+      "learning_rate": 0.00019389046991574298,
+      "loss": 1.0866,
+      "step": 88
+    },
+    {
+      "epoch": 0.5668789808917197,
+      "grad_norm": 0.11594802141189575,
+      "learning_rate": 0.00019371137774402527,
+      "loss": 1.1146,
+      "step": 89
+    },
+    {
+      "epoch": 0.5732484076433121,
+      "grad_norm": 0.1181577518582344,
+      "learning_rate": 0.0001935297835916754,
+      "loss": 1.1213,
+      "step": 90
+    },
+    {
+      "epoch": 0.5796178343949044,
+      "grad_norm": 0.10821503400802612,
+      "learning_rate": 0.00019334569230703794,
+      "loss": 1.1121,
+      "step": 91
+    },
+    {
+      "epoch": 0.5859872611464968,
+      "grad_norm": 0.118013896048069,
+      "learning_rate": 0.0001931591088051279,
+      "loss": 1.117,
+      "step": 92
+    },
+    {
+      "epoch": 0.5923566878980892,
+      "grad_norm": 0.11678043752908707,
+      "learning_rate": 0.0001929700380674995,
+      "loss": 1.0974,
+      "step": 93
+    },
+    {
+      "epoch": 0.5987261146496815,
+      "grad_norm": 0.11073200404644012,
+      "learning_rate": 0.00019277848514211317,
+      "loss": 1.1059,
+      "step": 94
+    },
+    {
+      "epoch": 0.6050955414012739,
+      "grad_norm": 0.11440474539995193,
+      "learning_rate": 0.00019258445514320065,
+      "loss": 1.0913,
+      "step": 95
+    },
+    {
+      "epoch": 0.6114649681528662,
+      "grad_norm": 0.11020273715257645,
+      "learning_rate": 0.0001923879532511287,
+      "loss": 1.0836,
+      "step": 96
+    },
+    {
+      "epoch": 0.6178343949044586,
+      "grad_norm": 0.11285867542028427,
+      "learning_rate": 0.0001921889847122605,
+      "loss": 1.0842,
+      "step": 97
+    },
+    {
+      "epoch": 0.6242038216560509,
+      "grad_norm": 0.11981746554374695,
+      "learning_rate": 0.00019198755483881583,
+      "loss": 1.1062,
+      "step": 98
+    },
+    {
+      "epoch": 0.6305732484076433,
+      "grad_norm": 0.11882256716489792,
+      "learning_rate": 0.0001917836690087291,
+      "loss": 1.1012,
+      "step": 99
+    },
+    {
+      "epoch": 0.6369426751592356,
+      "grad_norm": 0.11642686277627945,
+      "learning_rate": 0.00019157733266550575,
+      "loss": 1.0823,
+      "step": 100
+    },
+    {
+      "epoch": 0.643312101910828,
+      "grad_norm": 0.11980683356523514,
+      "learning_rate": 0.00019136855131807705,
+      "loss": 1.105,
+      "step": 101
+    },
+    {
+      "epoch": 0.6496815286624203,
+      "grad_norm": 0.1147085651755333,
+      "learning_rate": 0.0001911573305406528,
+      "loss": 1.0794,
+      "step": 102
+    },
+    {
+      "epoch": 0.6560509554140127,
+      "grad_norm": 0.12037765234708786,
+      "learning_rate": 0.00019094367597257282,
+      "loss": 1.1059,
+      "step": 103
+    },
+    {
+      "epoch": 0.6624203821656051,
+      "grad_norm": 0.12135636061429977,
+      "learning_rate": 0.000190727593318156,
+      "loss": 1.118,
+      "step": 104
+    },
+    {
+      "epoch": 0.6687898089171974,
+      "grad_norm": 0.13285911083221436,
+      "learning_rate": 0.00019050908834654834,
+      "loss": 1.0817,
+      "step": 105
+    },
+    {
+      "epoch": 0.6751592356687898,
+      "grad_norm": 0.11360063403844833,
+      "learning_rate": 0.00019028816689156878,
+      "loss": 1.0711,
+      "step": 106
+    },
+    {
+      "epoch": 0.6815286624203821,
+      "grad_norm": 0.13178926706314087,
+      "learning_rate": 0.00019006483485155338,
+      "loss": 1.1266,
+      "step": 107
+    },
+    {
+      "epoch": 0.6878980891719745,
+      "grad_norm": 0.1290571093559265,
+      "learning_rate": 0.0001898390981891979,
+      "loss": 1.0776,
+      "step": 108
+    },
+    {
+      "epoch": 0.6942675159235668,
+      "grad_norm": 0.11376259475946426,
+      "learning_rate": 0.0001896109629313987,
+      "loss": 1.1026,
+      "step": 109
+    },
+    {
+      "epoch": 0.7006369426751592,
+      "grad_norm": 0.12076874077320099,
+      "learning_rate": 0.0001893804351690917,
+      "loss": 1.104,
+      "step": 110
+    },
+    {
+      "epoch": 0.7070063694267515,
+      "grad_norm": 0.12165362387895584,
+      "learning_rate": 0.0001891475210570898,
+      "loss": 1.0884,
+      "step": 111
+    },
+    {
+      "epoch": 0.7133757961783439,
+      "grad_norm": 0.10634943842887878,
+      "learning_rate": 0.00018891222681391851,
+      "loss": 1.0844,
+      "step": 112
+    },
+    {
+      "epoch": 0.7197452229299363,
+      "grad_norm": 0.11928383260965347,
+      "learning_rate": 0.00018867455872165008,
+      "loss": 1.1205,
+      "step": 113
+    },
+    {
+      "epoch": 0.7261146496815286,
+      "grad_norm": 0.1243489533662796,
+      "learning_rate": 0.00018843452312573554,
+      "loss": 1.0704,
+      "step": 114
+    },
+    {
+      "epoch": 0.732484076433121,
+      "grad_norm": 0.11439479887485504,
+      "learning_rate": 0.0001881921264348355,
+      "loss": 1.0809,
+      "step": 115
+    },
+    {
+      "epoch": 0.7388535031847133,
+      "grad_norm": 0.1184995099902153,
+      "learning_rate": 0.0001879473751206489,
+      "loss": 1.1619,
+      "step": 116
+    },
+    {
+      "epoch": 0.7452229299363057,
+      "grad_norm": 0.11846223473548889,
+      "learning_rate": 0.00018770027571774031,
+      "loss": 1.0835,
+      "step": 117
+    },
+    {
+      "epoch": 0.7515923566878981,
+      "grad_norm": 0.11566226184368134,
+      "learning_rate": 0.00018745083482336544,
+      "loss": 1.0658,
+      "step": 118
+    },
+    {
+      "epoch": 0.7579617834394905,
+      "grad_norm": 0.11553015559911728,
+      "learning_rate": 0.00018719905909729494,
+      "loss": 1.0773,
+      "step": 119
+    },
+    {
+      "epoch": 0.7643312101910829,
+      "grad_norm": 0.13605500757694244,
+      "learning_rate": 0.0001869449552616367,
+      "loss": 1.0727,
+      "step": 120
+    },
+    {
+      "epoch": 0.7643312101910829,
+      "eval_loss": 1.0301120281219482,
+      "eval_runtime": 19.2781,
+      "eval_samples_per_second": 56.645,
+      "eval_steps_per_second": 0.934,
+      "step": 120
+    },
+    {
+      "epoch": 0.7707006369426752,
+      "grad_norm": 0.1149601861834526,
+      "learning_rate": 0.00018668853010065634,
+      "loss": 1.0745,
+      "step": 121
+    },
+    {
+      "epoch": 0.7770700636942676,
+      "grad_norm": 0.11904130131006241,
+      "learning_rate": 0.00018642979046059593,
+      "loss": 1.0574,
+      "step": 122
+    },
+    {
+      "epoch": 0.7834394904458599,
+      "grad_norm": 0.11868870258331299,
+      "learning_rate": 0.00018616874324949159,
+      "loss": 1.0681,
+      "step": 123
+    },
+    {
+      "epoch": 0.7898089171974523,
+      "grad_norm": 0.11400648951530457,
+      "learning_rate": 0.00018590539543698854,
+      "loss": 1.0874,
+      "step": 124
+    },
+    {
+      "epoch": 0.7961783439490446,
+      "grad_norm": 0.12247481942176819,
+      "learning_rate": 0.0001856397540541554,
+      "loss": 1.0832,
+      "step": 125
+    },
+    {
+      "epoch": 0.802547770700637,
+      "grad_norm": 0.11855783313512802,
+      "learning_rate": 0.0001853718261932964,
+      "loss": 1.0775,
+      "step": 126
+    },
+    {
+      "epoch": 0.8089171974522293,
+      "grad_norm": 0.11434577405452728,
+      "learning_rate": 0.00018510161900776187,
+      "loss": 1.048,
+      "step": 127
+    },
+    {
+      "epoch": 0.8152866242038217,
+      "grad_norm": 0.12175115942955017,
+      "learning_rate": 0.00018482913971175737,
+      "loss": 1.0776,
+      "step": 128
+    },
+    {
+      "epoch": 0.821656050955414,
+      "grad_norm": 0.1237318217754364,
+      "learning_rate": 0.00018455439558015115,
+      "loss": 1.0977,
+      "step": 129
+    },
+    {
+      "epoch": 0.8280254777070064,
+      "grad_norm": 0.12041562050580978,
+      "learning_rate": 0.00018427739394827973,
+      "loss": 1.0477,
+      "step": 130
+    },
+    {
+      "epoch": 0.8343949044585988,
+      "grad_norm": 0.11855332553386688,
+      "learning_rate": 0.00018399814221175227,
+      "loss": 1.1026,
+      "step": 131
+    },
+    {
+      "epoch": 0.8407643312101911,
+      "grad_norm": 0.12020997703075409,
+      "learning_rate": 0.00018371664782625287,
+      "loss": 1.0484,
+      "step": 132
+    },
+    {
+      "epoch": 0.8471337579617835,
+      "grad_norm": 0.1116231232881546,
+      "learning_rate": 0.00018343291830734176,
+      "loss": 1.0772,
+      "step": 133
+    },
+    {
+      "epoch": 0.8535031847133758,
+      "grad_norm": 0.12280379235744476,
+      "learning_rate": 0.00018314696123025454,
+      "loss": 1.0829,
+      "step": 134
+    },
+    {
+      "epoch": 0.8598726114649682,
+      "grad_norm": 0.11589805781841278,
+      "learning_rate": 0.00018285878422969983,
+      "loss": 1.0636,
+      "step": 135
+    },
+    {
+      "epoch": 0.8662420382165605,
+      "grad_norm": 0.11667989194393158,
+      "learning_rate": 0.0001825683949996556,
+      "loss": 1.0783,
+      "step": 136
+    },
+    {
+      "epoch": 0.8726114649681529,
+      "grad_norm": 0.11666262894868851,
+      "learning_rate": 0.00018227580129316366,
+      "loss": 1.0587,
+      "step": 137
+    },
+    {
+      "epoch": 0.8789808917197452,
+      "grad_norm": 0.11791834235191345,
+      "learning_rate": 0.00018198101092212267,
+      "loss": 1.0955,
+      "step": 138
+    },
+    {
+      "epoch": 0.8853503184713376,
+      "grad_norm": 0.12023093551397324,
+      "learning_rate": 0.00018168403175707954,
+      "loss": 1.1133,
+      "step": 139
+    },
+    {
+      "epoch": 0.89171974522293,
+      "grad_norm": 0.12082846462726593,
+      "learning_rate": 0.0001813848717270195,
+      "loss": 1.1083,
+      "step": 140
+    },
+    {
+      "epoch": 0.8980891719745223,
+      "grad_norm": 0.1259888857603073,
+      "learning_rate": 0.00018108353881915402,
+      "loss": 1.0931,
+      "step": 141
+    },
+    {
+      "epoch": 0.9044585987261147,
+      "grad_norm": 0.11900565028190613,
+      "learning_rate": 0.00018078004107870797,
+      "loss": 1.0955,
+      "step": 142
+    },
+    {
+      "epoch": 0.910828025477707,
+      "grad_norm": 0.11422552168369293,
+      "learning_rate": 0.00018047438660870446,
+      "loss": 1.0473,
+      "step": 143
+    },
+    {
+      "epoch": 0.9171974522292994,
+      "grad_norm": 0.13001863658428192,
+      "learning_rate": 0.00018016658356974884,
+      "loss": 1.0273,
+      "step": 144
+    },
+    {
+      "epoch": 0.9235668789808917,
+      "grad_norm": 0.11941977590322495,
+      "learning_rate": 0.0001798566401798106,
+      "loss": 1.0774,
+      "step": 145
+    },
+    {
+      "epoch": 0.9299363057324841,
+      "grad_norm": 0.12032714486122131,
+      "learning_rate": 0.00017954456471400393,
+      "loss": 1.1162,
+      "step": 146
+    },
+    {
+      "epoch": 0.9363057324840764,
+      "grad_norm": 0.13784518837928772,
+      "learning_rate": 0.00017923036550436704,
+      "loss": 1.095,
+      "step": 147
+    },
+    {
+      "epoch": 0.9426751592356688,
+      "grad_norm": 0.12085068970918655,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 1.1024,
+      "step": 148
+    },
+    {
+      "epoch": 0.9490445859872612,
+      "grad_norm": 0.11120469868183136,
+      "learning_rate": 0.00017859562946503788,
+      "loss": 1.0502,
+      "step": 149
+    },
+    {
+      "epoch": 0.9554140127388535,
+      "grad_norm": 0.1275676190853119,
+      "learning_rate": 0.00017827510958203147,
+      "loss": 1.0875,
+      "step": 150
+    },
+    {
+      "epoch": 0.9617834394904459,
+      "grad_norm": 0.13544359803199768,
+      "learning_rate": 0.00017795249984811396,
+      "loss": 1.0985,
+      "step": 151
+    },
+    {
+      "epoch": 0.9681528662420382,
+      "grad_norm": 0.11840228736400604,
+      "learning_rate": 0.00017762780887657574,
+      "loss": 1.059,
+      "step": 152
+    },
+    {
+      "epoch": 0.9745222929936306,
+      "grad_norm": 0.12622268497943878,
+      "learning_rate": 0.0001773010453362737,
+      "loss": 1.1034,
+      "step": 153
+    },
+    {
+      "epoch": 0.9808917197452229,
+      "grad_norm": 0.11485569179058075,
+      "learning_rate": 0.0001769722179513998,
+      "loss": 1.0639,
+      "step": 154
+    },
+    {
+      "epoch": 0.9872611464968153,
+      "grad_norm": 0.11948831379413605,
+      "learning_rate": 0.00017664133550124815,
+      "loss": 1.0635,
+      "step": 155
+    },
+    {
+      "epoch": 0.9936305732484076,
+      "grad_norm": 0.1214427575469017,
+      "learning_rate": 0.00017630840681998066,
+      "loss": 1.1361,
+      "step": 156
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.11713624000549316,
+      "learning_rate": 0.00017597344079639112,
+      "loss": 1.0619,
+      "step": 157
+    },
+    {
+      "epoch": 1.0063694267515924,
+      "grad_norm": 0.11573248356580734,
+      "learning_rate": 0.00017563644637366788,
+      "loss": 1.1141,
+      "step": 158
+    },
+    {
+      "epoch": 1.0127388535031847,
+      "grad_norm": 0.11592724919319153,
+      "learning_rate": 0.00017529743254915512,
+      "loss": 1.0569,
+      "step": 159
+    },
+    {
+      "epoch": 1.019108280254777,
+      "grad_norm": 0.12063013017177582,
+      "learning_rate": 0.0001749564083741126,
+      "loss": 1.0438,
+      "step": 160
+    },
+    {
+      "epoch": 1.019108280254777,
+      "eval_loss": 1.012627124786377,
+      "eval_runtime": 19.2811,
+      "eval_samples_per_second": 56.636,
+      "eval_steps_per_second": 0.934,
+      "step": 160
+    },
+    {
+      "epoch": 1.0254777070063694,
+      "grad_norm": 1.0452429056167603,
+      "learning_rate": 0.00017461338295347406,
+      "loss": 1.1776,
+      "step": 161
+    },
+    {
+      "epoch": 1.0063694267515924,
+      "grad_norm": 0.13684259355068207,
+      "learning_rate": 0.000174268365445604,
+      "loss": 1.0491,
+      "step": 162
+    },
+    {
+      "epoch": 1.0127388535031847,
+      "grad_norm": 0.13602878153324127,
+      "learning_rate": 0.0001739213650620533,
+      "loss": 1.0311,
+      "step": 163
+    },
+    {
+      "epoch": 1.019108280254777,
+      "grad_norm": 0.1506141573190689,
+      "learning_rate": 0.00017357239106731317,
+      "loss": 1.0233,
+      "step": 164
+    },
+    {
+      "epoch": 1.0254777070063694,
+      "grad_norm": 0.13961653411388397,
+      "learning_rate": 0.00017322145277856794,
+      "loss": 1.0515,
+      "step": 165
+    },
+    {
+      "epoch": 1.0318471337579618,
+      "grad_norm": 0.12875933945178986,
+      "learning_rate": 0.00017286855956544613,
+      "loss": 1.0643,
+      "step": 166
+    },
+    {
+      "epoch": 1.0382165605095541,
+      "grad_norm": 0.13839364051818848,
+      "learning_rate": 0.0001725137208497705,
+      "loss": 1.0853,
+      "step": 167
+    },
+    {
+      "epoch": 1.0445859872611465,
+      "grad_norm": 0.1401708722114563,
+      "learning_rate": 0.0001721569461053062,
+      "loss": 1.0608,
+      "step": 168
+    },
+    {
+      "epoch": 1.0509554140127388,
+      "grad_norm": 0.13666324317455292,
+      "learning_rate": 0.0001717982448575082,
+      "loss": 1.0186,
+      "step": 169
+    },
+    {
+      "epoch": 1.0573248407643312,
+      "grad_norm": 0.13511985540390015,
+      "learning_rate": 0.00017143762668326667,
+      "loss": 1.0775,
+      "step": 170
+    },
+    {
+      "epoch": 1.0636942675159236,
+      "grad_norm": 0.13401229679584503,
+      "learning_rate": 0.00017107510121065138,
+      "loss": 1.0587,
+      "step": 171
+    },
+    {
+      "epoch": 1.070063694267516,
+      "grad_norm": 0.13937029242515564,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 1.0655,
+      "step": 172
+    },
+    {
+      "epoch": 1.0764331210191083,
+      "grad_norm": 0.13978822529315948,
+      "learning_rate": 0.0001703443671369333,
+      "loss": 1.0417,
+      "step": 173
+    },
+    {
+      "epoch": 1.0828025477707006,
+      "grad_norm": 0.1328263282775879,
+      "learning_rate": 0.00016997617804554796,
+      "loss": 1.0609,
+      "step": 174
+    },
+    {
+      "epoch": 1.089171974522293,
+      "grad_norm": 0.13478587567806244,
+      "learning_rate": 0.00016960612067470288,
+      "loss": 1.0314,
+      "step": 175
+    },
+    {
+      "epoch": 1.0955414012738853,
+      "grad_norm": 0.12482774257659912,
+      "learning_rate": 0.00016923420490448296,
+      "loss": 1.0173,
+      "step": 176
+    },
+    {
+      "epoch": 1.1019108280254777,
+      "grad_norm": 0.12970109283924103,
+      "learning_rate": 0.0001688604406645903,
+      "loss": 1.0904,
+      "step": 177
+    },
+    {
+      "epoch": 1.10828025477707,
+      "grad_norm": 0.12363622337579727,
+      "learning_rate": 0.00016848483793407873,
+      "loss": 1.0434,
+      "step": 178
+    },
+    {
+      "epoch": 1.1146496815286624,
+      "grad_norm": 0.13114579021930695,
+      "learning_rate": 0.00016810740674108764,
+      "loss": 1.0456,
+      "step": 179
+    },
+    {
+      "epoch": 1.1210191082802548,
+      "grad_norm": 0.13814528286457062,
+      "learning_rate": 0.00016772815716257412,
+      "loss": 1.0845,
+      "step": 180
+    },
+    {
+      "epoch": 1.127388535031847,
+      "grad_norm": 0.12670482695102692,
+      "learning_rate": 0.00016734709932404403,
+      "loss": 1.0392,
+      "step": 181
+    },
+    {
+      "epoch": 1.1337579617834395,
+      "grad_norm": 0.13344614207744598,
+      "learning_rate": 0.00016696424339928152,
+      "loss": 1.0429,
+      "step": 182
+    },
+    {
+      "epoch": 1.1401273885350318,
+      "grad_norm": 0.14558671414852142,
+      "learning_rate": 0.00016657959961007747,
+      "loss": 1.0615,
+      "step": 183
+    },
+    {
+      "epoch": 1.1464968152866242,
+      "grad_norm": 0.13091522455215454,
+      "learning_rate": 0.00016619317822595667,
+      "loss": 1.0816,
+      "step": 184
+    },
+    {
+      "epoch": 1.1528662420382165,
+      "grad_norm": 0.1288042962551117,
+      "learning_rate": 0.00016580498956390342,
+      "loss": 1.0114,
+      "step": 185
+    },
+    {
+      "epoch": 1.1592356687898089,
+      "grad_norm": 0.12748295068740845,
+      "learning_rate": 0.00016541504398808631,
+      "loss": 1.096,
+      "step": 186
+    },
+    {
+      "epoch": 1.1656050955414012,
+      "grad_norm": 0.13045403361320496,
+      "learning_rate": 0.00016502335190958135,
+      "loss": 0.9977,
+      "step": 187
+    },
+    {
+      "epoch": 1.1719745222929936,
+      "grad_norm": 0.14281457662582397,
+      "learning_rate": 0.00016462992378609407,
+      "loss": 1.0434,
+      "step": 188
+    },
+    {
+      "epoch": 1.178343949044586,
+      "grad_norm": 0.1320338100194931,
+      "learning_rate": 0.00016423477012168038,
+      "loss": 1.0554,
+      "step": 189
+    },
+    {
+      "epoch": 1.1847133757961783,
+      "grad_norm": 0.12324702739715576,
+      "learning_rate": 0.00016383790146646588,
+      "loss": 1.0416,
+      "step": 190
+    },
+    {
+      "epoch": 1.1910828025477707,
+      "grad_norm": 0.1301770806312561,
+      "learning_rate": 0.00016343932841636456,
+      "loss": 1.0613,
+      "step": 191
+    },
+    {
+      "epoch": 1.197452229299363,
+      "grad_norm": 0.14009694755077362,
+      "learning_rate": 0.0001630390616127955,
+      "loss": 1.0139,
+      "step": 192
+    },
+    {
+      "epoch": 1.2038216560509554,
+      "grad_norm": 0.13656193017959595,
+      "learning_rate": 0.00016263711174239914,
+      "loss": 1.0632,
+      "step": 193
+    },
+    {
+      "epoch": 1.2101910828025477,
+      "grad_norm": 0.12946204841136932,
+      "learning_rate": 0.00016223348953675162,
+      "loss": 1.0458,
+      "step": 194
+    },
+    {
+      "epoch": 1.21656050955414,
+      "grad_norm": 0.1356847584247589,
+      "learning_rate": 0.00016182820577207842,
+      "loss": 1.0928,
+      "step": 195
+    },
+    {
+      "epoch": 1.2229299363057324,
+      "grad_norm": 0.1389479786157608,
+      "learning_rate": 0.0001614212712689668,
+      "loss": 1.0577,
+      "step": 196
+    },
+    {
+      "epoch": 1.2292993630573248,
+      "grad_norm": 0.1340690702199936,
+      "learning_rate": 0.00016101269689207655,
+      "loss": 1.0572,
+      "step": 197
+    },
+    {
+      "epoch": 1.2356687898089171,
+      "grad_norm": 0.13188521564006805,
+      "learning_rate": 0.00016060249354985025,
+      "loss": 1.0775,
+      "step": 198
+    },
+    {
+      "epoch": 1.2420382165605095,
+      "grad_norm": 0.12922795116901398,
+      "learning_rate": 0.00016019067219422178,
+      "loss": 1.0434,
+      "step": 199
+    },
+    {
+      "epoch": 1.2484076433121019,
+      "grad_norm": 0.12612590193748474,
+      "learning_rate": 0.0001597772438203241,
+      "loss": 1.0126,
+      "step": 200
+    },
+    {
+      "epoch": 1.2484076433121019,
+      "eval_loss": 1.00348961353302,
+      "eval_runtime": 19.3163,
+      "eval_samples_per_second": 56.533,
+      "eval_steps_per_second": 0.932,
+      "step": 200
+    },
+    {
+      "epoch": 1.2547770700636942,
+      "grad_norm": 0.1387277990579605,
+      "learning_rate": 0.0001593622194661956,
+      "loss": 1.0421,
+      "step": 201
+    },
+    {
+      "epoch": 1.2611464968152866,
+      "grad_norm": 0.13583126664161682,
+      "learning_rate": 0.00015894561021248535,
+      "loss": 1.0441,
+      "step": 202
+    },
+    {
+      "epoch": 1.267515923566879,
+      "grad_norm": 0.12996627390384674,
+      "learning_rate": 0.00015852742718215743,
+      "loss": 1.0342,
+      "step": 203
+    },
+    {
+      "epoch": 1.2738853503184713,
+      "grad_norm": 0.13653862476348877,
+      "learning_rate": 0.00015810768154019385,
+      "loss": 1.0108,
+      "step": 204
+    },
+    {
+      "epoch": 1.2802547770700636,
+      "grad_norm": 0.1289973258972168,
+      "learning_rate": 0.0001576863844932963,
+      "loss": 1.0523,
+      "step": 205
+    },
+    {
+      "epoch": 1.286624203821656,
+      "grad_norm": 0.13348506391048431,
+      "learning_rate": 0.00015726354728958736,
+      "loss": 1.0564,
+      "step": 206
+    },
+    {
+      "epoch": 1.2929936305732483,
+      "grad_norm": 0.12048185616731644,
+      "learning_rate": 0.0001568391812183097,
+      "loss": 1.0457,
+      "step": 207
+    },
+    {
+      "epoch": 1.2993630573248407,
+      "grad_norm": 0.12991134822368622,
+      "learning_rate": 0.00015641329760952513,
+      "loss": 1.05,
+      "step": 208
+    },
+    {
+      "epoch": 1.305732484076433,
+      "grad_norm": 0.13280436396598816,
+      "learning_rate": 0.00015598590783381163,
+      "loss": 1.0747,
+      "step": 209
+    },
+    {
+      "epoch": 1.3121019108280254,
+      "grad_norm": 0.13099676370620728,
+      "learning_rate": 0.00015555702330196023,
+      "loss": 1.0764,
+      "step": 210
+    },
+    {
+      "epoch": 1.3184713375796178,
+      "grad_norm": 0.1397230178117752,
+      "learning_rate": 0.00015512665546467007,
+      "loss": 1.0716,
+      "step": 211
+    },
+    {
+      "epoch": 1.3248407643312101,
+      "grad_norm": 0.13324333727359772,
+      "learning_rate": 0.00015469481581224272,
+      "loss": 1.0926,
+      "step": 212
+    },
+    {
+      "epoch": 1.3312101910828025,
+      "grad_norm": 0.1313484162092209,
+      "learning_rate": 0.00015426151587427547,
+      "loss": 1.0533,
+      "step": 213
+    },
+    {
+      "epoch": 1.3375796178343948,
+      "grad_norm": 0.1433049589395523,
+      "learning_rate": 0.00015382676721935345,
+      "loss": 1.055,
+      "step": 214
+    },
+    {
+      "epoch": 1.3439490445859872,
+      "grad_norm": 0.1309911012649536,
+      "learning_rate": 0.00015339058145474085,
+      "loss": 1.0536,
+      "step": 215
+    },
+    {
+      "epoch": 1.3503184713375795,
+      "grad_norm": 0.13482902944087982,
+      "learning_rate": 0.00015295297022607088,
+      "loss": 1.0176,
+      "step": 216
+    },
+    {
+      "epoch": 1.356687898089172,
+      "grad_norm": 0.13102853298187256,
+      "learning_rate": 0.00015251394521703494,
+      "loss": 1.0849,
+      "step": 217
+    },
+    {
+      "epoch": 1.3630573248407643,
+      "grad_norm": 0.13901150226593018,
+      "learning_rate": 0.00015207351814907068,
+      "loss": 1.0452,
+      "step": 218
+    },
+    {
+      "epoch": 1.3694267515923566,
+      "grad_norm": 0.13824929296970367,
+      "learning_rate": 0.000151631700781049,
+      "loss": 1.0083,
+      "step": 219
+    },
+    {
+      "epoch": 1.3757961783439492,
+      "grad_norm": 0.1309863179922104,
+      "learning_rate": 0.00015118850490896012,
+      "loss": 1.0517,
+      "step": 220
+    },
+    {
+      "epoch": 1.3821656050955413,
+      "grad_norm": 0.1359570473432541,
+      "learning_rate": 0.0001507439423655987,
+      "loss": 1.0452,
+      "step": 221
+    },
+    {
+      "epoch": 1.388535031847134,
+      "grad_norm": 0.13473795354366302,
+      "learning_rate": 0.00015029802502024788,
+      "loss": 1.0234,
+      "step": 222
+    },
+    {
+      "epoch": 1.394904458598726,
+      "grad_norm": 0.13787756860256195,
+      "learning_rate": 0.0001498507647783623,
+      "loss": 1.0811,
+      "step": 223
+    },
+    {
+      "epoch": 1.4012738853503186,
+      "grad_norm": 0.1334763914346695,
+      "learning_rate": 0.00014940217358125042,
+      "loss": 1.0363,
+      "step": 224
+    },
+    {
+      "epoch": 1.4076433121019107,
+      "grad_norm": 0.13535600900650024,
+      "learning_rate": 0.0001489522634057555,
+      "loss": 1.059,
+      "step": 225
+    },
+    {
+      "epoch": 1.4140127388535033,
+      "grad_norm": 0.1335124969482422,
+      "learning_rate": 0.00014850104626393598,
+      "loss": 1.0602,
+      "step": 226
+    },
+    {
+      "epoch": 1.4203821656050954,
+      "grad_norm": 0.13075490295886993,
+      "learning_rate": 0.00014804853420274472,
+      "loss": 1.0344,
+      "step": 227
+    },
+    {
+      "epoch": 1.426751592356688,
+      "grad_norm": 0.13887614011764526,
+      "learning_rate": 0.00014759473930370736,
+      "loss": 1.0728,
+      "step": 228
+    },
+    {
+      "epoch": 1.4331210191082802,
+      "grad_norm": 0.12808558344841003,
+      "learning_rate": 0.0001471396736825998,
+      "loss": 1.0158,
+      "step": 229
+    },
+    {
+      "epoch": 1.4394904458598727,
+      "grad_norm": 0.1339128464460373,
+      "learning_rate": 0.00014668334948912453,
+      "loss": 1.0647,
+      "step": 230
+    },
+    {
+      "epoch": 1.4458598726114649,
+      "grad_norm": 0.13178490102291107,
+      "learning_rate": 0.00014622577890658665,
+      "loss": 1.0684,
+      "step": 231
+    },
+    {
+      "epoch": 1.4522292993630574,
+      "grad_norm": 0.13547855615615845,
+      "learning_rate": 0.00014576697415156817,
+      "loss": 1.0712,
+      "step": 232
+    },
+    {
+      "epoch": 1.4585987261146496,
+      "grad_norm": 0.13795921206474304,
+      "learning_rate": 0.00014530694747360204,
+      "loss": 1.0776,
+      "step": 233
+    },
+    {
+      "epoch": 1.4649681528662422,
+      "grad_norm": 0.13771343231201172,
+      "learning_rate": 0.00014484571115484508,
+      "loss": 1.0517,
+      "step": 234
+    },
+    {
+      "epoch": 1.4713375796178343,
+      "grad_norm": 0.13231024146080017,
+      "learning_rate": 0.0001443832775097501,
+      "loss": 1.0776,
+      "step": 235
+    },
+    {
+      "epoch": 1.4777070063694269,
+      "grad_norm": 0.1319817453622818,
+      "learning_rate": 0.00014391965888473703,
+      "loss": 1.0494,
+      "step": 236
+    },
+    {
+      "epoch": 1.484076433121019,
+      "grad_norm": 0.13426139950752258,
+      "learning_rate": 0.0001434548676578634,
+      "loss": 1.001,
+      "step": 237
+    },
+    {
+      "epoch": 1.4904458598726116,
+      "grad_norm": 0.13087789714336395,
+      "learning_rate": 0.0001429889162384937,
+      "loss": 1.0588,
+      "step": 238
+    },
+    {
+      "epoch": 1.4968152866242037,
+      "grad_norm": 0.13652274012565613,
+      "learning_rate": 0.00014252181706696817,
+      "loss": 1.0124,
+      "step": 239
+    },
+    {
+      "epoch": 1.5031847133757963,
+      "grad_norm": 0.13933531939983368,
+      "learning_rate": 0.00014205358261427074,
+      "loss": 1.048,
+      "step": 240
+    },
+    {
+      "epoch": 1.5031847133757963,
+      "eval_loss": 0.9937697052955627,
+      "eval_runtime": 19.2892,
+      "eval_samples_per_second": 56.612,
+      "eval_steps_per_second": 0.933,
+      "step": 240
+    },
+    {
+      "epoch": 1.5095541401273884,
+      "grad_norm": 0.13970831036567688,
+      "learning_rate": 0.00014158422538169596,
+      "loss": 1.0433,
+      "step": 241
+    },
+    {
+      "epoch": 1.515923566878981,
+      "grad_norm": 0.13193373382091522,
+      "learning_rate": 0.0001411137579005151,
+      "loss": 1.0726,
+      "step": 242
+    },
+    {
+      "epoch": 1.5222929936305731,
+      "grad_norm": 0.14291027188301086,
+      "learning_rate": 0.0001406421927316419,
+      "loss": 1.0825,
+      "step": 243
+    },
+    {
+      "epoch": 1.5286624203821657,
+      "grad_norm": 0.14268159866333008,
+      "learning_rate": 0.00014016954246529696,
+      "loss": 1.0887,
+      "step": 244
+    },
+    {
+      "epoch": 1.5350318471337578,
+      "grad_norm": 0.13607299327850342,
+      "learning_rate": 0.00013969581972067164,
+      "loss": 1.0644,
+      "step": 245
+    },
+    {
+      "epoch": 1.5414012738853504,
+      "grad_norm": 0.13732877373695374,
+      "learning_rate": 0.0001392210371455913,
+      "loss": 1.0339,
+      "step": 246
+    },
+    {
+      "epoch": 1.5477707006369426,
+      "grad_norm": 0.13315479457378387,
+      "learning_rate": 0.00013874520741617735,
+      "loss": 1.0284,
+      "step": 247
+    },
+    {
+      "epoch": 1.5541401273885351,
+      "grad_norm": 0.13376399874687195,
+      "learning_rate": 0.000138268343236509,
+      "loss": 1.0279,
+      "step": 248
+    },
+    {
+      "epoch": 1.5605095541401273,
+      "grad_norm": 0.13698357343673706,
+      "learning_rate": 0.00013779045733828407,
+      "loss": 1.0884,
+      "step": 249
+    },
+    {
+      "epoch": 1.5668789808917198,
+      "grad_norm": 0.13575707376003265,
+      "learning_rate": 0.00013731156248047904,
+      "loss": 1.0383,
+      "step": 250
+    },
+    {
+      "epoch": 1.573248407643312,
+      "grad_norm": 0.14258643984794617,
+      "learning_rate": 0.00013683167144900834,
+      "loss": 1.0812,
+      "step": 251
+    },
+    {
+      "epoch": 1.5796178343949046,
+      "grad_norm": 0.1422533541917801,
+      "learning_rate": 0.00013635079705638298,
+      "loss": 1.0259,
+      "step": 252
+    },
+    {
+      "epoch": 1.5859872611464967,
+      "grad_norm": 0.13875292241573334,
+      "learning_rate": 0.00013586895214136874,
+      "loss": 1.0507,
+      "step": 253
+    },
+    {
+      "epoch": 1.5923566878980893,
+      "grad_norm": 0.1358788013458252,
+      "learning_rate": 0.00013538614956864296,
+      "loss": 1.066,
+      "step": 254
+    },
+    {
+      "epoch": 1.5987261146496814,
+      "grad_norm": 0.13774985074996948,
+      "learning_rate": 0.0001349024022284514,
+      "loss": 1.0485,
+      "step": 255
+    },
+    {
+      "epoch": 1.605095541401274,
+      "grad_norm": 0.13040746748447418,
+      "learning_rate": 0.00013441772303626387,
+      "loss": 1.0173,
+      "step": 256
+    },
+    {
+      "epoch": 1.611464968152866,
+      "grad_norm": 0.1312469244003296,
+      "learning_rate": 0.00013393212493242963,
+      "loss": 1.0489,
+      "step": 257
+    },
+    {
+      "epoch": 1.6178343949044587,
+      "grad_norm": 0.14885447919368744,
+      "learning_rate": 0.00013344562088183165,
+      "loss": 1.0403,
+      "step": 258
+    },
+    {
+      "epoch": 1.6242038216560508,
+      "grad_norm": 0.12916652858257294,
+      "learning_rate": 0.00013295822387354071,
+      "loss": 1.024,
+      "step": 259
+    },
+    {
+      "epoch": 1.6305732484076434,
+      "grad_norm": 0.14133484661579132,
+      "learning_rate": 0.00013246994692046836,
+      "loss": 1.0708,
+      "step": 260
+    },
+    {
+      "epoch": 1.6369426751592355,
+      "grad_norm": 0.1382388323545456,
+      "learning_rate": 0.0001319808030590197,
+      "loss": 1.0245,
+      "step": 261
+    },
+    {
+      "epoch": 1.643312101910828,
+      "grad_norm": 0.133922278881073,
+      "learning_rate": 0.0001314908053487452,
+      "loss": 1.0811,
+      "step": 262
+    },
+    {
+      "epoch": 1.6496815286624202,
+      "grad_norm": 0.13291525840759277,
+      "learning_rate": 0.00013099996687199203,
+      "loss": 1.0158,
+      "step": 263
+    },
+    {
+      "epoch": 1.6560509554140128,
+      "grad_norm": 0.13765017688274384,
+      "learning_rate": 0.00013050830073355488,
+      "loss": 1.051,
+      "step": 264
+    },
+    {
+      "epoch": 1.662420382165605,
+      "grad_norm": 0.13831576704978943,
+      "learning_rate": 0.000130015820060326,
+      "loss": 1.0277,
+      "step": 265
+    },
+    {
+      "epoch": 1.6687898089171975,
+      "grad_norm": 0.13770005106925964,
+      "learning_rate": 0.00012952253800094468,
+      "loss": 1.0982,
+      "step": 266
+    },
+    {
+      "epoch": 1.6751592356687897,
+      "grad_norm": 0.13507647812366486,
+      "learning_rate": 0.00012902846772544624,
+      "loss": 0.9961,
+      "step": 267
+    },
+    {
+      "epoch": 1.6815286624203822,
+      "grad_norm": 0.13365790247917175,
+      "learning_rate": 0.00012853362242491053,
+      "loss": 1.0191,
+      "step": 268
+    },
+    {
+      "epoch": 1.6878980891719744,
+      "grad_norm": 0.13307291269302368,
+      "learning_rate": 0.00012803801531110955,
+      "loss": 1.0709,
+      "step": 269
+    },
+    {
+      "epoch": 1.694267515923567,
+      "grad_norm": 0.1397312730550766,
+      "learning_rate": 0.0001275416596161548,
+      "loss": 1.0412,
+      "step": 270
+    },
+    {
+      "epoch": 1.700636942675159,
+      "grad_norm": 0.14555448293685913,
+      "learning_rate": 0.00012704456859214397,
+      "loss": 1.0675,
+      "step": 271
+    },
+    {
+      "epoch": 1.7070063694267517,
+      "grad_norm": 0.14011207222938538,
+      "learning_rate": 0.00012654675551080724,
+      "loss": 1.0632,
+      "step": 272
+    },
+    {
+      "epoch": 1.7133757961783438,
+      "grad_norm": 0.14120171964168549,
+      "learning_rate": 0.00012604823366315273,
+      "loss": 1.0307,
+      "step": 273
+    },
+    {
+      "epoch": 1.7197452229299364,
+      "grad_norm": 0.13699355721473694,
+      "learning_rate": 0.00012554901635911187,
+      "loss": 1.0482,
+      "step": 274
+    },
+    {
+      "epoch": 1.7261146496815285,
+      "grad_norm": 0.14194992184638977,
+      "learning_rate": 0.00012504911692718385,
+      "loss": 1.0944,
+      "step": 275
+    },
+    {
+      "epoch": 1.732484076433121,
+      "grad_norm": 0.13791659474372864,
+      "learning_rate": 0.00012454854871407994,
+      "loss": 1.0276,
+      "step": 276
+    },
+    {
+      "epoch": 1.7388535031847132,
+      "grad_norm": 0.1348312497138977,
+      "learning_rate": 0.0001240473250843669,
+      "loss": 1.0388,
+      "step": 277
+    },
+    {
+      "epoch": 1.7452229299363058,
+      "grad_norm": 0.13901084661483765,
+      "learning_rate": 0.00012354545942011057,
+      "loss": 1.0605,
+      "step": 278
+    },
+    {
+      "epoch": 1.7515923566878981,
+      "grad_norm": 0.13213810324668884,
+      "learning_rate": 0.00012304296512051814,
+      "loss": 1.0163,
+      "step": 279
+    },
+    {
+      "epoch": 1.7579617834394905,
+      "grad_norm": 0.13962963223457336,
+      "learning_rate": 0.00012253985560158062,
+      "loss": 1.0839,
+      "step": 280
+    },
+    {
+      "epoch": 1.7579617834394905,
+      "eval_loss": 0.9859278202056885,
+      "eval_runtime": 19.3296,
+      "eval_samples_per_second": 56.494,
+      "eval_steps_per_second": 0.931,
+      "step": 280
+    },
+    {
+      "epoch": 1.7643312101910829,
+      "grad_norm": 0.13637703657150269,
+      "learning_rate": 0.00012203614429571475,
+      "loss": 1.0466,
+      "step": 281
+    },
+    {
+      "epoch": 1.7707006369426752,
+      "grad_norm": 0.13617432117462158,
+      "learning_rate": 0.00012153184465140413,
+      "loss": 1.086,
+      "step": 282
+    },
+    {
+      "epoch": 1.7770700636942676,
+      "grad_norm": 0.1326834112405777,
+      "learning_rate": 0.00012102697013284034,
+      "loss": 1.0692,
+      "step": 283
+    },
+    {
+      "epoch": 1.78343949044586,
+      "grad_norm": 0.1359279453754425,
+      "learning_rate": 0.00012052153421956342,
+      "loss": 1.0337,
+      "step": 284
+    },
+    {
+      "epoch": 1.7898089171974523,
+      "grad_norm": 0.13754823803901672,
+      "learning_rate": 0.00012001555040610197,
+      "loss": 1.0377,
+      "step": 285
+    },
+    {
+      "epoch": 1.7961783439490446,
+      "grad_norm": 0.1381075084209442,
+      "learning_rate": 0.00011950903220161285,
+      "loss": 1.0311,
+      "step": 286
+    },
+    {
+      "epoch": 1.802547770700637,
+      "grad_norm": 0.14102081954479218,
+      "learning_rate": 0.00011900199312952047,
+      "loss": 1.0645,
+      "step": 287
+    },
+    {
+      "epoch": 1.8089171974522293,
+      "grad_norm": 0.14201205968856812,
+      "learning_rate": 0.00011849444672715586,
+      "loss": 1.043,
+      "step": 288
+    },
+    {
+      "epoch": 1.8152866242038217,
+      "grad_norm": 0.13808976113796234,
+      "learning_rate": 0.0001179864065453951,
+      "loss": 1.0263,
+      "step": 289
+    },
+    {
+      "epoch": 1.821656050955414,
+      "grad_norm": 0.13844111561775208,
+      "learning_rate": 0.00011747788614829758,
+      "loss": 1.0483,
+      "step": 290
+    },
+    {
+      "epoch": 1.8280254777070064,
+      "grad_norm": 0.13990408182144165,
+      "learning_rate": 0.00011696889911274393,
+      "loss": 1.0591,
+      "step": 291
+    },
+    {
+      "epoch": 1.8343949044585988,
+      "grad_norm": 0.14219453930854797,
+      "learning_rate": 0.00011645945902807341,
+      "loss": 1.0702,
+      "step": 292
+    },
+    {
+      "epoch": 1.8407643312101911,
+      "grad_norm": 0.13736915588378906,
+      "learning_rate": 0.0001159495794957211,
+      "loss": 1.0457,
+      "step": 293
+    },
+    {
+      "epoch": 1.8471337579617835,
+      "grad_norm": 0.1305588185787201,
+      "learning_rate": 0.00011543927412885489,
+      "loss": 1.0006,
+      "step": 294
+    },
+    {
+      "epoch": 1.8535031847133758,
+      "grad_norm": 0.1360785961151123,
+      "learning_rate": 0.0001149285565520119,
+      "loss": 1.0055,
+      "step": 295
+    },
+    {
+      "epoch": 1.8598726114649682,
+      "grad_norm": 0.13506443798542023,
+      "learning_rate": 0.00011441744040073468,
+      "loss": 1.0519,
+      "step": 296
+    },
+    {
+      "epoch": 1.8662420382165605,
+      "grad_norm": 0.1369323432445526,
+      "learning_rate": 0.0001139059393212074,
+      "loss": 1.0547,
+      "step": 297
+    },
+    {
+      "epoch": 1.872611464968153,
+      "grad_norm": 0.13333867490291595,
+      "learning_rate": 0.00011339406696989128,
+      "loss": 1.0537,
+      "step": 298
+    },
+    {
+      "epoch": 1.8789808917197452,
+      "grad_norm": 0.13622106611728668,
+      "learning_rate": 0.00011288183701315995,
+      "loss": 1.0517,
+      "step": 299
+    },
+    {
+      "epoch": 1.8853503184713376,
+      "grad_norm": 0.14023956656455994,
+      "learning_rate": 0.00011236926312693479,
+      "loss": 1.0559,
+      "step": 300
+    },
+    {
+      "epoch": 1.89171974522293,
+      "grad_norm": 0.1363711655139923,
+      "learning_rate": 0.00011185635899631963,
+      "loss": 1.0291,
+      "step": 301
+    },
+    {
+      "epoch": 1.8980891719745223,
+      "grad_norm": 0.13854017853736877,
+      "learning_rate": 0.00011134313831523547,
+      "loss": 1.0605,
+      "step": 302
+    },
+    {
+      "epoch": 1.9044585987261147,
+      "grad_norm": 0.13852174580097198,
+      "learning_rate": 0.00011082961478605475,
+      "loss": 1.0553,
+      "step": 303
+    },
+    {
+      "epoch": 1.910828025477707,
+      "grad_norm": 0.14662423729896545,
+      "learning_rate": 0.00011031580211923571,
+      "loss": 1.0619,
+      "step": 304
+    },
+    {
+      "epoch": 1.9171974522292994,
+      "grad_norm": 0.14127817749977112,
+      "learning_rate": 0.0001098017140329561,
+      "loss": 1.0698,
+      "step": 305
+    },
+    {
+      "epoch": 1.9235668789808917,
+      "grad_norm": 0.1394420862197876,
+      "learning_rate": 0.00010928736425274701,
+      "loss": 1.0433,
+      "step": 306
+    },
+    {
+      "epoch": 1.929936305732484,
+      "grad_norm": 0.1438218653202057,
+      "learning_rate": 0.00010877276651112662,
+      "loss": 1.0498,
+      "step": 307
+    },
+    {
+      "epoch": 1.9363057324840764,
+      "grad_norm": 0.1500382274389267,
+      "learning_rate": 0.00010825793454723325,
+      "loss": 1.0613,
+      "step": 308
+    },
+    {
+      "epoch": 1.9426751592356688,
+      "grad_norm": 0.14135834574699402,
+      "learning_rate": 0.00010774288210645862,
+      "loss": 1.0435,
+      "step": 309
+    },
+    {
+      "epoch": 1.9490445859872612,
+      "grad_norm": 0.1469028890132904,
+      "learning_rate": 0.00010722762294008106,
+      "loss": 1.0064,
+      "step": 310
+    },
+    {
+      "epoch": 1.9554140127388535,
+      "grad_norm": 0.14101552963256836,
+      "learning_rate": 0.00010671217080489814,
+      "loss": 1.0485,
+      "step": 311
+    },
+    {
+      "epoch": 1.9617834394904459,
+      "grad_norm": 0.1395803987979889,
+      "learning_rate": 0.00010619653946285947,
+      "loss": 1.0405,
+      "step": 312
+    },
+    {
+      "epoch": 1.9681528662420382,
+      "grad_norm": 0.1441717892885208,
+      "learning_rate": 0.00010568074268069928,
+      "loss": 1.0183,
+      "step": 313
+    },
+    {
+      "epoch": 1.9745222929936306,
+      "grad_norm": 0.14449232816696167,
+      "learning_rate": 0.00010516479422956882,
+      "loss": 1.035,
+      "step": 314
+    },
+    {
+      "epoch": 1.980891719745223,
+      "grad_norm": 0.14461494982242584,
+      "learning_rate": 0.00010464870788466873,
+      "loss": 1.0911,
+      "step": 315
+    },
+    {
+      "epoch": 1.9872611464968153,
+      "grad_norm": 0.13796767592430115,
+      "learning_rate": 0.00010413249742488131,
+      "loss": 1.0425,
+      "step": 316
+    },
+    {
+      "epoch": 1.9936305732484076,
+      "grad_norm": 0.1377556025981903,
+      "learning_rate": 0.00010361617663240253,
+      "loss": 1.0773,
+      "step": 317
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.14611703157424927,
+      "learning_rate": 0.00010309975929237408,
+      "loss": 1.0333,
+      "step": 318
+    },
+    {
+      "epoch": 2.0063694267515926,
+      "grad_norm": 0.1386556774377823,
+      "learning_rate": 0.00010258325919251537,
+      "loss": 0.9997,
+      "step": 319
+    },
+    {
+      "epoch": 2.0127388535031847,
+      "grad_norm": 0.13707421720027924,
+      "learning_rate": 0.00010206669012275545,
+      "loss": 1.0817,
+      "step": 320
+    },
+    {
+      "epoch": 2.0127388535031847,
+      "eval_loss": 0.9801168441772461,
+      "eval_runtime": 19.3185,
+      "eval_samples_per_second": 56.526,
+      "eval_steps_per_second": 0.932,
+      "step": 320
+    },
+    {
+      "epoch": 2.019108280254777,
+      "grad_norm": 0.13602325320243835,
+      "learning_rate": 0.00010155006587486469,
+      "loss": 1.0337,
+      "step": 321
+    },
+    {
+      "epoch": 2.0254777070063694,
+      "grad_norm": 0.6095888018608093,
+      "learning_rate": 0.00010103340024208674,
+      "loss": 1.1783,
+      "step": 322
+    },
+    {
+      "epoch": 2.0063694267515926,
+      "grad_norm": 0.1478217989206314,
+      "learning_rate": 0.00010051670701877012,
+      "loss": 1.0114,
+      "step": 323
+    },
+    {
+      "epoch": 2.0127388535031847,
+      "grad_norm": 0.1493089348077774,
+      "learning_rate": 0.0001,
+      "loss": 1.0167,
+      "step": 324
+    },
+    {
+      "epoch": 2.0191082802547773,
+      "grad_norm": 0.15172579884529114,
+      "learning_rate": 9.948329298122988e-05,
+      "loss": 1.0161,
+      "step": 325
+    },
+    {
+      "epoch": 2.0254777070063694,
+      "grad_norm": 0.14516572654247284,
+      "learning_rate": 9.89665997579133e-05,
+      "loss": 1.0375,
+      "step": 326
+    },
+    {
+      "epoch": 2.031847133757962,
+      "grad_norm": 0.15873479843139648,
+      "learning_rate": 9.844993412513532e-05,
+      "loss": 1.0214,
+      "step": 327
+    },
+    {
+      "epoch": 2.038216560509554,
+      "grad_norm": 0.1409904658794403,
+      "learning_rate": 9.793330987724459e-05,
+      "loss": 1.001,
+      "step": 328
+    },
+    {
+      "epoch": 2.0445859872611467,
+      "grad_norm": 0.15463833510875702,
+      "learning_rate": 9.741674080748464e-05,
+      "loss": 1.0325,
+      "step": 329
+    },
+    {
+      "epoch": 2.050955414012739,
+      "grad_norm": 0.15082892775535583,
+      "learning_rate": 9.690024070762596e-05,
+      "loss": 1.0292,
+      "step": 330
+    },
+    {
+      "epoch": 2.0573248407643314,
+      "grad_norm": 0.1486956775188446,
+      "learning_rate": 9.638382336759749e-05,
+      "loss": 0.9867,
+      "step": 331
+    },
+    {
+      "epoch": 2.0636942675159236,
+      "grad_norm": 0.14457155764102936,
+      "learning_rate": 9.586750257511867e-05,
+      "loss": 1.0255,
+      "step": 332
+    },
+    {
+      "epoch": 2.070063694267516,
+      "grad_norm": 0.15076091885566711,
+      "learning_rate": 9.535129211533129e-05,
+      "loss": 1.005,
+      "step": 333
+    },
+    {
+      "epoch": 2.0764331210191083,
+      "grad_norm": 0.1533278077840805,
+      "learning_rate": 9.483520577043121e-05,
+      "loss": 0.9923,
+      "step": 334
+    },
+    {
+      "epoch": 2.082802547770701,
+      "grad_norm": 0.14563091099262238,
+      "learning_rate": 9.431925731930078e-05,
+      "loss": 1.0548,
+      "step": 335
+    },
+    {
+      "epoch": 2.089171974522293,
+      "grad_norm": 0.14984974265098572,
+      "learning_rate": 9.380346053714055e-05,
+      "loss": 1.0417,
+      "step": 336
+    },
+    {
+      "epoch": 2.0955414012738856,
+      "grad_norm": 0.14393095672130585,
+      "learning_rate": 9.328782919510185e-05,
+      "loss": 1.0163,
+      "step": 337
+    },
+    {
+      "epoch": 2.1019108280254777,
+      "grad_norm": 0.14722150564193726,
+      "learning_rate": 9.277237705991894e-05,
+      "loss": 1.0418,
+      "step": 338
+    },
+    {
+      "epoch": 2.1082802547770703,
+      "grad_norm": 0.1614050716161728,
+      "learning_rate": 9.225711789354137e-05,
+      "loss": 0.9721,
+      "step": 339
+    },
+    {
+      "epoch": 2.1146496815286624,
+      "grad_norm": 0.15309081971645355,
+      "learning_rate": 9.174206545276677e-05,
+      "loss": 1.0198,
+      "step": 340
+    },
+    {
+      "epoch": 2.121019108280255,
+      "grad_norm": 0.1521766036748886,
+      "learning_rate": 9.122723348887339e-05,
+      "loss": 1.0196,
+      "step": 341
+    },
+    {
+      "epoch": 2.127388535031847,
+      "grad_norm": 0.15561290085315704,
+      "learning_rate": 9.0712635747253e-05,
+      "loss": 1.0537,
+      "step": 342
+    },
+    {
+      "epoch": 2.1337579617834397,
+      "grad_norm": 0.1435907781124115,
+      "learning_rate": 9.019828596704394e-05,
+      "loss": 0.9999,
+      "step": 343
+    },
+    {
+      "epoch": 2.140127388535032,
+      "grad_norm": 0.15161436796188354,
+      "learning_rate": 8.968419788076431e-05,
+      "loss": 1.0154,
+      "step": 344
+    },
+    {
+      "epoch": 2.1464968152866244,
+      "grad_norm": 0.14869481325149536,
+      "learning_rate": 8.917038521394526e-05,
+      "loss": 0.9847,
+      "step": 345
+    },
+    {
+      "epoch": 2.1528662420382165,
+      "grad_norm": 0.14515374600887299,
+      "learning_rate": 8.865686168476457e-05,
+      "loss": 1.0069,
+      "step": 346
+    },
+    {
+      "epoch": 2.159235668789809,
+      "grad_norm": 0.14634117484092712,
+      "learning_rate": 8.81436410036804e-05,
+      "loss": 1.008,
+      "step": 347
+    },
+    {
+      "epoch": 2.1656050955414012,
+      "grad_norm": 0.15083561837673187,
+      "learning_rate": 8.763073687306524e-05,
+      "loss": 0.9891,
+      "step": 348
+    },
+    {
+      "epoch": 2.171974522292994,
+      "grad_norm": 0.16083382070064545,
+      "learning_rate": 8.71181629868401e-05,
+      "loss": 1.0291,
+      "step": 349
+    },
+    {
+      "epoch": 2.178343949044586,
+      "grad_norm": 0.14988526701927185,
+      "learning_rate": 8.660593303010876e-05,
+      "loss": 1.0217,
+      "step": 350
+    },
+    {
+      "epoch": 2.1847133757961785,
+      "grad_norm": 0.15332889556884766,
+      "learning_rate": 8.609406067879258e-05,
+      "loss": 1.0502,
+      "step": 351
+    },
+    {
+      "epoch": 2.1910828025477707,
+      "grad_norm": 0.1509145349264145,
+      "learning_rate": 8.558255959926533e-05,
+      "loss": 1.0021,
+      "step": 352
+    },
+    {
+      "epoch": 2.1974522292993632,
+      "grad_norm": 0.14952103793621063,
+      "learning_rate": 8.507144344798814e-05,
+      "loss": 1.0052,
+      "step": 353
+    },
+    {
+      "epoch": 2.2038216560509554,
+      "grad_norm": 0.14787828922271729,
+      "learning_rate": 8.456072587114515e-05,
+      "loss": 0.929,
+      "step": 354
+    },
+    {
+      "epoch": 2.210191082802548,
+      "grad_norm": 0.1580924093723297,
+      "learning_rate": 8.405042050427891e-05,
+      "loss": 1.0071,
+      "step": 355
+    },
+    {
+      "epoch": 2.21656050955414,
+      "grad_norm": 0.15078537166118622,
+      "learning_rate": 8.35405409719266e-05,
+      "loss": 0.999,
+      "step": 356
+    },
+    {
+      "epoch": 2.2229299363057327,
+      "grad_norm": 0.1510494202375412,
+      "learning_rate": 8.303110088725608e-05,
+      "loss": 1.0222,
+      "step": 357
+    },
+    {
+      "epoch": 2.229299363057325,
+      "grad_norm": 0.15881799161434174,
+      "learning_rate": 8.252211385170242e-05,
+      "loss": 1.0053,
+      "step": 358
+    },
+    {
+      "epoch": 2.2356687898089174,
+      "grad_norm": 0.15945011377334595,
+      "learning_rate": 8.201359345460496e-05,
+      "loss": 1.0145,
+      "step": 359
+    },
+    {
+      "epoch": 2.2420382165605095,
+      "grad_norm": 0.15022100508213043,
+      "learning_rate": 8.150555327284417e-05,
+      "loss": 1.0115,
+      "step": 360
+    },
+    {
+      "epoch": 2.2420382165605095,
+      "eval_loss": 0.9787687659263611,
+      "eval_runtime": 19.3111,
+      "eval_samples_per_second": 56.548,
+      "eval_steps_per_second": 0.932,
+      "step": 360
+    },
+    {
+      "epoch": 2.248407643312102,
+      "grad_norm": 0.15681840479373932,
+      "learning_rate": 8.099800687047958e-05,
+      "loss": 1.0398,
+      "step": 361
+    },
+    {
+      "epoch": 2.254777070063694,
+      "grad_norm": 0.14988869428634644,
+      "learning_rate": 8.049096779838719e-05,
+      "loss": 0.9828,
+      "step": 362
+    },
+    {
+      "epoch": 2.261146496815287,
+      "grad_norm": 0.1477968841791153,
+      "learning_rate": 7.998444959389803e-05,
+      "loss": 1.0276,
+      "step": 363
+    },
+    {
+      "epoch": 2.267515923566879,
+      "grad_norm": 0.1529729813337326,
+      "learning_rate": 7.947846578043659e-05,
+      "loss": 1.0172,
+      "step": 364
+    },
+    {
+      "epoch": 2.2738853503184715,
+      "grad_norm": 0.15440192818641663,
+      "learning_rate": 7.897302986715967e-05,
+      "loss": 1.0387,
+      "step": 365
+    },
+    {
+      "epoch": 2.2802547770700636,
+      "grad_norm": 0.16899262368679047,
+      "learning_rate": 7.846815534859591e-05,
+      "loss": 1.0352,
+      "step": 366
+    },
+    {
+      "epoch": 2.286624203821656,
+      "grad_norm": 0.15240244567394257,
+      "learning_rate": 7.796385570428526e-05,
+      "loss": 1.0264,
+      "step": 367
+    },
+    {
+      "epoch": 2.2929936305732483,
+      "grad_norm": 0.15043969452381134,
+      "learning_rate": 7.74601443984194e-05,
+      "loss": 0.9838,
+      "step": 368
+    },
+    {
+      "epoch": 2.299363057324841,
+      "grad_norm": 0.16128422319889069,
+      "learning_rate": 7.695703487948189e-05,
+      "loss": 1.0317,
+      "step": 369
+    },
+    {
+      "epoch": 2.305732484076433,
+      "grad_norm": 0.159554123878479,
+      "learning_rate": 7.645454057988942e-05,
+      "loss": 1.0122,
+      "step": 370
+    },
+    {
+      "epoch": 2.3121019108280256,
+      "grad_norm": 0.15471342206001282,
+      "learning_rate": 7.59526749156331e-05,
+      "loss": 1.0077,
+      "step": 371
+    },
+    {
+      "epoch": 2.3184713375796178,
+      "grad_norm": 0.1584528684616089,
+      "learning_rate": 7.54514512859201e-05,
+      "loss": 1.0074,
+      "step": 372
+    },
+    {
+      "epoch": 2.3248407643312103,
+      "grad_norm": 0.16024437546730042,
+      "learning_rate": 7.495088307281618e-05,
+      "loss": 1.0066,
+      "step": 373
+    },
+    {
+      "epoch": 2.3312101910828025,
+      "grad_norm": 0.15269304811954498,
+      "learning_rate": 7.445098364088815e-05,
+      "loss": 1.0083,
+      "step": 374
+    },
+    {
+      "epoch": 2.337579617834395,
+      "grad_norm": 0.15843960642814636,
+      "learning_rate": 7.395176633684726e-05,
+      "loss": 0.982,
+      "step": 375
+    },
+    {
+      "epoch": 2.343949044585987,
+      "grad_norm": 0.155010387301445,
+      "learning_rate": 7.34532444891928e-05,
+      "loss": 1.019,
+      "step": 376
+    },
+    {
+      "epoch": 2.3503184713375798,
+      "grad_norm": 0.16114681959152222,
+      "learning_rate": 7.295543140785603e-05,
+      "loss": 1.0056,
+      "step": 377
+    },
+    {
+      "epoch": 2.356687898089172,
+      "grad_norm": 0.15911327302455902,
+      "learning_rate": 7.245834038384522e-05,
+      "loss": 1.0172,
+      "step": 378
+    },
+    {
+      "epoch": 2.3630573248407645,
+      "grad_norm": 0.1528182476758957,
+      "learning_rate": 7.196198468889047e-05,
+      "loss": 0.989,
+      "step": 379
+    },
+    {
+      "epoch": 2.3694267515923566,
+      "grad_norm": 0.1665172576904297,
+      "learning_rate": 7.146637757508949e-05,
+      "loss": 1.0249,
+      "step": 380
+    },
+    {
+      "epoch": 2.375796178343949,
+      "grad_norm": 0.16792528331279755,
+      "learning_rate": 7.097153227455379e-05,
+      "loss": 1.0017,
+      "step": 381
+    },
+    {
+      "epoch": 2.3821656050955413,
+      "grad_norm": 0.15666238963603973,
+      "learning_rate": 7.047746199905536e-05,
+      "loss": 1.021,
+      "step": 382
+    },
+    {
+      "epoch": 2.388535031847134,
+      "grad_norm": 0.15589690208435059,
+      "learning_rate": 6.998417993967403e-05,
+      "loss": 1.0325,
+      "step": 383
+    },
+    {
+      "epoch": 2.394904458598726,
+      "grad_norm": 0.15027858316898346,
+      "learning_rate": 6.949169926644514e-05,
+      "loss": 1.0215,
+      "step": 384
+    },
+    {
+      "epoch": 2.4012738853503186,
+      "grad_norm": 0.15848621726036072,
+      "learning_rate": 6.9000033128008e-05,
+      "loss": 1.0324,
+      "step": 385
+    },
+    {
+      "epoch": 2.4076433121019107,
+      "grad_norm": 0.15717780590057373,
+      "learning_rate": 6.850919465125482e-05,
+      "loss": 0.9591,
+      "step": 386
+    },
+    {
+      "epoch": 2.4140127388535033,
+      "grad_norm": 0.16189338266849518,
+      "learning_rate": 6.801919694098033e-05,
+      "loss": 0.9963,
+      "step": 387
+    },
+    {
+      "epoch": 2.4203821656050954,
+      "grad_norm": 0.16102810204029083,
+      "learning_rate": 6.753005307953167e-05,
+      "loss": 1.0205,
+      "step": 388
+    },
+    {
+      "epoch": 2.426751592356688,
+      "grad_norm": 0.15888215601444244,
+      "learning_rate": 6.70417761264593e-05,
+      "loss": 0.9904,
+      "step": 389
+    },
+    {
+      "epoch": 2.43312101910828,
+      "grad_norm": 0.16042260825634003,
+      "learning_rate": 6.655437911816838e-05,
+      "loss": 1.0394,
+      "step": 390
+    },
+    {
+      "epoch": 2.4394904458598727,
+      "grad_norm": 0.17097777128219604,
+      "learning_rate": 6.60678750675704e-05,
+      "loss": 1.0279,
+      "step": 391
+    },
+    {
+      "epoch": 2.445859872611465,
+      "grad_norm": 0.1708676666021347,
+      "learning_rate": 6.558227696373616e-05,
+      "loss": 0.9913,
+      "step": 392
+    },
+    {
+      "epoch": 2.4522292993630574,
+      "grad_norm": 0.16568711400032043,
+      "learning_rate": 6.509759777154864e-05,
+      "loss": 1.0058,
+      "step": 393
+    },
+    {
+      "epoch": 2.4585987261146496,
+      "grad_norm": 0.16431483626365662,
+      "learning_rate": 6.461385043135704e-05,
+      "loss": 1.0528,
+      "step": 394
+    },
+    {
+      "epoch": 2.464968152866242,
+      "grad_norm": 0.15568807721138,
+      "learning_rate": 6.413104785863128e-05,
+      "loss": 1.044,
+      "step": 395
+    },
+    {
+      "epoch": 2.4713375796178343,
+      "grad_norm": 0.1519811451435089,
+      "learning_rate": 6.3649202943617e-05,
+      "loss": 1.0269,
+      "step": 396
+    },
+    {
+      "epoch": 2.477707006369427,
+      "grad_norm": 0.159037247300148,
+      "learning_rate": 6.316832855099173e-05,
+      "loss": 1.0221,
+      "step": 397
+    },
+    {
+      "epoch": 2.484076433121019,
+      "grad_norm": 0.15528284013271332,
+      "learning_rate": 6.2688437519521e-05,
+      "loss": 0.985,
+      "step": 398
+    },
+    {
+      "epoch": 2.4904458598726116,
+      "grad_norm": 0.15442180633544922,
+      "learning_rate": 6.220954266171596e-05,
+      "loss": 1.0071,
+      "step": 399
+    },
+    {
+      "epoch": 2.4968152866242037,
+      "grad_norm": 0.15721610188484192,
+      "learning_rate": 6.173165676349103e-05,
+      "loss": 1.0356,
+      "step": 400
+    },
+    {
+      "epoch": 2.4968152866242037,
+      "eval_loss": 0.9729906916618347,
+      "eval_runtime": 19.5681,
+      "eval_samples_per_second": 55.805,
+      "eval_steps_per_second": 0.92,
+      "step": 400
+    },
+    {
+      "epoch": 2.5031847133757963,
+      "grad_norm": 0.16139522194862366,
+      "learning_rate": 6.125479258382268e-05,
+      "loss": 1.0247,
+      "step": 401
+    },
+    {
+      "epoch": 2.5095541401273884,
+      "grad_norm": 0.16305910050868988,
+      "learning_rate": 6.077896285440874e-05,
+      "loss": 1.0617,
+      "step": 402
+    },
+    {
+      "epoch": 2.515923566878981,
+      "grad_norm": 0.1648111492395401,
+      "learning_rate": 6.030418027932835e-05,
+      "loss": 1.0367,
+      "step": 403
+    },
+    {
+      "epoch": 2.522292993630573,
+      "grad_norm": 0.1614786684513092,
+      "learning_rate": 5.983045753470308e-05,
+      "loss": 1.0159,
+      "step": 404
+    },
+    {
+      "epoch": 2.5286624203821657,
+      "grad_norm": 0.15744082629680634,
+      "learning_rate": 5.935780726835811e-05,
+      "loss": 1.0317,
+      "step": 405
+    },
+    {
+      "epoch": 2.535031847133758,
+      "grad_norm": 0.16021069884300232,
+      "learning_rate": 5.888624209948495e-05,
+      "loss": 1.0468,
+      "step": 406
+    },
+    {
+      "epoch": 2.5414012738853504,
+      "grad_norm": 0.16327615082263947,
+      "learning_rate": 5.841577461830407e-05,
+      "loss": 1.0017,
+      "step": 407
+    },
+    {
+      "epoch": 2.5477707006369426,
+      "grad_norm": 0.16415022313594818,
+      "learning_rate": 5.794641738572925e-05,
+      "loss": 0.9745,
+      "step": 408
+    },
+    {
+      "epoch": 2.554140127388535,
+      "grad_norm": 0.16266576945781708,
+      "learning_rate": 5.747818293303184e-05,
+      "loss": 0.9881,
+      "step": 409
+    },
+    {
+      "epoch": 2.5605095541401273,
+      "grad_norm": 0.1699274182319641,
+      "learning_rate": 5.7011083761506344e-05,
+      "loss": 1.041,
+      "step": 410
+    },
+    {
+      "epoch": 2.56687898089172,
+      "grad_norm": 0.16479632258415222,
+      "learning_rate": 5.6545132342136634e-05,
+      "loss": 0.9983,
+      "step": 411
+    },
+    {
+      "epoch": 2.573248407643312,
+      "grad_norm": 0.1586018204689026,
+      "learning_rate": 5.608034111526298e-05,
+      "loss": 1.0322,
+      "step": 412
+    },
+    {
+      "epoch": 2.5796178343949046,
+      "grad_norm": 0.16549105942249298,
+      "learning_rate": 5.561672249024988e-05,
+      "loss": 0.9938,
+      "step": 413
+    },
+    {
+      "epoch": 2.5859872611464967,
+      "grad_norm": 0.16589821875095367,
+      "learning_rate": 5.515428884515494e-05,
+      "loss": 1.0039,
+      "step": 414
+    },
+    {
+      "epoch": 2.5923566878980893,
+      "grad_norm": 0.16915416717529297,
+      "learning_rate": 5.469305252639796e-05,
+      "loss": 1.0109,
+      "step": 415
+    },
+    {
+      "epoch": 2.5987261146496814,
+      "grad_norm": 0.1557624787092209,
+      "learning_rate": 5.423302584843186e-05,
+      "loss": 0.9918,
+      "step": 416
+    },
+    {
+      "epoch": 2.605095541401274,
+      "grad_norm": 0.16412273049354553,
+      "learning_rate": 5.377422109341332e-05,
+      "loss": 1.016,
+      "step": 417
+    },
+    {
+      "epoch": 2.611464968152866,
+      "grad_norm": 0.163206547498703,
+      "learning_rate": 5.331665051087549e-05,
+      "loss": 1.0528,
+      "step": 418
+    },
+    {
+      "epoch": 2.6178343949044587,
+      "grad_norm": 0.16450117528438568,
+      "learning_rate": 5.286032631740023e-05,
+      "loss": 1.0314,
+      "step": 419
+    },
+    {
+      "epoch": 2.624203821656051,
+      "grad_norm": 0.16499797999858856,
+      "learning_rate": 5.240526069629265e-05,
+      "loss": 0.985,
+      "step": 420
+    },
+    {
+      "epoch": 2.6305732484076434,
+      "grad_norm": 0.17124028503894806,
+      "learning_rate": 5.19514657972553e-05,
+      "loss": 0.9932,
+      "step": 421
+    },
+    {
+      "epoch": 2.6369426751592355,
+      "grad_norm": 0.1563197821378708,
+      "learning_rate": 5.149895373606405e-05,
+      "loss": 1.0221,
+      "step": 422
+    },
+    {
+      "epoch": 2.643312101910828,
+      "grad_norm": 0.16314244270324707,
+      "learning_rate": 5.104773659424453e-05,
+      "loss": 1.0274,
+      "step": 423
+    },
+    {
+      "epoch": 2.6496815286624202,
+      "grad_norm": 0.16842271387577057,
+      "learning_rate": 5.059782641874962e-05,
+      "loss": 0.9844,
+      "step": 424
+    },
+    {
+      "epoch": 2.656050955414013,
+      "grad_norm": 0.16713784635066986,
+      "learning_rate": 5.0149235221637724e-05,
+      "loss": 1.0368,
+      "step": 425
+    },
+    {
+      "epoch": 2.662420382165605,
+      "grad_norm": 0.16381680965423584,
+      "learning_rate": 4.970197497975216e-05,
+      "loss": 0.9824,
+      "step": 426
+    },
+    {
+      "epoch": 2.6687898089171975,
+      "grad_norm": 0.15927736461162567,
+      "learning_rate": 4.92560576344013e-05,
+      "loss": 1.0187,
+      "step": 427
+    },
+    {
+      "epoch": 2.6751592356687897,
+      "grad_norm": 0.17020700871944427,
+      "learning_rate": 4.8811495091039926e-05,
+      "loss": 1.0174,
+      "step": 428
+    },
+    {
+      "epoch": 2.6815286624203822,
+      "grad_norm": 0.1601068079471588,
+      "learning_rate": 4.836829921895103e-05,
+      "loss": 1.0177,
+      "step": 429
+    },
+    {
+      "epoch": 2.6878980891719744,
+      "grad_norm": 0.15823066234588623,
+      "learning_rate": 4.792648185092937e-05,
+      "loss": 1.0062,
+      "step": 430
+    },
+    {
+      "epoch": 2.694267515923567,
+      "grad_norm": 0.15921784937381744,
+      "learning_rate": 4.748605478296507e-05,
+      "loss": 1.0211,
+      "step": 431
+    },
+    {
+      "epoch": 2.700636942675159,
+      "grad_norm": 0.1611403077840805,
+      "learning_rate": 4.704702977392914e-05,
+      "loss": 0.9955,
+      "step": 432
+    },
+    {
+      "epoch": 2.7070063694267517,
+      "grad_norm": 0.16314539313316345,
+      "learning_rate": 4.660941854525917e-05,
+      "loss": 0.9749,
+      "step": 433
+    },
+    {
+      "epoch": 2.713375796178344,
+      "grad_norm": 0.161176398396492,
+      "learning_rate": 4.617323278064657e-05,
+      "loss": 1.0242,
+      "step": 434
+    },
+    {
+      "epoch": 2.7197452229299364,
+      "grad_norm": 0.16189491748809814,
+      "learning_rate": 4.573848412572458e-05,
+      "loss": 1.0461,
+      "step": 435
+    },
+    {
+      "epoch": 2.7261146496815285,
+      "grad_norm": 0.16615349054336548,
+      "learning_rate": 4.530518418775733e-05,
+      "loss": 1.0072,
+      "step": 436
+    },
+    {
+      "epoch": 2.732484076433121,
+      "grad_norm": 0.16004875302314758,
+      "learning_rate": 4.4873344535329976e-05,
+      "loss": 0.9512,
+      "step": 437
+    },
+    {
+      "epoch": 2.738853503184713,
+      "grad_norm": 0.16015282273292542,
+      "learning_rate": 4.444297669803981e-05,
+      "loss": 1.0274,
+      "step": 438
+    },
+    {
+      "epoch": 2.745222929936306,
+      "grad_norm": 0.16400328278541565,
+      "learning_rate": 4.401409216618837e-05,
+      "loss": 1.0202,
+      "step": 439
+    },
+    {
+      "epoch": 2.7515923566878984,
+      "grad_norm": 0.16385433077812195,
+      "learning_rate": 4.35867023904749e-05,
+      "loss": 0.992,
+      "step": 440
+    },
+    {
+      "epoch": 2.7515923566878984,
+      "eval_loss": 0.9724947810173035,
+      "eval_runtime": 19.3397,
+      "eval_samples_per_second": 56.464,
+      "eval_steps_per_second": 0.931,
+      "step": 440
+    },
+    {
+      "epoch": 2.7579617834394905,
+      "grad_norm": 0.15915533900260925,
+      "learning_rate": 4.316081878169028e-05,
+      "loss": 0.9942,
+      "step": 441
+    },
+    {
+      "epoch": 2.7643312101910826,
+      "grad_norm": 0.1640160232782364,
+      "learning_rate": 4.273645271041265e-05,
+      "loss": 1.0249,
+      "step": 442
+    },
+    {
+      "epoch": 2.770700636942675,
+      "grad_norm": 0.16595910489559174,
+      "learning_rate": 4.231361550670368e-05,
+      "loss": 0.9731,
+      "step": 443
+    },
+    {
+      "epoch": 2.777070063694268,
+      "grad_norm": 0.16304102540016174,
+      "learning_rate": 4.189231845980618e-05,
+      "loss": 0.9744,
+      "step": 444
+    },
+    {
+      "epoch": 2.78343949044586,
+      "grad_norm": 0.16614365577697754,
+      "learning_rate": 4.147257281784257e-05,
+      "loss": 1.0322,
+      "step": 445
+    },
+    {
+      "epoch": 2.789808917197452,
+      "grad_norm": 0.16803008317947388,
+      "learning_rate": 4.105438978751465e-05,
+      "loss": 1.0433,
+      "step": 446
+    },
+    {
+      "epoch": 2.7961783439490446,
+      "grad_norm": 0.16848289966583252,
+      "learning_rate": 4.063778053380446e-05,
+      "loss": 1.0024,
+      "step": 447
+    },
+    {
+      "epoch": 2.802547770700637,
+      "grad_norm": 0.1626000553369522,
+      "learning_rate": 4.022275617967591e-05,
+      "loss": 1.0139,
+      "step": 448
+    },
+    {
+      "epoch": 2.8089171974522293,
+      "grad_norm": 0.16323398053646088,
+      "learning_rate": 3.980932780577826e-05,
+      "loss": 0.9812,
+      "step": 449
+    },
+    {
+      "epoch": 2.8152866242038215,
+      "grad_norm": 0.16567878425121307,
+      "learning_rate": 3.939750645014977e-05,
+      "loss": 1.0059,
+      "step": 450
+    },
+    {
+      "epoch": 2.821656050955414,
+      "grad_norm": 0.16283658146858215,
+      "learning_rate": 3.8987303107923456e-05,
+      "loss": 0.975,
+      "step": 451
+    },
+    {
+      "epoch": 2.8280254777070066,
+      "grad_norm": 0.162727952003479,
+      "learning_rate": 3.857872873103322e-05,
+      "loss": 1.0087,
+      "step": 452
+    },
+    {
+      "epoch": 2.8343949044585988,
+      "grad_norm": 0.16250313818454742,
+      "learning_rate": 3.817179422792159e-05,
+      "loss": 1.0225,
+      "step": 453
+    },
+    {
+      "epoch": 2.840764331210191,
+      "grad_norm": 0.17140831053256989,
+      "learning_rate": 3.776651046324843e-05,
+      "loss": 1.0445,
+      "step": 454
+    },
+    {
+      "epoch": 2.8471337579617835,
+      "grad_norm": 0.16704216599464417,
+      "learning_rate": 3.7362888257600895e-05,
+      "loss": 1.0108,
+      "step": 455
+    },
+    {
+      "epoch": 2.853503184713376,
+      "grad_norm": 0.16444292664527893,
+      "learning_rate": 3.69609383872045e-05,
+      "loss": 1.0357,
+      "step": 456
+    },
+    {
+      "epoch": 2.859872611464968,
+      "grad_norm": 0.163473442196846,
+      "learning_rate": 3.6560671583635467e-05,
+      "loss": 1.0211,
+      "step": 457
+    },
+    {
+      "epoch": 2.8662420382165603,
+      "grad_norm": 0.1613229364156723,
+      "learning_rate": 3.616209853353409e-05,
+      "loss": 1.0095,
+      "step": 458
+    },
+    {
+      "epoch": 2.872611464968153,
+      "grad_norm": 0.16236041486263275,
+      "learning_rate": 3.576522987831965e-05,
+      "loss": 1.0443,
+      "step": 459
+    },
+    {
+      "epoch": 2.8789808917197455,
+      "grad_norm": 0.1598891168832779,
+      "learning_rate": 3.53700762139059e-05,
+      "loss": 1.0239,
+      "step": 460
+    },
+    {
+      "epoch": 2.8853503184713376,
+      "grad_norm": 0.159934863448143,
+      "learning_rate": 3.4976648090418685e-05,
+      "loss": 0.9633,
+      "step": 461
+    },
+    {
+      "epoch": 2.8917197452229297,
+      "grad_norm": 0.16620267927646637,
+      "learning_rate": 3.4584956011913696e-05,
+      "loss": 1.0087,
+      "step": 462
+    },
+    {
+      "epoch": 2.8980891719745223,
+      "grad_norm": 0.15960000455379486,
+      "learning_rate": 3.419501043609662e-05,
+      "loss": 1.0176,
+      "step": 463
+    },
+    {
+      "epoch": 2.904458598726115,
+      "grad_norm": 0.16119760274887085,
+      "learning_rate": 3.380682177404335e-05,
+      "loss": 0.9979,
+      "step": 464
+    },
+    {
+      "epoch": 2.910828025477707,
+      "grad_norm": 0.16766443848609924,
+      "learning_rate": 3.342040038992253e-05,
+      "loss": 1.0253,
+      "step": 465
+    },
+    {
+      "epoch": 2.917197452229299,
+      "grad_norm": 0.16117678582668304,
+      "learning_rate": 3.303575660071852e-05,
+      "loss": 0.9819,
+      "step": 466
+    },
+    {
+      "epoch": 2.9235668789808917,
+      "grad_norm": 0.16629359126091003,
+      "learning_rate": 3.2652900675956e-05,
+      "loss": 1.0317,
+      "step": 467
+    },
+    {
+      "epoch": 2.9299363057324843,
+      "grad_norm": 0.15698491036891937,
+      "learning_rate": 3.227184283742591e-05,
+      "loss": 1.0077,
+      "step": 468
+    },
+    {
+      "epoch": 2.9363057324840764,
+      "grad_norm": 0.16256022453308105,
+      "learning_rate": 3.1892593258912406e-05,
+      "loss": 1.0033,
+      "step": 469
+    },
+    {
+      "epoch": 2.9426751592356686,
+      "grad_norm": 0.16279557347297668,
+      "learning_rate": 3.1515162065921276e-05,
+      "loss": 1.0041,
+      "step": 470
+    },
+    {
+      "epoch": 2.949044585987261,
+      "grad_norm": 0.1671912968158722,
+      "learning_rate": 3.113955933540973e-05,
+      "loss": 1.02,
+      "step": 471
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 628,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 157,
+  "total_flos": 6.144816845642957e+17,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-471/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3ec2d42f7297673d946070db22cc38c40ecdb7e5fb5b23a335c46b1268e0b80
+size 5816

checkpoint-628/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: openlm-research/open_llama_3b_v2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-628/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-628/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66693d9662800d6ea9cbdbbe104239ccf19ffc74a7ae9d3c84ccf1eb4619e903
+size 50899792

checkpoint-628/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1888d6063aba805ea1e66fc00e3ee8489d1c9a9670a5f92a6ca9d9d01f2c797
+size 25871876

checkpoint-628/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9afbf853352cfbcfea61884ff6a2ddcd2aee1ce8618589cf5b56912c1b160011
+size 14244

checkpoint-628/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27b5ff522ca195b4d045a3bbae9ec1a3713d4653eb5dd22b56677a79af604b26
+size 1064

checkpoint-628/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-628/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574

checkpoint-628/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-628/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-628/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3ec2d42f7297673d946070db22cc38c40ecdb7e5fb5b23a335c46b1268e0b80
+size 5816

config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "_name_or_path": "openlm-research/open_llama_3b_v2",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 3200,
+  "initializer_range": 0.02,
+  "intermediate_size": 8640,
+  "max_position_embeddings": 2048,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "_load_in_4bit": false,
+    "_load_in_8bit": true,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": false,
+    "load_in_8bit": true,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.40.2",
+  "use_cache": false,
+  "vocab_size": 32000
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574