{ "adapter_path": "jumbo_adapters", "batch_size": 16, "config": null, "data": "data/", "fine_tune_type": "lora", "grad_checkpoint": true, "iters": 3000, "learning_rate": 5e-05, "lora_parameters": { "keys": [ "mlp.gate_proj", "mlp.down_proj", "self_attn.q_proj", "mlp.up_proj", "self_attn.v_proj", "self_attn.k_proj" ], "rank": 64, "alpha": 64, "dropout": 0.1, "scale": 16.0 }, "lr_schedule": { "name": "cosine_decay", "warmup": 500, "warmup_init": 1e-07, "arguments": [ 1e-05, 500, 1e-07 ] }, "max_seq_length": 512, "model": "Qwen/Qwen2.5-3B", "num_layers": 36, "resume_adapter_file": null, "save_every": 50, "seed": 24, "steps_per_eval": 50, "steps_per_report": 10, "test": true, "test_batches": 200, "testfile": "test.jsonl", "train": true, "val_batches": 50 }