- mode=train | |
- task=countdown2345 | |
- algorithm=grpo | |
- algorithm.training.curriculum_schedule=classic | |
- model=llama | |
- algorithm.training.per_device_train_batch_size=2 | |
- algorithm.training.scheduler_params.mu_exp=0.5 | |
- algorithm.training.scheduler_params.sigma=0.5 | |
- algorithm.training.max_steps=1600 | |