mode: train experiment: dataset_size: 6000 dataset_seed: 1234 test_size: 0.1 hf_token: ${oc.env:HF_TOKEN,null} output: root_path: ${oc.env:ROOT_PATH} run_name: ${model.trim}_${task.name}_${algorithm.name}_${algorithm.training.curriculum_schedule}_${algorithm.training.scheduler_params.mu_exp}_${algorithm.training.scheduler_params.sigma}_${algorithm.training.scheduler_params.min_prob}_${algorithm.training.max_steps} lora: r: 32 alpha: 64 dropout: 0.1 target_modules: - q_proj - v_proj task_type: CAUSAL_LM occupy_gpu_memory: false occupy_gpu_memory_gb: 50 gpu_device: cuda:0 model: family: meta-llama trim: Llama-3.2-3B-Instruct name: ${model.family}/${model.trim} trust_remote_code: true torch_dtype: bfloat16 attn_implementation: flash_attention_2 task: name: blocksworld1246 data_files: - data/blocksworld/train_set-1-complete-correct.json - data/blocksworld/train_set-2-complete-correct.json - data/blocksworld/train_set-4-complete-correct.json - data/blocksworld/train_set-6-complete-correct.json icl_examples_file: data/blocksworld/train_set-2-more_with_trace.json use_icl_examples: false training: max_prompt_length: 1600 max_completion_length: 512 inference: checkpoint: 1200 steps: 4 temperature: 0.0 sc_num: 1 use_icl: false icl_num: 2 prompt_path: prompts/blocksworld/pool_prompt_v1.json data_path: data/blocksworld/split_v1/split_v1_step_{steps}_data.json config_file: data/blocksworld/bw_config.yaml domain_file: data/blocksworld/generated_domain.pddl pass_at_k: 1 num_shot: 4 resume: 0 max_new_tokens: 512 max_batch_size: 64 algorithm: name: sgrpo training: learning_rate: 1.0e-06 lr_scheduler_type: cosine logging_steps: 10 max_steps: 300 curriculum: true curriculum_schedule: classic scheduler_params: mu_exp: 0.5 sigma: 0.5 min_prob: true per_device_train_batch_size: 2 gradient_accumulation_steps: 4 gradient_checkpointing: true bf16: true num_generations: 8 beta: 0.001 use_vllm: true vllm_gpu_memory_utilization: 0.5 report_to: - wandb push_to_hub: true save_strategy: steps save_steps: ${algorithm.training.max_steps} eval_strategy: steps tf32: true