restore_from_path: null
restore_from_ckpt: null
mcore_gpt: true
micro_batch_size: 1
global_batch_size: 288
tensor_model_parallel_size: 8
pipeline_model_parallel_size: 18
virtual_pipeline_model_parallel_size: null
encoder_seq_length: 17408
max_position_embeddings: 17408
num_layers: 162
hidden_size: 16384
ffn_hidden_size: 11008
num_attention_heads: 128
init_method_std: 0.02
use_scaled_init_method: true
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0
kv_channels: null
apply_query_key_layer_scaling: true
normalization: rmsnorm
layernorm_epsilon: 1.0e-05
do_layer_norm_weight_decay: false
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
persist_layer_norm: true
bias: false
activation: fast-swiglu
headscale: false
transformer_block_type: pre_ln
openai_gelu: false
normalize_attention_scores: true
position_embedding_type: rope
rotary_percentage: 1.0
attention_type: multihead
share_embeddings_and_output_weights: false
overlap_p2p_comm: false
batch_p2p_comm: true
num_query_groups: 8
tokenizer:
  library: huggingface
  type: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
  use_fast: true
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
hysteresis: 2
fp32_residual_connection: false
fp16_lm_cross_entropy: false
megatron_amp_O2: true
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_accumulation_fusion: false
bias_activation_fusion: false
bias_dropout_add_fusion: false
masked_softmax_fusion: true
get_attention_mask_from_fusion: true
apply_rope_fusion: false
seed: 1234
resume_from_checkpoint: null
use_cpu_initialization: false
onnx_safe: false
apex_transformer_log_level: 30
gradient_as_bucket_view: true
sync_batch_comm: false
activations_checkpoint_granularity: full
activations_checkpoint_method: uniform
activations_checkpoint_num_layers: 1
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: true
transformer_engine: true
fp8: false
fp8_e4m3: false
fp8_hybrid: true
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1024
fp8_amax_compute_algo: max
reduce_amax: true
use_emha: false
data:
  index_mapping_dir: null
  data_impl: jsonl
  splits_string: null
  seq_length: 17408
  skip_warmup: true
  num_workers: 0
  dataloader_type: single
  reset_position_ids: false
  reset_attention_mask: false
  eod_mask_loss: false
  validation_drop_last: true
  no_seqlen_plus_one_input_tokens: false
  pad_samples_to_global_batch_size: false
  shuffle_documents: true
  apply_chat_template: false
  prompt_file: null
  system_prompt_file: null
  shuffle_train_data: false
  system_prompt: detailed thinking off
  data_prefix:
    train:
    - /lustre/fsw/portfolios/llmservice/users/jiaqiz/data/reinforce/hs2/hs2.multiturn.rl.sys12.train.jsonl
    validation:
    - /lustre/fsw/portfolios/llmservice/users/jiaqiz/data/reinforce/hs2/hs2.multiturn.rl.sys12.val.jsonl
    test:
    - /lustre/fsw/portfolios/llmservice/users/jiaqiz/data/reinforce/hs2/hs2.multiturn.rl.sys12.val.jsonl
nsys_profile:
  enabled: false
  start_step: 10
  end_step: 10
  ranks:
  - 0
  gen_shape: false
optim:
  name: distributed_fused_adam
  lr: 3.00001e-07
  weight_decay: 0.1
  betas:
  - 0.9
  - 0.98
  sched:
    name: CosineAnnealing
    warmup_steps: 10
    constant_steps: 1000
    min_lr: 3.0e-07
    max_steps: 3802
  bucket_cap_mb: 200
  overlap_grad_sync: false
  overlap_param_sync: false
  contiguous_grad_buffer: true
rotary_base: 500000.0
scale_positional_embedding: true
seq_len_interpolation_factor: null
heterogeneous_layers_config_path: /home/boryiings/lustre/aligner/253B/checkpoints/sft_step1800_nemo/NeMo/config.json
name: heterogeneous_gpt
precision: bf16
hf_model_name_or_configs_dir: /lustre/fs1/portfolios/llmservice/users/jiaqiz/results/253b-id-step14-diffrank-evelina-gpqa-scp116k-aops-llmjudge-prompt2-long-16klen-lr5e7-72nodes/checkpoints/HF/step21
grpo:
  share_dir: /dev/shm/checkpoints_2248637
  forward_micro_batch_size: 1
  offload_adam_states: true
  ratio_eps: 0.2
  sampling_params:
    use_greedy: false
    temperature: 1
    top_k: -1
    top_p: 1.0
    repetition_penalty: 1.0
    add_BOS: false
    all_probs: false
    compute_logprob: false
    end_strings:
    - <|endoftext|>
    - <extra_id_1>
  length_params:
    max_length: 16384
    min_length: 1
  generation_rollout_mbs: 16
  trt_model_dir: /tmp/trt_llm_model
  initial_policy_kl_penalty: 0.0001
  inference_backend:
    type: vllm
    enable: true
    seed: 1234
    max_input_len: 1024
    reshard: true
    config:
      trt_llm:
        enable: false
        model_type: llama
        unload_engine_train: false
      vllm:
        enable: true
        port: 4321
        ip: cw-dfw-h100-001-262-012
      trt_llm_pytorch:
        enable: false
        port: 4321
        ip: localhost
peft:
  peft_scheme: none
  restore_from_path: null
  restore_from_ckpt:
    checkpoint_dir: null
    checkpoint_name: null
  lora_tuning:
    target_modules:
    - attention_qkv
    adapter_dim: 32
    adapter_dropout: 0.0
    column_init_method: xavier
    row_init_method: zero
    layer_selection: null
    weight_tying: false
    position_embedding_strategy: null
context_parallel_size: 2
dist_ckpt_format: torch_dist
dist_ckpt_load_on_device: true
dist_ckpt_parallel_save: true
dist_ckpt_parallel_save_within_dp: false
dist_ckpt_parallel_load: false
dist_ckpt_torch_dist_multiproc: 2
dist_ckpt_assume_constant_structure: false
dist_ckpt_parallel_dist_opt: true
dist_ckpt_load_strictness: log_all
deallocate_pipeline_outputs: false
target: nemo_aligner.experimental.grpo.models.nlp.gpt.megatron_gpt_grpo_actor.MegatronGPTActorModel
nemo_version: 2.2.0rc0