| |
|
|
| hydra: |
| searchpath: |
| - file://../../verl/trainer/config |
|
|
| defaults: |
| - ppo_trainer |
| - _self_ |
|
|
| data: |
| tokenizer: null |
| train_files: ~/data/rlhf/gsm8k/train.parquet |
| val_files: ~/data/rlhf/gsm8k/test.parquet |
| train_max_samples: -1 |
| val_max_samples: -1 |
| prompt_key: prompt |
| max_prompt_length: 512 |
| max_response_length: 512 |
| train_batch_size: 1024 |
| val_batch_size: null |
| return_raw_input_ids: False |
| return_raw_chat: False |
| return_full_prompt: False |
| shuffle: True |
| seed: 42 |
|
|
| actor_rollout_ref: |
| hybrid_engine: True |
| model: |
| path: ~/models/deepseek-llm-7b-chat |
| external_lib: null |
| override_config: { } |
| enable_gradient_checkpointing: True |
| use_remove_padding: False |
| actor: |
| strategy: fsdp |
| ppo_mini_batch_size: 256 |
| ppo_micro_batch_size: null |
| ppo_micro_batch_size_per_gpu: null |
| use_dynamic_bsz: False |
| ppo_max_token_len_per_gpu: 16384 |
| grad_clip: 1.0 |
| clip_ratio: 0.2 |
| entropy_coeff: 0.0 |
| use_kl_loss: False |
| kl_loss_coef: 0.001 |
| kl_loss_type: low_var_kl |
| ppo_epochs: 1 |
| shuffle: False |
| ulysses_sequence_parallel_size: 1 |
| optim: |
| lr: 1e-6 |
| lr_warmup_steps: -1 |
| lr_warmup_steps_ratio: 0. |
| min_lr_ratio: null |
| lr_scheduler_type: constant |
| total_training_steps: -1 |
| fsdp_config: |
| wrap_policy: |
| |
| min_num_params: 0 |
| param_offload: False |
| optimizer_offload: False |
| fsdp_size: -1 |
| ref: |
| fsdp_config: |
| param_offload: False |
| wrap_policy: |
| |
| min_num_params: 0 |
| log_prob_micro_batch_size: null |
| log_prob_micro_batch_size_per_gpu: null |
| log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} |
| log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} |
| ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} |
| rollout: |
| name: vllm |
| temperature: 1.0 |
| top_k: -1 |
| top_p: 1 |
| prompt_length: ${data.max_prompt_length} |
| response_length: ${data.max_response_length} |
| |
| dtype: bfloat16 |
| gpu_memory_utilization: 0.5 |
| ignore_eos: False |
| enforce_eager: True |
| free_cache_engine: True |
| load_format: dummy_dtensor |
| tensor_model_parallel_size: 2 |
| max_num_batched_tokens: 8192 |
| max_num_seqs: 1024 |
| log_prob_micro_batch_size: null |
| log_prob_micro_batch_size_per_gpu: null |
| log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} |
| log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} |
| disable_log_stats: True |
| enable_chunked_prefill: True |
| |
| do_sample: True |
| |
| n: 1 |
|
|
| critic: |
| strategy: fsdp |
| optim: |
| lr: 1e-5 |
| lr_warmup_steps_ratio: 0. |
| min_lr_ratio: null |
| lr_scheduler_type: constant |
| total_training_steps: -1 |
| model: |
| path: ~/models/deepseek-llm-7b-chat |
| tokenizer_path: ${actor_rollout_ref.model.path} |
| override_config: { } |
| external_lib: ${actor_rollout_ref.model.external_lib} |
| enable_gradient_checkpointing: True |
| use_remove_padding: False |
| fsdp_config: |
| param_offload: False |
| optimizer_offload: False |
| wrap_policy: |
| |
| min_num_params: 0 |
| fsdp_size: -1 |
| ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} |
| ppo_micro_batch_size: null |
| ppo_micro_batch_size_per_gpu: null |
| forward_micro_batch_size: ${critic.ppo_micro_batch_size} |
| forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} |
| use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} |
| ppo_max_token_len_per_gpu: 32768 |
| forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} |
| ulysses_sequence_parallel_size: 1 |
| ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} |
| shuffle: ${actor_rollout_ref.actor.shuffle} |
| grad_clip: 1.0 |
| cliprange_value: 0.5 |
|
|
| reward_model: |
| enable: False |
| strategy: fsdp |
| model: |
| input_tokenizer: ${actor_rollout_ref.model.path} |
| path: ~/models/FsfairX-LLaMA3-RM-v0.1 |
| external_lib: ${actor_rollout_ref.model.external_lib} |
| use_remove_padding: False |
| fsdp_config: |
| min_num_params: 0 |
| param_offload: False |
| fsdp_size: -1 |
| micro_batch_size: null |
| micro_batch_size_per_gpu: null |
| max_length: null |
| ulysses_sequence_parallel_size: 1 |
| use_dynamic_bsz: ${critic.use_dynamic_bsz} |
| forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} |
| reward_manager: naive |
|
|
| algorithm: |
| gamma: 1.0 |
| lam: 1.0 |
| adv_estimator: gae |
| use_kl_in_reward: False |
| kl_penalty: kl |
| kl_ctrl: |
| type: fixed |
| kl_coef: 0.001 |
|
|
| trainer: |
| total_epochs: 30 |
| total_training_steps: null |
| project_name: verl_examples |
| experiment_name: gsm8k |
| logger: [ 'console', 'wandb' ] |
| log_val_generations: 0 |
| nnodes: 1 |
| n_gpus_per_node: 8 |
| save_freq: -1 |
| |
| resume_mode: auto |
| resume_from_path: null |
| test_freq: -1 |
| critic_warmup: 0 |
| default_hdfs_dir: null |
| default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} |
|
|
| ray_kwargs: |
| ray_init: |
| num_cpus: null |
|
|