initial clean commit

1faccd4 25 days ago

6.91 kB

	# the ppo trainer split config will override default ppo_trainer.yaml

	hydra:
	searchpath:
	- file://../../verl/trainer/config

	defaults:
	- ppo_trainer
	- _self_

	data:
	tokenizer: null
	train_files: ~/data/rlhf/gsm8k/train.parquet
	val_files: ~/data/rlhf/gsm8k/test.parquet
	train_max_samples: -1 # set to -1 to use full dataset
	val_max_samples: -1 # set to -1 to use full dataset
	prompt_key: prompt
	max_prompt_length: 512
	max_response_length: 512
	train_batch_size: 1024
	val_batch_size: null # DEPRECATED: Validation datasets are sent to inference engines as a whole batch, which will schedule the memory themselves
	return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
	return_raw_chat: False
	return_full_prompt: False
	shuffle: True
	seed: 42

	actor_rollout_ref:
	hybrid_engine: True
	model:
	path: ~/models/deepseek-llm-7b-chat
	external_lib: null
	override_config: { }
	enable_gradient_checkpointing: True
	use_remove_padding: False
	actor:
	strategy: fsdp # This is for backward-compatibility
	ppo_mini_batch_size: 256
	ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
	ppo_micro_batch_size_per_gpu: null
	use_dynamic_bsz: False
	ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
	grad_clip: 1.0
	clip_ratio: 0.2
	entropy_coeff: 0.0
	use_kl_loss: False # True for GRPO
	kl_loss_coef: 0.001 # for grpo
	kl_loss_type: low_var_kl # for grpo
	ppo_epochs: 1
	shuffle: False
	ulysses_sequence_parallel_size: 1 # sp size
	optim:
	lr: 1e-6
	lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
	lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
	min_lr_ratio: null # only useful for warmup with cosine
	lr_scheduler_type: constant # select from constant/cosine
	total_training_steps: -1 # must be override by program
	fsdp_config:
	wrap_policy:
	# transformer_layer_cls_to_wrap: None
	min_num_params: 0
	param_offload: False
	optimizer_offload: False
	fsdp_size: -1
	ref:
	fsdp_config:
	param_offload: False
	wrap_policy:
	# transformer_layer_cls_to_wrap: None
	min_num_params: 0
	log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
	log_prob_micro_batch_size_per_gpu: null
	log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
	log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
	ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
	rollout:
	name: vllm
	temperature: 1.0
	top_k: -1 # 0 for hf rollout, -1 for vllm rollout
	top_p: 1
	prompt_length: ${data.max_prompt_length} # not use for opensource
	response_length: ${data.max_response_length}
	# for vllm rollout
	dtype: bfloat16 # should align with FSDP
	gpu_memory_utilization: 0.5
	ignore_eos: False
	enforce_eager: True
	free_cache_engine: True
	load_format: dummy_dtensor
	tensor_model_parallel_size: 2
	max_num_batched_tokens: 8192
	max_num_seqs: 1024
	log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
	log_prob_micro_batch_size_per_gpu: null
	log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
	log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
	disable_log_stats: True
	enable_chunked_prefill: True # could get higher throughput
	# for hf rollout
	do_sample: True
	# number of responses (i.e. num sample times)
	n: 1 # > 1 for grpo

	critic:
	strategy: fsdp
	optim:
	lr: 1e-5
	lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
	min_lr_ratio: null # only useful for warmup with cosine
	lr_scheduler_type: constant # select from constant/cosine
	total_training_steps: -1 # must be override by program
	model:
	path: ~/models/deepseek-llm-7b-chat
	tokenizer_path: ${actor_rollout_ref.model.path}
	override_config: { }
	external_lib: ${actor_rollout_ref.model.external_lib}
	enable_gradient_checkpointing: True
	use_remove_padding: False
	fsdp_config:
	param_offload: False
	optimizer_offload: False
	wrap_policy:
	# transformer_layer_cls_to_wrap: None
	min_num_params: 0
	fsdp_size: -1
	ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
	ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
	ppo_micro_batch_size_per_gpu: null
	forward_micro_batch_size: ${critic.ppo_micro_batch_size}
	forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
	use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
	ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
	forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
	ulysses_sequence_parallel_size: 1 # sp size
	ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
	shuffle: ${actor_rollout_ref.actor.shuffle}
	grad_clip: 1.0
	cliprange_value: 0.5

	reward_model:
	enable: False
	strategy: fsdp
	model:
	input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
	path: ~/models/FsfairX-LLaMA3-RM-v0.1
	external_lib: ${actor_rollout_ref.model.external_lib}
	use_remove_padding: False
	fsdp_config:
	min_num_params: 0
	param_offload: False
	fsdp_size: -1
	micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
	micro_batch_size_per_gpu: null # set a number
	max_length: null
	ulysses_sequence_parallel_size: 1 # sp size
	use_dynamic_bsz: ${critic.use_dynamic_bsz}
	forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
	reward_manager: naive

	algorithm:
	gamma: 1.0
	lam: 1.0
	adv_estimator: gae
	use_kl_in_reward: False
	kl_penalty: kl # how to estimate kl divergence
	kl_ctrl:
	type: fixed
	kl_coef: 0.001

	trainer:
	total_epochs: 30
	total_training_steps: null
	project_name: verl_examples
	experiment_name: gsm8k
	logger: [ 'console', 'wandb' ]
	log_val_generations: 0
	nnodes: 1
	n_gpus_per_node: 8
	save_freq: -1
	# auto: find the last ckpt to resume. If can't find, start from scratch
	resume_mode: auto # or disable or resume_path if resume_from_path is set
	resume_from_path: null
	test_freq: -1
	critic_warmup: 0
	default_hdfs_dir: null
	default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}

	ray_kwargs:
	ray_init:
	num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.