Lekr0 commited on 23 days ago

Commit

8765573

verified ·

1 Parent(s): 5de3d77

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

ICL/DAPO/verl-recipe/char_count/README.md +59 -0
ICL/DAPO/verl-recipe/char_count/create_dataset.py +198 -0
ICL/DAPO/verl-recipe/char_count/reward_function.py +34 -0
ICL/DAPO/verl-recipe/char_count/train_grpo.sh +45 -0
ICL/DAPO/verl-recipe/char_count/train_sft.sh +97 -0
ICL/DAPO/verl-recipe/collabllm/README.md +74 -0
ICL/DAPO/verl-recipe/collabllm/utils.py +280 -0
ICL/DAPO/verl-recipe/dapo/run_dapo_qwen3_8b_base_npu.sh +138 -0
ICL/DAPO/verl-recipe/deepeyes/deepeyes.py +408 -0
ICL/DAPO/verl-recipe/fault_recover/async_llm.py +84 -0
ICL/DAPO/verl-recipe/flash_rl_ascend/README.md +121 -0
ICL/DAPO/verl-recipe/flowrl/README.md +182 -0
ICL/DAPO/verl-recipe/flowrl/__init__.py +17 -0
ICL/DAPO/verl-recipe/flowrl/flowrl_fsdp_worker.py +495 -0
ICL/DAPO/verl-recipe/flowrl/main_flowrl.py +185 -0
ICL/DAPO/verl-recipe/flowrl/run_flowrl_qwen2.5_7b.sh +134 -0
ICL/DAPO/verl-recipe/infigui-g1/README.md +56 -0
ICL/DAPO/verl-recipe/langgraph_agent/__init__.py +13 -0
ICL/DAPO/verl-recipe/langgraph_agent/chat_model.py +393 -0
ICL/DAPO/verl-recipe/langgraph_agent/react_agent_loop.py +188 -0
ICL/DAPO/verl-recipe/langgraph_agent/test_react_agent_loop.py +202 -0
ICL/DAPO/verl-recipe/minicpmo/rl_dataset.py +571 -0
ICL/DAPO/verl-recipe/prime/__init__.py +13 -0
ICL/DAPO/verl-recipe/prime/prime_core_algos.py +147 -0
ICL/DAPO/verl-recipe/prime/run_prime_qwen_code.sh +61 -0
ICL/DAPO/verl-recipe/r1/run_r1_distill_qwen.sh +33 -0
ICL/DAPO/verl-recipe/r1_ascend/Dockerfile.vllm_ascend.mindspeed.deepseekV3 +82 -0
ICL/DAPO/verl-recipe/r1_ascend/README.md +119 -0
ICL/DAPO/verl-recipe/r1_ascend/README_zh.md +119 -0
ICL/DAPO/verl-recipe/r1_ascend/ray_start_grpo_npu.sh +82 -0
ICL/DAPO/verl-recipe/r1_ascend/vllm_rollout_spmd.py +347 -0
ICL/DAPO/verl-recipe/rep_exp/README.md +71 -0
ICL/DAPO/verl-recipe/rep_exp/eval.sh +83 -0
ICL/DAPO/verl-recipe/rep_exp/main_rep_exp.py +483 -0
ICL/DAPO/verl-recipe/rep_exp/metric_utils.py +382 -0
ICL/DAPO/verl-recipe/rep_exp/model_merge.sh +6 -0
ICL/DAPO/verl-recipe/rep_exp/plot_pass_at_k.py +241 -0
ICL/DAPO/verl-recipe/rep_exp/rep_exp_trainer.py +739 -0
ICL/DAPO/verl-recipe/spin/core_algos.py +206 -0
ICL/DAPO/verl-recipe/spin/main_spin.py +168 -0
ICL/DAPO/verl-recipe/spin/spin_trainer.py +1312 -0
ICL/LV/code/README.md +66 -0
ICL/LV/code/SFT/__pycache__/dataset.cpython-310.pyc +0 -0
ICL/LV/code/SFT/build_icl_eval_sharegpt.py +437 -0
ICL/LV/code/SFT/check_kshot_ret_ans.py +319 -0
ICL/LV/code/SFT/cuda-keyring_1.1-1_all.deb +0 -0
ICL/LV/code/SFT/prepare_dataset.py +56 -0
ICL/LV/code/adapters/gemma3_adapter.py +27 -0
ICL/LV/code/adapters/qwen3vl_adapter.py +27 -0
ICL/LV/code/attn map/attn map/attn map/__pycache__/token_attention_utils.cpython-313.pyc +0 -0

ICL/DAPO/verl-recipe/char_count/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# Char Count
+## Introduction
+Char count is a simple NLP task. We create it for beginners to grasp the idea of RLVR. The task can be trained using a tiny model (e.g., https://huggingface.co/HuggingFaceTB/SmolLM2-135M) on a consumer GPU with only 8GB.
+## Problem formulation
+The prompt is: "How many {char} are there in {word}?". In order for LLM to better answer this question, we create SFT dataset with intermediate steps. For example,
+```text
+Question: How many n are there in n-i-n-e?
+Answer:
+n = n
+i != n
+n = n
+e != n
+\boxed{2}
+```
+Note that
+- We add a dash between each individual char to make the task easier because each individual char will be tokenized to the same token by most tokenizer.
+- In the SFT dataset, we create a CoT by listing all the individual chars and whether it equals to the target. In the end, it outputs the final answer inside the box.
+- The task can be verified.
+- The word is not always meaningful. Each char is sampled uniformly from a to z. We make the total length and the answer uniformly distributed within a range.
+## Scripts
+Installation
+```bash
+pip install verl==0.6.1
+```
+To create the dataset, run
+```bash
+python3 create_dataset.py
+```
+We create a train set and a val set. Both of them are used of SFT and RL. You can specify the total number of data, min/max length and data path.
+To run the SFT
+```bash
+BACKEND=fsdp bash train_sft.sh  # use fsdp
+BACKEND=megatron bash train_sft.sh  # use megatron
+```
+We train SFT for 1 epoch. After 1 epoch, the validation score is around 0.435.
+Merge checkpoint trained from SFT
+```bash
+# sft
+export CKPT_PATH=$HOME/experiments/char_count/models/sft/fsdp/global_step_140
+python3 -m verl.model_merger merge --backend fsdp --local_dir $CKPT_PATH --target_dir $CKPT_PATH/huggingface/
+# megatron
+export CKPT_PATH=$HOME/experiments/char_count/models/sft/megatron/global_step_140
+python3 -m verl.model_merger merge --backend megatron --local_dir $CKPT_PATH --target_dir $CKPT_PATH/huggingface/
+```
+To run GRPO
+```bash
+bash train_grpo.sh
+```
+We train GRPO for 2 epochs. After 2 epochs, the validation score is around 0.6.

ICL/DAPO/verl-recipe/char_count/create_dataset.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Task description:
+Given a random word and a random char, count the number of occurrence of char in the word.
+Create CoT dataset that split the word into separate char. Then list the char and count the occurrence.
+The word set comes from shakespeare
+"""
+import os.path
+import random
+prompt_template = "How many {} are there in word {}?"
+def generate_random_char():
+    return chr(97 + random.randint(0, 25))
+def create_prompt_response(min_length=3, max_length=5):
+    # randomly generate a length
+    word_length = random.randint(min_length, max_length)
+    # randomly generate a target count number. This makes the target number
+    target_count_number = random.randint(1, word_length)
+    char_lst = []
+    # generate the word
+    # step 1: generate the target word
+    target_char = generate_random_char()
+    for _ in range(target_count_number):
+        char_lst.append(target_char)
+    # step 2: generate other words
+    for _ in range(word_length - target_count_number):
+        while True:
+            char = generate_random_char()
+            if char != target_char:
+                char_lst.append(char)
+                break
+    # step 3: random permute char_lst
+    random.shuffle(char_lst)
+    word = "-".join(char_lst)
+    prompt = prompt_template.format(target_char, word)
+    final_answer = []
+    # cot
+    number = 0
+    for i, char in enumerate(char_lst):
+        cot = f"{char}"
+        if char != target_char:
+            cot += " != "
+        else:
+            cot += " = "
+            number += 1
+        cot += f"{target_char}."
+        final_answer.append(cot)
+    conclusion = f"\\boxed{{{number}}} {target_char} in {word}."
+    final_answer.append(conclusion)
+    final_answer = "\n".join(final_answer)
+    return prompt, final_answer
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--total_number", type=int, default=10000)
+    parser.add_argument("--min_length", type=int, default=5)
+    parser.add_argument("--max_length", type=int, default=20)
+    parser.add_argument("--data_path", type=str, default="~/data/char_count")
+    args = vars(parser.parse_args())
+    total_number = args["total_number"]
+    min_length = args["min_length"]
+    max_length = args["max_length"]
+    data_path = args["data_path"]
+    data_path = os.path.expanduser(data_path)
+    full_output = []
+    for _ in range(total_number):
+        output = create_prompt_response(min_length=min_length, max_length=max_length)
+        full_output.append(output)
+    # random reorder
+    random.shuffle(full_output)
+    # split for train and test
+    train_split_len = int(0.9 * len(full_output))
+    train_outputs = full_output[:train_split_len]
+    test_output = full_output[train_split_len:]
+    sft_train_dataset = {"messages": []}
+    for o in train_outputs:
+        messages = [
+            {"role": "user", "content": o[0]},
+            {"role": "assistant", "content": o[1]},
+        ]
+        sft_train_dataset["messages"].append(messages)
+    sft_test_dataset = {"messages": []}
+    for o in test_output:
+        messages = [
+            {"role": "user", "content": o[0]},
+            {"role": "assistant", "content": o[1]},
+        ]
+        sft_test_dataset["messages"].append(messages)
+    import pandas as pd
+    sft_train_dataset = pd.DataFrame(data=sft_train_dataset)
+    sft_test_dataset = pd.DataFrame(data=sft_test_dataset)
+    folder = os.path.join(data_path, "sft")
+    os.makedirs(folder, exist_ok=True)
+    sft_train_dataset.to_parquet(os.path.join(folder, "train.parquet"))
+    sft_test_dataset.to_parquet(os.path.join(folder, "test.parquet"))
+    # build RL dataset
+    rl_train_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []}
+    rl_test_dataset = {"prompt": [], "data_source": [], "ability": [], "reward_model": [], "extra_info": []}
+    from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed
+    for o in train_outputs:
+        prompt = o[0]
+        response = o[1]
+        prompt_with_template = [
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ]
+        rl_train_dataset["prompt"].append(prompt_with_template)
+        rl_train_dataset["data_source"].append("char_count")
+        rl_train_dataset["ability"].append("other")
+        rl_train_dataset["reward_model"].append(
+            {"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))}
+        )
+        rl_train_dataset["extra_info"].append({"response": response})
+    for o in test_output:
+        prompt = o[0]
+        response = o[1]
+        prompt_with_template = [
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ]
+        rl_test_dataset["prompt"].append(prompt_with_template)
+        rl_test_dataset["data_source"].append("char_count")
+        rl_test_dataset["ability"].append("other")
+        rl_test_dataset["reward_model"].append(
+            {"style": "rule", "ground_truth": remove_boxed(last_boxed_only_string(response))}
+        )
+        rl_test_dataset["extra_info"].append({"response": response})
+    rl_train_dataset = pd.DataFrame(data=rl_train_dataset)
+    rl_test_dataset = pd.DataFrame(data=rl_test_dataset)
+    folder = os.path.join(data_path, "rl")
+    os.makedirs(folder, exist_ok=True)
+    rl_train_dataset.to_parquet(os.path.join(folder, "train.parquet"))
+    rl_test_dataset.to_parquet(os.path.join(folder, "test.parquet"))

ICL/DAPO/verl-recipe/char_count/reward_function.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Reward function
+"""
+from verl.utils.reward_score import math_reward
+def char_count_reward_function(data_source, solution_str, ground_truth, extra_info=None):
+    try:
+        last_boxed_string = math_reward.last_boxed_only_string(solution_str)
+        if last_boxed_string is None:
+            return 0
+        solution = math_reward.remove_boxed(last_boxed_string)
+        if solution == ground_truth:
+            return 1
+        else:
+            return 0
+    except Exception:
+        print(ground_truth, solution_str)
+        return 0

ICL/DAPO/verl-recipe/char_count/train_grpo.sh ADDED Viewed

	@@ -0,0 +1,45 @@

+set -x
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$HOME/data/char_count/rl/train.parquet \
+    data.val_files=$HOME/data/char_count/rl/test.parquet \
+    data.train_batch_size=128 \
+    data.max_prompt_length=128 \
+    data.max_response_length=128 \
+    data.filter_overlong_prompts=False \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$HOME/experiments/char_count/models/sft/megatron/global_step_140/huggingface \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
+    actor_rollout_ref.actor.use_dynamic_bsz=True \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=5000 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.0 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    actor_rollout_ref.rollout.n=8 \
+    actor_rollout_ref.rollout.enforce_eager=True \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name='verl_example' \
+    trainer.experiment_name='smol135m_grpo-1128a1' \
+    trainer.val_before_train=True \
+    trainer.n_gpus_per_node=1 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=5 \
+    trainer.use_legacy_worker_impl=disable \
+    custom_reward_function.path=./reward_function.py \
+    custom_reward_function.name=char_count_reward_function

ICL/DAPO/verl-recipe/char_count/train_sft.sh ADDED Viewed

	@@ -0,0 +1,97 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
+TRAIN_FILES=${TRAIN_FILES:-$HOME/data/char_count/sft/train.parquet}
+TEST_FILES=${TEST_FILES:-$HOME/data/char_count/sft/test.parquet}
+backend=${BACKEND:-fsdp}
+project_name=char_count-sft
+RESUME_MODE=auto
+MODEL_ID=${MODEL_ID:-HuggingFaceTB/SmolLM2-135M-Instruct}
+SP_SIZE=${SP_SIZE:-1}
+FSDP_SIZE=${FSDP_SIZE:-1}
+FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"}
+TP_SIZE=${TP_SIZE:-1}
+PP_SIZE=${PP_SIZE:-1}
+VPP_SIZE=${VPP_SIZE:-null}
+CP_SIZE=${CP_SIZE:-1}
+PAD_MODE=${PAD_MODE:-no_padding}
+USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True}
+FSDP_ENGINE_CONFIG="\
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=2e-5 \
+    optim.lr_warmup_steps_ratio=0.01 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_ratio=0.1 \
+    optim.warmup_style=cosine \
+    engine.ulysses_sequence_parallel_size=${SP_SIZE} \
+    engine.strategy=${FSDP_STRATEGY} \
+    engine.fsdp_size=${FSDP_SIZE}"
+MEGATRON_ENGINE_CONFIG="\
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=2e-5 \
+    optim.lr_warmup_steps_ratio=0.01 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    optim.min_lr=2e-6 \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    engine.use_mbridge=False"
+if [ "$backend" = "fsdp" ]; then
+    ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
+    echo "Using fsdp engine"
+    exp_name=char_count-sft-SmolLM2-135M-Instruct-fsdp
+else
+    ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
+    echo "Using megatron engine"
+    exp_name=char_count-sft-SmolLM2-135M-Instruct-megatron
+fi
+CKPT_HOME=${CKPT_HOME:-$HOME/experiments/char_count/models/sft/$backend}
+mkdir -p "${CKPT_HOME}"
+torchrun --standalone --nnodes=1 --nproc-per-node=${NUM_TRAINERS:-1} \
+    ${ENTRYPOINT} \
+    data.train_files="${TRAIN_FILES}" \
+    data.train_batch_size=64 \
+    data.val_files="${TEST_FILES}" \
+    data.max_length=256 \
+    data.pad_mode=${PAD_MODE} \
+    data.truncation=error \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=1792 \
+    data.messages_key=messages \
+    model.path=$MODEL_ID \
+    model.use_remove_padding=${USE_REMOVE_PADDING} \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=-1 \
+    trainer.save_freq=70 \
+    trainer.logger=['console'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=1 \
+    trainer.default_local_dir="${CKPT_HOME}" \
+    trainer.resume_mode=${RESUME_MODE} \
+    trainer.max_ckpt_to_keep=5 \
+    checkpoint.save_contents=[model,optimizer,extra]

ICL/DAPO/verl-recipe/collabllm/README.md ADDED Viewed

	@@ -0,0 +1,74 @@

+# CollabLLM
+This repository implements [CollabLLM](https://arxiv.org/pdf/2502.00640) (ICML 2025) using the verl framework. For the original implementation, see the [CollabLLM repository](https://github.com/Wuyxin/collabllm).
+CollabLLM is a method for training language models to collaborate effectively in multi-turn conversations. This implementation adapts the original imlpementation to work with the Verl training framework.
+## Quick start
+### 0. Environment
+Make sure the required packages for `verl` are installed. Additionally, install `litellm` and export the required API keys. The API model will be used for user simulators and, optionally, LLM Judges (see the Configuration section below).
+### 1. Prepare Your Dataset
+First, process your dataset using the provided script:
+```bash
+python process_dataset.py --dataset <> ... --dataset_type <sft or rl>
+```
+**Requirements:**
+- Input: A Hugging Face multiturn dataset. Existing datasets: `collabllm/collabllm-multiturn-$DATASET`, with `DATASET` in one of [`math-hard(-large)`, `medium(-large)`, `bigcodebench(-large)`] (*-large are the datasets used in the CollabLLM paper)
+- Example format: See [collabllm-multiturn-math-hard](https://huggingface.co/datasets/collabllm/collabllm-multiturn-math-hard)
+- To generate your own dataset: Use [build_dataset.py](https://github.com/Wuyxin/collabllm/blob/main/scripts/engine/build_dataset.py) from the original CollabLLM repository
+*Note: Check `process_dataset.py` for example commands and usage.*
+### 2. Train Your Model
+**(Optional) For Supervised Fine-Tuning (SFT):**
+```bash
+bash train_sft_collabllm.sh
+```
+**For Reinforcement Learning (RL):**
+```bash
+bash train_rl_collabllm.sh
+```
+The RL script shows an example to train CollabLLM on `math-hard-large`.
+- The config to sample future conversations are in `recipe/collabllm/config/collabllm_interaction_config.yaml`.
+- The Multiturn-aware Reward is aggregated from these three conversational-level rewards:
+    ```
+    +reward_model.reward_kwargs.metric_weights.accuracy=1 \
+    +reward_model.reward_kwargs.metric_weights.interactivity=1 \
+    +reward_model.reward_kwargs.metric_weights.token_amount=-0.0001 \
+    ```
+    You can remove, add, or modify the weights depending on your task. A list of implemented metrics you can already add are under `recipe/collabllm/metrics`. For example, on `medium-large`, you can replace `accuracy` with `bleu_score` via
+    ```
+    +reward_model.reward_kwargs.metric_weights.bleu_score=1
+    ```
+    which will instead apply bleu score on the sampled future conversations.
+## Configuration
+Read [doc](https://verl.readthedocs.io/en/latest/) for detailed configurations.
+## Citation
+If you find CollabLLM useful in your research, please cite the following:
+```bibtex
+@inproceedings{collabllm2025,
+    title={CollabLLM: From Passive Responders to Active Collaborators},
+    author={Shirley Wu and Michel Galley and Baolin Peng and Hao Cheng and
+            Gavin Li and Yao Dou and Weixin Cai and James Zou and
+            Jure Leskovec and Jianfeng Gao},
+    booktitle={International Conference on Machine Learning (ICML)},
+    year={2025}
+}
+```

ICL/DAPO/verl-recipe/collabllm/utils.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# Copyright 2025 CollabLLM team and/or its affiliates
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+import re
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+def parse_messages(messages, strip_sys_prompt=True):
+    """
+    Args:
+        messages: List[dict]
+            List of dictionaries with keys 'role' and 'content'
+            Example: messages = [{'role': 'user', 'content': 'Hello!'},
+                                 {'role': 'assistant', 'content': 'Hi!'}, ...]
+    """
+    if messages is None:
+        return ""
+    if strip_sys_prompt:
+        messages = strip_system_prompt(messages)
+    chat = "\n".join(f"**{m.role.capitalize()}**: {m.content}" for m in messages)
+    return chat
+def strip_system_prompt(messages):
+    """
+    Args:
+        messages: List[dict]
+            List of dictionaries with keys 'role' and 'content'
+            Example: messages = [{'role': 'user', 'content': 'Hello!'},
+                                 {'role': 'assistant', 'content': 'Hi!'}, ...]
+    """
+    return [msg for msg in messages if msg.role != "system"]
+def extract_json(s):
+    def convert_value(value):
+        true_values = {"true": True, "false": False, "null": None}
+        value_lower = value.lower()
+        if value_lower in true_values:
+            return true_values[value_lower]
+        try:
+            if "." in value or "e" in value.lower():
+                return float(value)
+            else:
+                return int(value)
+        except ValueError:
+            return value  # Return as string if not a number
+    def parse_number(s, pos):
+        start = pos
+        while pos < len(s) and s[pos] in "-+0123456789.eE":
+            pos += 1
+        num_str = s[start:pos]
+        try:
+            if "." in num_str or "e" in num_str.lower():
+                return float(num_str), pos
+            else:
+                return int(num_str), pos
+        except ValueError:
+            logger.error(f"Invalid number at position {start}: {num_str}")
+            raise
+    def skip_whitespace(s, pos):
+        while pos < len(s) and s[pos] in " \t\n\r":
+            pos += 1
+        return pos
+    def parse_string(s, pos):
+        quote_char = s[pos]
+        assert quote_char in ('"', "'")
+        pos += 1
+        result = ""
+        while pos < len(s):
+            c = s[pos]
+            if c == "\\":
+                pos += 1
+                if pos >= len(s):
+                    raise ValueError("Invalid escape sequence")
+                c = s[pos]
+                escape_sequences = {"n": "\n", "t": "\t", "r": "\r", "\\": "\\", quote_char: quote_char}
+                result += escape_sequences.get(c, c)
+            elif c == quote_char:
+                pos += 1
+                # Attempt to convert to a number if possible
+                converted_value = convert_value(result)
+                return converted_value, pos
+            else:
+                result += c
+            pos += 1
+        raise ValueError("Unterminated string")
+    def parse_key(s, pos):
+        pos = skip_whitespace(s, pos)
+        if s[pos] in ('"', "'"):
+            key, pos = parse_string(s, pos)
+            return key, pos
+        else:
+            raise ValueError(f"Expected string for key at position {pos}")
+    def parse_object(s, pos):
+        obj = {}
+        assert s[pos] == "{"
+        pos += 1
+        pos = skip_whitespace(s, pos)
+        while pos < len(s) and s[pos] != "}":
+            pos = skip_whitespace(s, pos)
+            key, pos = parse_key(s, pos)
+            pos = skip_whitespace(s, pos)
+            if pos >= len(s) or s[pos] != ":":
+                raise ValueError(f'Expected ":" at position {pos}')
+            pos += 1
+            pos = skip_whitespace(s, pos)
+            value, pos = parse_value(s, pos)
+            obj[key] = value
+            pos = skip_whitespace(s, pos)
+            if pos < len(s) and s[pos] == ",":
+                pos += 1
+                pos = skip_whitespace(s, pos)
+            elif pos < len(s) and s[pos] == "}":
+                break
+            elif pos < len(s) and s[pos] != "}":
+                raise ValueError(f'Expected "," or "}}" at position {pos}')
+        if pos >= len(s) or s[pos] != "}":
+            raise ValueError(f'Expected "}}" at position {pos}')
+        pos += 1
+        return obj, pos
+    def parse_array(s, pos):
+        lst = []
+        assert s[pos] == "["
+        pos += 1
+        pos = skip_whitespace(s, pos)
+        while pos < len(s) and s[pos] != "]":
+            value, pos = parse_value(s, pos)
+            lst.append(value)
+            pos = skip_whitespace(s, pos)
+            if pos < len(s) and s[pos] == ",":
+                pos += 1
+                pos = skip_whitespace(s, pos)
+            elif pos < len(s) and s[pos] == "]":
+                break
+            elif pos < len(s) and s[pos] != "]":
+                raise ValueError(f'Expected "," or "]" at position {pos}')
+        if pos >= len(s) or s[pos] != "]":
+            raise ValueError(f'Expected "]" at position {pos}')
+        pos += 1
+        return lst, pos
+    def parse_triple_quoted_string(s, pos):
+        if s[pos : pos + 3] == "'''":
+            quote_str = "'''"
+        elif s[pos : pos + 3] == '"""':
+            quote_str = '"""'
+        else:
+            raise ValueError(f"Expected triple quotes at position {pos}")
+        pos += 3
+        result = ""
+        while pos < len(s):
+            if s[pos : pos + 3] == quote_str:
+                pos += 3
+                # Attempt to convert to a number if possible
+                converted_value = convert_value(result)
+                return converted_value, pos
+            else:
+                result += s[pos]
+                pos += 1
+        raise ValueError("Unterminated triple-quoted string")
+    def parse_value(s, pos):
+        pos = skip_whitespace(s, pos)
+        if pos >= len(s):
+            raise ValueError("Unexpected end of input")
+        if s[pos] == "{":
+            return parse_object(s, pos)
+        elif s[pos] == "[":
+            return parse_array(s, pos)
+        elif s[pos : pos + 3] in ("'''", '"""'):
+            return parse_triple_quoted_string(s, pos)
+        elif s[pos] in ('"', "'"):
+            return parse_string(s, pos)
+        elif s[pos : pos + 4].lower() == "true":
+            return True, pos + 4
+        elif s[pos : pos + 5].lower() == "false":
+            return False, pos + 5
+        elif s[pos : pos + 4].lower() == "null":
+            return None, pos + 4
+        elif s[pos] in "-+0123456789.":
+            return parse_number(s, pos)
+        else:
+            raise ValueError(f"Unexpected character at position {pos}: {s[pos]}")
+    json_start = s.index("{")
+    json_end = s.rfind("}")
+    s = s[json_start : json_end + 1]
+    s = s.strip()
+    result, pos = parse_value(s, 0)
+    pos = skip_whitespace(s, pos)
+    if pos != len(s):
+        raise ValueError(f"Unexpected content at position {pos}")
+    return result
+def remove_think_block(msg: dict):
+    """
+    remove <think>.*?</think> from content
+    """
+    if "content" in msg and isinstance(msg["content"], str):
+        msg["content"] = re.sub(r"<think>.*?</think>", "", msg["content"], flags=re.DOTALL).strip()
+    return msg
+def is_valid_messages(msg: dict) -> bool:
+    """
+    check if is valid messages, including:
+    1. <think> is paried with </think>
+    2. is not empty inside and outside <think>
+    3. is not nested, and at most one <think> block is allowed.
+    4. can not be empty if remove ending "<|im_end|>"
+    """
+    content = msg.get("content")
+    if not isinstance(content, str):
+        return True
+    # Base case: empty or whitespace-only content is invalid.
+    if not content.strip():
+        return False
+    num_think_open = content.count("<think>")
+    num_think_close = content.count("</think>")
+    # Rule 1: Check for paired tags.
+    if num_think_open != num_think_close:
+        return False
+    # Rule 3: Allow at most one think block.
+    if num_think_open > 1:
+        return False
+    # Case 1: No <think> blocks.
+    if num_think_open == 0:
+        visible_content = content
+    # Case 2: Exactly one <think> block.
+    else:
+        # Rule 2: Check for empty content inside the think block.
+        match = re.search(r"<think>(.*?)</think>", content, re.DOTALL)
+        if not match or not match.group(1).strip():
+            return False
+        # The "visible" content is what's outside the think block.
+        visible_content = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL)
+    visible_content = visible_content.strip()
+    # Rule 4 & 2 (outside): Check if visible content is empty after handling <|im_end|>.
+    if visible_content.endswith("<|im_end|>"):
+        visible_content = visible_content[: -len("<|im_end|>")]
+    if not visible_content.strip():
+        return False
+    return True

ICL/DAPO/verl-recipe/dapo/run_dapo_qwen3_8b_base_npu.sh ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/bin/bash
+project_name='DAPO'
+exp_name='DAPO-Qwen3-8B-Base'
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 20))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+loss_agg_mode="token-mean"
+enable_filter_groups=False
+filter_groups_metric=acc
+max_num_gen_batches=10
+train_prompt_bsz=16
+gen_prompt_bsz=$((train_prompt_bsz * 3))
+n_resp_per_prompt=16
+train_prompt_mini_bsz=1
+# Ray
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-1}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-8B-Base"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+# Performance Related Parameter
+sp_size=2
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
+offload=True
+gen_tp=2
+ray job submit --runtime-env="${RUNTIME_ENV}" \
+    -- python3 -m recipe.dapo.main_dapo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    algorithm.filter_groups.enable=${enable_filter_groups} \
+    algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
+    algorithm.filter_groups.metric=${filter_groups_metric} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    +actor_rollout_ref.model.override_config.attention_dropout=0. \
+    +actor_rollout_ref.model.override_config.embd_pdrop=0. \
+    +actor_rollout_ref.model.override_config.resid_pdrop=0. \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.90 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k="${top_k}" \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
+    reward_model.reward_manager=dapo \
+    reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
+    reward_model.overlong_buffer.len=${overlong_buffer_len} \
+    reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    trainer.logger=['console'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=False \
+    trainer.test_freq=10 \
+    trainer.save_freq=20 \
+    trainer.total_epochs=1 \
+    trainer.total_training_steps=100 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    data.shuffle=False \
+    actor_rollout_ref.actor.use_torch_compile=False \
+    actor_rollout_ref.ref.use_torch_compile=False \
+    actor_rollout_ref.actor.entropy_checkpointing=True \
+    actor_rollout_ref.ref.entropy_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.forward_prefetch=True \
+    actor_rollout_ref.ref.fsdp_config.forward_prefetch=True

ICL/DAPO/verl-recipe/deepeyes/deepeyes.py ADDED Viewed

	@@ -0,0 +1,408 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import logging
+import os
+import random
+import re
+import requests
+from openai import OpenAI
+from PIL import Image
+import verl.utils.torch_functional as verl_F
+from verl.utils.dataset.rl_dataset import RLHFDataset
+from verl.utils.model import compute_position_id_with_mask
+logger = logging.getLogger(__name__)
+openai_api_key = "EMPTY"
+openai_api_base = os.environ.get("LLM_AS_A_JUDGE_BASE", "http://10.1.100.71:18901/v1")
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+model_name = ""
+if openai_api_base:
+    try:
+        response = requests.get(f"{openai_api_base}/models")
+        response.raise_for_status()
+        models = response.json()
+        if models.get("data"):
+            model_name = models["data"][0]["id"]
+        else:
+            logger.warning("No models found at the specified API base for reward scoring.")
+    except (requests.exceptions.RequestException, KeyError, IndexError) as e:
+        logger.warning(f"Failed to get model from {openai_api_base}: {e}. Reward scoring will be disabled.")
+class CustomRLHFDataset(RLHFDataset):
+    def __getitem__(self, item):
+        """
+        Note that we also return the raw_input_ids so that it can be combined with other chat template
+        """
+        row_dict: dict = self.dataframe[item]
+        row_dict[self.prompt_key] = [
+            {
+                "role": "system",
+                # We don't need tool description, because custom_chat_template will add it.
+                "content": (
+                    "You are a helpful assistant. You can call functions to assist with the user query. "
+                    "Important: You must call only one function at a time. After each function call, "
+                    "wait for the execution result before making the next function call if needed."
+                ),
+            },
+            {
+                "role": "user",
+                "content": row_dict[self.prompt_key][1]["content"],
+            },
+        ]
+        images = []
+        row_dict_images = row_dict.get(self.image_key, None)
+        if row_dict_images:
+            images = [Image.open(io.BytesIO(image["bytes"])) for image in row_dict_images]
+        messages = self._build_messages(row_dict)
+        if self.processor is not None:
+            raw_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+            model_inputs = self.processor(text=[raw_prompt], images=images, return_tensors="pt")
+            input_ids = model_inputs.pop("input_ids")
+            attention_mask = model_inputs.pop("attention_mask")
+            if "second_per_grid_ts" in model_inputs:
+                model_inputs.pop("second_per_grid_ts")
+        else:
+            raw_prompt = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+            model_inputs = self.tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
+            input_ids = model_inputs.pop("input_ids")
+            attention_mask = model_inputs.pop("attention_mask")
+        input_ids, attention_mask = verl_F.postprocess_data(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_length=self.max_prompt_length,
+            pad_token_id=self.tokenizer.pad_token_id,
+            left_pad=True,
+            truncation=self.truncation,
+        )
+        if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
+            from verl.models.transformers.qwen2_vl import get_rope_index
+            position_ids = [
+                get_rope_index(
+                    self.processor,
+                    input_ids=input_ids[0],
+                    image_grid_thw=model_inputs.get("image_grid_thw"),
+                    video_grid_thw=model_inputs.get("video_grid_thw"),
+                    second_per_grid_ts=model_inputs.get("second_per_grid_ts"),
+                    attention_mask=attention_mask[0],
+                )
+            ]  # (1, 3, seq_len)
+        else:
+            position_ids = compute_position_id_with_mask(attention_mask)
+        row_dict["input_ids"] = input_ids[0]
+        row_dict["attention_mask"] = attention_mask[0]
+        row_dict["position_ids"] = position_ids[0]
+        raw_prompt_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
+        if len(raw_prompt_ids) > self.max_prompt_length:
+            if self.truncation == "left":
+                raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length :]
+            elif self.truncation == "right":
+                raw_prompt_ids = raw_prompt_ids[: self.max_prompt_length]
+            elif self.truncation == "middle":
+                left_half = self.max_prompt_length // 2
+                right_half = self.max_prompt_length - left_half
+                raw_prompt_ids = raw_prompt_ids[:left_half] + raw_prompt_ids[-right_half:]
+            elif self.truncation == "error":
+                raise RuntimeError(f"Prompt length {len(raw_prompt_ids)} is longer than {self.max_prompt_length}.")
+        row_dict["raw_prompt_ids"] = raw_prompt_ids
+        # encode prompts without chat template
+        if self.return_raw_chat:
+            row_dict["raw_prompt"] = messages
+        # get prompts with chat template
+        if self.return_full_prompt:
+            row_dict["full_prompts"] = raw_prompt  # array of strings
+        # add index for each prompt
+        index = row_dict.get("extra_info", {}).get("index", 0)
+        tools_kwargs = {
+            "image_zoom_in_tool": {
+                "create_kwargs": {"image": images[0]},
+                # "execute_kwargs": {},
+                # "calc_reward_kwargs": {},
+                # "release_kwargs": {},
+            }
+        }
+        row_dict["index"] = index
+        row_dict["tools_kwargs"] = tools_kwargs
+        row_dict["agent_name"] = "tool_agent"
+        return row_dict
+def compute_score(data_source: str, solution_str: str, ground_truth: str, extra_info=None) -> float:
+    """
+    Compute reward score for model solutions with robust handling of various formats.
+    Returns a weighted combination of:
+    - Accuracy reward (0.8 weight): Whether the answer is semantically correct
+    - Format reward (0.2 weight): Whether the output follows expected format
+    - Tool reward (1.2 weight): Whether tools were used when answer is correct
+    """
+    # Initialize tracking variables
+    is_format_error = False
+    # 1. Check <think> tag format
+    count_think_1 = solution_str.count("<think>")
+    count_think_2 = solution_str.count("</think>")
+    if count_think_1 != count_think_2:
+        is_format_error = True
+    # 2. Check vision tokens (skip this since tokenizer removes special tokens)
+    # We'll use <tool_call> and <tool_response> instead to detect tool usage
+    # 3. Extract answer text with multiple fallback strategies
+    answer_text = ""
+    # Strategy 1: Try to extract from <answer> tags first
+    predict_no_think = (
+        solution_str.split("</think>")[-1].strip() if "</think>" in solution_str else solution_str.strip()
+    )
+    # Check <answer> tag format
+    count_answer_1 = predict_no_think.count("<answer>")
+    count_answer_2 = predict_no_think.count("</answer>")
+    if count_answer_1 != count_answer_2:
+        is_format_error = True
+    # Try to extract from <answer> tags
+    answer_match = re.search(r"<answer>(.*?)</answer>", predict_no_think, re.DOTALL)
+    if answer_match:
+        answer_text = answer_match.group(1).strip()
+    else:
+        # No proper <answer> tags found - this is a format error
+        is_format_error = True
+        # Strategy 2: If no <answer> tags, extract content after tool responses
+        # Look for pattern: <tool_response>...</tool_response>assistant\n[actual_answer]
+        tool_response_match = re.search(
+            r"</tool_response>\s*assistant\s*\n(.*?)$", predict_no_think, re.DOTALL | re.MULTILINE
+        )
+        if tool_response_match:
+            answer_text = tool_response_match.group(1).strip()
+        else:
+            # Strategy 3: If no tool responses, look for content after </think>
+            if "</think>" in solution_str:
+                # Remove any remaining tool-related tags and extract meaningful content
+                remaining_content = predict_no_think
+                # Remove tool calls and responses
+                remaining_content = re.sub(r"<tool_call>.*?</tool_call>", "", remaining_content, flags=re.DOTALL)
+                remaining_content = re.sub(
+                    r"<tool_response>.*?</tool_response>", "", remaining_content, flags=re.DOTALL
+                )
+                # Remove user/assistant markers
+                remaining_content = re.sub(r"\b(user|assistant)\b", "", remaining_content)
+                answer_text = remaining_content.strip()
+            else:
+                # Strategy 4: Use the entire solution_str as fallback
+                answer_text = solution_str.strip()
+    # Clean up answer text
+    answer_text = answer_text.strip()
+    # If answer is still empty after all strategies, mark as format error
+    if not answer_text:
+        is_format_error = True
+        answer_text = solution_str.strip()  # Use full text as last resort
+    # 4. Evaluate correctness using LLM judge
+    question_text = extra_info.get("question", "") if extra_info else ""
+    if not client or not model_name:
+        logger.warning("Reward function client not initialized or model name not found.")
+        return 0.0
+    system_prompt = (
+        "You are an expert evaluator. Your task is to determine if a model's answer is semantically equivalent to a "
+        "provided standard answer, given a specific question.\n"
+        "Your evaluation must be strict. The model's answer is only correct if it fully matches the meaning of the "
+        "standard answer.\n"
+        'You must provide your final judgement as a single word: either "CORRECT" or "INCORRECT". Do not provide '
+        "any explanation or other text."
+    )
+    user_prompt = (
+        f"I will provide a question, a standard answer, and a model's answer. You must evaluate if the model's "
+        f"answer is correct.\n\n"
+        f"---\n"
+        f"**Example 1:**\n"
+        f"[Question]: Is the countertop tan or blue?\n"
+        f"[Standard Answer]: The countertop is tan.\n"
+        f"[Model's Answer]: tan\n"
+        f"[Your Judgement]: CORRECT\n"
+        f"---\n"
+        f"**Example 2:**\n"
+        f"[Question]: Is the man phone both blue and closed?\n"
+        f"[Standard Answer]: Yes, the man phone is both blue and closed.\n"
+        f"[Model's Answer]: No.\n"
+        f"[Your Judgement]: INCORRECT\n"
+        f"---\n"
+        f"**Task:**\n"
+        f"[Question]: {question_text}\n"
+        f"[Standard Answer]: {ground_truth}\n"
+        f"[Model's Answer]: {answer_text}\n"
+        f"[Your Judgement]:"
+    )
+    try:
+        chat_response = client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            seed=random.randint(0, 1000000),
+            temperature=0.1,  # Lower temperature for more deterministic judgement
+            extra_body={
+                "chat_template_kwargs": {"enable_thinking": False},
+            },
+        )
+        response = chat_response.choices[0].message.content.strip()
+    except Exception as e:
+        logger.warning(f" [WARNING] Chat completion request failed: {e}")
+        return 0.0
+    # Parse LLM judge response
+    if re.search(r"\bCORRECT\b", response, re.IGNORECASE):
+        acc_reward = 1.0
+    elif re.search(r"\bINCORRECT\b", response, re.IGNORECASE):
+        acc_reward = 0.0
+    else:
+        logger.warning(
+            f" [WARNING] Judgement format error. Expected 'CORRECT' or 'INCORRECT'.\n"
+            f"Response: '{response}'\n"
+            f"Model Answer: '{answer_text}'\n"
+            f"Ground Truth: '{ground_truth}'"
+        )
+        acc_reward = 0.0
+    # Penalize excessively long answers (potential judge hacking)
+    if len(answer_text) >= 1000:
+        acc_reward = 0.0
+        is_format_error = True
+    # 5. Check tool usage - look for tool_call/tool_response patterns instead of vision tokens
+    has_tool_usage = bool(
+        re.search(r"<tool_call>.*?</tool_call>", solution_str, re.DOTALL)
+        or re.search(r"<tool_response>.*?</tool_response>", solution_str, re.DOTALL)
+    )
+    # Tool reward: only give if tools were used AND answer is correct
+    tool_reward = 1.0 if has_tool_usage and acc_reward > 0.5 else 0.0
+    # Format reward: penalty for format errors
+    format_reward = -1.0 if is_format_error else 0.0
+    # Log debug information for problematic cases
+    if is_format_error or not answer_text:
+        logger.debug(
+            f"Format issue detected:\n"
+            f"Solution: {solution_str[:200]}...\n"
+            f"Extracted answer: '{answer_text}'\n"
+            f"Format error: {is_format_error}\n"
+            f"Tool usage: {has_tool_usage}"
+        )
+    # Final weighted score
+    final_score = 0.8 * acc_reward + 0.2 * format_reward + 1.2 * tool_reward
+    return final_score
+if __name__ == "__main__":
+    # Test case 1: Original test case
+    predict_str = "The answer is 2 + 2 = 4 </think> <answer> right </answer> <answer> left </answer>"
+    ground_truth = "left"
+    extra_info = {
+        "answer": "The woman is to the left of the man who is holding the camera.",
+        "id": 0,
+        "image": "/cpfs/user/honglingyi/DATA/LLM/Vstar/gqa/images/713270.jpg",
+        "pred_ans": "The woman is to the right of the man who is holding the camera.",
+        "question": "Is the woman to the left or to the right of the man who is holding the camera?",
+    }
+    print("=== Test Case 1: Original test ===")
+    import time
+    time_start = time.time()
+    score = compute_score("common_reasoning", predict_str, ground_truth, extra_info)
+    print(f"Score: {score}")
+    time_end = time.time()
+    print(f"Time: {time_end - time_start}")
+    # Test case 2: Problematic case mentioned by user
+    problematic_solution = """<tool_call>
+{"name": "image_zoom_in_tool", "arguments": {"bbox_2d": [226, 399, 265, 464], "label": "white van"}}
+</tool_call>user
+<tool_response>
+Zoomed in on the image to the region [226, 399, 265, 464] with label white van.
+</tool_response>
+assistant
+The white van is visible in the lower section of the image, near the diagonal road."""
+    problematic_ground_truth = "Yes, the white van is indeed situated in the bottom part of the picture."
+    problematic_extra_info = {
+        "question": "Is the white van in the bottom part of the picture?",
+    }
+    print("\n=== Test Case 2: Problematic case (no answer tags) ===")
+    print(f"Solution: {problematic_solution}")
+    print(f"Ground truth: {problematic_ground_truth}")
+    time_start = time.time()
+    score2 = compute_score("common_reasoning", problematic_solution, problematic_ground_truth, problematic_extra_info)
+    print(f"Score: {score2}")
+    time_end = time.time()
+    print(f"Time: {time_end - time_start}")
+    # Test case 3: Well-formatted case with tools
+    well_formatted_solution = """<think>
+I need to use the image zoom tool to get a better look at the specific area.
+</think>
+<tool_call>
+{"name": "image_zoom_in_tool", "arguments": {"bbox_2d": [226, 399, 265, 464], "label": "white van"}}
+</tool_call>
+<tool_response>
+Zoomed in on the image to the region [226, 399, 265, 464] with label white van.
+</tool_response>
+<answer>Yes, the white van is indeed situated in the bottom part of the picture.</answer>"""
+    print("\n=== Test Case 3: Well-formatted case ===")
+    time_start = time.time()
+    score3 = compute_score(
+        "common_reasoning", well_formatted_solution, problematic_ground_truth, problematic_extra_info
+    )
+    print(f"Score: {score3}")
+    time_end = time.time()
+    print(f"Time: {time_end - time_start}")

ICL/DAPO/verl-recipe/fault_recover/async_llm.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import numpy as np
+from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
+from vllm.utils import cdiv
+from vllm.v1.engine.async_llm import AsyncLLM, logger
+from vllm.v1.metrics.stats import IterationStats
+class AsyncFaultRecoverLLM(AsyncLLM):
+    def _run_output_handler(self):
+        """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+        if self.output_handler is not None:
+            return
+        # Ensure that the task doesn't have a circular ref back to the AsyncLLM
+        # object, or else it won't be garbage collected and cleaned up properly.
+        engine_core = self.engine_core
+        output_processor = self.output_processor
+        log_stats = self.log_stats
+        logger_manager = self.logger_manager
+        async def output_handler(q):
+            try:
+                while True:
+                    # 1) Pull EngineCoreOutputs from the EngineCore.
+                    outputs = await engine_core.get_output_async()
+                    if q is not None:
+                        req_info = {}
+                        for output in outputs.outputs:
+                            req_info[output.request_id] = {}
+                            req_info[output.request_id]["new_token_ids"] = output.new_token_ids
+                            req_info[output.request_id]["finished"] = output.finished
+                        await q.put.remote(req_info)
+                    num_outputs = len(outputs.outputs)
+                    iteration_stats = IterationStats() if (log_stats and num_outputs) else None
+                    # Split outputs into chunks of at most
+                    # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
+                    # event loop for too long.
+                    if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
+                        slices = (outputs.outputs,)
+                    else:
+                        slices = np.array_split(outputs.outputs, cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
+                    for i, outputs_slice in enumerate(slices):
+                        # 2) Process EngineCoreOutputs.
+                        processed_outputs = output_processor.process_outputs(
+                            outputs_slice, outputs.timestamp, iteration_stats
+                        )
+                        # NOTE: RequestOutputs are pushed to their queues.
+                        assert not processed_outputs.request_outputs
+                        # Allow other asyncio tasks to run between chunks
+                        if i + 1 < len(slices):
+                            await asyncio.sleep(0)
+                        # 3) Abort any reqs that finished due to stop strings.
+                        await engine_core.abort_requests_async(processed_outputs.reqs_to_abort)
+                    # 4) Logging.
+                    # TODO(rob): make into a coroutine and launch it in
+                    # background thread once Prometheus overhead is non-trivial.
+                    if logger_manager:
+                        logger_manager.record(
+                            engine_idx=outputs.engine_index,
+                            scheduler_stats=outputs.scheduler_stats,
+                            iteration_stats=iteration_stats,
+                        )
+            except Exception as e:
+                logger.exception("AsyncLLM output_handler failed.")
+                output_processor.propagate_error(e)
+        from recipe.fault_recover.fault_manager import get_tokens_queue
+        tokens_queue = get_tokens_queue()
+        self.output_handler = asyncio.create_task(output_handler(tokens_queue))

ICL/DAPO/verl-recipe/flash_rl_ascend/README.md ADDED Viewed

	@@ -0,0 +1,121 @@

+## 在线量化权重：
+介绍在昇腾设备上，使用 [Flash-RL](https://github.com/yaof20/Flash-RL) 工具，修改推理后端，通过比较 INT8 模型和 BF16 模型，对权重和激活值执行在线量化。下文以 Qwen3-30B int8 为例，在 NPU 上跑通端到端功能。
+### 环境依赖
+##
+| PyTorch版本 | torch_npu版本 | CANN版本  | Python版本 |
+| ------------ |-----------| ---------- | ---------- |
+| 2.7.1     | 2.7.1       | 8.5.0 | Python3.10 |
+#### 1、安装 vllm 和 vllm-ascend
+```bash
+# vllm==0.10.1
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+git checkout e03940762b43812fccd3c214bda60201cff9d16a
+pip install -r requirements/build.txt
+VLLM_TARGET_DEVICE=empty pip install -v .
+cd ..
+# vllm-ascend==0.10.1
+git clone https://github.com/vllm-project/vllm-ascend.git
+cd vllm-ascend
+git checkout 7e16b4a7cdb15723c63c1c0efe58672a056eace8
+pip install -r requirements.txt
+export COMPILE_CUSTOM_KERNELS=1
+python setup.py install
+cd ..
+# 源码安装transformers
+git clone -b v4.57.6 https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+#### 2、安装 MindSpeed 与 Megatron
+```bash
+# MindSpeed
+git clone https://gitcode.com/Ascend/MindSpeed.git
+cd MindSpeed
+git checkout 1cdd0abd75e40936ad31721c092f57c695dd72c4
+pip install -e .
+cd ..
+# Megatron
+pip install git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.1
+```
+#### 3、安装 verl
+```bash
+# verl
+git clone https://github.com/volcengine/verl.git
+cd verl
+pip install -e .
+cd ..
+```
+### 使用步骤：
+#### 1、安装包：
+```
+pip install flash-llm-rl # need to be installed in all nodes in multi-node training
+```
+#### 2、打patch
+安装 FlashRL 后，默认采用自动 patch，推荐改用手动方式，减少过程中的错误：
+1. 在 `verl/verl/__init__.py` 文件中添加 `import flash_rl`；
+2. 在 shell 脚本中添加 `flashrl cleanup`，这将禁用自动 patch；
+#### 3、生成性能分析文件
+具体来说，profile 文件会比较 bf16 模型和 int8 模型，以确定如何对更新后的模型执行在线量化：
+```
+flashrl profile -m Qwen3-30B-A3B -q Qwen3-30B-A3B-w8a8 -o ${PROFILE_PATH:-"$HOME/profile.30b.pt"} --fn int8
+```
+`-m` 参数后是 bf16 模型路径，`-q` 参数后是 int8 模型路径，`-o` 参数后是生成文件路径；
+[RedHatAI](https://huggingface.co/RedHatAI/collections) 提供了各种量化模型；
+#### 4、生成配置文件
+通过以下命令生成 yaml 配置文件，供 patch 程序使用：
+```
+flashrl setup -m Qwen3-30B-A3B-w8a8 -p $HOME/profile.30b.pt --fn int8 -o ${CONFIG_PATH:-"$HOME/.flashrl_config.30b.yaml"}
+```
+`-m` 参数后是 int8 模型路径，`-p` 参数后是 profile 文件路径，`-o` 参数后是生成文件路径；
+（可选）为了缩小 rollout 生成和梯度计算之间的差距，FlashRL 提供了在 DP 工作线程间以混合方式进行 16 位和 8 位 rollout 生成的功能。具体来说，运行以下命令会将第二个配置附加到现有的 yaml 配置文件中。
+```
+flashrl setup -a --fn bf16 -o ${CONFIG_PATH:-"$HOME/.flashrl_config.30b.yaml"}
+```
+#### 5、开始训练
+脚本中添加以下环境变量：
+```
+# 打印详细日志，查看是否 patch 成功：
+export FLASHRL_LOGGING_LEVEL=DEBUG
+# 指定配置文件：
+export FLASHRL_CONFIG=$HOME/.flashrl_config.30b.yaml
+# 强制 lm-head 使用 bf16，减小精度损失：
+export FLASHRL_LMHEAD_FP32=1
+```
+以上步骤已在 `test_qwen3-30b_int8_npu.sh` 提供实例，修改脚本中的模型路径即可自动执行，有具体问题可根据上述步骤排查；
+在 `run.sh` 文件中补充机器 IP、网络接口，运行以下命令启动训练：
+```
+bash ./run.sh
+```

ICL/DAPO/verl-recipe/flowrl/README.md ADDED Viewed

	@@ -0,0 +1,182 @@

+<h1 align="center" style="color:#1976D2; font-size:42px; font-weight:bold; margin-bottom:0;">
+  FlowRL
+</h1>
+<p align="center" style="color:#42A5F5; font-size:16px; margin-top:0;">
+  Matching Reward Distributions via Flow Balance
+</p>
+<p align="center" style="color:#42A5F5; font-size:15px; margin-top:4px;">
+  <a href="https://arxiv.org/abs/2509.15207" target="_blank">📄 arXiv Paper</a> |
+  <a href="https://huggingface.co/papers/2509.15207" target="_blank">🤗 #1 Paper of the Day</a>
+</p>
+<p align="center" style="color:#42A5F5; font-size:14px; margin-top:4px;">
+  <a href="https://x.com/RoverHM/status/1969113890878259518" target="_blank">𝕏 Post 1</a> |
+  <a href="https://x.com/zdhnarsil/status/1969049940774023428" target="_blank">𝕏 Post 2</a> |
+  <a href="https://x.com/_akhaliq/status/1968901977376505929" target="_blank">𝕏 Post 3</a> |
+  <a href="https://x.com/zhu_xuekai/status/1968942580197941563" target="_blank">𝕏 Post 4</a>
+</p>
+<p align="center">
+  <img src="figures/flowrl.png" alt="FlowRL Overview" width="95%"/>
+</p>
+## Table of Contents
+- [FlowRL Objective](#flowrl-objective)
+- [Trained Models & Experiment Logs](#trained-models--experiment-logs)
+- [Quick Start](#quick-start)
+  - [Option 1: Original Paper Reproduction (verl 0.4.0)](#option-1-original-paper-reproduction-verl-040--recommended)
+    - [Step 1: Installation](#step-1-installation)
+    - [Step 2: Data Preparation](#step-2-data-preparation)
+    - [Step 3: Model Preparation](#step-3-model-preparation)
+    - [Step 4: Training Scripts](#step-4-training-scripts)
+  - [Option 2: Latest verl Recipe FlowRL](#option-3-latest-verl-recipe-flowrl)
+    - [Step 1: Prepare Data and Model](#step-1-prepare-data-and-model)
+    - [Step 2: Run Training](#step-2-run-training)
+  - [Option 3: Implement FlowRL Yourself](#option-4-implement-flowrl-yourself)
+- [Testing](#testing)
+- [Citation](#citation)
+## FlowRL Objective
+$$
+\mathcal{L}_{\text{FlowRL}} = w \cdot \left( \log Z_{\phi}(x) + \frac{1}{|y|} \log \pi_{\theta}(y \mid x) - \beta \hat{r}(x, y) - \frac{1}{|y|} \log \pi_{\text{ref}}(y \mid x) \right)^2
+$$
+FlowRL is a flow-balanced reinforcement learning method that matches full reward distributions instead of maximizing rewards, promoting diverse exploration and generalizable reasoning trajectories in LLMs.
+## Trained Models & Experiment Logs
+| Base Model | Domain | WandB Logs | Hugging Face Model |
+|-------|--------|------------|-------------------|
+| Qwen2.5-7B | Math | [🔗 View Run](https://wandb.ai/xuekaizhu0/FlowRL/runs/pa62rs4x?nw=nwuserxuekaizhu0) | [🤗 Model](https://huggingface.co/xuekai/FlowRL-Qwen2.5-7B-math) |
+| DeepSeek-7B | Code | [🔗 View Run](https://wandb.ai/xuekaizhu0/FlowRL/runs/wbw72gdv?nw=nwuserxuekaizhu0) | [🤗 Model](https://huggingface.co/xuekai/FlowRL-DeepSeek-7B-code) |
+| Qwen2.5-32B | Math | - | [🤗 Model](https://huggingface.co/xuekai/FlowRL-Qwen2.5-32B-math) |
+## Quick Start
+There are three ways to use FlowRL:
+---
+**⭐ We recommend using Option 1 as the default choice.** Since verl updates frequently, the newest versions may have unstable factors such as training and inference mismatches. Option 1 uses verl 0.4.0, which is stable and has been thoroughly tested with our paper results.
+---
+### Option 1: Original Paper Reproduction (verl 0.4.0) ⭐ Recommended
+For exact reproduction of results from the paper, use the original repository with verl 0.4.0:
+👉 **Original Code:** [https://github.com/Xuekai-Zhu/FlowRL](https://github.com/Xuekai-Zhu/FlowRL)
+#### Step 1: Installation
+Install [verl](https://github.com/volcengine/verl) first before using FlowRL.
+#### Step 2: Data Preparation
+```bash
+# Option A: Download our pre-processed datasets directly
+bash preprocess/down_load_dataset.sh
+# Move data to default directory
+mv data/xuekai/flowrl-data-collection/math_data data/math_data
+mv data/xuekai/flowrl-data-collection/code_data data/code_data
+```
+```bash
+# Option B: Process data from original sources
+# For detailed processing instructions, see data/README.md
+```
+#### Step 3: Model Preparation
+For Math Tasks: `Qwen/Qwen2.5-7B` (default in script) ; `Qwen/Qwen2.5-32B`
+For Code Tasks: `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B`
+```bash
+# Download default model (Qwen2.5-7B for math)
+bash preprocess/down_load_model.sh
+# For other models, modify MODEL_NAME in the script before running
+```
+#### Step 4: Training Scripts
+```bash
+cd verl_FlowRL
+# For 7B math training
+bash command/training/math/flowrl_7B_math.sh
+# For 32B math training
+bash command/training/math/flowrl_32B_math.sh
+# For 7B code training
+bash command/training/code/flowrl_7B_code.sh
+```
+----
+### Option 2: Latest verl Recipe FlowRL
+For running FlowRL using the latest verl framework:
+**Latest verl:**
+- verl recipe: [https://github.com/volcengine/verl/tree/main/recipe/flowrl](https://github.com/volcengine/verl/tree/main/recipe/flowrl)
+#### Step 1: Prepare Data and Model
+```bash
+# Prepare dataset
+bash recipe/flowrl/prepare/prepare_data.sh
+# Prepare model
+bash recipe/flowrl/prepare/prepare_model.sh
+```
+#### Step 2: Run Training
+```bash
+# Train FlowRL with Qwen2.5-7B
+bash recipe/flowrl/run_flowrl_qwen2.5_7b.sh
+```
+----
+### Option 3: Implement FlowRL Yourself
+If you want to implement FlowRL in your own codebase, we provide a detailed implementation guide:
+📖 **[FlowRL Implementation Guide](FLOWRL_SIMPLE_GUIDE.md)**
+This guide walks you through the key components and steps needed to integrate FlowRL into your existing training pipeline.
+## Testing
+After training your FlowRL models, you can evaluate them using the following commands:
+```bash
+cd verl_Test
+# First merge the model
+bash command/eval/merge_model.sh
+# For math testing
+bash command/eval/math/flowrl_math_test.sh
+# For code testing
+bash command/eval/code/flowrl_code_test.sh
+```
+**Reference:** For verl v0.5.0.dev merge model script, see [merge_model.sh](https://github.com/Xuekai-Zhu/verl_FlowRL/blob/flowrl-v0.5.0.dev/recipe/flowrl/eval/merge_model.sh)
+## Citation
+If you think this repo helps you, please kindly consider citing our paper:
+```bibtex
+@article{zhu2025flowrl,
+  title={FlowRL: Matching Reward Distributions for LLM Reasoning},
+  author={Zhu, Xuekai and Cheng, Daixuan and Zhang, Dinghuai and Li, Hengli and Zhang, Kaiyan and Jiang, Che and Sun, Youbang and Hua, Ermo and Zuo, Yuxin and Lv, Xingtai and others},
+  journal={arXiv preprint arXiv:2509.15207},
+  year={2025}
+}
+```

ICL/DAPO/verl-recipe/flowrl/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FlowRL recipe package."""
+__all__ = []

ICL/DAPO/verl-recipe/flowrl/flowrl_fsdp_worker.py ADDED Viewed

	@@ -0,0 +1,495 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FlowRL FSDP Worker that uses FlowRLActor instead of standard DPActor."""
+import logging
+import os
+import warnings
+import torch
+import torch.distributed
+from peft import LoraConfig, TaskType, get_peft_model
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+try:
+    # for torch 2.5+
+    from torch.distributed.tensor import DTensor
+except ImportError:
+    from torch.distributed._tensor import DTensor
+from recipe.flowrl.flowrl_actor import FlowRLActor, ProjZModule
+from verl.models.transformers.monkey_patch import apply_monkey_patch
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.utils import hf_processor, hf_tokenizer
+from verl.utils.activation_offload import enable_activation_offloading
+from verl.utils.config import omega_conf_to_dataclass
+from verl.utils.device import (
+    get_device_id,
+    get_torch_device,
+    set_expandable_segments,
+)
+from verl.utils.fsdp_utils import (
+    CPUOffloadPolicy,
+    MixedPrecisionPolicy,
+    apply_fsdp2,
+    collect_lora_params,
+    fsdp2_load_full_state_dict,
+    get_fsdp_wrap_policy,
+    get_init_weight_context_manager,
+    get_shard_placement_fn,
+    init_fn,
+    load_fsdp_model_to_gpu,
+    offload_fsdp_model_to_cpu,
+    replace_lora_wrapper,
+)
+from verl.utils.memory_utils import aggressive_empty_cache
+from verl.utils.model import convert_weight_keys
+from verl.utils.profiler import log_gpu_memory_usage
+from verl.utils.py_functional import convert_to_regular_types
+from verl.workers.config import FSDPEngineConfig
+from verl.workers.fsdp_workers import ActorRolloutRefWorker, get_sharding_strategy, get_vl_model_vision_tower
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+class FlowRLActorRolloutRefWorker(ActorRolloutRefWorker):
+    """
+    FlowRL version of ActorRolloutRefWorker that uses FlowRLActor.
+    This worker adds FlowRL-specific modifications:
+    - ProjZModule for log Z estimation (added in _build_model_optimizer)
+    - FlowRLActor with trajectory balance loss (replaces standard DPActor)
+    """
+    def _build_model_optimizer(
+        self,
+        model_path,
+        fsdp_config: FSDPEngineConfig,
+        optim_config,
+        override_model_config,
+        use_remove_padding=False,
+        use_fused_kernels=False,
+        enable_gradient_checkpointing=False,
+        trust_remote_code=False,
+        use_liger=False,
+        role="actor",
+        enable_activation_offload=False,
+    ):
+        from torch import optim
+        from torch.distributed.fsdp import CPUOffload, MixedPrecision
+        from transformers import (
+            AutoConfig,
+            AutoModel,
+            AutoModelForCausalLM,
+            AutoModelForImageTextToText,
+            AutoModelForVision2Seq,
+        )
+        from verl.utils.model import get_generation_config, print_model_size, update_model_config
+        from verl.utils.torch_dtypes import PrecisionType
+        assert role in ["actor", "ref"]
+        log_gpu_memory_usage(f"Before init {role} from HF AutoModel", logger=logger)
+        local_path = model_path
+        # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect
+        # TODO(zhangchi.usc1992): 1. support create from random initialized model. 2. Support init with FSDP directly
+        self.tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        self.processor = hf_processor(local_path, trust_remote_code=trust_remote_code)
+        if self.config.model.get("custom_chat_template", None) is not None:
+            if self.processor is not None:
+                self.processor.chat_template = self.config.model.custom_chat_template
+            else:
+                self.tokenizer.chat_template = self.config.model.custom_chat_template
+        vllm_dtype = PrecisionType.to_dtype(self.config.rollout.dtype)
+        torch_dtype = fsdp_config.get("model_dtype", None)
+        if torch_dtype is None:
+            torch_dtype = torch.float32 if self._is_actor else vllm_dtype
+        else:
+            torch_dtype = PrecisionType.to_dtype(torch_dtype)
+        # override model kwargs
+        actor_model_config = AutoConfig.from_pretrained(
+            local_path, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2"
+        )
+        # TODO: VL models use VisionAttention, which directly uses flash_attention in transformers>=4.53
+        # which will be patched by _ulysses_flash_attention_forward, but errorly misses position_ids
+        # Maybe support Ulysses in VisionAttention in the future and remove this patch
+        if self.ulysses_sequence_parallel_size > 1 and hasattr(actor_model_config, "vision_config"):
+            actor_model_config.vision_config._attn_implementation = "eager"
+        # patch for kimi-vl
+        if getattr(actor_model_config, "model_type", None) == "kimi_vl":
+            actor_model_config.text_config.topk_method = "greedy"
+        self.generation_config = get_generation_config(local_path, trust_remote_code=trust_remote_code)
+        override_config_kwargs = {
+            "bos_token_id": self.tokenizer.bos_token_id,
+            "eos_token_id": self.tokenizer.eos_token_id,
+            "pad_token_id": self.tokenizer.pad_token_id,
+        }
+        override_config_kwargs.update(override_model_config)
+        update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs)
+        if self.rank == 0:
+            print(f"Model config after override: {actor_model_config}")
+        # NOTE(fix me): tie_word_embedding causes meta_tensor init to hang
+        init_context = get_init_weight_context_manager(
+            use_meta_tensor=not actor_model_config.tie_word_embeddings, mesh=self.device_mesh
+        )
+        with init_context(), warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            has_remote_code = hasattr(actor_model_config, "auto_map") and any(
+                actor_model_config.architectures[0] in val for val in actor_model_config.auto_map.values()
+            )
+            if has_remote_code:
+                auto_class = next(
+                    k for k, v in actor_model_config.auto_map.items() if actor_model_config.architectures[0] in v
+                )
+                match auto_class:
+                    case "AutoModelForVision2Seq":
+                        actor_module_class = AutoModelForVision2Seq
+                    case "AutoModelForCausalLM":
+                        actor_module_class = AutoModelForCausalLM
+                    case "AutoModelForImageTextToText":
+                        actor_module_class = AutoModelForImageTextToText
+                    case _:
+                        actor_module_class = AutoModel
+            else:
+                if type(actor_model_config) in AutoModelForVision2Seq._model_mapping.keys():
+                    actor_module_class = AutoModelForVision2Seq
+                elif type(actor_model_config) in AutoModelForCausalLM._model_mapping.keys():
+                    actor_module_class = AutoModelForCausalLM
+                elif type(actor_model_config) in AutoModelForImageTextToText._model_mapping.keys():
+                    actor_module_class = AutoModelForImageTextToText
+                else:
+                    actor_module_class = AutoModel
+            actor_module = actor_module_class.from_pretrained(
+                pretrained_model_name_or_path=local_path,
+                torch_dtype=torch_dtype,
+                config=actor_model_config,
+                trust_remote_code=trust_remote_code,
+            )
+            # ==== FlowRL: inject ProjZ BEFORE FSDP wrap ====
+            if role == "actor" and self._is_actor:
+                n_dim = actor_module.config.hidden_size
+                proj_layers = getattr(self.config.actor, "proj_layer", 3)
+                actor_module.add_module("proj_z", ProjZModule(n_dim, num_layers=proj_layers))
+                if self.rank == 0:
+                    print(f"[FlowRL] Added proj_z (layers={proj_layers}, hidden={n_dim}) BEFORE FSDP wrap")
+            # ===============================================
+            # Apply Liger kernel to the model if use_liger is set to True
+            if use_liger:
+                from liger_kernel.transformers.monkey_patch import _apply_liger_kernel_to_instance
+                _apply_liger_kernel_to_instance(model=actor_module)
+            fused_kernel_options = self.config.model.get("fused_kernel_options", None)
+            fused_kernels_backend = (
+                fused_kernel_options.get("impl_backend", None) if fused_kernel_options is not None else None
+            )
+            apply_monkey_patch(
+                model=actor_module,
+                use_remove_padding=use_remove_padding,
+                ulysses_sp_size=self.ulysses_sequence_parallel_size,
+                use_fused_kernels=use_fused_kernels,
+                fused_kernels_backend=fused_kernels_backend,
+            )
+            # some parameters may not in torch_dtype. TODO(zhangchi.usc1992) remove this after we switch to fsdp2
+            actor_module.to(torch_dtype)
+            if enable_gradient_checkpointing:
+                actor_module.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
+            if self._is_lora:
+                print("Applying LoRA to actor module")
+                actor_module.enable_input_require_grads()
+                # Convert config to regular Python types before creating PEFT model
+                lora_config = {
+                    "task_type": TaskType.CAUSAL_LM,
+                    "r": self.config.model.lora_rank,
+                    "lora_alpha": self.config.model.lora_alpha,
+                    "target_modules": convert_to_regular_types(self.config.model.target_modules),
+                    "exclude_modules": convert_to_regular_types(self.config.model.exclude_modules),
+                    "bias": "none",
+                }
+                actor_module = get_peft_model(actor_module, LoraConfig(**lora_config))
+        self.use_orig_params = fsdp_config.get("use_orig_params", False)
+        if self.config.actor.get("freeze_vision_tower", False):
+            vision_tower = get_vl_model_vision_tower(actor_module)
+            if vision_tower is not None:
+                vision_tower.requires_grad_(False)
+                self.use_orig_params = True
+                if self.rank == 0:
+                    print("[actor model] Vision tower is set to not trainable.")
+            else:
+                if self.rank == 0:
+                    print("[actor model] No vision tower found.")
+        torch.distributed.barrier()
+        if self.rank == 0:
+            print_model_size(actor_module)
+        log_gpu_memory_usage(f"After init {role} from HF AutoModel", logger=logger)
+        # We wrap FSDP for rollout as well
+        mixed_precision_config = fsdp_config.get("mixed_precision", None)
+        if mixed_precision_config is not None:
+            param_dtype = PrecisionType.to_dtype(mixed_precision_config.get("param_dtype", "bf16"))
+            reduce_dtype = PrecisionType.to_dtype(mixed_precision_config.get("reduce_dtype", "fp32"))
+            buffer_dtype = PrecisionType.to_dtype(mixed_precision_config.get("buffer_dtype", "fp32"))
+        else:
+            param_dtype = PrecisionType.to_dtype(self.config.actor.get("dtype", "bfloat16"))
+            reduce_dtype = torch.float32
+            buffer_dtype = torch.float32
+        mixed_precision = MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype)
+        auto_wrap_policy = get_fsdp_wrap_policy(
+            module=actor_module,
+            config=fsdp_config.get("wrap_policy", None),
+            is_lora=self.config.model.get("lora_rank", 0) > 0,
+        )
+        if self._is_rollout and self.config.rollout.name == "hf":
+            # TODO(zhangchi.usc1992, shengguangming) fix me. Current, auto_wrap_policy causes HFRollout to hang in Gemma
+            auto_wrap_policy = None
+        if self.rank == 0:
+            print(f"wrap_policy: {auto_wrap_policy}")
+        fsdp_mesh = self.device_mesh
+        sharding_strategy = get_sharding_strategy(fsdp_mesh)
+        # TODO: add transformer policy
+        # We force reference policy to use CPUOffload to save memory.
+        # We force turn off CPUOffload for actor because it causes incorrect results when using grad accumulation
+        cpu_offload = None if role == "actor" else CPUOffload(offload_params=True)
+        fsdp_strategy = self.config.actor.strategy
+        if fsdp_strategy == "fsdp":
+            actor_module_fsdp = FSDP(
+                actor_module,
+                cpu_offload=cpu_offload,
+                param_init_fn=init_fn,
+                auto_wrap_policy=auto_wrap_policy,
+                device_id=get_device_id(),
+                sharding_strategy=sharding_strategy,  # zero3
+                mixed_precision=mixed_precision,
+                sync_module_states=True,
+                device_mesh=self.device_mesh,
+                use_orig_params=self.use_orig_params,
+                forward_prefetch=fsdp_config.get("forward_prefetch", False),
+            )
+        elif fsdp_strategy == "fsdp2":
+            assert CPUOffloadPolicy is not None, "PyTorch version >= 2.4 is required for using fully_shard API (FSDP2)"
+            mp_policy = MixedPrecisionPolicy(
+                param_dtype=param_dtype, reduce_dtype=reduce_dtype, cast_forward_inputs=True
+            )
+            if role == "actor" and fsdp_config.offload_policy:
+                cpu_offload = CPUOffloadPolicy(pin_memory=True)
+                self._is_offload_param = False
+                self._is_offload_optimizer = False
+            else:
+                cpu_offload = None if role == "actor" else CPUOffloadPolicy(pin_memory=True)
+            fsdp_kwargs = {
+                "mesh": fsdp_mesh,
+                "mp_policy": mp_policy,
+                "offload_policy": cpu_offload,
+                "reshard_after_forward": fsdp_config.reshard_after_forward,
+                "shard_placement_fn": get_shard_placement_fn(fsdp_size=self.device_mesh.shape[-1]),
+            }
+            full_state = actor_module.state_dict()
+            apply_fsdp2(actor_module, fsdp_kwargs, fsdp_config)
+            fsdp2_load_full_state_dict(actor_module, full_state, fsdp_mesh, cpu_offload)
+            actor_module_fsdp = actor_module
+        else:
+            raise NotImplementedError(f"not implement {fsdp_strategy}")
+        if enable_activation_offload:
+            enable_activation_offloading(actor_module_fsdp, fsdp_strategy, enable_gradient_checkpointing)
+        log_gpu_memory_usage(f"After {role} FSDP init", logger=logger)
+        # TODO: add more optimizer args into config
+        if role == "actor" and optim_config is not None:
+            from verl.utils.torch_functional import get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup
+            actor_optimizer = optim.AdamW(
+                actor_module_fsdp.parameters(),
+                lr=optim_config.lr,
+                betas=optim_config.get("betas", (0.9, 0.999)),
+                weight_decay=optim_config.get("weight_decay", 1e-2),
+            )
+            total_steps = optim_config.get("total_training_steps", 0)
+            num_warmup_steps = int(optim_config.get("lr_warmup_steps", -1))
+            warmup_style = optim_config.get("warmup_style", "constant")
+            min_lr_ratio = optim_config.get("min_lr_ratio", 0.0)
+            num_cycles = optim_config.get("num_cycles", 0.5)
+            if num_warmup_steps < 0:
+                num_warmup_steps_ratio = optim_config.get("lr_warmup_steps_ratio", 0.0)
+                num_warmup_steps = int(num_warmup_steps_ratio * total_steps)
+            if self.rank == 0:
+                print(f"Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}")
+            if warmup_style == "constant":
+                actor_lr_scheduler = get_constant_schedule_with_warmup(
+                    optimizer=actor_optimizer, num_warmup_steps=num_warmup_steps
+                )
+            elif warmup_style == "cosine":
+                actor_lr_scheduler = get_cosine_schedule_with_warmup(
+                    optimizer=actor_optimizer,
+                    num_warmup_steps=num_warmup_steps,
+                    num_training_steps=total_steps,
+                    min_lr_ratio=min_lr_ratio,
+                    num_cycles=num_cycles,
+                )
+            else:
+                raise NotImplementedError(f"Warmup style {warmup_style} is not supported")
+            log_gpu_memory_usage(f"After {role} optimizer init", logger=logger)
+        else:
+            actor_optimizer = None
+            actor_lr_scheduler = None
+        return actor_module_fsdp, actor_optimizer, actor_lr_scheduler, actor_model_config
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def init_model(self):
+        """Override init_model to use FlowRLActor instead of DataParallelPPOActor."""
+        # Call parent's init_model to set up the FSDP model (with proj_z already added)
+        super().init_model()
+        # Replace the actor with FlowRLActor if this worker is an actor
+        if self._is_actor:
+            if self.rank == 0:
+                print("[FlowRL] Replacing DataParallelPPOActor with FlowRLActor")
+            # Convert actor config to dataclass
+            actor_cfg = omega_conf_to_dataclass(self.config.actor)
+            # Create FlowRLActor with trajectory balance loss
+            self.actor = FlowRLActor(
+                config=actor_cfg, actor_module=self.actor_module_fsdp, actor_optimizer=self.actor_optimizer
+            )
+    async def rollout_mode(self):
+        """
+        Override rollout_mode to filter out proj_z parameters before syncing to vLLM.
+        FlowRL's proj_z module is only needed during training for estimating log Z.
+        It should not be loaded into vLLM since vLLM is only used for rollout generation.
+        """
+        aggressive_empty_cache(force_sync=True)
+        log_gpu_memory_usage("Before load_fsdp_model_to_gpu", logger=logger)
+        if self._is_offload_param:
+            load_fsdp_model_to_gpu(self.actor_module_fsdp)
+        log_gpu_memory_usage("After load_fsdp_model_to_gpu", logger=logger)
+        peft_config = None
+        peft_model = getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp)
+        if hasattr(peft_model, "peft_config"):  # LoRA
+            peft_config = peft_model.peft_config.get("default", None)
+            params = collect_lora_params(
+                module=self.actor_module_fsdp,
+                layered_summon=self.config.rollout.get("layered_summon", False),
+                base_sync_done=self.base_sync_done,
+            )
+            if not self.base_sync_done:
+                params = {replace_lora_wrapper(k, peft_config): v for k, v in params.items()}
+        else:
+            params = self.actor_module_fsdp.state_dict()
+        # ==== FlowRL: Filter out proj_z parameters ====
+        params = {k: v for k, v in params.items() if not k.startswith("proj_z")}
+        num_proj_z_filtered = len([k for k in self.actor_module_fsdp.state_dict().keys() if k.startswith("proj_z")])
+        if num_proj_z_filtered > 0 and self.rank == 0:
+            print(f"[FlowRL] Filtered {num_proj_z_filtered} proj_z parameters before syncing to vLLM")
+        # ===============================================
+        params = convert_weight_keys(
+            params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp)
+        )
+        # Special handling for LoRA with sleep_level=2:
+        if peft_config is not None and getattr(self.rollout, "sleep_level", None) == 2:
+            base_model_params = collect_lora_params(
+                module=self.actor_module_fsdp,
+                layered_summon=self.layered_summon,
+                base_sync_done=False,
+            )
+            base_model_params = {replace_lora_wrapper(k, peft_config): v for k, v in base_model_params.items()}
+            # Filter proj_z from base model params as well
+            base_model_params = {k: v for k, v in base_model_params.items() if not k.startswith("proj_z")}
+            base_model_params = convert_weight_keys(
+                base_model_params, getattr(self.actor_module_fsdp, "_fsdp_wrapped_module", self.actor_module_fsdp)
+            )
+        log_gpu_memory_usage("Before offload_fsdp_model_to_cpu", logger=logger)
+        if self._is_offload_param:
+            offload_fsdp_model_to_cpu(self.actor_module_fsdp)
+        log_gpu_memory_usage("After offload_fsdp_model_to_cpu", logger=logger)
+        set_expandable_segments(False)
+        if peft_config is not None and self.base_sync_done:
+            per_tensor_param = params
+        else:
+            device = get_device_id()
+            per_tensor_param = (
+                (name, param.to(device, non_blocking=True).full_tensor() if isinstance(param, DTensor) else param)
+                for name, param in params.items()
+            )
+        if self.config.rollout.free_cache_engine:
+            await self.rollout.resume(tags=["weights"])
+        log_gpu_memory_usage("After resume weights", logger=logger)
+        if peft_config is not None and getattr(self.rollout, "sleep_level", None) == 2:
+            per_tensor_base_params = (
+                (name, param.to(device, non_blocking=True).full_tensor() if isinstance(param, DTensor) else param)
+                for name, param in base_model_params.items()
+            )
+            await self.rollout.update_weights(per_tensor_base_params, base_sync_done=False)
+            del base_model_params, per_tensor_base_params
+        await self.rollout.update_weights(per_tensor_param, peft_config=peft_config, base_sync_done=self.base_sync_done)
+        log_gpu_memory_usage("After update_weights", logger=logger)
+        del params, per_tensor_param
+        aggressive_empty_cache(force_sync=True)
+        if self.config.rollout.free_cache_engine:
+            await self.rollout.resume(tags=["kv_cache"])
+        log_gpu_memory_usage("After resume kv_cache", logger=logger)
+        self.base_sync_done = True
+        self.torch_random_states = get_torch_device().get_rng_state()
+        get_torch_device().set_rng_state(self.gen_random_states)

ICL/DAPO/verl-recipe/flowrl/main_flowrl.py ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env python3
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Main training script for FlowRL algorithm."""
+import os
+import socket
+import hydra
+import ray
+from omegaconf import OmegaConf
+from verl.trainer.ppo.reward import load_reward_manager
+from verl.utils.device import is_cuda_available
+@hydra.main(config_path="config", config_name="flowrl_trainer", version_base=None)
+def main(config):
+    run_flowrl(config)
+def run_flowrl(config) -> None:
+    if not ray.is_initialized():
+        # this is for local ray cluster
+        default_runtime_env = {
+            "env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN", "VLLM_LOGGING_LEVEL": "WARN"}
+        }
+        ray_init_kwargs = config.ray_kwargs.get("ray_init", {})
+        runtime_env_kwargs = ray_init_kwargs.get("runtime_env", {})
+        runtime_env = OmegaConf.merge(default_runtime_env, runtime_env_kwargs)
+        ray_init_kwargs = OmegaConf.create({**ray_init_kwargs, "runtime_env": runtime_env})
+        print(f"ray init kwargs: {ray_init_kwargs}")
+        ray.init(**OmegaConf.to_container(ray_init_kwargs))
+    try:
+        if (
+            is_cuda_available
+            and config.global_profiler.tool == "nsys"
+            and OmegaConf.select(config.global_profiler, "steps") is not None
+            and len(OmegaConf.select(config.global_profiler, "steps")) > 0
+        ):
+            nsight_options = OmegaConf.to_container(
+                config.global_profiler.global_tool_config.nsys.controller_nsight_options
+            )
+            runner = TaskRunner.options(runtime_env={"nsight": nsight_options}).remote()
+        else:
+            runner = TaskRunner.remote()
+        ray.get(runner.run.remote(config))
+    finally:
+        if ray.is_initialized():
+            ray.shutdown()
+@ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
+class TaskRunner:
+    def run(self, config):
+        # print initial config
+        from pprint import pprint
+        from omegaconf import OmegaConf
+        from verl.utils.fs import copy_to_local
+        print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
+        pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
+        OmegaConf.resolve(config)
+        # download the checkpoint from hdfs
+        local_path = copy_to_local(config.actor_rollout_ref.model.path)
+        # instantiate tokenizer
+        from verl.utils import hf_processor, hf_tokenizer
+        tokenizer = hf_tokenizer(local_path)
+        processor = hf_processor(local_path, use_fast=True)  # used for multimodal LLM, could be none
+        from verl.single_controller.ray import RayWorkerGroup
+        # define worker classes
+        if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
+            assert config.critic.strategy in {"fsdp", "fsdp2"}
+            # Use FlowRL custom worker instead of standard worker
+            from recipe.flowrl.flowrl_fsdp_worker import FlowRLActorRolloutRefWorker
+            from verl.workers.fsdp_workers import CriticWorker  # , ActorRolloutRefWorker
+            ActorRolloutRefWorker = FlowRLActorRolloutRefWorker
+            ray_worker_group_cls = RayWorkerGroup
+        elif config.actor_rollout_ref.actor.strategy == "megatron":
+            assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+            from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
+            ray_worker_group_cls = RayWorkerGroup
+        else:
+            raise NotImplementedError
+        from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+        role_worker_mapping = {
+            Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
+            Role.Critic: ray.remote(CriticWorker),
+        }
+        global_pool_id = "global_pool"
+        resource_pool_spec = {
+            global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+        }
+        mapping = {
+            Role.ActorRollout: global_pool_id,
+            Role.Critic: global_pool_id,
+        }
+        # we should adopt a multi-source reward function here
+        # - for rule-based rm, we directly call a reward score
+        # - for model-based rm, we call a model
+        # - for code related prompt, we send to a sandbox if there are test cases
+        # - finally, we combine all the rewards together
+        # - The reward type depends on the tag of the data
+        if config.reward_model.enable:
+            if config.reward_model.strategy in {"fsdp", "fsdp2"}:
+                from verl.workers.fsdp_workers import RewardModelWorker
+            elif config.reward_model.strategy == "megatron":
+                from verl.workers.megatron_workers import RewardModelWorker
+            else:
+                raise NotImplementedError
+            role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+            mapping[Role.RewardModel] = global_pool_id
+        # reference model
+        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+            role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+            mapping[Role.RefPolicy] = global_pool_id
+        reward_fn = load_reward_manager(
+            config,
+            tokenizer,
+            0,
+            max_resp_len=config.data.max_response_length,
+            overlong_buffer_cfg=config.reward_model.overlong_buffer,
+        )
+        # Note that we always use function-based RM for validation
+        val_reward_fn = load_reward_manager(
+            config,
+            tokenizer,
+            1,
+            max_resp_len=config.data.max_response_length,
+            overlong_buffer_cfg=config.reward_model.overlong_buffer,
+        )
+        resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+        from recipe.flowrl.flowrl_ray_trainer import RayFlowRLTrainer
+        trainer = RayFlowRLTrainer(
+            config=config,
+            tokenizer=tokenizer,
+            processor=processor,
+            role_worker_mapping=role_worker_mapping,
+            resource_pool_manager=resource_pool_manager,
+            ray_worker_group_cls=ray_worker_group_cls,
+            reward_fn=reward_fn,
+            val_reward_fn=val_reward_fn,
+        )
+        trainer.init_workers()
+        trainer.fit()
+if __name__ == "__main__":
+    main()

ICL/DAPO/verl-recipe/flowrl/run_flowrl_qwen2.5_7b.sh ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+project_name='FlowRL'
+exp_name='FlowRL-Qwen2.5-7B'
+# Algorithm settings
+adv_estimator=grpo
+# KL settings (ref policy needed for FlowRL, but KL penalty disabled)
+use_kl_in_reward=False  # Enable ref policy for ref_log_prob (needed for FlowRL loss)
+kl_coef=0.0
+use_kl_loss=True
+kl_loss_coef=0.0
+# Clip parameters
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+# Sequence lengths
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+# Overlong buffer for very long responses
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+# Batch sizes
+train_prompt_bsz=512
+gen_prompt_bsz=$((train_prompt_bsz * 3))
+n_resp_per_prompt=8
+train_prompt_mini_bsz=32
+# Checkpoint saving frequency (-1 to disable periodic saves)
+save_freq=-1
+# Ray
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-1}
+# Paths
+MODEL_PATH=${MODEL_PATH:-"${WORKING_DIR}/downloads/models/Qwen/Qwen2.5-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${WORKING_DIR}/outputs/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${WORKING_DIR}/downloads/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${WORKING_DIR}/downloads/data/aime-2024.parquet"}
+# Sampling
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+# Performance Related Parameter
+n_gpus=8
+sp_size=1
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=False
+gen_tp=1
+python3 -m recipe.flowrl.main_flowrl \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.warmup_style='constant' \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k="${top_k}" \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
+    reward_model.reward_manager=dapo \
+    reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
+    reward_model.overlong_buffer.len=${overlong_buffer_len} \
+    reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=${n_gpus} \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=10 \
+    trainer.save_freq=${save_freq} \
+    trainer.total_epochs=1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto

ICL/DAPO/verl-recipe/infigui-g1/README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+# Recipe for InfiGUI-G1
+This directory contains the official implementation for the paper [InfiGUI-G1: Advancing GUI Grounding with Adaptive Exploration Policy Optimization](https://arxiv.org/abs/2508.05731).
+This work introduces Adaptive Exploration Policy Optimization (AEPO), a policy optimization framework designed to enhance GUI grounding in Multimodal Large Language Models (MLLMs). AEPO improves exploration efficiency by employing a multi-answer generation strategy and a theoretically grounded Adaptive Exploration Reward (AER) function. This approach effectively addresses the challenge of semantic alignment in complex GUI grounding tasks.
+We provide training scripts for both 3B and 7B models, configured for a single machine with 8 GPUs by default.
+## Environment Setup
+Please follow the main environment setup guide for `verl`.
+The provided scripts use the following Docker image: `verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2`
+## Data Preparation
+Before starting the training, you need to download the example dataset. This dataset is a filtered version of [omniact](https://huggingface.co/datasets/Writer/omniact), containing only grounding tasks and excluding easy samples.
+The data is hosted on the Hugging Face. You can download it using the `huggingface-cli`:
+```bash
+huggingface-cli download --repo-type dataset --resume-download InfiX-ai/omniact_grounding_filtered --local-dir data/omniact_grounding_filtered
+```
+This command will download the training and validation parquet files into the `data/omniact_grounding_filtered` directory, which is the default path used by the scripts.
+## Training
+We provide scripts to train the 3B and 7B models. Please run them from the root directory of `verl`.
+-   **Train the 3B model:**
+    ```bash
+    bash recipe/infigui-g1/run_3b.sh
+    ```
+-   **Train the 7B model:**
+    ```bash
+    bash recipe/infigui-g1/run_7b.sh
+    ```
+## Using Custom Data
+If you wish to train on your own dataset, please format your data to match the structure of the example files located in `data/omniact_grounding_filtered`.
+Once your data is ready, you need to update the data path arguments in the training script.
+In `run_3b.sh` or `run_7b.sh`, modify the following lines:
+```bash
+    data.train_files=./path/to/your/train_data.parquet \
+    data.val_files=./path/to/your/val_data.parquet \
+```
+Replace the paths with the location of your custom data files.

ICL/DAPO/verl-recipe/langgraph_agent/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

ICL/DAPO/verl-recipe/langgraph_agent/chat_model.py ADDED Viewed

	@@ -0,0 +1,393 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Ref: https://python.langchain.com/docs/how_to/custom_chat_model/
+"""
+import asyncio
+import json
+import logging
+import os
+import uuid
+from typing import Any, Optional
+from langchain_core.language_models import BaseChatModel
+from langchain_core.language_models.base import LanguageModelInput
+from langchain_core.messages import (
+    AIMessage,
+    BaseMessage,
+    convert_to_openai_messages,
+)
+from langchain_core.messages.tool import InvalidToolCall, ToolCall
+from langchain_core.outputs import ChatGeneration, ChatResult
+from langchain_core.runnables import Runnable, RunnableConfig
+from langchain_core.tools import StructuredTool
+from langchain_core.utils.function_calling import convert_to_openai_tool
+from pydantic import Field
+from verl.experimental.agent_loop.agent_loop import AgentLoopOutput, AsyncLLMServerManager
+from verl.experimental.agent_loop.tool_parser import ToolParser
+from verl.experimental.agent_loop.utils import add_generation_prompt_for_gpt_oss, format_gpt_oss_tool_response_manually
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+class MaxTokenExceededError(Exception):
+    """Indicate that history chat messages + tool message exceeds LLM max_tokens."""
+    pass
+class ChatModel(BaseChatModel):
+    model_name: str = Field(alias="model")
+    """The name of the model"""
+    client: AsyncLLMServerManager
+    """AsyncLLM server manager"""
+    tokenizer: Any
+    """Tokenizer for the model"""
+    max_tokens: int
+    """Max tokens to generate"""
+    tool_parser: str = "hermes"
+    """Tool parser for the model"""
+    max_parallel_calls: int = 1
+    """Max parallel tool calls"""
+    temperature: float = 1.0
+    """Temperature for sampling"""
+    top_p: float = 1.0
+    """Top p for sampling"""
+    repetition_penalty: float = 1.0
+    """Repetition penalty for sampling"""
+    def bind_tools(self, tools, **kwargs) -> Runnable[LanguageModelInput, BaseMessage]:
+        """Bind tools to the model.
+        Args:
+            tools: Sequence of tools to bind to the model.
+        Returns:
+            A Runnable that returns a message.
+        """
+        formatted_tools: list = [convert_to_openai_tool(tool) for tool in tools]
+        # used to remove system prompt prefix when encoding tool response
+        system_prompt = self.tokenizer.apply_chat_template([{}], add_generation_prompt=False, tokenize=True)
+        kwargs["system_prompt"] = system_prompt
+        return self.bind(tools=formatted_tools, **kwargs)
+    def with_structured_output(
+        self,
+        schema: dict | type,
+        *,
+        include_raw: bool = False,
+        **kwargs: Any,
+    ) -> Runnable[LanguageModelInput, dict | BaseChatModel]:
+        """Ref: https://langchain-ai.github.io/langgraph/how-tos/react-agent-structured-output/"""
+        raise NotImplementedError
+    def _generate(
+        self,
+        messages: list[BaseMessage],
+        stop: Optional[list[str]] = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        raise NotImplementedError
+    async def _agenerate(
+        self,
+        messages: list[BaseMessage],
+        stop: Optional[list[str]] = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        """Asynchronously generate chat completion message.
+        Args:
+            messages (list[BaseMessage]): List of list of messages.
+            stop (Optional[list[str]], optional): Stop words to use when generating. Model output is cut off at the
+                first occurrence of any of these substrings. Defaults to None.
+        Returns:
+            ChatResult: Chat result.
+        """
+        request_id, prompt_ids, response_mask = await self._preprocess(messages, **kwargs)
+        sampling_params = {
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "repetition_penalty": self.repetition_penalty,
+        }
+        if "sampling_params" in kwargs:
+            sampling_params.update(kwargs["sampling_params"])
+        output = await self.client.generate(
+            request_id=request_id, prompt_ids=prompt_ids, sampling_params=sampling_params
+        )
+        message = await self._postprocess(request_id, prompt_ids, response_mask, output.token_ids, **kwargs)
+        generation = ChatGeneration(message=message)
+        return ChatResult(generations=[generation])
+    @property
+    def _llm_type(self) -> str:
+        """Get the type of language model used by this chat model."""
+        return self.model_name
+    async def _preprocess(self, messages: list[BaseMessage], **kwargs: Any) -> tuple[str, list[int], list[int]]:
+        """Preprocess messages for chat completion.
+        To ensure strong consistency with policy model, AsyncLLM server generate response with token in token out
+        instead of messages list.
+        But all agent frameworks use messages list to represent chat history. To mitigate the gap, we store trajectory
+        (prompt_ids, response_mask) in lastest AIMessage.response_metadata.
+        1. Encode ToolMessage to token ids.
+        2. Retrieve trajectory (prompt_ids, response_mask) from lastest AIMessage.response_metadata.
+        3. Append ToolMessage token ids to prompt_ids, and append 0 to response_mask.
+        Ref: https://python.langchain.com/docs/concepts/chat_history/
+        Args:
+            messages (list[BaseMessage]): List of messages.
+        Returns:
+            tuple[str, list[int], list[int]]: Request id, prompt ids, response mask.
+        """
+        # messages: [system], human, ai, human|tool, ai, human|tool, ...
+        assert messages[-1].type in ["human", "tool"], (
+            f"Last message must be human or tool, but got {messages[-1].type}"
+        )
+        loop = asyncio.get_running_loop()
+        # Case 1: initial chat completion: [system], human
+        if messages[-1].type == "human" and (len(messages) == 1 or messages[-2].type != "ai"):
+            prompt_ids = await loop.run_in_executor(
+                None,
+                lambda: self.tokenizer.apply_chat_template(
+                    convert_to_openai_messages(messages),
+                    tools=kwargs.get("tools"),
+                    add_generation_prompt=True,
+                    tokenize=True,
+                ),
+            )
+            return str(uuid.uuid4()), prompt_ids, []
+        # Case 2: follow up chat completion with tool/human response: [system], human, ai, human|tool, ...
+        for i in range(len(messages) - 1, -1, -1):
+            if messages[i].type == "ai":
+                break
+        assert "prompt_ids" in messages[i].response_metadata, "Last message must have prompt_ids in response_metadata"
+        assert "response_mask" in messages[i].response_metadata, (
+            "Last message must have response_mask in response_metadata"
+        )
+        # encode tool response
+        tool_responses = convert_to_openai_messages(messages[i + 1 :])
+        if self.tool_parser == "hermes":
+            tool_response_ids = await loop.run_in_executor(
+                None,
+                lambda messages=tool_responses: self.tokenizer.apply_chat_template(
+                    messages, add_generation_prompt=True, tokenize=True
+                ),
+            )
+            tool_response_ids = tool_response_ids[len(kwargs["system_prompt"]) :]
+        elif self.tool_parser == "gpt-oss":
+            # Format tool responses manually
+            # since gpt-oss chat template requires tool call messages to parse tool response messages
+            # we need to format the tool response messages manually
+            tool_response_texts = []
+            for tool_msg in tool_responses:
+                if tool_msg["role"] == "tool":
+                    # Use tool message's name if available (for multiple tool calls)
+                    actual_tool_name = tool_msg.get("name", "unknown")
+                    if actual_tool_name == "unknown":
+                        logger.error(f"actual_tool_name: {actual_tool_name}")
+                    formatted = format_gpt_oss_tool_response_manually(tool_msg["content"], actual_tool_name)
+                    tool_response_texts.append(formatted)
+            # Tokenize the manually formatted tool responses
+            tool_response_text = "".join(tool_response_texts)
+            # need to add generation tokens for gpt-oss manually since add_generation_prompt is True
+            tool_response_text = add_generation_prompt_for_gpt_oss(tool_response_text)
+            logger.debug(f"tool_response_text: {tool_response_text}")
+            tool_response_ids = await loop.run_in_executor(
+                None, lambda: self.tokenizer.encode(tool_response_text, add_special_tokens=False)
+            )
+        else:
+            raise ValueError(f"Unsupported tool parser: {self.tool_parser}")
+        # stop generation if response length exceeds max response length
+        if len(messages[i].response_metadata["response_mask"]) + len(tool_response_ids) >= self.max_tokens:
+            raise MaxTokenExceededError(f"Max response length {self.max_tokens} exceeded")
+        # append tool response to prompt
+        request_id = messages[i].response_metadata.pop("request_id")
+        prompt_ids = messages[i].response_metadata.pop("prompt_ids")
+        response_mask = messages[i].response_metadata.pop("response_mask")
+        prompt_ids += tool_response_ids
+        response_mask += [0] * len(tool_response_ids)
+        return request_id, prompt_ids, response_mask
+    async def _postprocess(
+        self, request_id: str, prompt_ids: list[int], response_mask: list[int], response_ids: list[int], **kwargs: Any
+    ) -> AIMessage:
+        """Postprocess response_ids when chat completion is done.
+        1. Decode response_ids, parse tool calls to AIMessage.
+        2. Append response_ids to prompt_ids, and append 1 to response_mask.
+        3. Store trajectory (prompt_ids, response_mask) in AIMessage.response_metadata.
+        Args:
+            request_id (str): Unique request id.
+            prompt_ids (list[int]): Input prompt token ids in this chat completion.
+            response_mask (list[int]): Response mask before this chat completion.
+            response_ids (list[int]): LLM generated token ids in this chat completion.
+        Returns:
+            AIMessage: Postprocessed message.
+        """
+        prompt_ids += response_ids
+        response_mask += [1] * len(response_ids)
+        tool_parser = ToolParser.get_tool_parser(self.tool_parser, self.tokenizer)
+        content, function_calls = await tool_parser.extract_tool_calls(response_ids)
+        tool_calls, invalid_tool_calls = [], []
+        for function_call in function_calls:
+            error = None
+            try:
+                args = json.loads(function_call.arguments)
+                if not isinstance(args, dict):
+                    error = f"Tool arguments must be a JSON object, got {type(args).__name__}"
+            except json.JSONDecodeError as e:
+                error = f"Invalid JSON tool arguments: {e}"
+            if error:
+                logger.warning(error)
+                invalid_tool_calls.append(
+                    InvalidToolCall(
+                        name=function_call.name,
+                        args=function_call.arguments,
+                        id=str(uuid.uuid4()),
+                        error=error,
+                    )
+                )
+            else:
+                tool_calls.append(
+                    ToolCall(
+                        name=function_call.name,
+                        args=args,
+                        id=str(uuid.uuid4()),
+                    )
+                )
+        message = AIMessage(
+            content=content,
+            tool_calls=tool_calls[: self.max_parallel_calls],
+            invalid_tool_calls=invalid_tool_calls[: self.max_parallel_calls],
+            response_metadata={
+                "request_id": request_id,
+                "prompt_ids": prompt_ids,
+                "response_mask": response_mask,
+            },
+        )
+        return message
+class TruncateStructuredTool(StructuredTool):
+    """Structured tool with response truncation."""
+    tool_response_truncate_side: str
+    """truncate side of tool response: left, middle, right"""
+    max_tool_response_length: int
+    """max length of tool response"""
+    async def _arun(
+        self,
+        *args: Any,
+        config: RunnableConfig,
+        **kwargs: Any,
+    ) -> Any:
+        tool_response = await super()._arun(*args, config=config, **kwargs)
+        tool_response = str(tool_response)
+        if len(tool_response) > self.max_tool_response_length:
+            if self.tool_response_truncate_side == "left":
+                tool_response = tool_response[: self.max_tool_response_length] + "...(truncated)"
+            elif self.tool_response_truncate_side == "right":
+                tool_response = "(truncated)..." + tool_response[-self.max_tool_response_length :]
+            else:
+                length = self.max_tool_response_length // 2
+                tool_response = tool_response[:length] + "...(truncated)..." + tool_response[-length:]
+        return tool_response
+def convert_to_agent_output(messages: list[BaseMessage], response_length: int) -> AgentLoopOutput:
+    """Convert messages to AgentLoopOutput.
+    Args:
+        messages (List[BaseMessage]): List of messages, last message must be assistant
+            with response_metadata containing `prompt_ids` and `response_mask`.
+        response_length (int): Max length of response.
+    Returns:
+        AgentLoopOutput: agent loop output trajectory used for training.
+    """
+    # skip last tool calls
+    for i in range(len(messages) - 1, -1, -1):
+        if messages[i].type != "tool":
+            break
+    last_message = messages[i]
+    assert last_message.type == "ai", f"Last message must be assistant, but got {last_message.type}"
+    assert "prompt_ids" in last_message.response_metadata, "Last message must have prompt_ids in response_metadata"
+    assert "response_mask" in last_message.response_metadata, (
+        "Last message must have response_mask in response_metadata"
+    )
+    num_turns = 0
+    for i in range(len(messages)):
+        if messages[i].type == "system":
+            continue
+        # parallel tool calls are in single turn
+        if i == 0 or messages[i].type != messages[i - 1].type:
+            num_turns += 1
+    prompt_ids = last_message.response_metadata["prompt_ids"]
+    response_mask = last_message.response_metadata["response_mask"]
+    response_ids = prompt_ids[-len(response_mask) :]
+    prompt_ids = prompt_ids[: len(prompt_ids) - len(response_mask)]
+    output = AgentLoopOutput(
+        prompt_ids=prompt_ids,
+        response_ids=response_ids[:response_length],
+        response_mask=response_mask[:response_length],
+        num_turns=num_turns,
+        metrics={},
+    )
+    return output

ICL/DAPO/verl-recipe/langgraph_agent/react_agent_loop.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+LangGraph React Agent Loop.
+This implementation is exact same as `ToolAgentLoop`.
+Ref: https://langchain-ai.github.io/langgraph/tutorials/workflows/
+"""
+import logging
+from typing import Any, Literal
+from langchain_core.messages import AIMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.graph import END, MessagesState, StateGraph
+from langgraph.prebuilt import ToolNode
+from recipe.langgraph_agent.chat_model import (
+    ChatModel,
+    MaxTokenExceededError,
+    convert_to_agent_output,
+)
+from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput
+logger = logging.getLogger(__name__)
+async def call_model(state: MessagesState, config: RunnableConfig):
+    model = config["configurable"]["model"]
+    sampling_params = config["configurable"]["sampling_params"]
+    try:
+        message = await model.ainvoke(state["messages"], sampling_params=sampling_params)
+        return {"messages": [message]}
+    except MaxTokenExceededError:
+        # last message is ToolMessage
+        return {"messages": []}
+def should_continue(state: MessagesState, config: RunnableConfig) -> Literal["tools", END]:
+    # Safely extract max_assistant_turns from config
+    max_assistant_turns = None
+    try:
+        if config and "configurable" in config:
+            max_assistant_turns = config["configurable"].get("max_assistant_turns")
+    except Exception as e:
+        logger.warning(f"Failed to extract max_assistant_turns from config: {e}")
+    num_assistant_turns = 0
+    for message in state["messages"]:
+        if message.type == "ai":
+            num_assistant_turns += 1
+    last_message = state["messages"][-1]
+    # LLM call failed, e.g: max response length exceeded
+    if last_message.type == "tool":
+        return END
+    # max assistant turns exceeded
+    # Use a reasonable default limit (25) if max_assistant_turns is not set
+    # This prevents infinite loops
+    effective_max_turns = max_assistant_turns if max_assistant_turns is not None else 25
+    if num_assistant_turns >= effective_max_turns:
+        return END
+    # no tool calls
+    if not getattr(last_message, "tool_calls", None):
+        return END
+    return "tools"
+class ReactAgentLoop(AgentLoopBase):
+    # Recursion limit calculation constants
+    DEFAULT_MAX_ASSISTANT_TURNS = 25
+    MIN_RECURSION_LIMIT = 50
+    NODES_PER_TURN = 2  # Each AI turn involves agent + tools nodes
+    RECURSION_LIMIT_SAFETY_FACTOR = 1.5  # 50% buffer for edge cases
+    @classmethod
+    def init_class(cls, config, tokenizer, **kwargs):
+        if cls._class_initialized:
+            return
+        cls._class_initialized = True
+        print("Performing class-level ReactAgentLoop initialization")
+        # build graph
+        cls.graph = cls.build_graph()
+    @classmethod
+    def build_graph(cls) -> StateGraph:
+        workflow = StateGraph(MessagesState)
+        workflow.add_node("agent", call_model)
+        workflow.add_node("tools", ToolNode(cls.tools))
+        workflow.set_entry_point("agent")
+        workflow.add_conditional_edges(
+            "agent",
+            should_continue,
+            {
+                "tools": "tools",
+                END: END,
+            },
+        )
+        workflow.add_edge("tools", "agent")
+        graph = workflow.compile()
+        return graph
+    async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
+        messages = list(kwargs["raw_prompt"])
+        model_path = self.config.actor_rollout_ref.model.path
+        model_name = "/".join(model_path.split("/")[-2:])
+        rollout = self.config.actor_rollout_ref.rollout
+        model = ChatModel(
+            model=model_name,
+            client=self.server_manager,
+            tokenizer=self.tokenizer,
+            max_tokens=rollout.response_length,
+            max_parallel_calls=rollout.multi_turn.max_parallel_calls,
+            tool_parser=rollout.multi_turn.format,
+        )
+        model = model.bind_tools(self.tools, tool_choice="any")
+        # Calculate recursion_limit dynamically based on max_assistant_turns
+        max_assistant_turns = (
+            rollout.multi_turn.max_assistant_turns
+            if rollout.multi_turn.max_assistant_turns
+            else self.DEFAULT_MAX_ASSISTANT_TURNS
+        )
+        # Formula: nodes_per_turn * max_turns * safety_buffer, with minimum threshold
+        recursion_limit = max(
+            self.MIN_RECURSION_LIMIT,
+            int(max_assistant_turns * self.NODES_PER_TURN * self.RECURSION_LIMIT_SAFETY_FACTOR),
+        )
+        logger.info(f"Configured recursion_limit={recursion_limit} (max_assistant_turns={max_assistant_turns})")
+        config = {
+            "configurable": {
+                "model": model,
+                "sampling_params": sampling_params,
+                "max_user_turns": rollout.multi_turn.max_user_turns,
+                "max_assistant_turns": rollout.multi_turn.max_assistant_turns,
+            },
+            "recursion_limit": recursion_limit,
+        }
+        # TODO: how to handle multiple trajectories in an graph invocation?
+        # Each graph node may has its own LLM calls and state, e.g:
+        # https://github.com/google-gemini/gemini-fullstack-langgraph-quickstart
+        try:
+            state = await self.graph.ainvoke(input={"messages": messages}, config=config)
+        except Exception as e:
+            logger.error(f"Agent loop execution failed: {type(e).__name__}: {e}")
+            logger.error("Falling back to a minimal dummy trajectory.")
+            # Fallback to a minimal assistant message so that
+            # convert_to_agent_output and downstream padding logic
+            # can still run without crashing.
+            dummy_id = 0
+            fallback_message = AIMessage(
+                content="[Agent execution failed - no valid trajectory]",
+                response_metadata={
+                    "request_id": "fallback",
+                    "prompt_ids": [dummy_id, dummy_id],
+                    "response_mask": [1],
+                },
+            )
+            state = {"messages": [fallback_message]}
+        output = convert_to_agent_output(state["messages"], rollout.response_length)
+        return output

ICL/DAPO/verl-recipe/langgraph_agent/test_react_agent_loop.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import numpy as np
+import pytest
+import ray
+from langchain_core.tools import tool
+from omegaconf import DictConfig
+from recipe.langgraph_agent.react_agent_loop import ReactAgentLoop
+from tests.experimental.agent_loop.agent_utils import init_agent_loop_manager
+from verl.protocol import DataProto
+from verl.utils import hf_tokenizer
+@pytest.fixture
+def init_config() -> DictConfig:
+    from hydra import compose, initialize_config_dir
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+    model_path = "Qwen/Qwen2.5-1.5B-Instruct"
+    config.actor_rollout_ref.model.path = model_path
+    config.actor_rollout_ref.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.prompt_length = 4096
+    config.actor_rollout_ref.rollout.response_length = 4096
+    config.actor_rollout_ref.rollout.n = 4
+    config.actor_rollout_ref.rollout.agent.num_workers = 2
+    config.actor_rollout_ref.actor.use_dynamic_bsz = True
+    # test sleep/wake_up with fsdp offload
+    config.actor_rollout_ref.actor.fsdp_config.param_offload = True
+    config.actor_rollout_ref.actor.fsdp_config.optimizer_offload = True
+    return config
+@tool(parse_docstring=True)
+def get_current_temperature(location: str, unit: str = "celsius"):
+    """Get current temperature at a location.
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+    Returns:
+        the temperature, the location, and the unit in a dict
+    """
+    print(f"[DEBUG] get_current_temperature: {location}, {unit}")
+    return {
+        "temperature": 26.1,
+        "location": location,
+        "unit": unit,
+    }
+@tool(parse_docstring=True)
+def get_temperature_date(location: str, date: str, unit: str = "celsius"):
+    """Get temperature at a location and date.
+    Args:
+        location: The location to get the temperature for, in the format "City, State, Country".
+        date: The date to get the temperature for, in the format "Year-Month-Day".
+        unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+    Returns:
+        the temperature, the location, the date and the unit in a dict
+    """
+    print(f"[DEBUG] get_temperature_date: {location}, {date}, {unit}")
+    return {
+        "temperature": 25.9,
+        "location": location,
+        "date": date,
+        "unit": unit,
+    }
+class TestReactAgentLoop(ReactAgentLoop):
+    @classmethod
+    def init_class(cls, config, tokenizer, **kwargs):
+        # TODO: find better way to configure tools
+        cls.tools = [get_current_temperature, get_temperature_date]
+        super().init_class(config, tokenizer, **kwargs)
+def test_react_agent(init_config):
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    # =========================== 1. Init rollout manager ===========================
+    agent_loop_config = [
+        {
+            "_target_": "recipe.langgraph_agent.test_react_agent_loop.TestReactAgentLoop",
+            "name": "react_agent",
+        },
+    ]
+    agent_loop_config_path = "/tmp/agent_loop_config.json"
+    with open(agent_loop_config_path, "w") as f:
+        json.dump(agent_loop_config, f)
+    n = 2
+    init_config.actor_rollout_ref.rollout.n = n
+    # init_config.actor_rollout_ref.rollout.multi_turn.tool_config_path = tool_config_path
+    init_config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 2
+    init_config.actor_rollout_ref.rollout.agent.agent_loop_config_path = agent_loop_config_path
+    agent_loop_manager = init_agent_loop_manager(init_config)
+    # =========================== 2. Generate sequences  ===========================
+    raw_prompts = [
+        [
+            {"role": "user", "content": "How are you?"},
+        ],
+        [
+            {"role": "user", "content": "What's the temperature in Los Angeles now?"},
+        ],
+        [
+            {"role": "user", "content": "What's the temperature in New York now?"},
+        ],
+        [
+            {
+                "role": "system",
+                "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n\n"
+                "Current Date: 2024-09-30",
+            },
+            {"role": "user", "content": "What's the temperature in San Francisco now? How about tomorrow?"},
+        ],
+    ]
+    batch = DataProto(
+        non_tensor_batch={
+            "raw_prompt": np.array([np.array(prompt) for prompt in raw_prompts], dtype=object),
+            "agent_name": np.array(["react_agent"] * len(raw_prompts)),
+            "data_source": np.array(["openai/gsm8k"] * len(raw_prompts)),
+            "reward_model": np.array([{"style": "rule", "ground_truth": "1.0"}] * len(raw_prompts)),
+        },
+    )
+    batch = batch.repeat(n)
+    result = agent_loop_manager.generate_sequences(prompts=batch)
+    assert len(result) == len(raw_prompts) * n
+    # Check turns
+    num_turns = result.non_tensor_batch["__num_turns__"]
+    print(f"num_turns: {num_turns}")
+    for i in range(len(num_turns)):
+        if i // n == 0:
+            # [user, assistant]
+            assert num_turns[i] == 2
+        else:
+            # [user, assistant, tool, assistant]
+            assert num_turns[i] == 4
+    # Check response_mask
+    tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
+    responses = result.batch["responses"]
+    response_mask = result.batch["response_mask"]
+    attention_mask = result.batch["attention_mask"]
+    assert responses.size() == response_mask.size(), f"{responses.size()} != {response_mask.size()}"
+    response_length = response_mask.size(1)
+    for i in range(len(responses)):
+        # response with tool response
+        valid_tokens = responses[i][attention_mask[i][-response_length:].bool()]
+        response_with_obs = tokenizer.decode(valid_tokens)
+        # response without tool response
+        valid_tokens = responses[i][response_mask[i].bool()]
+        response_without_obs = tokenizer.decode(valid_tokens)
+        assert "<tool_response>" not in response_without_obs, (
+            f"found <tool_response> in response: {response_without_obs}"
+        )
+        assert "</tool_response>" not in response_without_obs, (
+            f"found </tool_response> in response: {response_without_obs}"
+        )
+        print("=========================")
+        print(response_with_obs)
+        print("---")
+        print(response_without_obs)
+    print("Test passed!")
+    ray.shutdown()

ICL/DAPO/verl-recipe/minicpmo/rl_dataset.py ADDED Viewed

	@@ -0,0 +1,571 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import logging
+import math
+import os
+import re
+from typing import Optional
+import datasets
+import torch
+from omegaconf import DictConfig, ListConfig
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from transformers import PreTrainedTokenizer, ProcessorMixin
+import verl.utils.torch_functional as verl_F
+from verl.utils.dataset.vision_utils import process_image
+from verl.utils.model import compute_position_id_with_mask
+logger = logging.getLogger(__name__)
+def build_transform():
+    IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)  # timm.data.IMAGENET_INCEPTION_MEAN
+    IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)  # timm.data.IMAGENET_INCEPTION_STD
+    return transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+        ]
+    )
+def build_image_bound(input_ids, tokenizer, new_schema=True, logger=None):
+    if new_schema:
+        start_cond = (input_ids == tokenizer.im_start_id) | (input_ids == tokenizer.slice_start_id)
+        end_cond = (input_ids == tokenizer.im_end_id) | (input_ids == tokenizer.slice_end_id)
+    else:
+        start_cond = input_ids == tokenizer.im_start_id
+        end_cond = input_ids == tokenizer.im_end_id
+    image_start_tokens = torch.where(start_cond)[0]
+    image_start_tokens += 1
+    image_end_tokens = torch.where(end_cond)[0]
+    if len(image_start_tokens) != len(image_end_tokens):
+        logger.error("image start token != image end tokens")
+        raise Exception("image start token != image end tokens")
+    if len(image_start_tokens) > 0:
+        image_bound = torch.hstack([image_start_tokens.unsqueeze(-1), image_end_tokens.unsqueeze(-1)])
+    else:
+        image_bound = []
+    return image_bound
+def preprocess(
+    images_dict,
+    conversations,
+    tokenizer,
+    transform,
+    query_nums=64,
+    slice_config=None,
+    llm_type=None,
+    patch_size=14,
+    batch_vision=False,
+    max_length=2048,
+    truncation="error",
+    apply_chat_template_kwargs=None,
+    logger=None,
+):
+    """
+    single(multi) image(s) preprocess, the image(s) will be placed at the top of the conversation
+    """
+    conversations = copy.deepcopy(conversations)
+    assert conversations[0]["role"] == "user", "the first role must be user"
+    if slice_config is not None:
+        assert isinstance(slice_config, dict)
+        assert "patch_size" in slice_config
+        assert "max_slice_nums" in slice_config
+        assert "scale_resolution" in slice_config
+    default_image_placeholder = tokenizer.im_start + tokenizer.unk_token * query_nums + tokenizer.im_end
+    new_schema = False
+    use_image_id = False
+    if llm_type == "qwen":
+        new_schema = True
+        use_image_id = True
+    image_placeholder_dict = {}
+    images = []
+    image_id_cnt = 0
+    for img_name, image in images_dict.items():
+        if slice_config:
+            source_image, patches, best_grid = slice_image(
+                image,
+                slice_config["max_slice_nums"],
+                slice_config["scale_resolution"],
+                slice_config["patch_size"],
+            )
+            images.append(source_image)
+            image_placeholder = default_image_placeholder
+            if len(patches) > 0:
+                for i in range(len(patches)):
+                    for j in range(len(patches[0])):
+                        images.append(patches[i][j])
+                if use_image_id:
+                    image_placeholder = (
+                        f"{tokenizer.im_id_start}{image_id_cnt}{tokenizer.im_id_end}" + image_placeholder
+                    )
+                    image_id_cnt += 1
+                image_placeholder += get_grid_placeholder(tokenizer, best_grid, query_nums, new_schema=new_schema)
+            image_placeholder_dict[img_name] = image_placeholder
+        else:
+            images.append(image)
+            if use_image_id:
+                image_placeholder = f"{tokenizer.im_id_start}{image_id_cnt}{tokenizer.im_id_end}" + image_placeholder
+                image_id_cnt += 1
+            else:
+                image_placeholder = default_image_placeholder
+            image_placeholder_dict[img_name] = image_placeholder
+    images = [transform(i) for i in images]
+    if len(images_dict) == 1 and "<image>" in images_dict:
+        if "<image>" in conversations[0]["content"]:
+            conversations[0]["content"] = conversations[0]["content"].replace("<image>", image_placeholder)
+        else:
+            conversations[0]["content"] = image_placeholder + "\n" + conversations[0]["content"]
+    else:
+        pattern = r"<image_\d+>"
+        new_conversations = []
+        for conversation in conversations:
+            content = conversation["content"]
+            parts = re.split(f"({pattern})", content)
+            for i, part in enumerate(parts):
+                if not part.strip():
+                    continue
+                if re.match(pattern, part):
+                    if part in image_placeholder_dict:
+                        parts[i] = image_placeholder_dict[part]
+                    else:
+                        raise Exception(f"not found {part} in image dict")
+            conversation["content"] = "\n".join(parts)
+            new_conversations.append(conversation)
+        conversations = new_conversations
+    # TODO change role in conversation for different llm
+    prompt_with_chat_template = tokenizer.apply_chat_template(
+        conversations, add_generation_prompt=True, tokenize=False, **(apply_chat_template_kwargs or {})
+    )
+    input_ids, attention_mask = verl_F.tokenize_and_postprocess_data(
+        prompt=prompt_with_chat_template,
+        tokenizer=tokenizer,
+        max_length=max_length,
+        pad_token_id=tokenizer.pad_token_id,
+        left_pad=True,
+        truncation=truncation,
+    )
+    position_ids = compute_position_id_with_mask(attention_mask)
+    image_bound = build_image_bound(input_ids[0], tokenizer, new_schema, logger)
+    input_dict = {
+        "input_ids": input_ids[0],
+        "attention_mask": attention_mask[0],
+        "position_ids": position_ids[0],
+        "image_bound": image_bound,
+    }
+    if batch_vision:
+        tgt_sizes = []
+        reshape_images = []
+        for image in images:
+            H, W = image.shape[1:]
+            reshape_image = reshape_by_patch(image, patch_size)
+            reshape_images.append(reshape_image)
+            tgt_sizes.append([H // patch_size, W // patch_size])
+        if tgt_sizes:
+            tgt_sizes = torch.Tensor(tgt_sizes).type(torch.int32)
+        input_dict["pixel_values"] = reshape_images
+        input_dict["tgt_sizes"] = tgt_sizes
+    else:
+        input_dict["pixel_values"] = images
+        input_dict["tgt_sizes"] = []
+    return input_dict
+def slice_image(image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False):
+    original_size = image.size
+    original_width, original_height = original_size
+    log_ratio = math.log(original_width / original_height)
+    ratio = original_width * original_height / (scale_resolution * scale_resolution)
+    multiple = min(math.ceil(ratio), max_slice_nums)
+    source_image = None
+    best_grid = None
+    patches = []
+    if multiple <= 1 or never_split:
+        # dont need to slice, upsample
+        best_size = find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=True)
+        source_image = image.resize(best_size, Image.Resampling.BICUBIC)
+    else:
+        candidate_split_grids_nums = []
+        for i in [multiple - 1, multiple, multiple + 1]:
+            if i == 1 or i > max_slice_nums:
+                continue
+            candidate_split_grids_nums.append(i)
+        # source image, down-sampling and ensure divided by patch_size
+        best_resize = find_best_resize(original_size, scale_resolution, patch_size)
+        source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
+        candidate_grids = []
+        # find best grid
+        for split_grids_nums in candidate_split_grids_nums:
+            m = 1
+            while m <= split_grids_nums:
+                if split_grids_nums % m == 0:
+                    candidate_grids.append([m, split_grids_nums // m])
+                m += 1
+        best_grid = [1, 1]
+        min_error = float("inf")
+        for grid in candidate_grids:
+            error = abs(log_ratio - math.log(grid[0] / grid[1]))
+            if error < min_error:
+                best_grid = grid
+                min_error = error
+        refine_size = get_refine_size(original_size, best_grid, scale_resolution, patch_size, allow_upscale=True)
+        refine_image = image.resize(refine_size, Image.Resampling.BICUBIC)
+        patches = split_to_patches(refine_image, best_grid)
+    return source_image, patches, best_grid
+def ensure_divide(length, patch_size):
+    return max(round(length / patch_size) * patch_size, patch_size)
+def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False):
+    width, height = original_size
+    if (width * height > scale_resolution * scale_resolution) or allow_upscale:
+        r = width / height
+        height = int(scale_resolution / math.sqrt(r))
+        width = int(height * r)
+    best_width = ensure_divide(width, patch_size)
+    best_height = ensure_divide(height, patch_size)
+    return (best_width, best_height)
+def get_refine_size(original_size, grid, scale_resolution, patch_size, allow_upscale=False):
+    width, height = original_size
+    grid_x, grid_y = grid
+    refine_width = ensure_divide(width, grid_x)
+    refine_height = ensure_divide(height, grid_y)
+    grid_width = refine_width / grid_x
+    grid_height = refine_height / grid_y
+    best_grid_size = find_best_resize(
+        (grid_width, grid_height),
+        scale_resolution,
+        patch_size,
+        allow_upscale=allow_upscale,
+    )
+    refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)
+    return refine_size
+def split_to_patches(image, grid):
+    patches = []
+    width, height = image.size
+    grid_x = int(width / grid[0])
+    grid_y = int(height / grid[1])
+    for i in range(0, height, grid_y):
+        images = []
+        for j in range(0, width, grid_x):
+            box = (j, i, j + grid_x, i + grid_y)
+            patch = image.crop(box)
+            images.append(patch)
+        patches.append(images)
+    return patches
+def get_grid_placeholder(tokenizer, grid, query_num, new_schema=False):
+    if new_schema:
+        image_placeholder = tokenizer.slice_start + tokenizer.unk_token * query_num + tokenizer.slice_end
+    else:
+        image_placeholder = tokenizer.im_start + tokenizer.unk_token * query_num + tokenizer.im_end
+    cols = grid[0]
+    rows = grid[1]
+    slices = []
+    for i in range(rows):
+        lines = []
+        for j in range(cols):
+            lines.append(image_placeholder)
+        slices.append("".join(lines))
+    if new_schema:
+        slice_placeholder = "\n".join(slices)
+    else:
+        slice_placeholder = tokenizer.slice_start + "\n".join(slices) + tokenizer.slice_end
+    return slice_placeholder
+def reshape_by_patch(image_tensor, patch_size):
+    """
+    :param image_tensor: shape [3, H, W]
+    :param patch_size:
+    :return: [3, patch_size, HW/patch_size]
+    """
+    patches = torch.nn.functional.unfold(image_tensor, (patch_size, patch_size), stride=(patch_size, patch_size))
+    patches = patches.reshape(image_tensor.size(0), patch_size, patch_size, -1)
+    patches = patches.permute(0, 1, 3, 2).reshape(image_tensor.size(0), patch_size, -1)
+    return patches
+def init_minicpmo_config(processor, config):
+    """Initialize MiniCPM-o specific configuration"""
+    minicpmo_config = {
+        "transform": build_transform(),
+        "patch_size": config.get("patch_size", 14),
+        "query_nums": config.get("query_nums", 64),
+        "slice_config": config.get(
+            "slice_config", {"max_slice_nums": 9, "patch_size": config.get("patch_size", 14), "scale_resolution": 448}
+        ),
+        "llm_type": config.get("llm_type", "qwen"),
+        "batch_vision": config.get("batch_vision", True),
+    }
+    return minicpmo_config
+def process_minicpmo_data(
+    row_dict,
+    messages,
+    tokenizer,
+    minicpmo_config,
+    image_key,
+    max_prompt_length,
+    truncation,
+    apply_chat_template_kwargs,
+    logger,
+):
+    """Process data for MiniCPM-o model"""
+    if len(row_dict[image_key]) == 1:
+        multi_modal_data = {}
+        image = process_image(row_dict.pop(image_key)[0])
+        multi_modal_data["image"] = [image]
+        images_dict = {"<image>": image}
+    else:
+        raise NotImplementedError
+    model_inputs = preprocess(
+        images_dict,
+        messages,
+        tokenizer,
+        minicpmo_config["transform"],
+        query_nums=minicpmo_config["query_nums"],
+        slice_config=minicpmo_config["slice_config"],
+        llm_type=minicpmo_config["llm_type"],
+        patch_size=minicpmo_config["patch_size"],
+        batch_vision=minicpmo_config["batch_vision"],
+        max_length=max_prompt_length,
+        truncation=truncation,
+        apply_chat_template_kwargs=apply_chat_template_kwargs,
+        logger=logger,
+    )
+    raw_prompt = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False, **(apply_chat_template_kwargs or {})
+    )
+    raw_prompt = raw_prompt.replace("<image>", "(<image>./</image>)")
+    return model_inputs, multi_modal_data, raw_prompt
+class RLHFDataset(Dataset):
+    """
+    Load and preprocess RLHF data from Parquet files.
+    - Caches files locally.
+    - Reads into a HuggingFace Dataset and tokenizes prompts.
+    - Optionally handles images/videos via a ProcessorMixin.
+    - Filters prompts over a max length.
+    - Supports resuming from checkpoints.
+    Args:
+        data_files (str or list): Path(s) to Parquet file(s).
+        tokenizer (PreTrainedTokenizer): For the tokenization of text to token IDs.
+        config (DictConfig): Options like cache_dir, prompt_key, max_prompt_length, truncation, etc.
+        processor (ProcessorMixin, optional): Multimodal preprocessor for images/videos.
+    """
+    def __init__(
+        self,
+        data_files: str | list[str],
+        tokenizer: PreTrainedTokenizer,
+        config: DictConfig,
+        processor: Optional[ProcessorMixin] = None,
+    ):
+        if not isinstance(data_files, list | ListConfig):
+            data_files = [data_files]
+        self.data_files = copy.deepcopy(data_files)
+        self.original_data_files = copy.deepcopy(data_files)  # use for resume
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.config = config
+        self.cache_dir = os.path.expanduser(config.get("cache_dir", "~/.cache/verl/rlhf"))
+        self.prompt_key = config.get("prompt_key", "prompt")
+        self.image_key = config.get("image_key", "images")
+        self.video_key = config.get("video_key", "videos")
+        self.max_prompt_length = config.get("max_prompt_length", 1024)
+        self.return_raw_chat = config.get("return_raw_chat", False)
+        self.return_full_prompt = config.get("return_full_prompt", False)
+        self.truncation = config.get("truncation", "error")
+        self.filter_overlong_prompts = config.get("filter_overlong_prompts", True)
+        self.apply_chat_template_kwargs = config.get("apply_chat_template_kwargs", {})
+        self.num_workers = config.get("filter_overlong_prompts_workers", max(1, os.cpu_count() // 4))
+        self.num_workers = min(self.num_workers, os.cpu_count())
+        self.use_shm = config.get("use_shm", False)
+        self.chat_template_func = config.get("chat_template_func", None)
+        self.need_tools_kwargs = config.get("need_tools_kwargs", False)
+        self.filter_prompts = config.get("filter_prompts", True)
+        self.serialize_dataset = False
+        self.minicpmo_config = init_minicpmo_config(self.processor, config)
+        self._download()
+        self._read_files_and_tokenize()
+    def _download(self, use_origin_parquet=False):
+        from verl.utils.fs import copy_to_local
+        data_files = self.data_files if not use_origin_parquet else self.original_data_files
+        for i, parquet_file in enumerate(data_files):
+            self.data_files[i] = copy_to_local(src=parquet_file, cache_dir=self.cache_dir, use_shm=self.use_shm)
+    def _read_files_and_tokenize(self):
+        dataframes = []
+        for parquet_file in self.data_files:
+            # read parquet files and cache
+            dataframe = datasets.load_dataset("parquet", data_files=parquet_file)["train"]
+            dataframes.append(dataframe)
+        self.dataframe: datasets.Dataset = datasets.concatenate_datasets(dataframes)
+        print(f"dataset len: {len(self.dataframe)}")
+    def resume_dataset_state(self):
+        self.serialize_dataset = not hasattr(self, "original_data_files")
+        # resume dataframe if not it's serialized in data.pt
+        if not self.serialize_dataset:
+            self._download(use_origin_parquet=True)  # download and resume from original parquet files
+            self._read_files_and_tokenize()
+        else:
+            print(r"old dataloader ckpt file is used, please train from scratch for better ckpt performance")
+    def __len__(self):
+        return len(self.dataframe)
+    def _build_messages(self, example: dict):
+        return example.pop(self.prompt_key)
+    def __getitem__(self, item):
+        """
+        Note that we also return the raw_input_ids so that it can be combined with other chat template
+        """
+        row_dict: dict = self.dataframe[item]
+        messages = self._build_messages(row_dict)
+        model_inputs = {}
+        if self.processor is not None:
+            model_inputs, multi_modal_data, raw_prompt = process_minicpmo_data(
+                row_dict,
+                messages,
+                self.tokenizer,
+                self.minicpmo_config,
+                self.image_key,
+                self.max_prompt_length,
+                self.truncation,
+                self.apply_chat_template_kwargs,
+                logger,
+            )
+            input_ids = model_inputs.pop("input_ids")
+            attention_mask = model_inputs.pop("attention_mask")
+            position_ids = model_inputs.pop("position_ids")
+            # There's a trap here, multi_modal_inputs has to be a dict, not BatchFeature
+            row_dict["multi_modal_data"] = multi_modal_data
+            row_dict["multi_modal_inputs"] = dict(model_inputs)
+        else:
+            raw_prompt = self.tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=False, **self.apply_chat_template_kwargs
+            )
+            model_inputs = self.tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
+            input_ids = model_inputs.pop("input_ids")
+            attention_mask = model_inputs.pop("attention_mask")
+            position_ids = compute_position_id_with_mask(attention_mask)
+        row_dict["input_ids"] = input_ids
+        row_dict["attention_mask"] = attention_mask
+        row_dict["position_ids"] = position_ids
+        raw_prompt_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
+        if len(raw_prompt_ids) > self.max_prompt_length:
+            if self.truncation == "left":
+                raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length :]
+            elif self.truncation == "right":
+                raw_prompt_ids = raw_prompt_ids[: self.max_prompt_length]
+            elif self.truncation == "middle":
+                left_half = self.max_prompt_length // 2
+                right_half = self.max_prompt_length - left_half
+                raw_prompt_ids = raw_prompt_ids[:left_half] + raw_prompt_ids[-right_half:]
+            elif self.truncation == "error":
+                raise RuntimeError(f"Prompt length {len(raw_prompt_ids)} is longer than {self.max_prompt_length}.")
+        row_dict["raw_prompt_ids"] = raw_prompt_ids
+        # encode prompts without chat template
+        if self.return_raw_chat:
+            row_dict["raw_prompt"] = messages
+        # get prompts with chat template
+        if self.return_full_prompt:
+            row_dict["full_prompts"] = raw_prompt  # array of strings
+        # add index for each prompt
+        index = row_dict.get("extra_info", {}).get("index", 0)
+        tools_kwargs = row_dict.get("extra_info", {}).get("tools_kwargs", {})
+        interaction_kwargs = row_dict.get("extra_info", {}).get("interaction_kwargs", {})
+        need_tools_kwargs = row_dict.get("extra_info", {}).get("need_tools_kwargs", self.need_tools_kwargs)
+        if need_tools_kwargs and not tools_kwargs:
+            logger.warning("tools_kwargs is empty for index {}, data source: {}", index, row_dict["data_source"])
+        row_dict["index"] = index
+        row_dict["tools_kwargs"] = tools_kwargs
+        row_dict["interaction_kwargs"] = interaction_kwargs
+        return row_dict
+    def __getstate__(self):
+        if not self.serialize_dataset:
+            state = self.__dict__.copy()
+            if "dataframe" in state:
+                del state["dataframe"]
+            return state
+        return self.__dict__.copy()

ICL/DAPO/verl-recipe/prime/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

ICL/DAPO/verl-recipe/prime/prime_core_algos.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright 2024 PRIME team and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import verl
+import verl.utils.torch_functional as verl_F
+def compute_rloo_advantage_return(data: verl.DataProto, response_mask: torch.Tensor, n_samples, config):
+    # calculate rloo reward on different reward sources, and sum again
+    def masked_rloo(reward_tensor_original, mask_tensor):
+        reward_tensor = reward_tensor_original.clone()
+        reward_tensor[~mask_tensor] = 0
+        for start_pos in range(0, reward_tensor.shape[0], n_samples):
+            cur_rewards_mean = torch.cat(
+                [
+                    reward_tensor[pos : pos + 1][mask_tensor[pos : pos + 1]].mean(dim=0, keepdim=True)
+                    for pos in range(start_pos, start_pos + n_samples)
+                ],
+                dim=0,
+            )
+            cur_rewards_sum = cur_rewards_mean.sum()
+            cur_reward_baseline = cur_rewards_sum / (n_samples - 1)
+            reward_tensor[start_pos : start_pos + n_samples][mask_tensor[start_pos : start_pos + n_samples]] = (
+                reward_tensor[start_pos : start_pos + n_samples][mask_tensor[start_pos : start_pos + n_samples]]
+                * (n_samples / (n_samples - 1))
+                - cur_reward_baseline
+            )
+        return reward_tensor
+    reward_tensors = []
+    with torch.no_grad():
+        if "rm_scores" in data.batch.keys() and config.algorithm.reward_dpo_coef != 0.0:
+            reward_tensor = data.batch["rm_scores"]
+            reward_mask = response_mask.bool()
+            reward_tensors.append(masked_rloo(reward_tensor, reward_mask) * config.algorithm.reward_dpo_coef)
+        if "acc" in data.batch.keys() and config.algorithm.reward_gt_coef != 0.0:
+            reward_tensor = torch.zeros_like(response_mask, dtype=torch.float32)
+            reward_mask = torch.zeros_like(response_mask, dtype=torch.bool)
+            prompt_ids = data.batch["prompts"]
+            prompt_length = prompt_ids.shape[-1]
+            valid_response_length = data.batch["attention_mask"][:, prompt_length:].sum(-1)
+            reward_mask[
+                torch.arange(0, valid_response_length.shape[0], dtype=torch.long, device=valid_response_length.device),
+                valid_response_length - 1,
+            ] = True
+            reward_tensor[
+                torch.arange(0, valid_response_length.shape[0], dtype=torch.long, device=valid_response_length.device),
+                valid_response_length - 1,
+            ] = data.batch["acc"]
+            reward_tensors.append(masked_rloo(reward_tensor, reward_mask) * config.algorithm.reward_gt_coef)
+        final_reward_tensor = sum(reward_tensors)
+        returns = (final_reward_tensor * response_mask).flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
+        advantages = returns.clone()
+        advantages = verl_F.masked_whiten(advantages, response_mask)
+        return advantages, returns
+def compute_ce_dpo_loss_rm(token_level_scores, acc, response_mask, beta):
+    cur_scores = ((token_level_scores * response_mask).sum(dim=1) * beta).sigmoid()
+    cur_dpo_loss = torch.nn.functional.binary_cross_entropy(cur_scores, acc)
+    return cur_dpo_loss
+def compute_detach_dpo_loss_rm(token_level_scores, acc, Q_bc, acc_bc, response_mask, beta, bon_mode="none"):
+    # we always assume that the BoN size equals n_samples
+    # mode1: use acc as rm
+    # mode2: use Q as rm
+    cur_Q = (token_level_scores * response_mask).sum(dim=1) * beta
+    other_Q = torch.zeros_like(cur_Q)
+    for i in range(token_level_scores.shape[0]):
+        Q_chosen = Q_bc[i][acc_bc[i] < acc[i]] if acc[i] > 0 else Q_bc[i][acc_bc[i] > acc[i]]
+        if len(Q_chosen) > 0:
+            other_Q[i] = Q_chosen.mean() * beta
+        else:
+            other_Q[i] = 0
+    dpo_loss = -torch.log(torch.sigmoid((cur_Q - other_Q) * ((acc > 0).float() * 2 - 1)))
+    if bon_mode == "none":
+        dpo_loss = dpo_loss.mean()
+    else:
+        weight = torch.zeros_like(dpo_loss)
+        n_samples = acc_bc.shape[1]
+        if bon_mode == "bon_rm":
+            for i in range(token_level_scores.shape[0]):
+                weight[i] = n_samples * torch.pow((Q_bc[i] * beta <= cur_Q[i]).float().mean(), n_samples - 1)
+        elif bon_mode == "bon_acc":
+            for i in range(token_level_scores.shape[0]):
+                weight[i] = n_samples * torch.pow((acc_bc[i] <= acc[i]).float().mean(), n_samples - 1)
+        else:
+            raise NotImplementedError
+        dpo_loss = (dpo_loss * weight).sum()
+    return dpo_loss
+def compute_dpo_accuracy(token_level_scores, acc, response_mask, n_samples):
+    dpo_acc = []
+    for start_id in range(0, token_level_scores.shape[0], n_samples):
+        cur_scores = (
+            token_level_scores[start_id : start_id + n_samples] * response_mask[start_id : start_id + n_samples]
+        ).sum(dim=1)
+        def get_upper_triangle(tensor_x):
+            diff_matrix = tensor_x.unsqueeze(1) - tensor_x.unsqueeze(0)
+            upper_tri_indices = torch.triu(torch.ones_like(diff_matrix).bool(), diagonal=1)
+            return diff_matrix[upper_tri_indices]
+        cur_acc_diff = get_upper_triangle(acc[start_id : start_id + n_samples])  # in range [-1,1]
+        cur_score_diff = get_upper_triangle(cur_scores)  # in R
+        cur_score_prediction = (cur_score_diff > 0).float()  # in [0,1]
+        if cur_acc_diff.abs().sum() == 0:
+            cur_acc = torch.zeros_like(cur_score_prediction[0]) + 0.5
+        else:
+            cur_acc = (
+                ((cur_score_diff > 0) == (cur_acc_diff > 0)).float() * cur_acc_diff.abs()
+            ).sum() / cur_acc_diff.abs().sum()
+        dpo_acc.append(cur_acc.unsqueeze(0))
+    return torch.cat(dpo_acc, dim=0).mean()
+def compute_dpo_abs_accuracy(token_level_scores, acc, response_mask, n_samples):
+    return (torch.sign((token_level_scores * response_mask).sum(dim=-1)) == torch.sign(acc * 2 - 1)).float().mean()

ICL/DAPO/verl-recipe/prime/run_prime_qwen_code.sh ADDED Viewed

	@@ -0,0 +1,61 @@

+set -x
+# download from https://huggingface.co/datasets/PRIME-RL/Eurus-2-RL-Data
+code_train_path=$HOME/data/code/train.parquet
+code_test_path=$HOME/data/code/test.parquet
+train_files="['$code_train_path']"
+test_files="['$code_test_path']"
+model_path=PRIME-RL/Eurus-2-7B-SFT
+# model_path=Qwen/Qwen2.5-0.5B-Instruct
+python3 -m recipe.prime.main_prime \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=64 \
+    data.val_batch_size=6312 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=3072 \
+    data.filter_overlong_prompts=True \
+    data.filter_accuracy=True \
+    data.accuracy_lower_bound=0.2 \
+    data.accuracy_upper_bound=0.8 \
+    data.oversample_factor=4 \
+    actor_rollout_ref.model.path=$model_path \
+    actor_rollout_ref.actor.optim.lr=5e-7 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.n=4 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
+    algorithm.adv_estimator=rloo \
+    algorithm.use_kl_in_reward=True \
+    algorithm.kl_penalty=kl \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    reward_model.model.path=$model_path \
+    reward_model.micro_batch_size_per_gpu=1 \
+    reward_model.model.update=before \
+    reward_model.model.beta_train=0.05 \
+    reward_model.model.optim.lr=1e-6 \
+    reward_model.model.optim.grad_clip=10.0 \
+    reward_model.model.input_tokenizer=null \
+    reward_model.mini_batch_size=64 \
+    trainer.val_before_train=False \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='prime_example' \
+    trainer.experiment_name='Eurus-2-7B-SFT-code' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=64 \
+    trainer.test_freq=64 \
+    trainer.total_epochs=15 $@

ICL/DAPO/verl-recipe/r1/run_r1_distill_qwen.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+MODEL_PATH=Qwen/DeepSeek-R1-Distill-Qwen-1.5B
+DATA_PATH=/workspace/datasets/r1_bench
+# Eval Data Process
+python3 -m recipe.r1.data_process \
+    --local_dir $DATA_PATH \
+    --tasks all
+# Generation
+python3 -m verl.trainer.main_generation \
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node=8 \
+    data.path=$DATA_PATH/test.parquet \
+    data.prompt_key=prompt \
+    data.batch_size=1024 \
+    data.n_samples=8 \
+    data.output_path=$DATA_PATH/test-output-8.parquet \
+    model.path=$MODEL_PATH \
+    rollout.temperature=0.6 \
+    rollout.top_p=0.95 \
+    rollout.prompt_length=1024 \
+    rollout.response_length=32768 \
+    rollout.tensor_model_parallel_size=1 \
+    rollout.gpu_memory_utilization=0.9 \
+    rollout.max_num_batched_tokens=65536
+# Evaluation
+python3 -m recipe.r1.main_eval \
+    data.path=$DATA_PATH/test-output-8.parquet \
+    data.prompt_key=prompt \
+    data.response_key=responses \
+    custom_reward_function.path=recipe/r1/reward_score.py \
+    custom_reward_function.name=reward_func

ICL/DAPO/verl-recipe/r1_ascend/Dockerfile.vllm_ascend.mindspeed.deepseekV3 ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler22.03-py3.11
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+# Define environments
+ENV DEBIAN_FRONTED=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+RUN yum install -y patch
+WORKDIR /workspace
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+# Install torch and torch-npu
+RUN python3 -m pip install torch==2.5.1 torch-npu==2.5.1.post1
+# Compile/Install apex
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    source /usr/local/Ascend/nnal/asdsip/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    git clone -b master https://gitcode.com/ascend/apex.git && \
+    cd apex/ && bash scripts/build.sh --python=3.11 && \
+    cd apex/dist/ && \
+    python3 -m pip install --upgrade apex-0.1+ascend-*.whl
+# verl
+RUN git clone https://github.com/volcengine/verl.git
+# MindSpeed
+RUN git clone https://gitcode.com/Ascend/MindSpeed.git && \
+    cd MindSpeed && \
+    git checkout f6688 && \
+    pip install -r requirements.txt && \
+    cp -r mindspeed ../verl
+# Install vLLM
+RUN git clone https://github.com/vllm-project/vllm.git && \
+    cd vllm && \
+    git checkout v0.9.1 && \
+    cp -r vllm ../verl
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+# Install vllm-ascend
+RUN git clone https://github.com/vllm-project/vllm-ascend.git && \
+    cd vllm-ascend && \
+    git checkout 8c7bc45 && \
+    cp -r vllm_ascend ../verl
+# Append `libascebd_hal.so` path (devlib) to LD_LIBRARY_PATH
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
+    python3 -m pip install -v -e /workspace/vllm-ascend/ --exists-action=i --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+# Install modelscope (for fast download) and ray (for multinode) and Megatron-LM and others
+RUN python3 -m pip install modelscope ray cache purge "transformers<4.54.0" mathruler cbor2 && \
+    pip install pybase64 fastapi zmq uvicorn openai msgspec blake3 py-cpuinfo gguf openai-harmony && \
+    pip install git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.1
+CMD ["/bin/bash"]

ICL/DAPO/verl-recipe/r1_ascend/README.md ADDED Viewed

	@@ -0,0 +1,119 @@

+# DeepSeek-R1-Zero on Ascend NPU
+This recipe provides a sample for fine-tuning the Deepseek-V3-Base model using Reinforcement Learning from Human Feedback (RLHF) on Ascend NPUs, specifically utilizing the GRPO algorithm with rule-based rewards on the deepscaler dataset.
+## Implementation Details
+To implement RL training for the DeepSeek model on Ascend NPUs, this example includes the following key code additions and modifications:
+- We implemented a simple rule-based reward function in `deepscaler.py`, referencing `verl/utils/reward_score/gsm8k.py`.
+- We provided a dataset file conversion script `json_to_parquet.py`, which adds a template to the prompts to stimulate model thinking during the data file format conversion.
+- Due to potential incomplete memory offloading during sleep operations for vLLM on NPUs, we added patches to manually handle the offloading and onloading of the rollout model and KVcache on NPUs. The related code is in `vllm_rollout_spmd.py` and `megatron_workers.py`.
+- To enable vLLM to utilize all ranks for expert parallelism, support for vLLM's data parallelism was necessary. For this purpose, we added patches to construct the correct data parallel communication group. The related code is in `vllm_parallel_state.py` and `vllm_rollout_spmd.py`. Additionally, the `VLLM_DP_SIZE` environment variable must be correctly set to `world_size / vllm_tp_size`.
+- The MindSpeed training framework for NPUs invalidates torch.compile to avoid compilation failures during training, but this prevents its use for accelerating inference. To resolve this, we added patches that allow compilation during inference but not during training. The related code is in `megatron_workers.py`.
+- During RL training, multiple KV cache scheduling operations in vLLM on NPUs could lead to inconsistent memory allocation causing memory trampling. The fix for this issue is patched in `engine_core.py`.
+By searching globally for `# NPU-ADAPTATION`, you can see the actual changes made by the patch code.
+For more technical details, please refer to [the Technical Report (in Chinese)](https://gitcode.com/cann/cann-recipes-train/blob/master/docs/deepseek/deepseek_rl_train_optimization.md).
+## Training Details
+### Hyperparameters
+This example fine-tunes the DeepSeek-671B Base model on the deepscaler dataset using a combination of simple format rewards and answer accuracy rewards. The key hyperparameters are as follows:
+|  iteration  | learning rate |  global batchsize  |  n_samples | temperature |  kl-coef | prompt_max_len | response_max_len | rule reward | reward model |
+|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
+| 70 | 1e-6 (constant) |  512  |  16  |  1.0  |  0.001  |  1024  |  2048  |  format + acc  | - |
+### Resource Allocation and Performance
+This recipe was trained on an Ascend Atlas 800T A3 hyper-node server, utilizing 128 A3 NPUs, which is equivalent to 256 accelerator ranks. The specific deployment strategy is as follows:
+| Rollout Deployment | Actor Deployment | Reference Deployment | Offload Strategy |
+|:----:|:----:|:----:|:----:|
+|  TP2 EP256  |  EP32 PP8  |  Same as Actor  |  Full offload, optimizer utilizes the [Mindspeed Swap Optimizer feature](https://gitee.com/ascend/MindSpeed/blob/master/docs/features/swap-optimizer.md)  |
+The performance metrics for one training step are shown below (throughput varies with the model's response length during training):
+|  step  | prompt_len_mean |  response_len_mean  |  timing_step (s) | throughput (tps/A3) | timing_gen (s) | timing_reward (s) | timing_old_prob (s) | timing_ref_prob (s) | timing_update (s) |
+|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
+| 2 | 175.1 |  1385.0  |  1044.8  | 95.5 |  482.2  |  20.4  |  105.5  |  92.7  | 342.9 |
+### Training Metrics
+<div align="center">
+  <img src="./figures/rewards.png" width="33%" />
+  <img src="./figures/response_len.png" width="33%" />
+  <img src="./figures/val_score.png" width="33%" />
+</div>
+## Quick Start
+### Environment Setup
+For setting up the Ascend NPU environment for verl, please refer to [ascend_quick_start.rst (in Chinese)](../../docs/ascend_tutorial/ascend_quick_start.rst).
+Alternatively, you can use the provided Dockerfile to build the project's runtime environment locally: `docker build -f Dockerfile.vllm_ascend.mindspeed.deepseekV3 -t REPOSITORY:TAG ./`
+Prepare the source code with the following steps:
+```bash
+# Clone verl
+git clone https://github.com/volcengine/verl.git
+# Clone and setup vLLM (v0.9.1)
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+git checkout v0.9.1
+cp -r vllm ../verl
+cd ..
+# Clone and setup vLLM-Ascend (commit 8c7bc45)
+git clone https://github.com/vllm-project/vllm-ascend.git
+cd vllm-ascend
+git checkout 8c7bc45
+cp -r vllm_ascend ../verl
+cd ..
+# Clone and setup MindSpeed (commit f6688)
+git clone https://gitcode.com/Ascend/MindSpeed.git
+cd MindSpeed
+git checkout f6688
+cp -r mindspeed ../verl
+cd ..
+# Install Megatron-LM.core and other dependencies
+pip install git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.1
+pip install mathruler
+```
+### Prepare the Training Dataset
+This example uses the deepscaler dataset. Prepare it as follows:
+- Download the dataset [JSON file](https://huggingface.co/datasets/agentica-org/DeepScaleR-Preview-Dataset/blob/main/deepscaler.json).
+- Generate the `train.parquet` and `test.parquet` files and place them in the `./data/deepscaler` directory:
+    ```bash
+    # Execute from the verl project directory
+    python recipe/r1_ascend/json_to_parquet.py --output_dir ./data/deepscaler --json_path path/to/deepscaler.json --train_data_ratio 0.9
+    ```
+    The processed prompts used during training will include a specific template, for example: `A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Put your final answer within \boxed{}. <｜User｜>{problem}<｜Assistant｜>`
+### Prepare Model Weights
+Prepare the DeepSeek-V3-Base model weights as follows:
+- Place the model configuration files (excluding the weights) into the `./DeepSeek-V3-hf` directory. The `config.json` file needs to be replaced to remove quantization and MTP configurations. Refer to [this link (in Chinese)](https://gitcode.com/cann/cann-recipes-train/blob/master/rl_train/deepseek/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87) for details.
+- Download the FP8 model weights from [HuggingFace](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base) or [ModelScope](https://www.modelscope.cn/models/deepseek-ai/DeepSeek-V3-Base). Ensure the target disk has over 650GB of free space.
+- Convert the FP8 weights to BF16 weights. Refer to [this link (in Chinese)](https://gitcode.com/cann/cann-recipes-train/blob/master/rl_train/deepseek/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87) for instructions. This step requires over 1300GB of free space on the target disk.
+This example uses pre-sharded distributed weights. Therefore, the following weight sharding step is also required:
+- The distributed weights will be stored in `ckpts/DeepseekV3-dist-ckpts`.
+- Use the script `verl/scripts/converter_hf_to_mcore.py` to shard the original BF16 weights into distributed weights. In practice, we found that 2TB of CPU RAM was insufficient for sharding the 671B model. Therefore, we adapted this script for expert parallelism and performed the weight sharding using a distributed strategy of EP8 PP8 across 64 NPUs.
+### Other Code Modifications
+In practice, to achieve the above results for on-policy RL training, we need to replace the `old_log_prob = data["old_log_probs"]` code in `verl/workers/actor/megatron_actor.py` with:
+```python
+on_policy = self.config.ppo_epochs == 1
+if on_policy:
+    old_log_prob = log_prob.detach()    # guarantee exact numerical equality
+else:
+    old_log_prob = data["old_log_probs"]
+```
+### Execute RL Fine-tuning
+```bash
+# Start the RL fine-tuning for DeepSeekV3 from the verl directory
+bash ./recipe/r1_ascend/ray_start_grpo_npu.sh
+```

ICL/DAPO/verl-recipe/r1_ascend/README_zh.md ADDED Viewed

	@@ -0,0 +1,119 @@

+# DeepSeek-R1-Zero on Ascend NPU
+本recipe是基于Deepseek-V3-Base模型在NPU上进行RLHF后训练的样例，基于GRPO与规则奖励，使用deepscaler数据集。
+## 实现细节
+为了在Ascend NPU上实现DeepSeek模型的RL训练，本样例中补充了一些代码，如下所示：
+- 我们参考`verl/utils/reward_score/gsm8k.py`，在`deepscaler.py`中实现了一个简单的规则奖励函数。
+- 我们提供了数据集文件转换脚本`json_to_parquet.py`，在数据文件格式转换的同时给prompt增加了激发模型思考的模板。
+- NPU上vLLM的sleep可能存在内存卸载不干净的问题，因此添加了一些patch，手动实现NPU上Rollout模型与KVcache的卸载与加载。相关代码在`vllm_rollout_spmd.py`以及 `megatron_workers.py`中。
+- 为了实现vLLM利用所有卡进行专家并行，需要支持vLLM的数据并行。为此添加了一些patch构建正确的DP通信域。相关代码在`vllm_parallel_state.py`以及`vllm_rollout_spmd.py`中。此外还需要正确配置`VLLM_DP_SIZE`环境变量为`world_size / vllm_tp_size`。
+- NPU的MindSpeed训练框架会将torch.compile无效化来规避训练侧的compile失败，但这会使推理侧无法利用torch.compile加速。为了解决该问题，本样例添加了一些patch，使推理时可以compile，训练时不compile。相关代码`megatron_workers.py`中。
+- RL训练过程中，NPU上vLLM多次KVcache调度可能引发申请内存不一致导致内存踩踏问题，修复patch在`engine_core.py`中。
+通过全局搜索`# NPU-ADAPTATION`，可以看到patch代码所做的实际改动。
+更多技术细节可参考[技术报告](https://gitcode.com/cann/cann-recipes-train/blob/master/docs/deepseek/deepseek_rl_train_optimization.md)。
+## 训练细节
+### 训练超参
+本样例基于DeepSeek-671B Base模型在deepscaler数据集上训练，使用简单的格式奖励和结果准确率奖励，训练超参如下：
+|  迭代  | 学习率 |  gbs  |  采样数 | 温度 |  kl-coef | 输入长度 | 输出长度 | 规则奖励 | 奖励模型 |
+|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
+| 70 | 1e-6 (constant) |  512  |  16  |  1.0  |  0.001  |  1024  |  2048  |  format + acc  | - |
+### 训练资源与性能
+本样例在昇腾Atlas 800T A3超节点服务器上进行训练，使用了128张A3 NPU，等效于256张加速卡。具体的部署方式如下：
+| Rollout部署 | Actor部署 | Reference部署 | Offload策略 |
+|:----:|:----:|:----:|:----:|
+|  TP2 EP256  |  EP32 PP8  |  同Actor  |  全offload，优化器使用[Mindspeed卸载特性](https://gitee.com/ascend/MindSpeed/blob/master/docs/features/swap-optimizer.md)  |
+得到一步的训练性能如下（吞吐会随着训练中模型输出长度变化而改变）：
+|  step  | 平均问题长度 |  平均回复长度  |  单步总耗时(s) | 吞吐(tps/A3) | gen耗时(s) | reward耗时(s) | old_prob耗时(s) | ref_prob耗时(s) | update耗时(s) |
+|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
+| 2 | 175.1 |  1385.0  |  1044.8  | 95.5 |  482.2  |  20.4  |  105.5  |  92.7  | 342.9 |
+### 训练过程记录
+<div align="center">
+  <img src="./figures/rewards.png" width="33%" />
+  <img src="./figures/response_len.png" width="33%" />
+  <img src="./figures/val_score.png" width="33%" />
+</div>
+## 快速开始
+### 环境准备
+verl上的NPU环境准备，可参考[ascend_quick_start.rst](../../docs/ascend_tutorial/ascend_quick_start.rst)进行配置。
+此外，也可使用我们提供的Dockerfile在本地构建项目运行环境：`docker build -f Dockerfile.vllm_ascend.mindspeed.deepseekV3 -t REPOSITORY:TAG ./`
+本样准备源码的步骤如下：
+```bash
+# verl
+git clone https://github.com/volcengine/verl.git
+# vLLM (v0.9.1)
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+git checkout v0.9.1
+cp -r vllm ../verl
+cd ..
+# vLLM-Ascend (v0.9.1-dev)
+git clone https://github.com/vllm-project/vllm-ascend.git
+cd vllm-ascend
+git checkout 8c7bc45
+cp -r vllm_ascend ../verl
+cd ..
+# MindSpeed (commit-id: f6688)
+git clone https://gitcode.com/Ascend/MindSpeed.git
+cd MindSpeed
+git checkout f6688
+cp -r mindspeed ../verl
+cd ..
+# Megatron-LM.core and others
+pip install git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.1
+pip install mathruler
+```
+### 准备训练数据集
+本样例使用deepscaler数据集。准备方式如下：
+- 下载数据集[json文件](https://huggingface.co/datasets/agentica-org/DeepScaleR-Preview-Dataset/blob/main/deepscaler.json)。
+- 获取`train.parquet`与`test.parquet`文件并放入`./data/deepscaler`路径：
+    ```bash
+    # 在verl项目目录执行
+    python recipe/r1_ascend/json_to_parquet.py --output_dir ./data/deepscaler --json_path path/to/deepscaler.json --train_data_ratio 0.9
+    ```
+    训练中经过处理的prompt将包含模板，例如：`A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Put your final answer within \boxed{}. <｜User｜>{problem}<｜Assistant｜>`
+### 准备模型权重
+DeepSeek-V3-Base模型权重准备步骤如下：
+- 需要将模型配置相关文件（不含权重）放入`./DeepSeek-V3-hf`目录，并且`config.json`需要进行替换以去除量化和MTP。该步骤可参考[此链接](https://gitcode.com/cann/cann-recipes-train/blob/master/rl_train/deepseek/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87)。
+- 模型FP8权重下载：[HuggingFace地址](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base)，[ModelScope地址](https://www.modelscope.cn/models/deepseek-ai/DeepSeek-V3-Base)。此步骤需要目录所在磁盘有650GB以上空间。
+- 将FP8权重转为BF16权重，可参考[此链接](https://gitcode.com/cann/cann-recipes-train/blob/master/rl_train/deepseek/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87)。此步骤需要目录所在磁盘有1300GB以上空间。
+本样例使用了预先切分的分布式权重，因此还要执行以下的切分权重操作：
+- 分布式权重需存储至`ckpts/DeepseekV3-dist-ckpts`。
+- 使用`verl/scripts/converter_hf_to_mcore.py`对原始的BF16权重切分得到分布式权重。实践中我们发现2T的CPU内存不足以完成671B模型的权重切分处理，为此我们对该脚本进行了专家并行的适配，并在64块NPU上用EP8 PP8分布式策略对权重进行了切分。
+### 其他代码修改
+实践中为了得到以上on-policy训练的结果，我们将 `verl/workers/actor/megatron_actor.py` 中的代码段 `old_log_prob = data["old_log_probs"]` 替换为如下代码：
+```python
+on_policy = self.config.ppo_epochs == 1
+if on_policy:
+    old_log_prob = log_prob.detach()    # 确保二者数值完全相等
+else:
+    old_log_prob = data["old_log_probs"]
+```
+### 执行RL后训练
+```bash
+# verl目录下启动DeepSeekV3的RL后训练
+bash ./recipe/r1_ascend/ray_start_grpo_npu.sh
+```

ICL/DAPO/verl-recipe/r1_ascend/ray_start_grpo_npu.sh ADDED Viewed

	@@ -0,0 +1,82 @@

+ray stop --force
+export RAY_DEDUP_LOGS=0            # 0: disable ray's log folding 1: enable ray's log folding
+export HYDRA_FULL_ERROR=1          # display the accurate error stack
+ulimit -n 32768
+mkdir logs
+NNODES=16                          # number of nodes
+NPUS_PER_NODE=16                   # the number of npus for each node
+export WORLD_SIZE=$(($NNODES*$NPUS_PER_NODE))
+RAY_START_PORT=6766
+RAY_DASHBOARD_PORT=8260
+MASTER_ADDR="IP FOR MASTER NODE"   # modify it to correspond to the IP of the master node
+SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"  # modify it to the communication network card of the current node
+# obtain the current node IP
+CURRENT_IP=$(ifconfig $SOCKET_IFNAME | grep -Eo 'inet (addr:)?([0-9]{1,3}\.){3}[0-9]{1,3}' | awk '{print $NF}')
+export MASTER_PORT=29444
+export HCCL_IF_BASE_PORT=64247
+export TP_SOCKET_IFNAME=$SOCKET_IFNAME
+export HCCL_SOCKET_IFNAME=$SOCKET_IFNAME
+export GLOO_SOCKET_IFNAME=$SOCKET_IFNAME
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
+export TASK_QUEUE_ENABLE=2                      # enable level2 optimization of the sent queue of the ascend operator
+export HCCL_BUFFSIZE=300                        # the buffer size of HCCL
+export HCCL_CONNECT_TIMEOUT=600
+export HCCL_EXEC_TIMEOUT=600
+export ASCEND_LAUNCH_BLOCKING=0       # debug usage, which seriously affects performance after use, but the error stack is accurate
+export VLLM_USE_V1=1                            # use the V1 engine of vLLM
+export VLLM_ENABLE_GRAPH_MODE=1                 # enable vLLM graph mode
+export HCCL_OP_EXPANSION_MODE=AIV               # enable the communication mode of AIV
+export VLLM_ENABLE_MC2=1                        # enable MC2 communication
+export VLLM_DP_SIZE=128                         # configure the DP size of vLLM, this is related to the vLLM instance num
+# under the configuration of the vLLM log level of INFO, enable this configuration, print the time of prefill and decode
+export VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE=0
+if [ "$MASTER_ADDR" = "$CURRENT_IP" ]; then
+  # the master node starts
+  ray start --head --port=$RAY_START_PORT --dashboard-host=0.0.0.0 --node-ip-address=$CURRENT_IP --dashboard-port=$RAY_DASHBOARD_PORT --resources='{"NPU": '$NPUS_PER_NODE'}'
+  while true; do
+      ray_status_output=$(ray status)
+      npu_count=$(echo "$ray_status_output" | grep -oP '(?<=/)\d+\.\d+(?=\s*NPU)' | head -n 1)
+      npu_count_int=$(echo "$npu_count" | awk '{print int($1)}')
+      device_count=$((npu_count_int / $NPUS_PER_NODE))
+      # determine whether device_count is equal to NNODES
+      if [ "$device_count" -eq "$NNODES" ]; then
+          echo "Ray cluster is ready with $device_count devices (from $npu_count NPU resources), starting Python script."
+          ray status
+          bash ./recipe/r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh
+          break
+      else
+          echo "Waiting for Ray to allocate $NNODES devices. Current device count: $device_count"
+          sleep 5
+      fi
+  done
+else
+  # the child node attempts to register ray with the master node until successful
+  while true; do
+      # try to connect to the Ray cluster
+      ray start --address="$MASTER_ADDR:$RAY_START_PORT" --resources='{"NPU": '$NPUS_PER_NODE'}' --node-ip-address=$CURRENT_IP
+      # check if the connection is successful
+      ray status
+      if [ $? -eq 0 ]; then
+          echo "Successfully connected to the Ray cluster!"
+          break
+      else
+          echo "Failed to connect to the Ray cluster. Retrying in 5 seconds..."
+          sleep 5
+      fi
+  done
+fi

ICL/DAPO/verl-recipe/r1_ascend/vllm_rollout_spmd.py ADDED Viewed

	@@ -0,0 +1,347 @@

+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from
+# https://github.com/volcengine/verl/blob/main/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py
+import logging
+import os
+from typing import Generator
+import torch
+import torch.distributed
+from omegaconf import ListConfig
+from torch.distributed.device_mesh import DeviceMesh
+from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig, CompilationLevel
+from verl.third_party.vllm import VLLM_SLEEP_LEVEL
+from verl.utils.device import get_device_name
+from verl.utils.memory_utils import aggressive_empty_cache
+from verl.workers.config import HFModelConfig, RolloutConfig
+from verl.workers.rollout.vllm_rollout import vLLMRollout as vLLMRolloutBase
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+class vLLMRollout(vLLMRolloutBase):
+    def __init__(
+        self,
+        config: RolloutConfig,
+        model_config: HFModelConfig,
+        device_mesh: DeviceMesh,
+    ):
+        self.config = config
+        self.model_config = model_config
+        self.device_mesh = device_mesh
+        # NPU-ADAPTATION: import vLLM-Ascend patch
+        from recipe.r1_ascend import engine_core  # noqa: F401
+        from vllm_ascend.patch import (
+            platform,  # noqa: F401
+            worker,  # noqa: F401
+        )
+        # NPU-ADAPTATION END
+        if config.layered_summon:
+            self.sleep_level = 1
+        else:
+            self.sleep_level = VLLM_SLEEP_LEVEL
+        model_path = model_config.local_path
+        tokenizer = model_config.tokenizer
+        model_hf_config = model_config.hf_config
+        trust_remote_code = model_config.trust_remote_code
+        self.lora_kwargs = (
+            {"enable_lora": True, "max_loras": 1, "max_lora_rank": model_config.lora_rank}
+            if model_config.lora_rank > 0
+            else {}
+        )
+        tensor_parallel_size = self.config.get("tensor_model_parallel_size", 1)
+        assert tensor_parallel_size <= torch.distributed.get_world_size(), (
+            "tensor parallel size should be less than or equal to the world size"
+        )
+        max_num_batched_tokens = self.config.get("max_num_batched_tokens", 8192)
+        # NPU-ADAPTATION: VLLM_DP_SIZE is configured, the DP communication domain needs to be explicitly initialized
+        if int(os.environ.get("VLLM_DP_SIZE", "1")) > 1:
+            from recipe.r1_ascend.vllm_parallel_state import init_parallel_state
+            init_parallel_state(tensor_parallel_size)
+        # NPU-ADAPTATION END
+        rope_scaling_config = getattr(model_hf_config, "rope_scaling", None)
+        if not rope_scaling_config:
+            max_position_embeddings = None
+            if hasattr(model_hf_config, "max_position_embeddings"):
+                max_position_embeddings = model_hf_config.max_position_embeddings
+            elif hasattr(model_hf_config, "llm_config") and hasattr(
+                model_hf_config.llm_config, "max_position_embeddings"
+            ):
+                max_position_embeddings = model_hf_config.llm_config.max_position_embeddings
+            elif hasattr(model_hf_config, "text_config") and hasattr(
+                model_hf_config.text_config, "max_position_embeddings"
+            ):
+                max_position_embeddings = model_hf_config.text_config.max_position_embeddings
+            if max_position_embeddings is None:
+                raise ValueError("max_position_embeddings not found in model_hf_config")
+            assert max_position_embeddings >= config.prompt_length + config.response_length, (
+                "model context length should be greater than total sequence length"
+            )
+        else:
+            # handle type where there's a length extend factor
+            # see https://qwen.readthedocs.io/en/latest/deployment/vllm.html#extended-context-support
+            # for using yarn as an example
+            rope_scaling_factor = rope_scaling_config.get("factor", 1.0)
+            assert (
+                model_hf_config.max_position_embeddings * rope_scaling_factor
+                >= config.prompt_length + config.response_length
+            ), (
+                "model context length should be greater than total sequence length, "
+                + f"got rope_scaling_factor={rope_scaling_factor} and "
+                + f"max_position_embeddings={model_hf_config.max_position_embeddings}"
+            )
+        max_model_len = int(config.max_model_len or config.prompt_length + config.response_length)
+        load_format = "dummy" if config.load_format.startswith("dummy") else config.load_format
+        # copy it to avoid secretly modifying the engine config
+        engine_kwargs = config.get("engine_kwargs", {}).get("vllm", {}) or {}
+        # For each vLLM engine parameter,
+        # - `None` means not setting it, so we pop it, and leave it to vLLM default value
+        #    (which can vary across different vLLM versions);
+        # - Otherwise it's the desired value we want to explicitly set.
+        engine_kwargs = {key: val for key, val in engine_kwargs.items() if val is not None}
+        if config.get("limit_images", None):  # support for multi-image data
+            engine_kwargs["limit_mm_per_prompt"] = {"image": config.get("limit_images")}
+        compilation_config = {}
+        cudagraph_capture_sizes = config.get("cudagraph_capture_sizes")
+        # enforce_eager must be False to use cudagraph
+        if not config.enforce_eager and cudagraph_capture_sizes:
+            if isinstance(cudagraph_capture_sizes, ListConfig):
+                compilation_config["compilation_config"] = CompilationConfig(
+                    level=CompilationLevel.PIECEWISE, cudagraph_capture_sizes=cudagraph_capture_sizes
+                )
+            else:
+                logger.warning(f"cudagraph_capture_sizes must be a list, but got {cudagraph_capture_sizes}")
+        VLLM_ENABLE_GRAPGH_MODE = int(os.environ.get("VLLM_ENABLE_GRAPH_MODE", "0"))
+        self.inference_engine = LLM(
+            model=model_path,
+            # NPU-ADAPTATION: Enable inference EP and disable sleep mode.
+            enable_sleep_mode=False,
+            enable_expert_parallel=True,
+            # NPU-ADAPTATION END
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend="external_launcher",
+            dtype=config.dtype,
+            enforce_eager=config.enforce_eager,
+            gpu_memory_utilization=config.gpu_memory_utilization,
+            disable_custom_all_reduce=True,
+            skip_tokenizer_init=False,
+            max_model_len=max_model_len,
+            max_num_seqs=config.max_num_seqs,
+            load_format=load_format,
+            disable_log_stats=config.disable_log_stats,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=config.enable_chunked_prefill,
+            enable_prefix_caching=False,
+            trust_remote_code=trust_remote_code,
+            seed=config.get("seed", 0),
+            # NPU-ADAPTATION: Enable graph mode and configure the parameters.
+            additional_config={
+                "torchair_graph_config": {
+                    "enabled": VLLM_ENABLE_GRAPGH_MODE,
+                    "use_cached_graph": False,
+                    "graph_batch_sizes_init": False,
+                    "graph_batch_sizes": [config.max_num_seqs],
+                    "enable_multistream_mla": False,
+                    "enable_multistream_moe": False,
+                    "enable_view_optimize": False,
+                    "enable_kv_nz": False,
+                    "enable_frozen_parameter": False,
+                },
+                "ascend_scheduler_config": {
+                    "enabled": True,
+                },
+                "refresh": True,
+            },
+            # NPU-ADAPTATION END
+            **compilation_config,
+            **self.lora_kwargs,
+            **engine_kwargs,
+        )
+        # NPU-ADAPTATION: Weight onload and offload, and initialization configurations such as kv_cache.
+        self.model = self.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.get_model()
+        self.kv_cache_configs = None
+        self.cpu_model = {}
+        self.gpu_buffers = None
+        for name, params in self.model.named_parameters():
+            self.cpu_model[name] = torch.empty_like(params, device="cpu")
+        # NPU-ADAPTATION END
+        kwargs = dict(
+            n=1,
+            logprobs=0,  # can be set to 0 and let actor to recompute
+            max_tokens=config.response_length,
+            repetition_penalty=config.get("repetition_penalty", 1.0),
+        )
+        kwargs["detokenize"] = False
+        # supporting adding any sampling params from the config file
+        for k in config.keys():
+            if hasattr(SamplingParams(), str(k)) and k != "seed":
+                kwargs[k] = config.get(k)
+        kwargs["n"] = 1  # already repeat in ray_trainer
+        logger.info(f"vllm sampling kwargs: {kwargs}")
+        self.sampling_params = SamplingParams(**kwargs)
+        self.pad_token_id = tokenizer.pad_token_id
+    # NPU-ADAPTATION: Weight onload and offload, kv_cache init and free function
+    # NOTE: Due to potential incomplete memory offloading during sleep operations for vLLM on NPUs, we add
+    # patches to manually handle the off/on loading of the rollout model and KVcache on NPUs.
+    def init_cache_engine(self):
+        if os.environ["VLLM_USE_V1"] == "1":
+            worker = self.inference_engine.llm_engine.model_executor.driver_worker.worker
+            if not worker.model_runner.kv_caches:
+                # v1 use explicit initialization method
+                self.inference_engine.llm_engine.engine_core.engine_core.model_executor.initialize_from_config(
+                    self.inference_engine.llm_engine.engine_core.engine_core.kv_cache_configs
+                )
+                self.inference_engine.llm_engine.reset_prefix_cache()
+        else:
+            if self.inference_engine.llm_engine.model_executor.driver_worker.worker.cache_engine is None:
+                self.inference_engine.llm_engine.model_executor.driver_worker.worker._init_cache_engine()
+    def onload_model_weights(self):
+        self.gpu_buffers = {}
+        for name, param in self.model.named_parameters():
+            self.gpu_buffers[name] = torch.empty_like(param, device=get_device_name())
+        for name, param in self.model.named_parameters():
+            param.data = self.gpu_buffers[name]
+    def offload_model_weights(self):
+        for name, params in self.model.named_parameters():
+            params.data = self.cpu_model[name]
+        if hasattr(self.model.model.layers[0].self_attn, "mla_attn"):
+            for i in range(self.model.model.start_layer, self.model.model.end_layer):
+                mla = self.model.model.layers[i].self_attn.mla_attn.impl
+                if hasattr(mla, "w_kc"):
+                    mla.w_kc = None
+                    mla.w_vc = None
+                if hasattr(mla, "W_UV"):
+                    mla.W_UV = None
+                    mla.W_UK_T = None
+        self.gpu_buffers = None
+        aggressive_empty_cache()
+    def free_cache_engine(self):
+        if os.environ["VLLM_USE_V1"] == "1":
+            worker = self.inference_engine.llm_engine.model_executor.driver_worker.worker
+            ctx = worker.model_runner.vllm_config.compilation_config.static_forward_context
+        else:
+            compilation_config = self.inference_engine.llm_engine.model_executor.driver_worker.worker.compilation_config
+            ctx = compilation_config.static_forward_context
+        from vllm.attention import AttentionType
+        layer_need_kv_cache = []
+        for layer_name in ctx:
+            if hasattr(ctx[layer_name], "attn_type") and ctx[layer_name].attn_type in (
+                AttentionType.DECODER,
+                AttentionType.ENCODER_DECODER,
+            ):
+                layer_need_kv_cache.append(layer_name)
+        pipeline_parallel_size = self.inference_engine.llm_engine.vllm_config.parallel_config.pipeline_parallel_size
+        for layer_name in layer_need_kv_cache:
+            kv_cache = []
+            for _ in range(pipeline_parallel_size):
+                kv_cache.append(torch.tensor([]))
+            ctx[layer_name].kv_cache = kv_cache
+        if os.environ["VLLM_USE_V1"] == "1":
+            worker = self.inference_engine.llm_engine.model_executor.driver_worker.worker
+            worker.model_runner.kv_caches = []
+        else:
+            self.inference_engine.llm_engine.model_executor.driver_worker.worker.cache_engine = None
+            self.inference_engine.llm_engine.model_executor.driver_worker.worker.gpu_cache = None
+        if hasattr(self.model.model.layers[0].self_attn, "attn"):
+            for i in range(self.model.model.start_layer, self.model.model.end_layer):
+                attn_impl = self.model.model.layers[i].self_attn.attn.impl
+                if hasattr(attn_impl, "key_cache"):
+                    attn_impl.key_cache = None
+                    attn_impl.value_cache = None
+        aggressive_empty_cache()
+    def _process_mla(self, load_weight=False):
+        for i in range(self.model.model.start_layer, self.model.model.end_layer):
+            mla = self.model.model.layers[i].self_attn.mla_attn.impl
+            if hasattr(mla, "w_kc"):
+                mla.w_kc = None
+                mla.w_vc = None
+            if hasattr(mla, "W_UV"):
+                mla.W_UV = None
+                mla.W_UK_T = None
+            if load_weight:
+                mla.process_weights_after_loading(None)
+    async def resume(self, tags: list[str]):
+        """Resume rollout weights or kv cache in NPU memory.
+        Args:
+            tags: weights or kv_cache.
+        """
+        if not self.config.free_cache_engine:
+            return
+        if "weights" in tags:
+            self.onload_model_weights()
+        elif "kv_cache" in tags:
+            self.init_cache_engine()
+    async def release(self):
+        """Release weights and kv cache in NPU memory."""
+        if not self.config.free_cache_engine:
+            return
+        self.free_cache_engine()
+        self.offload_model_weights()
+        if hasattr(self.model.model.layers[0].self_attn, "mla_attn"):
+            self._process_mla()
+    async def update_weights(self, weights: Generator[tuple[str, torch.Tensor], None, None], **kwargs):
+        """Update the weights of the rollout model.
+        Args:
+            weights: A generator that yields the name of the weight tensor and the tensor itself.
+        """
+        await super().update_weights(weights, **kwargs)
+        if hasattr(self.model.model.layers[0].self_attn, "mla_attn"):
+            self._process_mla(load_weight=True)
+    # NPU-ADAPTATION END

ICL/DAPO/verl-recipe/rep_exp/README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+<div align="center">
+# Representation-Based Exploration for Language Models: <br> From Test-Time to Post-Training
+[📄 arXiv](https://arxiv.org/abs/2510.11686) &nbsp; &nbsp; [🌐 Website](https://rep-exp.github.io) &nbsp; &nbsp; [🐦 Twitter / X ](https://x.com/JensTuyls/status/1978244454617128993)
+</div>
+## Installation 🔌
+Install the following commit of verl:
+```
+pip install verl@git+https://github.com/volcengine/verl.git@b9bd00efba253ea90072555c45692054cf703de2
+```
+The only other package to install is scikit-learn, which we'll use for applying a sparse projection.
+```bash
+pip install scikit-learn
+```
+## Running the Experiments 🚀
+You can reproduce or extend our experiments by running the following commands:
+```bash
+# General format
+sh rep_exp/train_elliptical.sh $TASK $SPARSE_DIM $BETA $SEED
+# MATH
+sh rep_exp/train_elliptical.sh math 32 0.01 42
+# GSM8K
+sh rep_exp/train_elliptical.sh gsm8k 32 0.01 42
+# DAPO-WITH-AIME
+sh rep_exp/train_elliptical.sh dapo-with-aime24 128 0.01 42
+```
+where `$TASK` is the task name, `$SPARSE_DIM` is the sparse dimension, `$BETA` is the beta parameter, and `$SEED` is the seed.
+## Evaluation 📊
+Once done training, you can evaluate the model on the test set by following two steps.
+1. Merge the model checkpoint.
+This is necessary because the model checkpoint is saved in multiple shards (depending on the nubmer of GPUs), and we need to merge them into a single checkpoint.
+```bash
+sh rep_exp/model_merge.sh /path/to/global_step_X/actor # where X is the global step of the checkpoint with the best pass@1 on dev
+```
+2. Evaluate the merged model.
+```bash
+sh rep_exp/eval.sh $TASK /path/to/global_step_X/actor/hf #where X is the global step of the checkpoint with the best pass@1 on dev
+```
+The results should be in a folder named `eval` and saved as a JSON file.
+## Citation 📝
+```bibtex
+@article{tuyls2025representation,
+  title={Representation-Based Exploration for Language Models: From Test-Time to Post-Training},
+  author={Tuyls, Jens and Foster, Dylan J and Krishnamurthy, Akshay and Ash, Jordan T},
+  journal={arXiv preprint arXiv:2510.11686},
+  year={2025}
+}
+```
+## Contact 📬
+If you have any questions or suggestions, feel free to reach out at [jtuyls@princeton.edu](mailto:jtuyls@princeton.edu).

ICL/DAPO/verl-recipe/rep_exp/eval.sh ADDED Viewed

	@@ -0,0 +1,83 @@

+TASK=${1} # math, gsm8k, dapo-with-aime24
+# Custom model path for evaluation after training
+MODEL_PATH=${2} # /path/to/global_step_X/actor/hf, where X is the global step of the checkpoint with the best pass@1 on dev
+# If you want to evaluate the base model before training
+# MODEL_PATH=Qwen/Qwen2.5-7B-Instruct
+train_path=$HOME/data/${TASK}/train.parquet
+train_files="['$train_path']"
+CHECKPOINT_SAVE_CONTENTS='["model"]'
+if [ ${TASK} == "dapo-with-aime24" ]; then
+    MAX_PROMPT_LENGTH=$((1024 * 2))
+    MAX_RESPONSE_LENGTH=$((1024 * 8))
+    MAX_NUM_BATCHED_TOKENS=$((MAX_PROMPT_LENGTH + MAX_RESPONSE_LENGTH))
+    test_path=$HOME/data/${TASK}/dev.parquet
+else
+    MAX_PROMPT_LENGTH=1024
+    MAX_RESPONSE_LENGTH=1024
+    MAX_NUM_BATCHED_TOKENS=8192
+    test_path=$HOME/data/${TASK}/test.parquet
+fi
+test_files="['$test_path']"
+# If you're on a cluster with no internet access, set to OFFLINE=True
+OFFLINE=False
+PYTHONUNBUFFERED=1 WANDB_MODE=disabled TRANSFORMERS_OFFLINE=${OFFLINE} python3 -u -m rep_exp.main_rep_exp \
+    algorithm.adv_estimator=grpo \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=$MAX_PROMPT_LENGTH \
+    data.max_response_length=$MAX_RESPONSE_LENGTH \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.val_batch_size=128 \
+    actor_rollout_ref.model.path="$MODEL_PATH" \
+    actor_rollout_ref.actor.checkpoint.save_contents=$CHECKPOINT_SAVE_CONTENTS \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.kl_loss_coef=0.0 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.ppo_epochs=1 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.mode=sync \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$MAX_NUM_BATCHED_TOKENS \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.45 \
+    actor_rollout_ref.rollout.val_kwargs.n=256 \
+    actor_rollout_ref.rollout.n=8 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    reward_model.model.path="$MODEL_PATH" \
+    reward_model.model.use_remove_padding=False \
+    reward_model.model.fsdp_config.param_offload=True \
+    reward_model.micro_batch_size_per_gpu=32 \
+    reward_model.model.input_tokenizer=null \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","json_eval"]' \
+    trainer.project_name='rep-exp' \
+    trainer.experiment_name="${TASK}_eval" \
+    trainer.n_gpus_per_node=1 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=1 \
+    trainer.total_epochs=100 \
+    trainer.val_only=True \
+    trainer.resume_mode=disable \
+    trainer.resume_from_path=''
+exit 0

ICL/DAPO/verl-recipe/rep_exp/main_rep_exp.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
+"""
+import os
+import socket
+import warnings
+import hydra
+import ray
+from omegaconf import OmegaConf
+from verl.experimental.dataset.sampler import AbstractSampler
+from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
+from verl.trainer.ppo.reward import load_reward_manager
+from verl.trainer.ppo.utils import need_critic, need_reference_policy
+from verl.utils.config import validate_config
+from verl.utils.device import is_cuda_available
+from verl.utils.import_utils import load_extern_type
+from .rep_exp_trainer import RayRepExpTrainer
+@hydra.main(config_path="config", config_name="rep_exp_trainer", version_base=None)
+def main(config):
+    """Main entry point for PPO training with Hydra configuration management.
+    Args:
+        config_dict: Hydra configuration dictionary containing training parameters.
+    """
+    run_ppo(config)
+# Define a function to run the PPO-like training process
+def run_ppo(config, task_runner_class=None) -> None:
+    """Initialize Ray cluster and run distributed PPO training process.
+    Args:
+        config: Training configuration object containing all necessary parameters
+                for distributed PPO training including Ray initialization settings,
+                model paths, and training hyperparameters.
+        task_runner_class: For recipe to change TaskRunner.
+    """
+    # Check if Ray is not initialized
+    if not ray.is_initialized():
+        # Initialize Ray with a local cluster configuration
+        # Set environment variables in the runtime environment to control tokenizer parallelism,
+        # NCCL debug level, VLLM logging level, and allow runtime LoRA updating
+        # `num_cpus` specifies the number of CPU cores Ray can use, obtained from the configuration
+        default_runtime_env = get_ppo_ray_runtime_env()
+        ray_init_kwargs = config.ray_kwargs.get("ray_init", {})
+        runtime_env_kwargs = ray_init_kwargs.get("runtime_env", {})
+        if config.transfer_queue.enable:
+            # Add runtime environment variables for transfer queue
+            runtime_env_vars = runtime_env_kwargs.get("env_vars", {})
+            runtime_env_vars["TRANSFER_QUEUE_ENABLE"] = "1"
+            runtime_env_kwargs["env_vars"] = runtime_env_vars
+        runtime_env = OmegaConf.merge(default_runtime_env, runtime_env_kwargs)
+        ray_init_kwargs = OmegaConf.create({**ray_init_kwargs, "runtime_env": runtime_env})
+        print(f"ray init kwargs: {ray_init_kwargs}")
+        ray.init(**OmegaConf.to_container(ray_init_kwargs))
+    if task_runner_class is None:
+        task_runner_class = ray.remote(num_cpus=1)(TaskRunner)  # please make sure main_task is not scheduled on head
+    # Create a remote instance of the TaskRunner class, and
+    # Execute the `run` method of the TaskRunner instance remotely and wait for it to complete
+    if (
+        is_cuda_available
+        and config.global_profiler.tool == "nsys"
+        and config.global_profiler.get("steps") is not None
+        and len(config.global_profiler.get("steps", [])) > 0
+    ):
+        from verl.utils.import_utils import is_nvtx_available
+        assert is_nvtx_available(), "nvtx is not available in CUDA platform. Please 'pip3 install nvtx'"
+        nsight_options = OmegaConf.to_container(
+            config.global_profiler.global_tool_config.nsys.controller_nsight_options
+        )
+        runner = task_runner_class.options(runtime_env={"nsight": nsight_options}).remote()
+    else:
+        runner = task_runner_class.remote()
+    ray.get(runner.run.remote(config))
+    # [Optional] get the path of the timeline trace file from the configuration, default to None
+    # This file is used for performance analysis
+    timeline_json_file = config.ray_kwargs.get("timeline_json_file", None)
+    if timeline_json_file:
+        ray.timeline(filename=timeline_json_file)
+class TaskRunner:
+    """Ray remote class for executing distributed PPO training tasks.
+    This class encapsulates the main training logic and runs as a Ray remote actor
+    to enable distributed execution across multiple nodes and GPUs.
+    Attributes:
+        role_worker_mapping: Dictionary mapping Role enums to Ray remote worker classes
+        mapping: Dictionary mapping Role enums to resource pool IDs for GPU allocation
+    """
+    def __init__(self):
+        self.role_worker_mapping = {}
+        self.mapping = {}
+    def add_actor_rollout_worker(self, config):
+        """Add actor rollout worker based on the actor strategy."""
+        from verl.single_controller.ray import RayWorkerGroup
+        from verl.trainer.ppo.ray_trainer import Role
+        use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto")
+        # use new model engine implementation
+        if use_legacy_worker_impl == "disable":
+            from verl.workers.engine_workers import ActorRolloutRefWorker
+            actor_rollout_cls = ActorRolloutRefWorker
+            ray_worker_group_cls = RayWorkerGroup
+            # NOTE: In new model engine, ref policy and actor rollout are in same ActorRolloutRefWorker,
+            # while in legacy model engine, ref policy is in a separate ActorRolloutRefWorker.
+            if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+                role = Role.ActorRolloutRef
+            else:
+                role = Role.ActorRollout
+            self.role_worker_mapping[role] = ray.remote(actor_rollout_cls)
+            self.mapping[role] = "global_pool"
+            return actor_rollout_cls, ray_worker_group_cls
+        if config.actor_rollout_ref.rollout.mode == "sync":
+            warnings.warn("spmd rollout mode is deprecated and will be removed in v0.6.2", stacklevel=2)
+        if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
+            from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
+            actor_rollout_cls = (
+                AsyncActorRolloutRefWorker
+                if config.actor_rollout_ref.rollout.mode == "async"
+                else ActorRolloutRefWorker
+            )
+            ray_worker_group_cls = RayWorkerGroup
+        elif config.actor_rollout_ref.actor.strategy == "megatron":
+            from verl.workers.megatron_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
+            actor_rollout_cls = (
+                AsyncActorRolloutRefWorker
+                if config.actor_rollout_ref.rollout.mode == "async"
+                else ActorRolloutRefWorker
+            )
+            ray_worker_group_cls = RayWorkerGroup
+        else:
+            raise NotImplementedError
+        self.role_worker_mapping[Role.ActorRollout] = ray.remote(actor_rollout_cls)
+        self.mapping[Role.ActorRollout] = "global_pool"
+        return actor_rollout_cls, ray_worker_group_cls
+    def add_critic_worker(self, config):
+        """Add critic worker to role mapping."""
+        if config.critic.strategy in {"fsdp", "fsdp2"}:
+            use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto")
+            if use_legacy_worker_impl in ["auto", "enable"]:
+                from verl.workers.fsdp_workers import CriticWorker
+            elif use_legacy_worker_impl == "disable":
+                from verl.workers.roles import CriticWorker
+                print("Using new worker implementation")
+            else:
+                raise ValueError(f"Invalid use_legacy_worker_impl: {use_legacy_worker_impl}")
+        elif config.critic.strategy == "megatron":
+            from verl.workers.megatron_workers import CriticWorker
+        else:
+            raise NotImplementedError
+        from verl.trainer.ppo.ray_trainer import Role
+        self.role_worker_mapping[Role.Critic] = ray.remote(CriticWorker)
+        self.mapping[Role.Critic] = "global_pool"
+    def init_resource_pool_mgr(self, config):
+        """Initialize resource pool manager."""
+        global_pool_id = "global_pool"
+        resource_pool_spec = {
+            global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+        }
+        # TODO Here you can use the new registration method to support dynamic registration of roles
+        if config.reward_model.enable_resource_pool:
+            if config.reward_model.n_gpus_per_node <= 0:
+                raise ValueError("config.reward_model.n_gpus_per_node must be greater than 0")
+            if config.reward_model.nnodes <= 0:
+                raise ValueError("config.reward_model.nnodes must be greater than 0")
+            reward_pool = [config.reward_model.n_gpus_per_node] * config.reward_model.nnodes
+            resource_pool_spec["reward_pool"] = reward_pool
+        from verl.trainer.ppo.ray_trainer import ResourcePoolManager
+        resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=self.mapping)
+        return resource_pool_manager
+    def add_reward_model_worker(self, config):
+        """Add reward model worker if enabled."""
+        from verl.trainer.ppo.ray_trainer import Role
+        if config.reward_model.enable:
+            use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto")
+            if use_legacy_worker_impl in ["auto", "enable"]:
+                if config.reward_model.strategy in {"fsdp", "fsdp2"}:
+                    if config.reward_model.elliptical:
+                        from .workers.elliptical_reward_model_worker import (
+                            EllipticalRewardModelWorker as RewardModelWorker,
+                        )
+                    else:
+                        from verl.workers.fsdp_workers import RewardModelWorker
+                elif config.reward_model.strategy == "megatron":
+                    from verl.workers.megatron_workers import RewardModelWorker
+                else:
+                    raise NotImplementedError
+            elif use_legacy_worker_impl == "disable":
+                from verl.workers.roles import RewardModelWorker
+                print("Using new worker implementation")
+            else:
+                raise ValueError(f"Invalid use_legacy_worker_impl: {use_legacy_worker_impl}")
+            self.role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+            if config.reward_model.enable_resource_pool:
+                self.mapping[Role.RewardModel] = "reward_pool"
+            else:
+                self.mapping[Role.RewardModel] = "global_pool"
+    def add_ref_policy_worker(self, config, ref_policy_cls):
+        """Add reference policy worker if KL loss or KL reward is used."""
+        from verl.trainer.ppo.ray_trainer import Role
+        # Ref policy has been fused into ActorRolloutRefWorker in new model engine,
+        # we don't need to add a separate ref policy worker goup.
+        use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto")
+        if use_legacy_worker_impl == "disable":
+            return
+        if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+            self.role_worker_mapping[Role.RefPolicy] = ray.remote(ref_policy_cls)
+            self.mapping[Role.RefPolicy] = "global_pool"
+    def run(self, config):
+        """Execute the main PPO training workflow.
+        This method sets up the distributed training environment, initializes
+        workers, datasets, and reward functions, then starts the training process.
+        Args:
+            config: Training configuration object containing all parameters needed
+                   for setting up and running the PPO training process.
+        """
+        # Print the initial configuration. `resolve=True` will evaluate symbolic values.
+        from pprint import pprint
+        from omegaconf import OmegaConf
+        from verl.utils.fs import copy_to_local
+        print(f"TaskRunner hostname: {socket.gethostname()}, PID: {os.getpid()}")
+        pprint(OmegaConf.to_container(config, resolve=True))
+        OmegaConf.resolve(config)
+        actor_rollout_cls, ray_worker_group_cls = self.add_actor_rollout_worker(config)
+        self.add_critic_worker(config)
+        # We should adopt a multi-source reward function here:
+        # - for rule-based rm, we directly call a reward score
+        # - for model-based rm, we call a model
+        # - for code related prompt, we send to a sandbox if there are test cases
+        # finally, we combine all the rewards together
+        # The reward type depends on the tag of the data
+        self.add_reward_model_worker(config)
+        # Add a reference policy worker if KL loss or KL reward is used.
+        self.add_ref_policy_worker(config, actor_rollout_cls)
+        # validate config
+        validate_config(
+            config=config,
+            use_reference_policy=need_reference_policy(self.role_worker_mapping),
+            use_critic=need_critic(config),
+        )
+        # Download the checkpoint from HDFS to the local machine.
+        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
+        local_path = copy_to_local(
+            config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
+        )
+        # Instantiate the tokenizer and processor.
+        from verl.utils import hf_processor, hf_tokenizer
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        # Used for multimodal LLM, could be None
+        processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
+        # Make sure the elliptical reward manager is registered
+        from .reward_manager.elliptical_reward_manager import EllipticalRewardManager  # noqa: F401
+        # Load the reward manager for training and validation.
+        reward_manager_name = config.reward_model.get("reward_manager", "naive")
+        reward_fn = load_reward_manager(
+            config,
+            tokenizer,
+            num_examine=0,
+            **config.reward_model.get("reward_kwargs", {}).get(reward_manager_name, {}),
+        )
+        val_reward_fn = load_reward_manager(
+            config,
+            tokenizer,
+            num_examine=1,
+            **config.reward_model.get("reward_kwargs", {}).get(reward_manager_name, {}),
+        )
+        resource_pool_manager = self.init_resource_pool_mgr(config)
+        from verl.utils.dataset.rl_dataset import collate_fn
+        # Create training and validation datasets.
+        train_dataset = create_rl_dataset(
+            config.data.train_files,
+            config.data,
+            tokenizer,
+            processor,
+            is_train=True,
+            max_samples=config.data.get("train_max_samples", -1),
+        )
+        val_dataset = create_rl_dataset(
+            config.data.val_files,
+            config.data,
+            tokenizer,
+            processor,
+            is_train=False,
+            max_samples=config.data.get("val_max_samples", -1),
+        )
+        train_sampler = create_rl_sampler(config.data, train_dataset)
+        # Initialize the PPO trainer.
+        trainer = RayRepExpTrainer(
+            config=config,
+            tokenizer=tokenizer,
+            processor=processor,
+            role_worker_mapping=self.role_worker_mapping,
+            resource_pool_manager=resource_pool_manager,
+            ray_worker_group_cls=ray_worker_group_cls,
+            reward_fn=reward_fn,
+            val_reward_fn=val_reward_fn,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            collate_fn=collate_fn,
+            train_sampler=train_sampler,
+        )
+        # Initialize the workers of the trainer.
+        trainer.init_workers()
+        # Start the training process.
+        trainer.fit()
+def create_rl_dataset(data_paths, data_config, tokenizer, processor, is_train=True, max_samples: int = -1):
+    """Create a dataset.
+    Arguments:
+        data_paths: List of paths to data files.
+        data_config: The data config.
+        tokenizer (Tokenizer): The tokenizer.
+        processor (Processor): The processor.
+    Returns:
+        dataset (Dataset): The dataset.
+    """
+    from torch.utils.data import Dataset
+    from verl.utils.dataset.rl_dataset import RLHFDataset
+    # Check if a custom dataset class is specified in the data configuration
+    # and if the path to the custom class is provided
+    if "custom_cls" in data_config and data_config.custom_cls.get("path", None) is not None:
+        # Dynamically load the custom dataset class
+        dataset_cls = load_extern_type(data_config.custom_cls.path, data_config.custom_cls.name)
+        # Verify that the custom dataset class inherits from torch.utils.data.Dataset
+        if not issubclass(dataset_cls, Dataset):
+            raise TypeError(
+                f"The custom dataset class '{data_config.custom_cls.name}' from "
+                f"'{data_config.custom_cls.path}' must inherit from torch.utils.data.Dataset"
+            )
+    elif "datagen" in data_config and data_config.datagen.get("path", None) is not None and is_train:
+        # If a data generation strategy is specified, use the DynamicGenDataset class
+        from verl.utils.dataset.dynamicgen_dataset import DynamicGenDataset
+        dataset_cls = DynamicGenDataset
+        print("Using DynamicGenDataset for data generation.")
+    else:
+        # Use the default RLHFDataset class if no custom class is specified
+        dataset_cls = RLHFDataset
+    print(f"Using dataset class: {dataset_cls.__name__}")
+    # Instantiate the dataset using the determined dataset class
+    dataset = dataset_cls(
+        data_files=data_paths,
+        tokenizer=tokenizer,
+        processor=processor,
+        config=data_config,
+        max_samples=max_samples,
+    )
+    return dataset
+def create_rl_sampler(data_config, dataset):
+    """Create a sampler for the dataset.
+    Arguments:
+        data_config: The data config.
+        dataset (Dataset): The dataset.
+    Returns:
+        sampler (Sampler): The sampler.
+    """
+    import torch
+    from torch.utils.data import SequentialSampler
+    # torch.utils.data.RandomSampler could not recover properly
+    from torchdata.stateful_dataloader.sampler import RandomSampler
+    if data_config.sampler is not None and data_config.sampler.get("class_path", None) is not None:
+        curriculum_class = load_extern_type(
+            data_config.sampler.class_path,
+            data_config.sampler.class_name,
+        )
+        sampler = curriculum_class(
+            data_source=dataset,
+            data_config=data_config,
+        )
+        assert isinstance(sampler, AbstractSampler)
+        assert data_config.get("dataloader_num_workers", 8) == 0, (
+            "If using curriculum, num_workers must be 0 to prevent data caching. "
+            "If the dataloader caches data before the batch is done the "
+            "curriculum sampler won't have the opportunity to reorder it. "
+        )
+    # Use a sampler to facilitate checkpoint resumption.
+    # If shuffling is enabled in the data configuration, create a random sampler.
+    elif data_config.shuffle:
+        train_dataloader_generator = torch.Generator()
+        seed = data_config.get("seed")
+        if seed is not None:
+            train_dataloader_generator.manual_seed(seed)
+        sampler = RandomSampler(data_source=dataset, generator=train_dataloader_generator)
+    else:
+        # If shuffling is disabled, use a sequential sampler to iterate through the dataset in order.
+        sampler = SequentialSampler(data_source=dataset)
+    return sampler
+if __name__ == "__main__":
+    main()

ICL/DAPO/verl-recipe/rep_exp/metric_utils.py ADDED Viewed

	@@ -0,0 +1,382 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Metrics related to the RepExp trainer.
+"""
+from collections import defaultdict
+from functools import partial
+from typing import Any
+import numpy as np
+import torch
+from verl import DataProto
+from verl.trainer.ppo.metric_utils import _compute_response_info, bootstrap_metric, calc_maj_val
+def _compute_three_case_stats(data: DataProto, extrinsic_reward_tensor: torch.Tensor) -> dict:
+    """
+    Compute the fraction of samples that have no rollouts correct, some rollouts correct, and all rollouts correct.
+    Args:
+        data (DataProto): The data proto containing the batch data.
+        extrinsic_reward_tensor (torch.Tensor): The extrinsic reward tensor.
+    Returns:
+        dict[str, float]: A dictionary containing the fraction of samples that have no rollouts correct,
+        some rollouts correct, and all rollouts correct.
+    """
+    no_rollouts_correct = 0
+    some_rollouts_correct = 0
+    all_rollouts_correct = 0
+    visited_uids = set()
+    for uid in data.non_tensor_batch["uid"]:
+        if uid in visited_uids:
+            continue
+        visited_uids.add(uid)
+        mask = torch.from_numpy(data.non_tensor_batch["uid"] == uid)
+        # Split into three cases
+        if extrinsic_reward_tensor[mask].sum() == 0:
+            no_rollouts_correct += 1
+        elif extrinsic_reward_tensor[mask].sum() == mask.sum():
+            all_rollouts_correct += 1
+        elif extrinsic_reward_tensor[mask].sum() > 0 and extrinsic_reward_tensor[mask].sum() < mask.sum():
+            some_rollouts_correct += 1
+        else:
+            raise ValueError(f"Invalid extrinsic reward tensor: {extrinsic_reward_tensor[mask].sum()}")
+    # Sanity checks
+    assert len(visited_uids) == no_rollouts_correct + some_rollouts_correct + all_rollouts_correct
+    return {
+        "no_rollouts_correct_frac": no_rollouts_correct / len(visited_uids),
+        "some_rollouts_correct_frac": some_rollouts_correct / len(visited_uids),
+        "all_rollouts_correct_frac": all_rollouts_correct / len(visited_uids),
+    }
+def compute_data_metrics(batch: DataProto, use_critic: bool = True, elliptical: bool = False) -> dict[str, Any]:
+    """
+    Computes various metrics from a batch of data for PPO training.
+    This function calculates metrics related to scores, rewards, advantages, returns, values,
+    and sequence lengths from a batch of data. It provides statistical information (mean, max, min)
+    for each metric category.
+    Args:
+        batch: A DataProto object containing batch data with token-level scores, rewards, advantages, etc.
+        use_critic: Whether to include critic-specific metrics. Defaults to True.
+        elliptical: Whether to include elliptical-specific metrics. Defaults to False.
+    Returns:
+        A dictionary of metrics including:
+            - critic/score/mean, max, min: Statistics about sequence scores
+            - critic/rewards/mean, max, min: Statistics about sequence rewards
+            - critic/advantages/mean, max, min: Statistics about advantages
+            - critic/returns/mean, max, min: Statistics about returns
+            - critic/values/mean, max, min: Statistics about critic values (if use_critic=True)
+            - critic/vf_explained_var: Explained variance of the value function (if use_critic=True)
+            - response_length/mean, max, min, clip_ratio: Statistics about response lengths
+            - prompt_length/mean, max, min, clip_ratio: Statistics about prompt lengths
+            - num_turns/mean, max, min: Statistics about the number of multi-turn conversations
+    """
+    sequence_score = batch.batch["token_level_scores"].sum(-1)
+    sequence_reward = batch.batch["token_level_rewards"].sum(-1)
+    if elliptical:
+        sequence_intrinsic_reward = batch.non_tensor_batch["intrinsic_reward"].sum(-1)
+        sequence_beta_scaled_intrinsic_reward = batch.non_tensor_batch["beta_scaled_intrinsic_reward"].sum(-1)
+        sequence_extrinsic_reward = batch.non_tensor_batch["extrinsic_reward"].sum(-1)
+        sequence_total_reward = batch.non_tensor_batch["total_reward"].sum(-1)
+        sequence_raw_bonuses = batch.non_tensor_batch["raw_bonuses"].sum(-1)
+        three_case_stats = _compute_three_case_stats(batch, batch.non_tensor_batch["extrinsic_reward"])
+    advantages = batch.batch["advantages"]
+    returns = batch.batch["returns"]
+    max_response_length = batch.batch["responses"].shape[-1]
+    prompt_mask = batch.batch["attention_mask"][:, :-max_response_length].bool()
+    response_mask = batch.batch["response_mask"].bool()
+    max_prompt_length = prompt_mask.size(-1)
+    response_info = _compute_response_info(batch)
+    prompt_length = response_info["prompt_length"]
+    response_length = response_info["response_length"]
+    aborted_mask = (response_length == 0).bool()
+    non_aborted_mask = ~aborted_mask
+    non_aborted_sequence_score = sequence_score[non_aborted_mask]
+    non_aborted_sequence_reward = sequence_reward[non_aborted_mask]
+    score_mean = torch.mean(non_aborted_sequence_score).detach().item()
+    score_max = torch.max(non_aborted_sequence_score).detach().item()
+    score_min = torch.min(non_aborted_sequence_score).detach().item()
+    reward_mean = torch.mean(non_aborted_sequence_reward).detach().item()
+    reward_max = torch.max(non_aborted_sequence_reward).detach().item()
+    reward_min = torch.min(non_aborted_sequence_reward).detach().item()
+    valid_adv = torch.masked_select(advantages, response_mask)
+    valid_returns = torch.masked_select(returns, response_mask)
+    if use_critic:
+        values = batch.batch["values"]
+        valid_values = torch.masked_select(values, response_mask)
+        return_diff_var = torch.var(valid_returns - valid_values)
+        return_var = torch.var(valid_returns)
+    # Aborted samples and non-aborted response length statistics
+    # response_length_non_aborted/*: statistics computed on non-aborted samples only
+    aborted_ratio = torch.mean(aborted_mask.float()).detach().item()
+    non_aborted_response_length = response_length[non_aborted_mask]
+    if non_aborted_response_length.numel() > 0:
+        non_aborted_response_length_mean = torch.mean(non_aborted_response_length).detach().item()
+        non_aborted_response_length_max = torch.max(non_aborted_response_length).detach().item()
+        non_aborted_response_length_min = torch.min(non_aborted_response_length).detach().item()
+        non_aborted_response_length_clip_ratio = (
+            torch.mean(torch.eq(non_aborted_response_length, max_response_length).float()).detach().item()
+        )
+    else:
+        raise ValueError("All samples are aborted, this should not happen.")
+    metrics = {
+        # score
+        "critic/score/mean": score_mean,
+        "critic/score/max": score_max,
+        "critic/score/min": score_min,
+        # reward
+        "critic/rewards/mean": reward_mean,
+        "critic/rewards/max": reward_max,
+        "critic/rewards/min": reward_min,
+        # adv
+        "critic/advantages/mean": torch.mean(valid_adv).detach().item(),
+        "critic/advantages/max": torch.max(valid_adv).detach().item(),
+        "critic/advantages/min": torch.min(valid_adv).detach().item(),
+        # returns
+        "critic/returns/mean": torch.mean(valid_returns).detach().item(),
+        "critic/returns/max": torch.max(valid_returns).detach().item(),
+        "critic/returns/min": torch.min(valid_returns).detach().item(),
+        **(
+            {
+                # values
+                "critic/values/mean": torch.mean(valid_values).detach().item(),
+                "critic/values/max": torch.max(valid_values).detach().item(),
+                "critic/values/min": torch.min(valid_values).detach().item(),
+                # vf explained var
+                "critic/vf_explained_var": (1.0 - return_diff_var / (return_var + 1e-5)).detach().item(),
+            }
+            if use_critic
+            else {}
+        ),
+        **(
+            {
+                # raw bonuses
+                "critic/raw_bonuses/mean": np.mean(sequence_raw_bonuses).item(),
+                "critic/raw_bonuses/max": np.max(sequence_raw_bonuses).item(),
+                "critic/raw_bonuses/min": np.min(sequence_raw_bonuses).item(),
+                "critic/raw_bonuses/std": np.std(sequence_raw_bonuses).item(),
+                # intrinsic_reward
+                "critic/intrinsic_reward/mean": np.mean(sequence_intrinsic_reward).item(),
+                "critic/intrinsic_reward/max": np.max(sequence_intrinsic_reward).item(),
+                "critic/intrinsic_reward/min": np.min(sequence_intrinsic_reward).item(),
+                "critic/intrinsic_reward/std": np.std(sequence_intrinsic_reward).item(),
+                # beta_scaled_intrinsic_reward
+                "critic/beta_scaled_intrinsic_reward/mean": np.mean(sequence_beta_scaled_intrinsic_reward).item(),
+                "critic/beta_scaled_intrinsic_reward/max": np.max(sequence_beta_scaled_intrinsic_reward).item(),
+                "critic/beta_scaled_intrinsic_reward/min": np.min(sequence_beta_scaled_intrinsic_reward).item(),
+                "critic/beta_scaled_intrinsic_reward/std": np.std(sequence_beta_scaled_intrinsic_reward).item(),
+                # extrinsic_reward
+                "critic/extrinsic_reward/mean": np.mean(sequence_extrinsic_reward).item(),
+                "critic/extrinsic_reward/max": np.max(sequence_extrinsic_reward).item(),
+                "critic/extrinsic_reward/min": np.min(sequence_extrinsic_reward).item(),
+                "critic/extrinsic_reward/std": np.std(sequence_extrinsic_reward).item(),
+                # three_case_stats
+                "critic/extrinsic_reward/no_rollouts_correct_frac": three_case_stats["no_rollouts_correct_frac"],
+                "critic/extrinsic_reward/some_rollouts_correct_frac": three_case_stats["some_rollouts_correct_frac"],
+                "critic/extrinsic_reward/all_rollouts_correct_frac": three_case_stats["all_rollouts_correct_frac"],
+                # total_reward
+                "critic/total_reward/mean": np.mean(sequence_total_reward).item(),
+                "critic/total_reward/max": np.max(sequence_total_reward).item(),
+                "critic/total_reward/min": np.min(sequence_total_reward).item(),
+                "critic/total_reward/std": np.std(sequence_total_reward).item(),
+            }
+            if elliptical
+            else {}
+        ),
+        # response length
+        "response_length/mean": torch.mean(response_length).detach().item(),
+        "response_length/max": torch.max(response_length).detach().item(),
+        "response_length/min": torch.min(response_length).detach().item(),
+        "response_length/clip_ratio": torch.mean(torch.eq(response_length, max_response_length).float())
+        .detach()
+        .item(),
+        # response length (non-aborted only)
+        # These statistics exclude aborted samples to avoid skew from zeros
+        "response_length_non_aborted/mean": non_aborted_response_length_mean,
+        "response_length_non_aborted/max": non_aborted_response_length_max,
+        "response_length_non_aborted/min": non_aborted_response_length_min,
+        "response_length_non_aborted/clip_ratio": non_aborted_response_length_clip_ratio,
+        # aborted ratio
+        # Fraction of samples whose response length is zero
+        "response/aborted_ratio": aborted_ratio,
+        # prompt length
+        "prompt_length/mean": torch.mean(prompt_length).detach().item(),
+        "prompt_length/max": torch.max(prompt_length).detach().item(),
+        "prompt_length/min": torch.min(prompt_length).detach().item(),
+        "prompt_length/clip_ratio": torch.mean(torch.eq(prompt_length, max_prompt_length).float()).detach().item(),
+    }
+    # multi-turn conversation
+    if "__num_turns__" in batch.non_tensor_batch:
+        num_turns = batch.non_tensor_batch["__num_turns__"]
+        metrics["num_turns/min"] = num_turns.min()
+        metrics["num_turns/max"] = num_turns.max()
+        metrics["num_turns/mean"] = num_turns.mean()
+    if "tool_call_counts" in batch.non_tensor_batch:
+        tool_call_counts = batch.non_tensor_batch["tool_call_counts"]
+        metrics["tool_call_counts/min"] = tool_call_counts.min()
+        metrics["tool_call_counts/max"] = tool_call_counts.max()
+        metrics["tool_call_counts/mean"] = tool_call_counts.mean()
+    return metrics
+def comb_estimator(n: int, c: int, k: int) -> float:
+    """Calculates 1 - comb(n - c, k) / comb(n, k)."""
+    if n - c < k:
+        return 1.0
+    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+def process_validation_metrics(
+    data_sources: list[str], sample_uids: list[str], infos_dict: dict[str, list[Any]], seed: int = 42
+) -> dict[str, dict[str, dict[str, float]]]:
+    """
+    Process validation metrics into a structured format with statistical analysis.
+    This function organizes validation metrics by data source and prompt, then computes
+    various statistical measures including means, standard deviations, best/worst values,
+    and majority voting results. It also performs bootstrap sampling to estimate statistics
+    for different sample sizes.
+    Args:
+        data_sources: List of data source identifiers for each sample.
+        sample_uids: List of sample uids corresponding to each sample.
+        infos_dict: Dictionary mapping variable names to lists of values for each sample.
+        seed: Random seed for bootstrap sampling. Defaults to 42.
+    Returns:
+        A nested dictionary with the structure:
+        {
+            data_source: {
+                variable_name: {
+                    metric_name: value
+                }
+            }
+        }
+        Where metric_name includes:
+        - "mean@N": Mean value across N samples
+        - "std@N": Standard deviation across N samples
+        - "best@N/mean": Mean of the best values in bootstrap samples of size N
+        - "best@N/std": Standard deviation of the best values in bootstrap samples
+        - "worst@N/mean": Mean of the worst values in bootstrap samples
+        - "worst@N/std": Standard deviation of the worst values in bootstrap samples
+        - "maj@N/mean": Mean of majority voting results in bootstrap samples (if "pred" exists)
+        - "maj@N/std": Standard deviation of majority voting results (if "pred" exists)
+    Example:
+        >>> data_sources = ["source1", "source1", "source2"]
+        >>> sample_uids = ["uid1", "uid1", "uid2"]
+        >>> infos_dict = {"score": [0.8, 0.9, 0.7], "pred": ["A", "A", "B"]}
+        >>> result = process_validation_metrics(data_sources, sample_uids, infos_dict)
+        >>> # result will contain statistics for each data source and variable
+    """
+    # Group metrics by data source, prompt and variable
+    data_src2uid2var2vals = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    for sample_idx, data_source in enumerate(data_sources):
+        uid = sample_uids[sample_idx]
+        var2vals = data_src2uid2var2vals[data_source][uid]
+        for var_name, var_vals in infos_dict.items():
+            var2vals[var_name].append(var_vals[sample_idx])
+    # Calculate metrics for each group
+    data_src2uid2var2metric = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+    for data_source, uid2var2vals in data_src2uid2var2vals.items():
+        for uid, var2vals in uid2var2vals.items():
+            for var_name, var_vals in var2vals.items():
+                if isinstance(var_vals[0], str):
+                    continue
+                metric = {}
+                n_resps = len(var_vals)
+                metric[f"mean@{n_resps}"] = np.mean(var_vals)
+                metric["pass@1/mean"] = comb_estimator(n_resps, np.sum(var_vals), 1)
+                if n_resps > 1:
+                    metric[f"std@{n_resps}"] = np.std(var_vals)
+                    ns = []
+                    n = 2
+                    while n < n_resps:
+                        ns.append(n)
+                        n *= 2
+                    ns.append(n_resps)
+                    for n in ns:
+                        # [(bon_mean, bon_std), (won_mean, won_std)] = bootstrap_metric(
+                        #     data=var_vals, subset_size=n, reduce_fns=[np.max, np.min], seed=seed
+                        # )
+                        # metric[f"best@{n}/mean"], metric[f"best@{n}/std"] = bon_mean, bon_std
+                        # metric[f"worst@{n}/mean"], metric[f"worst@{n}/std"] = won_mean, won_std
+                        metric[f"pass@{n}/mean"] = comb_estimator(n_resps, np.sum(var_vals), n)
+                        if var2vals.get("pred", None) is not None:
+                            vote_data = [
+                                {"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"], strict=True)
+                            ]
+                            [(maj_n_mean, maj_n_std)] = bootstrap_metric(
+                                data=vote_data,
+                                subset_size=n,
+                                reduce_fns=[partial(calc_maj_val, vote_key="pred", val_key="val")],
+                                seed=seed,
+                            )
+                            metric[f"maj@{n}/mean"], metric[f"maj@{n}/std"] = maj_n_mean, maj_n_std
+                data_src2uid2var2metric[data_source][uid][var_name] = metric
+    # Aggregate metrics across uids
+    data_src2var2metric2uid_vals = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    for data_source, uid2var2metric in data_src2uid2var2metric.items():
+        for uid, var2metric in uid2var2metric.items():
+            for var_name, metric in var2metric.items():
+                for metric_name, metric_val in metric.items():
+                    data_src2var2metric2uid_vals[data_source][var_name][metric_name].append(metric_val)
+    data_src2var2metric2val = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
+    for data_source, var2metric2uid_vals in data_src2var2metric2uid_vals.items():
+        for var_name, metric2uid_vals in var2metric2uid_vals.items():
+            for metric_name, uid_vals in metric2uid_vals.items():
+                data_src2var2metric2val[data_source][var_name][metric_name] = np.mean(uid_vals)
+    return data_src2var2metric2val

ICL/DAPO/verl-recipe/rep_exp/model_merge.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+CHECKPOINT_PATH=${1} # /path/to/global_step_X/actor, where X is the global step of the checkpoint with the best pass@1 on dev
+python3 -m verl.model_merger merge \
+    --backend fsdp \
+    --local_dir $CHECKPOINT_PATH \
+    --target_dir $CHECKPOINT_PATH/hf

ICL/DAPO/verl-recipe/rep_exp/plot_pass_at_k.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Code to plot the pass@k results for the RepExp RL training results.
+"""
+import json
+import os
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.stats as stats
+import seaborn as sns
+from matplotlib.lines import Line2D
+# Content configuration
+EVAL_FOLDER = "./eval"
+TASKS = ["math"]  # ["math", "gsm8k", "dapo-with-aime24"]
+SEEDS = [41, 42, 43]
+ALGORITHMS = ["elliptical"]  # ["grpo", "elliptical", "untrained", "unlikely"]
+LOG_AXES = True
+# Plot configuration
+FACE_COLOR = "#F7F7FF"
+MARKER = "o"
+LINEWIDTH = 1.275
+MARKERSIZE = 6
+MARKEREDGEWIDTH = 0.9
+LABEL_FONT_SIZE = 10
+TITLE_FONT_SIZE = 11
+TICK_LABEL_FONT_SIZE = 8
+LEGEND_FONT_SIZE = 8
+TASK_TO_NICE_NAME = {
+    "math": "MATH",
+    "gsm8k": "GSM8K",
+    "dapo-with-aime24": "AIME 2024",
+    "countdown-4": "Countdown",
+}
+ALGO_TO_COLOR = {
+    "grpo": sns.color_palette("deep")[-1],
+    "untrained": sns.color_palette("deep")[7],
+    "elliptical": sns.color_palette("colorblind")[2],
+    "unlikely": sns.color_palette("deep")[1],
+}
+ALGO_TO_NICE_NAME = {
+    "grpo": "GRPO",
+    "untrained": "Base Model",
+    "elliptical": r"RepExp (ours)",
+    "unlikely": "Unlikeliness",
+}
+def process_data(data: list[dict[str, float]], algorithm: str) -> tuple[dict[int, float], dict[int, float]]:
+    """
+    Process the pass@k data generated by a given algorithm.
+    Args:
+        data (List[Dict]): The data to process.
+        algorithm (str): Algorithm that generated the data.
+    Returns:
+        Tuple[Dict[int, float], Dict[int, float]]:
+            pass_at_k - The mean pass@k values.
+            pass_at_k_sem - The standard error of the pass@k values.
+    """
+    pass_at_k = defaultdict(list)
+    for d in data:
+        for key, v in d.items():
+            for k in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
+                if key.endswith(f"reward/pass@{k}/mean"):
+                    pass_at_k[k].append(v)
+    # NOTE: we only use a single seed for untrained since there is only one checkpoint for it
+    if algorithm != "untrained":
+        for k in pass_at_k.keys():
+            assert len(pass_at_k[k]) == len(SEEDS)
+    pass_at_k_sem = {k: stats.sem(v) for k, v in pass_at_k.items()} if algorithm != "untrained" else None
+    pass_at_k = {k: np.mean(v) for k, v in pass_at_k.items()}
+    return pass_at_k, pass_at_k_sem
+def main():
+    # Get all top-level folders in EVAL_FOLDER
+    eval_folders = os.listdir(EVAL_FOLDER)
+    # Figure setup
+    sns.set_style("whitegrid")
+    fig, axs = plt.subplots(1, len(TASKS), figsize=(3 * len(TASKS), 3))
+    for i, task in enumerate(TASKS):
+        ax = axs[i] if len(TASKS) > 1 else axs
+        algo_to_xs = {}
+        algo_to_ys = {}
+        for algorithm in ALGORITHMS:
+            # Get all eval folders for the current task and algorithm
+            folders = [f for f in eval_folders if f.startswith(f"{task}_{algorithm}")]
+            if len(folders) == 0:
+                continue
+            data = []
+            for folder in folders:
+                if algorithm == "untrained":
+                    with open(os.path.join(EVAL_FOLDER, folder, "eval.json")) as f:
+                        data.append(json.load(f))
+                else:
+                    # walk all files recursively in folder
+                    for root, dirs, files in os.walk(os.path.join(EVAL_FOLDER, folder)):
+                        for file in files:
+                            if file.endswith("eval.json"):
+                                with open(os.path.join(root, file)) as f:
+                                    data.append(json.load(f))
+                                break
+            pass_at_k, pass_at_k_sem = process_data(data, algorithm)
+            xs = np.array(list(pass_at_k.keys()))
+            ys = np.array([pass_at_k[k] for k in xs])
+            algo_to_xs[algorithm] = xs
+            algo_to_ys[algorithm] = ys
+            # Plot the current task - algorithm data
+            ax.plot(
+                xs,
+                ys,
+                color=ALGO_TO_COLOR[algorithm],
+                label=algorithm,
+                markeredgecolor=FACE_COLOR,
+                marker=MARKER,
+                linewidth=LINEWIDTH,
+                markersize=MARKERSIZE,
+                markeredgewidth=MARKEREDGEWIDTH,
+                alpha=1.0 if algorithm != "untrained" else 0.8,
+            )
+            # Plot the standard error in shaded bands
+            if algorithm != "untrained":
+                sems = np.array([pass_at_k_sem[k] for k in xs])
+                ax.fill_between(xs, ys - sems, ys + sems, alpha=0.2, color=ALGO_TO_COLOR[algorithm])
+            # Set y-axis limits
+            if task == "math":
+                y_min = 0.7
+                ax.set_ylim(top=0.95, bottom=y_min)
+            elif task == "gsm8k":
+                y_min = 0.925
+                ax.set_ylim(top=0.995, bottom=y_min)
+            elif task == "dapo-with-aime24":
+                y_min = 0.1
+                ax.set_ylim(bottom=y_min, top=0.63)
+            # Set x-axis limits
+            if LOG_AXES:
+                ax.set_xlim(left=2 ** (-0.2), right=2 ** (8.2))
+            else:
+                ax.set_xlim(left=-10, right=266)
+            # Set x-axis scale and ticks
+            if LOG_AXES:
+                ax.set_xscale("log", base=2)
+                x_ticks = [2**i for i in range(int(np.log2(max(xs))) + 1)]
+                x_tick_labels = [f"$2^{{{i}}}$" for i in range(int(np.log2(max(xs))) + 1)]
+            else:
+                # set every 64
+                x_ticks = [1, 32, 64, 96, 128, 160, 192, 224, 256]
+                x_tick_labels = ["1", "32", "64", "96", "128", "160", "192", "224", "256"]
+            ax.set_xticks(x_ticks, x_tick_labels)
+            # Set axes labels
+            ax.set_xlabel("k", fontsize=LABEL_FONT_SIZE)
+            if i == 0:
+                ax.set_ylabel("Pass@k", fontsize=LABEL_FONT_SIZE)
+            # Set title
+            ax.set_title(f"{TASK_TO_NICE_NAME[task]}", fontsize=TITLE_FONT_SIZE)
+        # Set font size for tick labels
+        for _label in ax.get_xticklabels():
+            _label.set_fontsize(TICK_LABEL_FONT_SIZE)
+        for _label in ax.get_yticklabels():
+            _label.set_fontsize(TICK_LABEL_FONT_SIZE)
+    # Create legend handles
+    legend_handles = [
+        Line2D(
+            [0],
+            [0],
+            color=ALGO_TO_COLOR[algo],
+            marker=MARKER,
+            linestyle="-",
+            linewidth=LINEWIDTH,
+            markersize=MARKERSIZE,
+            markeredgewidth=MARKEREDGEWIDTH,
+            markeredgecolor=FACE_COLOR,
+            label=ALGO_TO_NICE_NAME[algo],
+        )
+        for algo in ALGORITHMS
+    ]
+    # Create legend
+    legend = fig.legend(
+        handles=legend_handles,
+        loc="lower center",
+        ncol=len(ALGORITHMS),
+        bbox_to_anchor=(0.5, -0.07),
+        fontsize=LEGEND_FONT_SIZE,
+    )
+    plt.tight_layout()
+    os.makedirs("figures", exist_ok=True)
+    # Save figure
+    plt.savefig(
+        os.path.join("figures", f"rl_pass_at_k_{TASKS}_{'' if LOG_AXES else '_linear_axes'}.pdf"),
+        bbox_extra_artists=(legend,),
+        bbox_inches="tight",
+    )
+    # Close figure
+    plt.close()
+if __name__ == "__main__":
+    main()

ICL/DAPO/verl-recipe/rep_exp/rep_exp_trainer.py ADDED Viewed

	@@ -0,0 +1,739 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PPO Trainer with Ray-based single controller.
+This trainer supports model-agonistic model initialization with huggingface
+"""
+import json
+import os
+import uuid
+from collections import defaultdict
+from copy import deepcopy
+from pprint import pprint
+import numpy as np
+import ray
+import torch
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from verl import DataProto
+from verl.experimental.dataset.sampler import AbstractCurriculumSampler
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from verl.single_controller.ray import RayClassWithInitArgs
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
+from verl.trainer.ppo.metric_utils import (
+    compute_throughout_metrics,
+    compute_timing_metrics,
+)
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer, apply_kl_penalty, compute_advantage, compute_response_mask
+from verl.trainer.ppo.reward import compute_reward, compute_reward_async
+from verl.trainer.ppo.utils import Role
+from verl.utils.checkpoint.checkpoint_manager import should_save_ckpt_esi
+from verl.utils.config import omega_conf_to_dataclass
+from verl.utils.debug import marked_timer
+from verl.utils.metric import reduce_metrics
+from verl.utils.rollout_skip import RolloutSkip
+from .metric_utils import compute_data_metrics, process_validation_metrics
+class RayRepExpTrainer(RayPPOTrainer):
+    """Distributed RepExp trainer using Ray for scalable reinforcement learning.
+    See RayPPOTrainer parent class for more details.
+    """
+    def _save_checkpoint(self):
+        super()._save_checkpoint()
+        # Write best metric to global steps
+        local_best_metric_to_global_step = os.path.join(
+            self.config.trainer.default_local_dir, "best_metric_to_global_step.json"
+        )
+        with open(local_best_metric_to_global_step, "w") as f:
+            json.dump(self.best_dev_pass_at_k_to_global_step, f)
+    def _update_best_pass_at(self, val_metrics: dict[str, float], pass_at_k: int) -> bool:
+        """
+        Save checkpoint if the validation metrics are the best.
+        Args:
+            val_metrics: The validation metrics.
+            pass_at_k: The pass@k to use for determining whether to save the checkpoint.
+        """
+        for k in val_metrics.keys():
+            if k.endswith(f"reward/pass@{pass_at_k}/mean"):
+                if val_metrics[k] > self.best_dev_pass_at_k[pass_at_k]:
+                    self.best_dev_pass_at_k[pass_at_k] = val_metrics[k]
+                    self.best_dev_pass_at_k_to_global_step[pass_at_k] = self.global_steps
+                    return True
+        return False
+    def _validate(self):
+        data_source_lst = []
+        reward_extra_infos_dict: dict[str, list] = defaultdict(list)
+        # Lists to collect samples for the table
+        sample_inputs = []
+        sample_outputs = []
+        sample_gts = []
+        sample_scores = []
+        sample_turns = []
+        sample_uids = []
+        for test_data in tqdm(self.val_dataloader, desc="Validating ..."):
+            test_batch = DataProto.from_single_dict(test_data)
+            if "uid" not in test_batch.non_tensor_batch:
+                test_batch.non_tensor_batch["uid"] = np.array(
+                    [str(uuid.uuid4()) for _ in range(len(test_batch.batch))], dtype=object
+                )
+            # repeat test batch
+            test_batch = test_batch.repeat(
+                repeat_times=self.config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True
+            )
+            # we only do validation on rule-based rm
+            if self.config.reward_model.enable and test_batch[0].non_tensor_batch["reward_model"]["style"] == "model":
+                return {}
+            # Store original inputs
+            input_ids = test_batch.batch["input_ids"]
+            # TODO: Can we keep special tokens except for padding tokens?
+            input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
+            sample_inputs.extend(input_texts)
+            sample_uids.extend(test_batch.non_tensor_batch["uid"])
+            ground_truths = [
+                item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) for item in test_batch
+            ]
+            sample_gts.extend(ground_truths)
+            test_gen_batch = self._get_gen_batch(test_batch)
+            test_gen_batch.meta_info = {
+                "eos_token_id": self.tokenizer.eos_token_id,
+                "pad_token_id": self.tokenizer.pad_token_id,
+                "recompute_log_prob": False,
+                "do_sample": self.config.actor_rollout_ref.rollout.val_kwargs.do_sample,
+                "validate": True,
+                "global_steps": self.global_steps,
+            }
+            print(f"test_gen_batch meta info: {test_gen_batch.meta_info}")
+            # pad to be divisible by dp_size
+            size_divisor = (
+                self.actor_rollout_wg.world_size
+                if not self.async_rollout_mode
+                else self.config.actor_rollout_ref.rollout.agent.num_workers
+            )
+            test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, size_divisor)
+            if not self.async_rollout_mode:
+                test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
+            else:
+                test_output_gen_batch_padded = self.async_rollout_manager.generate_sequences(test_gen_batch_padded)
+            # unpad
+            test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size)
+            print("validation generation end")
+            # Store generated outputs
+            output_ids = test_output_gen_batch.batch["responses"]
+            output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
+            sample_outputs.extend(output_texts)
+            test_batch = test_batch.union(test_output_gen_batch)
+            test_batch.meta_info["validate"] = True
+            # evaluate using reward_function
+            if self.val_reward_fn is None:
+                raise ValueError("val_reward_fn must be provided for validation.")
+            result = self.val_reward_fn(test_batch, return_dict=True)
+            reward_tensor = result["reward_tensor"]
+            scores = reward_tensor.sum(-1).cpu().tolist()
+            sample_scores.extend(scores)
+            reward_extra_infos_dict["reward"].extend(scores)
+            if "reward_extra_info" in result:
+                for key, lst in result["reward_extra_info"].items():
+                    reward_extra_infos_dict[key].extend(lst)
+            # collect num_turns of each prompt
+            if "__num_turns__" in test_batch.non_tensor_batch:
+                sample_turns.append(test_batch.non_tensor_batch["__num_turns__"])
+            data_source_lst.append(test_batch.non_tensor_batch.get("data_source", ["unknown"] * reward_tensor.shape[0]))
+        self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores)
+        # dump generations
+        val_data_dir = self.config.trainer.get("validation_data_dir", None)
+        if val_data_dir:
+            self._dump_generations(
+                inputs=sample_inputs,
+                outputs=sample_outputs,
+                gts=sample_gts,
+                scores=sample_scores,
+                reward_extra_infos_dict=reward_extra_infos_dict,
+                dump_path=val_data_dir,
+            )
+        for key_info, lst in reward_extra_infos_dict.items():
+            assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}"
+        data_sources = np.concatenate(data_source_lst, axis=0)
+        data_src2var2metric2val = process_validation_metrics(data_sources, sample_uids, reward_extra_infos_dict)
+        metric_dict = {}
+        for data_source, var2metric2val in data_src2var2metric2val.items():
+            core_var = "acc" if "acc" in var2metric2val else "reward"
+            for var_name, metric2val in var2metric2val.items():
+                n_max = max([int(name.split("@")[-1].split("/")[0]) for name in metric2val.keys()])
+                for metric_name, metric_val in metric2val.items():
+                    if (
+                        (var_name == core_var)
+                        and any(metric_name.startswith(pfx) for pfx in ["mean", "maj", "best"])
+                        and (f"@{n_max}" in metric_name)
+                    ):
+                        metric_sec = "val-core"
+                    else:
+                        metric_sec = "val-aux"
+                    pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}"
+                    metric_dict[pfx] = metric_val
+        if len(sample_turns) > 0:
+            sample_turns = np.concatenate(sample_turns)
+            metric_dict["val-aux/num_turns/min"] = sample_turns.min()
+            metric_dict["val-aux/num_turns/max"] = sample_turns.max()
+            metric_dict["val-aux/num_turns/mean"] = sample_turns.mean()
+        return metric_dict
+    def init_workers(self):
+        """Initialize distributed training workers using Ray backend.
+        Creates:
+        1. Ray resource pools from configuration
+        2. Worker groups for each role (actor, critic, etc.)
+        """
+        self.resource_pool_manager.create_resource_pool()
+        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
+        val_only = self.config.trainer.get("val_only", False)
+        # create actor and rollout
+        actor_role = Role.ActorRolloutRef if Role.ActorRolloutRef in self.role_worker_mapping else Role.ActorRollout
+        if self.hybrid_engine:
+            resource_pool = self.resource_pool_manager.get_resource_pool(actor_role)
+            actor_rollout_cls = RayClassWithInitArgs(
+                cls=self.role_worker_mapping[actor_role],
+                config=self.config.actor_rollout_ref,
+                role=str(actor_role),
+            )
+            self.resource_pool_to_cls[resource_pool][str(actor_role)] = actor_rollout_cls
+        else:
+            raise NotImplementedError
+        # create critic
+        if self.use_critic and not val_only:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
+            critic_cfg = omega_conf_to_dataclass(self.config.critic)
+            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg)
+            self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls
+        # create reference policy if needed
+        if self.use_reference_policy and not val_only:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
+            ref_policy_cls = RayClassWithInitArgs(
+                self.role_worker_mapping[Role.RefPolicy],
+                config=self.config.actor_rollout_ref,
+                role=str(Role.RefPolicy),
+            )
+            self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls
+        # create a reward model if reward_fn is None
+        if self.use_rm and not val_only:
+            # we create a RM here
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+            rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
+            self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls
+        # initialize WorkerGroup
+        # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
+        # you should not use `create_colocated_worker_cls`.
+        # Instead, directly pass different resource pool to different worker groups.
+        # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information.
+        all_wg = {}
+        wg_kwargs = {}  # Setting up kwargs for RayWorkerGroup
+        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
+            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
+        if OmegaConf.select(self.config.global_profiler, "steps") is not None:
+            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.global_profiler, "steps")
+            # Only require nsight worker options when tool is nsys
+            if OmegaConf.select(self.config.global_profiler, "tool") == "nsys":
+                assert (
+                    OmegaConf.select(self.config.global_profiler.global_tool_config.nsys, "worker_nsight_options")
+                    is not None
+                ), "worker_nsight_options must be set when using nsys with profile_steps"
+                wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
+                    OmegaConf.select(self.config.global_profiler.global_tool_config.nsys, "worker_nsight_options")
+                )
+        wg_kwargs["device_name"] = self.device_name
+        for resource_pool, class_dict in self.resource_pool_to_cls.items():
+            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+            wg_dict = self.ray_worker_group_cls(
+                resource_pool=resource_pool,
+                ray_cls_with_init=worker_dict_cls,
+                **wg_kwargs,
+            )
+            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+            all_wg.update(spawn_wg)
+        if self.use_critic:
+            self.critic_wg = all_wg[str(Role.Critic)]
+            self.critic_wg.init_model()
+        if self.use_reference_policy and not self.ref_in_actor:
+            if str(Role.RefPolicy) in all_wg:
+                self.ref_policy_wg = all_wg[str(Role.RefPolicy)]
+                self.ref_policy_wg.init_model()
+            else:
+                # Model engine: ActorRolloutRefWorker
+                assert str(Role.ActorRolloutRef) in all_wg, f"{all_wg.keys()=}"
+                self.ref_policy_wg = all_wg[str(Role.ActorRolloutRef)]
+        self.rm_wg = None
+        # initalization of rm_wg will be deprecated in the future
+        if self.use_rm:
+            self.rm_wg = all_wg[str(Role.RewardModel)]
+            self.rm_wg.init_model()
+        # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
+        self.actor_rollout_wg = all_wg[str(actor_role)]
+        self.actor_rollout_wg.init_model()
+        # create async rollout manager and request scheduler
+        self.async_rollout_mode = False
+        if self.config.actor_rollout_ref.rollout.mode == "async":
+            from verl.experimental.agent_loop import AgentLoopManager
+            self.async_rollout_mode = True
+            self.async_rollout_manager = AgentLoopManager(
+                config=self.config, worker_group=self.actor_rollout_wg, rm_wg=self.rm_wg
+            )
+    def fit(self):
+        """
+        The training loop of PPO.
+        The driver process only need to call the compute functions of the worker group through RPC
+        to construct the PPO dataflow.
+        The light-weight advantage computation is done on the driver process.
+        """
+        from omegaconf import OmegaConf
+        from .utils.tracking import Tracking
+        logger = Tracking(
+            project_name=self.config.trainer.project_name,
+            experiment_name=self.config.trainer.experiment_name,
+            default_backend=self.config.trainer.logger,
+            config=OmegaConf.to_container(self.config, resolve=True),
+        )
+        # global vars to track during training
+        self.global_steps = 0
+        self.best_dev_pass_at_k = {
+            1: 0,
+        }
+        self.best_dev_pass_at_k_to_global_step = {
+            1: 0,
+        }
+        # load checkpoint before doing anything
+        self._load_checkpoint()
+        current_epoch = self.global_steps // len(self.train_dataloader)
+        # perform validation before training
+        # currently, we only support validation using the reward_function.
+        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+            val_metrics = self._validate()
+            assert val_metrics, f"{val_metrics=}"
+            # Initialize the best validation metrics for pass@k before training
+            self._update_best_pass_at(val_metrics, 1)
+            val_metrics["best/pass@1"] = self.best_dev_pass_at_k[1]
+            pprint(f"Initial validation metrics: {val_metrics}")
+            logger.log(data=val_metrics, step=self.global_steps)
+            if self.config.trainer.get("val_only", False):
+                return
+        if self.config.actor_rollout_ref.rollout.get("skip_rollout", False):
+            rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg)
+            rollout_skip.wrap_generate_sequences()
+        # add tqdm
+        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")
+        # we start from step 1
+        self.global_steps += 1
+        last_val_metrics = None
+        self.max_steps_duration = 0
+        prev_step_profile = False
+        curr_step_profile = (
+            self.global_steps in self.config.global_profiler.steps
+            if self.config.global_profiler.steps is not None
+            else False
+        )
+        next_step_profile = False
+        for epoch in range(current_epoch, self.config.trainer.total_epochs):
+            for batch_dict in self.train_dataloader:
+                metrics = {}
+                timing_raw = {}
+                with marked_timer("start_profile", timing_raw):
+                    self._start_profiling(
+                        not prev_step_profile and curr_step_profile
+                        if self.config.global_profiler.profile_continuous_steps
+                        else curr_step_profile
+                    )
+                batch: DataProto = DataProto.from_single_dict(batch_dict)
+                batch.meta_info["temperature"] = self.config.actor_rollout_ref.rollout.temperature
+                # add uid to batch
+                batch.non_tensor_batch["uid"] = np.array(
+                    [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object
+                )
+                gen_batch = self._get_gen_batch(batch)
+                # pass global_steps to trace
+                gen_batch.meta_info["global_steps"] = self.global_steps
+                gen_batch_output = gen_batch.repeat(
+                    repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True
+                )
+                is_last_step = self.global_steps >= self.total_training_steps
+                with marked_timer("step", timing_raw):
+                    # generate a batch
+                    with marked_timer("gen", timing_raw, color="red"):
+                        if not self.async_rollout_mode:
+                            gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch_output)
+                        else:
+                            gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch_output)
+                        timing_raw.update(gen_batch_output.meta_info["timing"])
+                        gen_batch_output.meta_info.pop("timing", None)
+                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
+                        if self.reward_fn is None:
+                            raise ValueError("A reward_fn is required for REMAX advantage estimation.")
+                        with marked_timer("gen_max", timing_raw, color="purple"):
+                            gen_baseline_batch = deepcopy(gen_batch)
+                            gen_baseline_batch.meta_info["do_sample"] = False
+                            if not self.async_rollout_mode:
+                                gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch)
+                            else:
+                                gen_baseline_output = self.async_rollout_manager.generate_sequences(gen_baseline_batch)
+                            batch = batch.union(gen_baseline_output)
+                            # compute reward model score on batch
+                            rm_scores = None
+                            if self.use_rm and "rm_scores" not in batch.batch.keys():
+                                rm_scores = self.rm_wg.compute_rm_score(batch)
+                                batch = batch.union(rm_scores)
+                            reward_baseline_tensor, _ = compute_reward(batch, self.reward_fn)
+                            reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)
+                            keys_to_pop = set(gen_baseline_output.batch.keys())
+                            if rm_scores is not None:
+                                keys_to_pop.update(rm_scores.batch.keys())
+                            batch.pop(batch_keys=list(keys_to_pop))
+                            batch.batch["reward_baselines"] = reward_baseline_tensor
+                            del rm_scores, gen_baseline_batch, gen_baseline_output
+                    # repeat to align with repeated responses in rollout
+                    batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+                    batch = batch.union(gen_batch_output)
+                    if "response_mask" not in batch.batch.keys():
+                        batch.batch["response_mask"] = compute_response_mask(batch)
+                    # Balance the number of valid tokens across DP ranks.
+                    # NOTE: This usually changes the order of data in the `batch`,
+                    # which won't affect the advantage calculation (since it's based on uid),
+                    # but might affect the loss calculation (due to the change of mini-batching).
+                    if self.config.trainer.balance_batch:
+                        self._balance_batch(batch, metrics=metrics)
+                    # compute global_valid tokens
+                    batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+                    with marked_timer("reward", timing_raw, color="yellow"):
+                        # compute reward model score
+                        if self.use_rm and "rm_scores" not in batch.batch.keys():
+                            if self.config.reward_model.elliptical.enable:
+                                hidden_states = self.rm_wg.compute_hidden_states(batch)
+                                batch = batch.union(hidden_states)
+                                reward_tensor = self.rm_wg.compute_rm_score(batch)
+                            else:
+                                reward_tensor = self.rm_wg.compute_rm_score(batch)
+                            batch = batch.union(reward_tensor)
+                        if self.config.reward_model.launch_reward_fn_async:
+                            future_reward = compute_reward_async.remote(
+                                data=batch, config=self.config, tokenizer=self.tokenizer
+                            )
+                        else:
+                            reward_tensor, reward_extra_infos_dict = compute_reward(batch, self.reward_fn)
+                    # Operating Mode Selection:
+                    # - Bypass mode: Sets old_log_probs = rollout_log_probs (2 policies: π_rollout, π_θ)
+                    # - Decoupled mode: Recomputes old_log_probs as proximal anchor (3 policies: π_rollout, π_old, π_θ)
+                    #   Note: π_old computed once per data batch, serves as stable reference during mini-batch updates
+                    rollout_corr_config = self.config.algorithm.get("rollout_correction", None)
+                    bypass_recomputing_logprobs = rollout_corr_config and rollout_corr_config.get("bypass_mode", False)
+                    if bypass_recomputing_logprobs:  # Use `rollout_log_probs`
+                        from verl.trainer.ppo.rollout_corr_helper import apply_rollout_correction
+                        apply_rollout_correction(
+                            batch=batch,
+                            rollout_corr_config=rollout_corr_config,
+                            policy_loss_config=self.config.actor_rollout_ref.actor.policy_loss,
+                        )
+                    else:  # Recompute old_log_probs
+                        with marked_timer("old_log_prob", timing_raw, color="blue"):
+                            old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                            entropys = old_log_prob.batch["entropys"]
+                            response_masks = batch.batch["response_mask"]
+                            loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
+                            entropy_agg = agg_loss(
+                                loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode
+                            )
+                            old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
+                            metrics.update(old_log_prob_metrics)
+                            old_log_prob.batch.pop("entropys")
+                            batch = batch.union(old_log_prob)
+                            if "rollout_log_probs" in batch.batch.keys():
+                                # TODO: we may want to add diff of probs too.
+                                from verl.utils.debug.metrics import calculate_debug_metrics
+                                metrics.update(calculate_debug_metrics(batch))
+                    assert "old_log_probs" in batch.batch, f'"old_log_prob" not in {batch.batch.keys()=}'
+                    if self.use_reference_policy:
+                        # compute reference log_prob
+                        with marked_timer(str(Role.RefPolicy), timing_raw, color="olive"):
+                            if not self.ref_in_actor:
+                                ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+                            else:
+                                ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
+                            batch = batch.union(ref_log_prob)
+                    # compute values
+                    if self.use_critic:
+                        with marked_timer("values", timing_raw, color="cyan"):
+                            values = self.critic_wg.compute_values(batch)
+                            batch = batch.union(values)
+                    with marked_timer("adv", timing_raw, color="brown"):
+                        # we combine with rule-based rm
+                        reward_extra_infos_dict: dict[str, list]
+                        if self.config.reward_model.launch_reward_fn_async:
+                            reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
+                        batch.batch["token_level_scores"] = reward_tensor
+                        if reward_extra_infos_dict:
+                            batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
+                        # compute rewards. apply_kl_penalty if available
+                        if self.config.algorithm.use_kl_in_reward:
+                            batch, kl_metrics = apply_kl_penalty(
+                                batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
+                            )
+                            metrics.update(kl_metrics)
+                        else:
+                            batch.batch["token_level_rewards"] = batch.batch["token_level_scores"]
+                        # Compute rollout correction: IS weights, rejection sampling, and metrics
+                        # Only runs in decoupled mode (computes once per batch using stable π_old)
+                        # In bypass mode, this is skipped - actor computes metrics from evolving π_θ vs π_rollout
+                        if (
+                            rollout_corr_config is not None
+                            and "rollout_log_probs" in batch.batch
+                            and not bypass_recomputing_logprobs  # Only in decoupled mode
+                        ):
+                            from verl.trainer.ppo.rollout_corr_helper import compute_rollout_correction_and_add_to_batch
+                            # Compute IS weights, apply rejection sampling, compute metrics
+                            batch, is_metrics = compute_rollout_correction_and_add_to_batch(batch, rollout_corr_config)
+                            # IS and off-policy metrics already have rollout_corr/ prefix
+                            metrics.update(is_metrics)
+                        # compute advantages, executed on the driver process
+                        norm_adv_by_std_in_grpo = self.config.algorithm.get(
+                            "norm_adv_by_std_in_grpo", True
+                        )  # GRPO adv normalization factor
+                        batch = compute_advantage(
+                            batch,
+                            adv_estimator=self.config.algorithm.adv_estimator,
+                            gamma=self.config.algorithm.gamma,
+                            lam=self.config.algorithm.lam,
+                            num_repeat=self.config.actor_rollout_ref.rollout.n,
+                            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
+                            config=self.config.algorithm,
+                        )
+                    # update critic
+                    if self.use_critic:
+                        with marked_timer("update_critic", timing_raw, color="pink"):
+                            critic_output = self.critic_wg.update_critic(batch)
+                        critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
+                        metrics.update(critic_output_metrics)
+                    # implement critic warmup
+                    if self.config.trainer.critic_warmup <= self.global_steps:
+                        # update actor
+                        with marked_timer("update_actor", timing_raw, color="red"):
+                            rollout_config = self.config.actor_rollout_ref.rollout
+                            batch.meta_info["multi_turn"] = rollout_config.multi_turn.enable
+                            # TODO: Make "temperature" single source of truth from generation.
+                            batch.meta_info["temperature"] = rollout_config.temperature
+                            actor_output = self.actor_rollout_wg.update_actor(batch)
+                        actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
+                        metrics.update(actor_output_metrics)
+                    # Log rollout generations if enabled
+                    rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
+                    if rollout_data_dir:
+                        self._log_rollout_data(batch, reward_extra_infos_dict, timing_raw, rollout_data_dir)
+                # validate
+                if (
+                    self.val_reward_fn is not None
+                    and self.config.trainer.test_freq > 0
+                    and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
+                ):
+                    with marked_timer("testing", timing_raw, color="green"):
+                        val_metrics: dict = self._validate()
+                        # Initialize the best validation metrics for pass@k before training
+                        self._update_best_pass_at(val_metrics, 1)
+                        val_metrics["best/pass@1"] = self.best_dev_pass_at_k[1]
+                        if is_last_step:
+                            last_val_metrics = val_metrics
+                    metrics.update(val_metrics)
+                # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
+                esi_close_to_expiration = should_save_ckpt_esi(
+                    max_steps_duration=self.max_steps_duration,
+                    redundant_time=self.config.trainer.esi_redundant_time,
+                )
+                # Check if the conditions for saving a checkpoint are met.
+                # The conditions include a mandatory condition (1) and
+                # one of the following optional conditions (2/3/4):
+                # 1. The save frequency is set to a positive value.
+                # 2. It's the last training step.
+                # 3. The current step number is a multiple of the save frequency.
+                # 4. The ESI(Elastic Server Instance)/training plan is close to expiration.
+                if self.config.trainer.save_freq > 0 and (
+                    is_last_step or self.global_steps % self.config.trainer.save_freq == 0 or esi_close_to_expiration
+                ):
+                    if esi_close_to_expiration:
+                        print("Force saving checkpoint: ESI instance expiration approaching.")
+                    with marked_timer("save_checkpoint", timing_raw, color="green"):
+                        self._save_checkpoint()
+                with marked_timer("stop_profile", timing_raw):
+                    next_step_profile = (
+                        self.global_steps + 1 in self.config.global_profiler.steps
+                        if self.config.global_profiler.steps is not None
+                        else False
+                    )
+                    self._stop_profiling(
+                        curr_step_profile and not next_step_profile
+                        if self.config.global_profiler.profile_continuous_steps
+                        else curr_step_profile
+                    )
+                    prev_step_profile = curr_step_profile
+                    curr_step_profile = next_step_profile
+                steps_duration = timing_raw["step"]
+                self.max_steps_duration = max(self.max_steps_duration, steps_duration)
+                # training metrics
+                metrics.update(
+                    {
+                        "training/global_step": self.global_steps,
+                        "training/epoch": epoch,
+                    }
+                )
+                # collect metrics
+                metrics.update(
+                    compute_data_metrics(
+                        batch=batch,
+                        use_critic=self.use_critic,
+                        elliptical=self.config.reward_model.elliptical.enable,
+                    )
+                )
+                metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+                # TODO: implement actual tflpo and theoretical tflpo
+                n_gpus = self.resource_pool_manager.get_n_gpus()
+                metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+                # Note: mismatch metrics (KL, PPL, etc.) are collected at line 1179 after advantage computation
+                # this is experimental and may be changed/removed in the future in favor of a general-purpose one
+                if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
+                    self.train_dataloader.sampler.update(batch=batch)
+                # TODO: make a canonical logger that supports various backend
+                logger.log(data=metrics, step=self.global_steps)
+                progress_bar.update(1)
+                self.global_steps += 1
+                if (
+                    hasattr(self.config.actor_rollout_ref.actor, "profiler")
+                    and self.config.actor_rollout_ref.actor.profiler.tool == "torch_memory"
+                ):
+                    self.actor_rollout_wg.dump_memory_snapshot(
+                        tag=f"post_update_step{self.global_steps}", sub_dir=f"step{self.global_steps}"
+                    )
+                if is_last_step:
+                    pprint(f"Final validation metrics: {last_val_metrics}")
+                    progress_bar.close()
+                    return
+                # this is experimental and may be changed/removed in the future
+                # in favor of a general-purpose data buffer pool
+                if hasattr(self.train_dataset, "on_batch_end"):
+                    # The dataset may be changed after each training batch
+                    self.train_dataset.on_batch_end(batch=batch)

ICL/DAPO/verl-recipe/spin/core_algos.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+class AdaptiveKLController:
+    """
+    Adaptive KL controller described in the paper:
+    https://arxiv.org/pdf/1909.08593.pdf
+    """
+    def __init__(self, init_kl_coef, target_kl, horizon):
+        self.value = init_kl_coef
+        self.target = target_kl
+        self.horizon = horizon
+    def update(self, current_kl, n_steps):
+        target = self.target
+        proportional_error = np.clip(current_kl / target - 1, -0.2, 0.2)
+        mult = 1 + proportional_error * n_steps / self.horizon
+        self.value *= mult
+class FixedKLController:
+    """Fixed KL controller."""
+    def __init__(self, kl_coef):
+        self.value = kl_coef
+    def update(self, current_kl, n_steps):
+        pass
+def get_kl_controller(kl_ctrl):
+    if kl_ctrl.type == "fixed":
+        return FixedKLController(kl_coef=kl_ctrl.kl_coef)
+    elif kl_ctrl.type == "adaptive":
+        assert kl_ctrl.horizon > 0, f"horizon must be larger than 0. Got {kl_ctrl.horizon}"
+        return AdaptiveKLController(init_kl_coef=kl_ctrl.kl_coef, target_kl=kl_ctrl.target_kl, horizon=kl_ctrl.horizon)
+    else:
+        raise NotImplementedError
+def compute_onlinedpo_pref(
+    token_level_rewards: torch.Tensor,
+    response_mask: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Computes preferences between pairs of sequences based on summed rewards
+    and returns a mask aligned with the interleaved batch.
+    Assumes inputs are interleaved: [Resp1_Prompt0, Resp2_Prompt0, Resp1_Prompt1, Resp2_Prompt1, ...]
+    Args:
+        token_level_rewards: Tensor of shape [batch_size * 2, seq_len]
+        response_mask: Tensor of shape [batch_size * 2, seq_len]
+    Returns:
+        torch.Tensor: A boolean mask of shape [batch_size * 2], where True indicates
+                      the corresponding entry is the chosen response for its pair.
+                      Example: [True, False, False, True, ...] means for prompt 0,
+                               response 1 was chosen; for prompt 1, response 2 was chosen.
+    """
+    # print(f"---- [DEBUG] Inside compute_onlinedpo_pref ----")
+    if token_level_rewards.shape[0] % 2 != 0 or response_mask.shape[0] % 2 != 0:
+        raise ValueError(
+            f"Input tensor batch dimension must be even for pair comparison, got shapes: "
+            f"{token_level_rewards.shape}, {response_mask.shape}"
+        )
+    if token_level_rewards.shape != response_mask.shape:
+        raise ValueError(f"Shape mismatch between rewards {token_level_rewards.shape} and mask {response_mask.shape}")
+    # 1. Calculate Sequence Scores
+    scores = (token_level_rewards * response_mask).sum(dim=-1)
+    # print(f"  Calculated sequence scores shape: {scores.shape}") # [batch_size * 2]
+    # 2. Reshape scores to group pairs: [batch_size, 2]
+    try:
+        score_pairs = scores.view(-1, 2)
+    except RuntimeError as e:
+        print(f"ERROR reshaping scores (shape {scores.shape}) into pairs: {e}")
+        raise e
+    print(f"  Reshaped score pairs shape: {score_pairs.shape}")  # [batch_size, 2]
+    # 3. Compare scores to find which index (0 or 1) is the winner within each pair
+    #    winner_indices[i] = 0 if score_pairs[i, 0] >= score_pairs[i, 1] else 1
+    winner_indices = torch.argmax(score_pairs, dim=1)  # 0 if first is max, 1 if second is max
+    # Handle ties explicitly if argmax behavior isn't guaranteed (usually picks first max)
+    # Alternatively: winner_mask_original = score_pairs[:, 0] >= score_pairs[:, 1]
+    # print(f"  Winner indices shape: {winner_indices.shape}") # [batch_size]
+    # print(f"  Number where Response 2 (index 1) is preferred: {winner_indices.sum().item()}") # Counts number of 1s
+    # 4. Create the final [batch_size * 2] mask
+    num_pairs = score_pairs.shape[0]
+    full_batch_size = num_pairs * 2
+    # Create indices for the full batch [0, 1, 2, 3, ..., N*2-1]
+    # full_indices = torch.arange(full_batch_size, device=scores.device)
+    # Create indices corresponding to the winner within each pair's original index
+    # E.g., if winner_indices is [0, 1, 0], pair_indices is [0, 1, 2]
+    # winner_global_indices = (pair_indices * 2) + winner_indices -> [ (0*2)+0, (1*2)+1, (2*2)+0 ] -> [0, 3, 4]
+    pair_indices = torch.arange(num_pairs, device=scores.device)
+    winner_global_indices = (pair_indices * 2) + winner_indices
+    # Create boolean mask - True at the winner's position
+    output_preference_mask = torch.zeros(full_batch_size, dtype=torch.bool, device=scores.device)
+    output_preference_mask[winner_global_indices] = True
+    # print(f"  Output preference mask shape: {output_preference_mask.shape}") # Should be [batch_size * 2]
+    # print(f"  Output mask True count (Chosen): {output_preference_mask.sum().item()}") # Should be batch_size
+    # print(f"  Output mask False count (Rejected): {(~output_preference_mask).sum().item()}") # Should be batch_size
+    # print(f"---- [DEBUG] Exiting compute_onlinedpo_pref ----")
+    return output_preference_mask
+def compute_online_dpo_loss(
+    policy_chosen_logps: torch.Tensor,
+    policy_rejected_logps: torch.Tensor,
+    reference_chosen_logps: torch.Tensor,
+    reference_rejected_logps: torch.Tensor,
+    beta: float,
+    label_smoothing: float = 0.0,
+    loss_type: str = "sigmoid",
+    reference_free: bool = False,
+) -> torch.Tensor:
+    import torch.nn.functional as F
+    pi_logratios = policy_chosen_logps - policy_rejected_logps
+    ref_logratios = reference_chosen_logps - reference_rejected_logps
+    if reference_free:
+        ref_logratios = torch.zeros_like(pi_logratios)
+    logits = pi_logratios - ref_logratios
+    if loss_type == "sigmoid":
+        losses = -F.logsigmoid(beta * logits) * (1 - label_smoothing) - F.logsigmoid(-beta * logits) * label_smoothing
+    elif loss_type == "ipo":
+        losses = (logits - 1 / (2 * beta)) ** 2
+    else:
+        raise ValueError(f"Unsupported loss_type: {loss_type}. Choose 'sigmoid', 'ipo', or 'hinge'.")
+    return losses.mean()
+def get_batch_logps(
+    logits: torch.FloatTensor, labels: torch.LongTensor, average_log_prob: bool = False
+) -> torch.FloatTensor:
+    """
+    Compute the log probabilities of the given labels under the given logits.
+    Args:
+        logits: Logits of the model (e.g., huggingface CausalLMOutputs `logits`).
+                Shape: (batch_size, sequence_length, vocab_size)
+        labels: Labels for computing the sequence log probabilities. Shape: (batch_size, sequence_length)
+        average_log_prob: If True, return the average log probability per sequence. Otherwise, return the sum.
+    Returns:
+        A tensor of shape (batch_size,) containing the average/sum log probabilities of the given sequences.
+    """
+    if logits.shape[:-1] != labels.shape:
+        raise ValueError("Logits and labels must have the same shape[:-1]")
+    # Ensure labels are contiguous and on the same device as logits
+    labels = labels.contiguous().to(logits.device)
+    # Shift so that tokens < n predict n
+    shift_logits = logits[..., :-1, :].contiguous()
+    shift_labels = labels[..., 1:].contiguous()
+    # Calculate per token log probability
+    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction="none")
+    per_token_logps = -loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+    per_token_logps = per_token_logps.view(
+        shift_logits.size(0), shift_logits.size(1)
+    )  # Reshape back to (batch_size, seq_len-1)
+    # Create a mask for the labels that are not -100
+    loss_mask = shift_labels != -100
+    # Apply the mask to the per token log probabilities
+    masked_logps = per_token_logps * loss_mask
+    # Calculate the sum or average log probability per sequence
+    sequence_logps = masked_logps.sum(dim=-1)
+    if average_log_prob:
+        # Avoid division by zero for sequences with no valid tokens
+        num_valid_tokens = loss_mask.sum(dim=-1)
+        return sequence_logps / torch.clamp(num_valid_tokens, min=1)
+    else:
+        return sequence_logps

ICL/DAPO/verl-recipe/spin/main_spin.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import hydra
+import ray
+from recipe.spin.spin_trainer import RaySPINTrainer
+from recipe.spin.utils import validate_config
+from verl.trainer.ppo.reward import get_custom_reward_fn
+from verl.trainer.ppo.utils import need_reference_policy
+@hydra.main(config_path="config", config_name="spin_trainer", version_base=None)
+def main(config):
+    run_ppo(config)
+def run_ppo(config) -> None:
+    # TODO(linjunrong.ocss884): this ENV is left for resolving SGLang conflict with ray devices
+    # isolation, will solve in the future
+    os.environ["ENSURE_CUDA_VISIBLE_DEVICES"] = os.environ.get("CUDA_VISIBLE_DEVICES", "")
+    if not ray.is_initialized():
+        # this is for local ray cluster
+        ray.init(
+            runtime_env={
+                "env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN", "VLLM_LOGGING_LEVEL": "WARN"}
+            }
+        )
+    runner = TaskRunner.remote()
+    ray.get(runner.run.remote(config))
+@ray.remote(num_cpus=1)  # please make sure main_task is not scheduled on head
+class TaskRunner:
+    def run(self, config):
+        # print initial config
+        from pprint import pprint
+        from omegaconf import OmegaConf
+        from verl.utils.fs import copy_to_local
+        pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
+        OmegaConf.resolve(config)
+        # define worker classes
+        if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
+            assert config.critic.strategy in {"fsdp", "fsdp2"}
+            # from recipe.spin.fsdp_workers import ActorRolloutRefWorker
+            from recipe.spin.fsdp_workers import SPINRolloutRefWorker
+            from verl.single_controller.ray import RayWorkerGroup
+            ray_worker_group_cls = RayWorkerGroup
+        elif config.actor_rollout_ref.actor.strategy == "megatron":
+            assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+            from verl.single_controller.ray import RayWorkerGroup
+            ray_worker_group_cls = RayWorkerGroup
+        else:
+            raise NotImplementedError
+        from recipe.spin.spin_trainer import ResourcePoolManager, Role
+        role_worker_mapping = {
+            # Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
+            Role.ActorRollout: ray.remote(SPINRolloutRefWorker),
+            # Role.Critic: ray.remote(CriticWorker),
+        }
+        global_pool_id = "global_pool"
+        resource_pool_spec = {
+            global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+        }
+        mapping = {
+            Role.ActorRollout: global_pool_id,
+            # Role.Critic: global_pool_id,
+        }
+        if config.reward_model.enable:
+            if config.reward_model.strategy in {"fsdp", "fsdp2"}:
+                from recipe.spin.fsdp_workers import RewardModelWorker
+            elif config.reward_model.strategy == "megatron":
+                from verl.workers.megatron_workers import RewardModelWorker
+            else:
+                raise NotImplementedError
+            role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+            mapping[Role.RewardModel] = global_pool_id
+        # use reference model
+        # if config.algorithm.use_kl_in_reward or config.actor_rollout_ref.actor.use_kl_loss:
+        # role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
+        role_worker_mapping[Role.RefPolicy] = ray.remote(SPINRolloutRefWorker)
+        mapping[Role.RefPolicy] = global_pool_id
+        # validate config
+        validate_config(
+            config=config,
+            use_reference_policy=need_reference_policy(role_worker_mapping),
+            use_critic=False,
+        )
+        # download the checkpoint from hdfs
+        local_path = copy_to_local(config.actor_rollout_ref.model.path)
+        # instantiate tokenizer
+        from verl.utils import hf_processor, hf_tokenizer
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        processor = hf_processor(local_path, use_fast=True)  # used for multimodal LLM, could be none
+        from verl.workers.reward_manager import get_reward_manager_cls
+        # Note(haibin.lin): please make sure custom reward managers are imported and
+        # registered via `verl.workers.reward_manager.register`
+        reward_manager_name = config.reward_model.get("reward_manager", "naive")
+        reward_manager_cls = get_reward_manager_cls(reward_manager_name)
+        compute_score = get_custom_reward_fn(config)
+        reward_kwargs = dict(config.reward_model.get("reward_kwargs", {}))
+        reward_fn = reward_manager_cls(
+            tokenizer=tokenizer,
+            num_examine=0,
+            compute_score=compute_score,
+            reward_fn_key=config.data.reward_fn_key,
+            **reward_kwargs,
+        )
+        # Note that we always use function-based RM for validation
+        val_reward_fn = reward_manager_cls(
+            tokenizer=tokenizer, num_examine=1, compute_score=compute_score, reward_fn_key=config.data.reward_fn_key
+        )
+        resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+        trainer = RaySPINTrainer(
+            config=config,
+            tokenizer=tokenizer,
+            processor=processor,
+            role_worker_mapping=role_worker_mapping,
+            resource_pool_manager=resource_pool_manager,
+            ray_worker_group_cls=ray_worker_group_cls,
+            reward_fn=reward_fn,
+            val_reward_fn=val_reward_fn,
+        )
+        trainer.init_workers()
+        trainer.fit_dpo()
+if __name__ == "__main__":
+    main()

ICL/DAPO/verl-recipe/spin/spin_trainer.py ADDED Viewed

	@@ -0,0 +1,1312 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import traceback
+import uuid
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from pprint import pprint
+from typing import Any, Optional
+import numpy as np
+import ray
+import torch
+from codetiming import Timer
+from omegaconf import OmegaConf, open_dict
+from recipe.spin import core_algos
+from torch.utils.data import Dataset, Sampler
+from torchdata.stateful_dataloader import StatefulDataLoader
+from tqdm import tqdm
+from verl import DataProto
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.trainer.ppo.metric_utils import compute_throughout_metrics, compute_timing_metrics, process_validation_metrics
+from verl.trainer.ppo.utils import Role, WorkerType, need_reference_policy, need_reward_model
+from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
+from verl.utils.metric import reduce_metrics
+from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
+from verl.utils.torch_functional import masked_mean
+from verl.utils.tracking import ValidationGenerationsLogger
+@dataclass
+class ResourcePoolManager:
+    """
+    Define a resource pool specification. Resource pool will be initialized first.
+    Mapping
+    """
+    resource_pool_spec: dict[str, list[int]]
+    mapping: dict[Role, str]
+    resource_pool_dict: dict[str, RayResourcePool] = field(default_factory=dict)
+    def create_resource_pool(self):
+        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
+            # max_colocate_count means the number of WorkerGroups (i.e. processes) in each RayResourcePool
+            # For FSDP backend, we recommend using max_colocate_count=1 that merge all WorkerGroups into one.
+            # For Megatron backend, we recommend using max_colocate_count>1 that can utilize different
+            # WorkerGroup for different models
+            resource_pool = RayResourcePool(
+                process_on_nodes=process_on_nodes, use_gpu=True, max_colocate_count=1, name_prefix=resource_pool_name
+            )
+            self.resource_pool_dict[resource_pool_name] = resource_pool
+        self._check_resource_available()
+    def get_resource_pool(self, role: Role) -> RayResourcePool:
+        """Get the resource pool of the worker_cls"""
+        return self.resource_pool_dict[self.mapping[role]]
+    def get_n_gpus(self) -> int:
+        """Get the number of gpus in this cluster."""
+        return sum([n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes])
+    def _check_resource_available(self):
+        """Check if the resource pool can be satisfied in this ray cluster."""
+        node_available_resources = ray._private.state.available_resources_per_node()
+        node_available_gpus = {node: node_info.get("GPU", 0) for node, node_info in node_available_resources.items()}
+        # check total required gpus can be satisfied
+        total_available_gpus = sum(node_available_gpus.values())
+        total_required_gpus = sum(
+            [n_gpus for process_on_nodes in self.resource_pool_spec.values() for n_gpus in process_on_nodes]
+        )
+        if total_available_gpus < total_required_gpus:
+            raise ValueError(
+                f"Total available GPUs {total_available_gpus} is less than total desired GPUs {total_required_gpus}"
+            )
+        # check each resource pool can be satisfied, O(#resource_pools * #nodes)
+        for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
+            num_gpus, num_nodes = process_on_nodes[0], len(process_on_nodes)
+            for node, available_gpus in node_available_gpus.items():
+                if available_gpus >= num_gpus:
+                    node_available_gpus[node] -= num_gpus
+                    num_nodes -= 1
+                    if num_nodes == 0:
+                        break
+            if num_nodes > 0:
+                raise ValueError(
+                    f"Resource pool {resource_pool_name}: {num_gpus}*{num_nodes} cannot be satisfied in this "
+                    f"ray cluster"
+                )
+def _compute_response_info(batch: DataProto) -> dict[str, Any]:
+    """Placeholder: Computes prompt and response lengths."""
+    try:
+        # Assuming 'prompts' and 'responses' keys exist after generation/union
+        prompt_len = batch.batch["prompts"].shape[1]
+        resp_len = batch.batch["responses"].shape[1]
+        # This is simplified - real implementation might use attention masks
+        # to get actual lengths per sample.
+        batch_size = batch.batch.batch_size[0]
+        prompt_lengths_tensor = torch.full((batch_size,), prompt_len, dtype=torch.float32, device=batch.batch.device)
+        response_lengths_tensor = torch.full((batch_size,), resp_len, dtype=torch.float32, device=batch.batch.device)
+        # Try getting actual lengths from attention mask if possible (more accurate)
+        if "response_mask" in batch.batch:
+            response_lengths_tensor = batch.batch["response_mask"].sum(dim=1).float()
+            # if "attention_mask" in batch.batch and "response_mask" in batch.batch:
+            # full_mask = batch.batch["attention_mask"]
+            # resp_mask = batch.batch["response_mask"]
+            # Infer prompt mask length based on where response mask starts or total length
+            # This logic depends heavily on how your masks are constructed.
+            # Example: prompt_lengths_tensor = full_mask.sum(dim=1).float() - response_lengths_tensor
+            # Fallback to using prompt shape if mask logic is complex:
+            prompt_lengths_tensor = torch.tensor(
+                [batch.batch["prompts"].shape[1]] * batch_size, dtype=torch.float32, device=batch.batch.device
+            )
+        return {
+            "prompt_length": prompt_lengths_tensor,
+            "response_length": response_lengths_tensor,
+            "max_response_length": resp_len,
+            "max_prompt_length": prompt_len,  # Or from config if fixed padding
+        }
+    except KeyError as e:
+        print(f"Warning: Missing key in _compute_response_info: {e}. Returning defaults.")
+        # Return default/dummy values if keys are missing
+        b_size = batch.batch.batch_size[0] if batch.batch.batch_size else 1
+        max_resp = batch.batch.get("responses").shape[1] if batch.batch.get("responses") is not None else 0
+        max_prompt = batch.batch.get("prompts").shape[1] if batch.batch.get("prompts") is not None else 0
+        return {
+            "prompt_length": torch.zeros(b_size),
+            "response_length": torch.zeros(b_size),
+            "max_response_length": max_resp,
+            "max_prompt_length": max_prompt,
+        }
+# --- Modified Metric Function ---
+def compute_dpo_data_metrics(batch: DataProto) -> dict[str, Any]:
+    """
+    Computes and returns metrics relevant for the DPO-like process.
+    Assumes 'batch' contains results after generation and preference marking,
+    potentially including 'dpo_logits', 'preferences', 'chosen_logps', etc.
+    Removes PPO-specific advantage/return/critic metrics.
+    """
+    print("---- [DEBUG] Computing DPO Data Metrics ----")
+    metrics = {}
+    try:
+        # --- Scores and Rewards (from reward_fn) ---
+        if "token_level_scores" in batch.batch and batch.batch["token_level_scores"] is not None:
+            sequence_score = batch.batch["token_level_scores"].sum(-1)
+            metrics.update(
+                {
+                    "reward/score/mean": torch.mean(sequence_score).item(),
+                    "reward/score/max": torch.max(sequence_score).item(),
+                    "reward/score/min": torch.min(sequence_score).item(),
+                }
+            )
+        else:
+            print("DEBUG compute_dpo_data_metrics: 'token_level_scores' not found.")
+        if "token_level_rewards" in batch.batch and batch.batch["token_level_rewards"] is not None:
+            sequence_reward = batch.batch["token_level_rewards"].sum(-1)
+            metrics.update(
+                {
+                    "reward/rewards/mean": torch.mean(sequence_reward).item(),
+                    "reward/rewards/max": torch.max(sequence_reward).item(),
+                    "reward/rewards/min": torch.min(sequence_reward).item(),
+                }
+            )
+        else:
+            print("DEBUG compute_dpo_data_metrics: 'token_level_rewards' not found.")
+        # --- DPO Specific Metrics (if stored previously) ---
+        if "dpo_logits" in batch.batch and batch.batch["dpo_logits"] is not None:
+            metrics["actor/dpo_logits"] = batch.batch["dpo_logits"].mean().item()
+        else:
+            print("DEBUG compute_dpo_data_metrics: 'dpo_logits' not found.")
+        if "chosen_logps" in batch.batch and batch.batch["chosen_logps"] is not None:
+            metrics["actor/chosen_logps"] = batch.batch["chosen_logps"].mean().item()
+        else:
+            print("DEBUG compute_dpo_data_metrics: 'chosen_logps' not found.")
+        if "rejected_logps" in batch.batch and batch.batch["rejected_logps"] is not None:
+            metrics["actor/rejected_logps"] = batch.batch["rejected_logps"].mean().item()
+        else:
+            print("DEBUG compute_dpo_data_metrics: 'rejected_logps' not found.")
+        # Add metrics based on the 'preferences' mask if available
+        # if "preferences" in batch.batch and batch.batch["preferences"] is not None:
+        # prefs_mask = batch.batch["preferences"]  # Shape [batch_size * n]
+        # Calculate accuracy based on RM scores (assuming higher score -> True in mask)
+        # Requires chosen/rejected scores to be available or recalculated
+        # This is complex here, better calculated in the main loop or update function
+        # --- Length Metrics ---
+        response_info = _compute_response_info(batch)
+        prompt_length = response_info["prompt_length"]
+        response_length = response_info["response_length"]
+        max_response_length = response_info["max_response_length"]
+        max_prompt_length = response_info["max_prompt_length"]  # Use calculated or from config
+        metrics.update(
+            {
+                "response_length/mean": torch.mean(response_length).item(),
+                "response_length/max": torch.max(response_length).item(),
+                "response_length/min": torch.min(response_length).item(),
+                "response_length/clip_ratio": torch.mean(torch.eq(response_length, max_response_length).float()).item(),
+                "prompt_length/mean": torch.mean(prompt_length).item(),
+                "prompt_length/max": torch.max(prompt_length).item(),
+                "prompt_length/min": torch.min(prompt_length).item(),
+                # Prompt clip ratio might need adjustment based on how max_prompt_length is defined
+                "prompt_length/clip_ratio": torch.mean(torch.eq(prompt_length, max_prompt_length).float()).item(),
+            }
+        )
+    except KeyError as e:
+        print(f"ERROR in compute_dpo_data_metrics: Missing key {e}")
+    except Exception as e:
+        print(f"ERROR in compute_dpo_data_metrics: {e}")
+        traceback.print_exc()
+    print(f"---- [DEBUG] Calculated DPO Data Metrics: {list(metrics.keys())} ----")
+    return metrics
+def apply_kl_penalty(data: DataProto, kl_ctrl: core_algos.AdaptiveKLController, kl_penalty="kl"):
+    responses = data.batch["responses"]
+    response_length = responses.size(1)
+    token_level_scores = data.batch["token_level_scores"]
+    batch_size = data.batch.batch_size[0]
+    attention_mask = data.batch["attention_mask"]
+    response_mask = attention_mask[:, -response_length:]
+    # compute kl between ref_policy and current policy
+    # When apply_kl_penalty, algorithm.use_kl_in_reward=True, so the reference model has been enabled.
+    kld = core_algos.kl_penalty(
+        data.batch["old_log_probs"], data.batch["ref_log_prob"], kl_penalty=kl_penalty
+    )  # (batch_size, response_length)
+    kld = kld * response_mask
+    beta = kl_ctrl.value
+    token_level_rewards = token_level_scores - beta * kld
+    current_kl = masked_mean(kld, mask=response_mask, axis=-1)  # average over sequence
+    current_kl = torch.mean(current_kl, dim=0).item()
+    # according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837
+    kl_ctrl.update(current_kl=current_kl, n_steps=batch_size)
+    data.batch["token_level_rewards"] = token_level_rewards
+    metrics = {"actor/reward_kl_penalty": current_kl, "actor/reward_kl_penalty_coeff": beta}
+    return data, metrics
+def compute_response_mask(data: DataProto):
+    responses = data.batch["responses"]
+    response_length = responses.size(1)
+    attention_mask = data.batch["attention_mask"]
+    return attention_mask[:, -response_length:]
+def compute_onlineDPO_pref(data: DataProto):
+    """
+    Wrapper to compute DPO preference and add it to the DataProto batch.
+    Includes debugging prints.
+    """
+    # print(f"\n---- [DEBUG] Entering compute_onlineDPO_pref ----")
+    # print(f"  Input batch keys: {list(data.batch.keys())}")
+    # Check inputs
+    rewards_tensor = data.batch.get("token_level_rewards")
+    mask_tensor = data.batch.get("response_mask")
+    if rewards_tensor is None or mask_tensor is None:
+        print("  ERROR: Missing 'token_level_rewards' or 'response_mask' in input data!")
+        # Handle error case - maybe return original data or raise?
+        # Returning original data for now to potentially allow skipping
+        return data
+    try:
+        preferences = core_algos.compute_onlinedpo_pref(token_level_rewards=rewards_tensor, response_mask=mask_tensor)
+        # Store the result
+        data.batch["preferences"] = preferences
+    except AttributeError:
+        print("ERROR: Function 'compute_online_dpo_preference' not found in core_algos.py!")
+        # Assign dummy value or raise error
+        data.batch["preferences"] = None  # Indicate failure
+    except Exception as e_pref:
+        print(f"ERROR during core_algos.compute_online_dpo_preference: {e_pref}")
+        import traceback
+        traceback.print_exc()
+        data.batch["preferences"] = None  # Indicate failure
+    # print(f"---- [DEBUG] Exiting compute_onlineDPO_pref ----")
+    return data
+@contextmanager
+def _timer(name: str, timing_raw: dict[str, float]):
+    with Timer(name=name, logger=None) as timer:
+        yield
+    timing_raw[name] = timer.last
+class RaySPINTrainer:
+    """
+    Note that this trainer runs on the driver process on a single CPU/GPU node.
+    """
+    # TODO: support each role have individual ray_worker_group_cls,
+    # i.e., support different backend of different role
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        role_worker_mapping: dict[Role, WorkerType],
+        resource_pool_manager: ResourcePoolManager,
+        ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+        processor=None,
+        reward_fn=None,
+        val_reward_fn=None,
+        train_dataset: Optional[Dataset] = None,
+        val_dataset: Optional[Dataset] = None,
+        collate_fn=None,
+        train_sampler: Optional[Sampler] = None,
+        device_name=None,
+    ):
+        # assert get_torch_device().is_available(), 'cuda must be available on driver'
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.config = config
+        self.reward_fn = reward_fn
+        self.val_reward_fn = val_reward_fn
+        self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
+        assert self.hybrid_engine, "Currently, only support hybrid engine"
+        if self.hybrid_engine:
+            assert Role.ActorRollout in role_worker_mapping, f"{role_worker_mapping.keys()=}"
+        self.role_worker_mapping = role_worker_mapping
+        self.resource_pool_manager = resource_pool_manager
+        self.use_reference_policy = need_reference_policy(role_worker_mapping)
+        self.use_rm = need_reward_model(role_worker_mapping)
+        self.use_critic = False
+        self.ray_worker_group_cls = ray_worker_group_cls
+        self.validation_generations_logger = ValidationGenerationsLogger()
+        self.async_rollout_mode = False
+        self.device_name = device_name if device_name else self.config.trainer.device
+        # define in-reward KL control
+        # kl loss control currently not suppoorted
+        if config.algorithm.use_kl_in_reward:
+            self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl)
+        self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
+    def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler):
+        """
+        Creates the train and validation dataloaders.
+        """
+        # TODO: we have to make sure the batch size is divisible by the dp size
+        from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
+        if train_dataset is None:
+            train_dataset = create_rl_dataset(
+                self.config.data.train_files,
+                self.config.data,
+                self.tokenizer,
+                self.processor,
+                max_samples=self.config.data.get("train_max_samples", -1),
+            )
+        if val_dataset is None:
+            val_dataset = create_rl_dataset(
+                self.config.data.val_files,
+                self.config.data,
+                self.tokenizer,
+                self.processor,
+                max_samples=self.config.data.get("val_max_samples", -1),
+            )
+        self.train_dataset, self.val_dataset = train_dataset, val_dataset
+        if train_sampler is None:
+            train_sampler = create_rl_sampler(self.config.data, self.train_dataset)
+        if collate_fn is None:
+            from verl.utils.dataset.rl_dataset import collate_fn as default_collate_fn
+            collate_fn = default_collate_fn
+        self.train_dataloader = StatefulDataLoader(
+            dataset=self.train_dataset,
+            batch_size=self.config.data.get("gen_batch_size", self.config.data.train_batch_size),
+            num_workers=self.config.data.get("dataloader_num_workers", 8),
+            drop_last=True,
+            collate_fn=collate_fn,
+            sampler=train_sampler,
+        )
+        val_batch_size = self.config.data.val_batch_size  # Prefer config value if set
+        if val_batch_size is None:
+            val_batch_size = len(self.val_dataset)
+        self.val_dataloader = StatefulDataLoader(
+            dataset=self.val_dataset,
+            batch_size=val_batch_size,
+            num_workers=self.config.data.get("dataloader_num_workers", 8),
+            shuffle=False,
+            drop_last=False,
+            collate_fn=collate_fn,
+        )
+        assert len(self.train_dataloader) >= 1, "Train dataloader is empty!"
+        assert len(self.val_dataloader) >= 1, "Validation dataloader is empty!"
+        print(
+            f"Size of train dataloader: {len(self.train_dataloader)}, "
+            f"Size of val dataloader: {len(self.val_dataloader)}"
+        )
+        total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
+        if self.config.trainer.total_training_steps is not None:
+            total_training_steps = self.config.trainer.total_training_steps
+        self.total_training_steps = total_training_steps
+        print(f"Total training steps: {self.total_training_steps}")
+        try:
+            OmegaConf.set_struct(self.config, True)
+            with open_dict(self.config):
+                if OmegaConf.select(self.config, "actor_rollout_ref.actor.optim"):
+                    self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps
+                if OmegaConf.select(self.config, "critic.optim"):
+                    self.config.critic.optim.total_training_steps = total_training_steps
+        except Exception as e:
+            print(f"Warning: Could not set total_training_steps in config. Structure missing? Error: {e}")
+    def _maybe_log_val_generations(self, inputs, outputs, scores):
+        """Log a table of validation samples to the configured logger (wandb or swanlab)"""
+        generations_to_log = self.config.trainer.log_val_generations
+        if generations_to_log == 0:
+            return
+        import numpy as np
+        # Create tuples of (input, output, score) and sort by input text
+        samples = list(zip(inputs, outputs, scores, strict=True))
+        samples.sort(key=lambda x: x[0])  # Sort by input text
+        # Use fixed random seed for deterministic shuffling
+        rng = np.random.RandomState(42)
+        rng.shuffle(samples)
+        # Take first N samples after shuffling
+        samples = samples[:generations_to_log]
+        # Log to each configured logger
+        self.validation_generations_logger.log(self.config.trainer.logger, samples, self.global_steps)
+    def _validate(self):
+        data_source_lst = []
+        reward_extra_infos_dict: dict[str, list] = defaultdict(list)
+        # Lists to collect samples for the table
+        sample_inputs = []
+        sample_outputs = []
+        sample_scores = []
+        for test_data in self.val_dataloader:
+            test_batch = DataProto.from_single_dict(test_data)
+            # repeat test batch
+            test_batch = test_batch.repeat(
+                repeat_times=self.config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True
+            )
+            # we only do validation on rule-based rm
+            if self.config.reward_model.enable and test_batch[0].non_tensor_batch["reward_model"]["style"] == "model":
+                return {}
+            # Store original inputs
+            input_ids = test_batch.batch["input_ids"]
+            # TODO: Can we keep special tokens except for padding tokens?
+            input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
+            sample_inputs.extend(input_texts)
+            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
+            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
+            if "multi_modal_inputs" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.extend(["multi_modal_data", "multi_modal_inputs"])
+            if "raw_prompt" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("raw_prompt")
+            if "tools_kwargs" in test_batch.non_tensor_batch:
+                non_tensor_batch_keys_to_pop.append("tools_kwargs")
+            test_gen_batch = test_batch.pop(
+                batch_keys=batch_keys_to_pop,
+                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
+            )
+            test_gen_batch.meta_info = {
+                "eos_token_id": self.tokenizer.eos_token_id,
+                "pad_token_id": self.tokenizer.pad_token_id,
+                "recompute_log_prob": False,
+                "do_sample": self.config.actor_rollout_ref.rollout.val_kwargs.do_sample,
+                "validate": True,
+            }
+            print(f"test_gen_batch meta info: {test_gen_batch.meta_info}")
+            # pad to be divisible by dp_size
+            test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, self.actor_rollout_wg.world_size)
+            if not self.async_rollout_mode:
+                test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
+            else:
+                test_output_gen_batch_padded = self.async_rollout_manager.generate_sequences(test_gen_batch_padded)
+            # unpad
+            test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size)
+            print("validation generation end")
+            # Store generated outputs
+            output_ids = test_output_gen_batch.batch["responses"]
+            output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
+            sample_outputs.extend(output_texts)
+            test_batch = test_batch.union(test_output_gen_batch)
+            # evaluate using reward_function
+            result = self.val_reward_fn(test_batch, return_dict=True)
+            reward_tensor = result["reward_tensor"]
+            scores = reward_tensor.sum(-1).cpu().tolist()
+            sample_scores.extend(scores)
+            reward_extra_infos_dict["reward"].extend(scores)
+            if "reward_extra_info" in result:
+                for key, lst in result["reward_extra_info"].items():
+                    reward_extra_infos_dict[key].extend(lst)
+            data_source_lst.append(test_batch.non_tensor_batch.get("data_source", ["unknown"] * reward_tensor.shape[0]))
+        self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores)
+        # dump generations
+        val_data_dir = self.config.trainer.get("validation_data_dir", None)
+        if val_data_dir:
+            sample_gts = [
+                item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) for item in test_batch
+            ]
+            self._dump_generations(
+                inputs=sample_inputs,
+                outputs=sample_outputs,
+                gts=sample_gts,
+                scores=sample_scores,
+                reward_extra_infos_dict=reward_extra_infos_dict,
+                dump_path=val_data_dir,
+            )
+        for key_info, lst in reward_extra_infos_dict.items():
+            assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}"
+        data_sources = np.concatenate(data_source_lst, axis=0)
+        print(f"DEBUG: Data sources shape: {data_sources.shape}")  # Added Print
+        print(f"DEBUG: reward_extra_infos_dict keys before processing: {reward_extra_infos_dict.keys()}")  # Added Print
+        data_src2var2metric2val = process_validation_metrics(data_sources, sample_inputs, reward_extra_infos_dict)
+        print(
+            f"DEBUG: Output of process_validation_metrics (data_src2var2metric2val): {data_src2var2metric2val}"
+        )  # Added Print
+        metric_dict = {}
+        for data_source, var2metric2val in data_src2var2metric2val.items():
+            core_var = "acc" if "acc" in var2metric2val else "reward"
+            for var_name, metric2val in var2metric2val.items():
+                n_max = max([int(name.split("@")[-1].split("/")[0]) for name in metric2val.keys()])
+                for metric_name, metric_val in metric2val.items():
+                    if (
+                        (var_name == core_var)
+                        and any(metric_name.startswith(pfx) for pfx in ["mean", "maj", "best"])
+                        and (f"@{n_max}" in metric_name)
+                    ):
+                        metric_sec = "val-core"
+                    else:
+                        metric_sec = "val-aux"
+                    pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}"
+                    metric_dict[pfx] = metric_val
+        return metric_dict
+    def init_workers(self):
+        """Init resource pool and worker group"""
+        self.resource_pool_manager.create_resource_pool()
+        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
+        # create actor and rollout
+        if self.hybrid_engine:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout)
+            actor_rollout_cls = RayClassWithInitArgs(
+                cls=self.role_worker_mapping[Role.ActorRollout],
+                config=self.config.actor_rollout_ref,
+                role="actor_rollout",
+            )
+            self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
+        else:
+            raise NotImplementedError
+        # create critic
+        if self.use_critic:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
+            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic)
+            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls
+        # create reference policy if needed
+        if self.use_reference_policy:
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
+            ref_policy_cls = RayClassWithInitArgs(
+                self.role_worker_mapping[Role.RefPolicy], config=self.config.actor_rollout_ref, role="ref"
+            )
+            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls
+        # create a reward model if reward_fn is None
+        if self.use_rm:
+            # we create a RM here
+            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+            rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
+            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls
+        # initialize WorkerGroup
+        # NOTE: if you want to use a different resource pool for each role, which can support different
+        # parallel size,
+        # you should not use `create_colocated_worker_cls`. Instead, directly pass different resource pool to
+        # different worker groups.
+        # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information.
+        all_wg = {}
+        self.wg_dicts = []
+        wg_kwargs = {}  # Setting up kwargs for RayWorkerGroup
+        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
+            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
+        wg_kwargs["device_name"] = self.device_name
+        for resource_pool, class_dict in self.resource_pool_to_cls.items():
+            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+            wg_dict = self.ray_worker_group_cls(
+                resource_pool=resource_pool,
+                ray_cls_with_init=worker_dict_cls,
+                **wg_kwargs,
+            )
+            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+            all_wg.update(spawn_wg)
+            # keep the referece of WorkerDict to support ray >= 2.31. Ref: https://github.com/ray-project/ray/pull/45699
+            self.wg_dicts.append(wg_dict)
+        if self.use_critic:
+            self.critic_wg = all_wg["critic"]
+            self.critic_wg.init_model()
+        if self.use_reference_policy:
+            self.ref_policy_wg = all_wg["ref"]
+            self.ref_policy_wg.init_model()
+        if self.use_rm:
+            self.rm_wg = all_wg["rm"]
+            self.rm_wg.init_model()
+        # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
+        self.actor_rollout_wg = all_wg["actor_rollout"]
+        self.actor_rollout_wg.init_model()
+    def _save_checkpoint(self):
+        # path: given_path + `/global_step_{global_steps}` + `/actor`
+        local_global_step_folder = os.path.join(
+            self.config.trainer.default_local_dir, f"global_step_{self.global_steps}"
+        )
+        print(f"local_global_step_folder: {local_global_step_folder}")
+        actor_local_path = os.path.join(local_global_step_folder, "actor")
+        actor_remote_path = (
+            None
+            if self.config.trainer.default_hdfs_dir is None
+            else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "actor")
+        )
+        remove_previous_ckpt_in_save = self.config.trainer.get("remove_previous_ckpt_in_save", False)
+        if remove_previous_ckpt_in_save:
+            print(
+                "Warning: remove_previous_ckpt_in_save is deprecated, set max_actor_ckpt_to_keep=1 and "
+                "max_critic_ckpt_to_keep=1 instead"
+            )
+        max_actor_ckpt_to_keep = (
+            self.config.trainer.get("max_actor_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
+        )
+        max_critic_ckpt_to_keep = (
+            self.config.trainer.get("max_critic_ckpt_to_keep", None) if not remove_previous_ckpt_in_save else 1
+        )
+        self.actor_rollout_wg.save_checkpoint(
+            actor_local_path, actor_remote_path, self.global_steps, max_ckpt_to_keep=max_actor_ckpt_to_keep
+        )
+        if self.use_critic:
+            critic_local_path = os.path.join(local_global_step_folder, "critic")
+            critic_remote_path = (
+                None
+                if self.config.trainer.default_hdfs_dir is None
+                else os.path.join(self.config.trainer.default_hdfs_dir, f"global_step_{self.global_steps}", "critic")
+            )
+            self.critic_wg.save_checkpoint(
+                critic_local_path, critic_remote_path, self.global_steps, max_ckpt_to_keep=max_critic_ckpt_to_keep
+            )
+        # save dataloader
+        dataloader_local_path = os.path.join(local_global_step_folder, "data.pt")
+        dataloader_state_dict = self.train_dataloader.state_dict()
+        torch.save(dataloader_state_dict, dataloader_local_path)
+        # latest checkpointed iteration tracker (for atomic usage)
+        local_latest_checkpointed_iteration = os.path.join(
+            self.config.trainer.default_local_dir, "latest_checkpointed_iteration.txt"
+        )
+        with open(local_latest_checkpointed_iteration, "w") as f:
+            f.write(str(self.global_steps))
+    def _load_checkpoint(self):
+        if self.config.trainer.resume_mode == "disable":
+            return 0
+        # load from hdfs
+        if self.config.trainer.default_hdfs_dir is not None:
+            raise NotImplementedError("load from hdfs is not implemented yet")
+        else:
+            checkpoint_folder = self.config.trainer.default_local_dir  # TODO: check path
+            if not os.path.isabs(checkpoint_folder):
+                working_dir = os.getcwd()
+                checkpoint_folder = os.path.join(working_dir, checkpoint_folder)
+            global_step_folder = find_latest_ckpt_path(checkpoint_folder)  # None if no latest
+        # find global_step_folder
+        if self.config.trainer.resume_mode == "auto":
+            if global_step_folder is None:
+                print("Training from scratch")
+                return 0
+        else:
+            if self.config.trainer.resume_mode == "resume_path":
+                assert isinstance(self.config.trainer.resume_from_path, str), "resume ckpt must be str type"
+                assert "global_step_" in self.config.trainer.resume_from_path, (
+                    "resume ckpt must specify the global_steps"
+                )
+                global_step_folder = self.config.trainer.resume_from_path
+                if not os.path.isabs(global_step_folder):
+                    working_dir = os.getcwd()
+                    global_step_folder = os.path.join(working_dir, global_step_folder)
+        print(f"Load from checkpoint folder: {global_step_folder}")
+        # set global step
+        self.global_steps = int(global_step_folder.split("global_step_")[-1])
+        print(f"Setting global step to {self.global_steps}")
+        print(f"Resuming from {global_step_folder}")
+        actor_path = os.path.join(global_step_folder, "actor")
+        critic_path = os.path.join(global_step_folder, "critic")
+        # load actor
+        self.actor_rollout_wg.load_checkpoint(
+            actor_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
+        )
+        # load critic
+        if self.use_critic:
+            self.critic_wg.load_checkpoint(
+                critic_path, del_local_after_load=self.config.trainer.del_local_ckpt_after_load
+            )
+        # load dataloader,
+        # TODO: from remote not implemented yet
+        dataloader_local_path = os.path.join(global_step_folder, "data.pt")
+        if os.path.exists(dataloader_local_path):
+            dataloader_state_dict = torch.load(dataloader_local_path, weights_only=False)
+            self.train_dataloader.load_state_dict(dataloader_state_dict)
+        else:
+            print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch")
+    def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen"):
+        """Reorder the data on single controller such that each dp rank gets similar total tokens"""
+        attention_mask = batch.batch["attention_mask"]
+        batch_size = attention_mask.shape[0]
+        global_seqlen_lst = batch.batch["attention_mask"].view(batch_size, -1).sum(-1).tolist()  # (train_batch_size,)
+        world_size = self.actor_rollout_wg.world_size
+        global_partition_lst = get_seqlen_balanced_partitions(
+            global_seqlen_lst, k_partitions=world_size, equal_size=True
+        )
+        # reorder based on index. The data will be automatically equally partitioned by dispatch function
+        global_idx = torch.tensor([j for partition in global_partition_lst for j in partition])
+        batch.reorder(global_idx)
+        global_balance_stats = log_seqlen_unbalance(
+            seqlen_list=global_seqlen_lst, partitions=global_partition_lst, prefix=logging_prefix
+        )
+        metrics.update(global_balance_stats)
+    def fit_dpo(self):  # Renamed for clarity as standard PPO loop
+        """
+        The training loop of Online DPO using a periodically updated reference model.
+        The driver process calls worker groups for computation.
+        Advantage computation is replaced by DPO logic.
+        """
+        import traceback  # Ensure traceback is imported
+        from omegaconf import OmegaConf
+        from verl.utils.tracking import Tracking
+        # Initialize logger
+        logger = None
+        try:
+            logger = Tracking(
+                project_name=self.config.trainer.project_name,
+                experiment_name=self.config.trainer.experiment_name,
+                default_backend=self.config.trainer.logger,
+                config=OmegaConf.to_container(self.config, resolve=True, throw_on_missing=False),
+            )
+        except Exception as e:
+            print(f"Warning: Failed to initialize logger: {e}")
+        self.global_steps = 0
+        # Load checkpoint before doing anything
+        loaded_step = self._load_checkpoint()
+        self.global_steps = loaded_step + 1 if loaded_step is not None and loaded_step > 0 else 1
+        print(
+            f"Starting Online DPO training from global step {self.global_steps}. "
+            f"Total steps: {self.total_training_steps}"
+        )
+        print(f"Reference model update frequency: {self.config.trainer.get('ref_update_freq', 'Not Set')}")
+        # Check if reference policy is configured correctly for this mode
+        if not self.use_reference_policy:
+            print(
+                "WARNING: 'use_reference_policy' is False. Periodic reference model update requires a "
+                "reference policy worker. DPO updates might fail or use incorrect logic."
+            )
+            # Consider raising an error if strict adherence is required:
+            # raise ValueError("Periodic reference model update requires 'use_reference_policy' to be True "
+            #                  "and a configured reference worker.")
+        # Perform validation before training
+        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
+            print("Running validation before Online DPO training...")
+            val_metrics = self._validate()
+            pprint(f"Initial validation metrics: {val_metrics}")
+            if logger and val_metrics:
+                logger.log(data=val_metrics, step=max(0, self.global_steps - 1))
+            if self.config.trainer.get("val_only", False):
+                print("Validation only mode enabled. Exiting training.")
+                if logger and hasattr(logger, "finish"):
+                    logger.finish()
+                return
+        # Add tqdm progress bar
+        progress_bar = tqdm(
+            total=self.total_training_steps,
+            initial=self.global_steps,
+            desc="Online DPO Training Progress",
+            position=0,
+            leave=True,
+        )
+        last_val_metrics = None
+        should_stop = False
+        for epoch in range(self.config.trainer.total_epochs):
+            if should_stop:
+                break
+            print(f"--- Starting Online DPO Epoch {epoch} ---")
+            try:
+                train_iterator = iter(self.train_dataloader)
+            except TypeError:
+                print("Warning: Dataloader is not iterable.")
+                train_iterator = self.train_dataloader  # Fallback attempt
+            for batch_idx, batch_dict in enumerate(train_iterator):
+                if self.global_steps > self.total_training_steps:
+                    should_stop = True
+                    break
+                metrics = {}
+                timing_raw = {}
+                step_timer = Timer(logger=None)
+                ref_log_prob_computed = False  # Flag to track if ref log probs were computed
+                try:  # Outer try-except for the whole step
+                    step_timer.start()
+                    with _timer("step", timing_raw):
+                        batch: DataProto = DataProto.from_single_dict(batch_dict)
+                        current_batch_size = batch.batch.batch_size[0]
+                        print(
+                            f"\n[Step {self.global_steps}, Batch {batch_idx}] Processing batch size: "
+                            f"{current_batch_size}"
+                        )
+                        # --- Reference Model Update ---
+                        ref_update_freq = self.config.trainer.get("ref_update_freq", -1)
+                        if (
+                            self.use_reference_policy
+                            and ref_update_freq > 0
+                            and self.global_steps % ref_update_freq == 0
+                        ):
+                            print(f"\n[Step {self.global_steps}] Updating Reference Model Weights from Actor...")
+                            try:
+                                # --- This requires careful implementation with FSDP ---
+                                # 1. Save actor state dict (potentially to CPU memory or disk)
+                                #    This needs to be done collectively across actor worker ranks.
+                                #    The checkpoint_manager might be adaptable, or use FSDP APIs directly.
+                                #    Example placeholder using a conceptual save/load mechanism:
+                                actor_state_path = "/tmp/actor_state_mid"  # Temporary path
+                                self.actor_rollout_wg.save_checkpoint(actor_state_path)  # Adapt save logic
+                                # 2. Load the state dict onto the reference model worker group
+                                #    This also needs collective loading on the ref worker ranks.
+                                self.ref_policy_wg.load_checkpoint(actor_state_path, None, True)  # Adapt load logic
+                                print(f"[Step {self.global_steps}] Reference Model Weights Updated.")
+                                # Optionally remove the temporary state file
+                                # os.remove(actor_state_path) # Needs rank-aware removal or shared storage
+                            except Exception as sync_e:
+                                print(f"ERROR during reference model sync at step {self.global_steps}: {sync_e}")
+                                traceback.print_exc()
+                        # Pop keys for generation
+                        pop_batch_keys = ["input_ids", "attention_mask"]
+                        if "position_ids" in batch.batch:
+                            pop_batch_keys.append("position_ids")
+                        pop_non_tensor_keys = ["raw_prompt_ids"] if "raw_prompt_ids" in batch.non_tensor_batch else []
+                        if "multi_modal_inputs" in batch.non_tensor_batch.keys():
+                            pop_non_tensor_keys.extend(["multi_modal_data", "multi_modal_inputs"])
+                        original_non_tensor_data = batch.non_tensor_batch
+                        gen_batch = batch.pop(
+                            batch_keys=pop_batch_keys,
+                            non_tensor_batch_keys=pop_non_tensor_keys,
+                        )
+                        gen_batch = gen_batch.repeat(
+                            repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True
+                        )
+                        # (Add Debug prints for gen_batch if needed)
+                        # Generate sequences (chosen/rejected pairs)
+                        with _timer("gen", timing_raw):
+                            try:
+                                gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
+                                # (Add Debug prints for gen_batch_output if needed)
+                            except Exception as gen_e:
+                                print(f"\n!!!!!!!! ERROR DURING GENERATION (Step {self.global_steps}) !!!!!!!!")
+                                print(gen_e)
+                                traceback.print_exc()
+                                print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                                step_timer.stop()
+                                continue
+                        # Combine original prompts with generated sequences
+                        batch.non_tensor_batch = original_non_tensor_data  # Restore non-tensor data
+                        batch.non_tensor_batch["uid"] = np.array(
+                            [str(uuid.uuid4()) for _ in range(current_batch_size)], dtype=object
+                        )
+                        batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+                        batch = batch.union(gen_batch_output)
+                        # (Add Debug prints after union if needed)
+                        # Compute response mask (needed for ref logprob calc and DPO prep)
+                        batch.batch["response_mask"] = compute_response_mask(batch)
+                        if self.config.trainer.balance_batch:
+                            self._balance_batch(batch, metrics=metrics)
+                        batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
+                        # --- Compute Log Probs for the CURRENT policy (used for KL if enabled, or ActorAsRef
+                        # fallback) ---
+                        # Note: For pure DPO with external ref, this 'old_log_probs' might not be strictly needed
+                        #       unless used for other metrics or a fallback. Keep it for now.
+                        with _timer("policy_log_prob", timing_raw):
+                            policy_log_prob_output = self.actor_rollout_wg.compute_log_prob(batch)
+                            batch = batch.union(policy_log_prob_output)  # Adds 'old_log_probs'
+                            # (Debug prints for old_log_probs)
+                        # --- Compute Log Probs using the EXTERNAL Reference Model ---
+                        if self.use_reference_policy:
+                            with _timer("ref_log_prob_dpo", timing_raw):
+                                # print(f"---- [Step {self.global_steps}] DEBUG DPO: Calling compute_ref_log_prob ----")
+                                try:
+                                    # 'batch' contains interleaved chosen/rejected sequences
+                                    ref_log_prob_output = self.ref_policy_wg.compute_ref_log_prob(
+                                        batch
+                                    )  # Returns DataProto with 'ref_log_prob'
+                                    batch = batch.union(
+                                        ref_log_prob_output
+                                    )  # Adds 'ref_log_prob' key [batch_size * n, seq_len]
+                                    ref_log_prob_computed = True  # Mark success
+                                    # print(f"---- [Step {self.global_steps}] DEBUG DPO: ref_log_prob tensor shape: "
+                                    #       f"{batch.batch['ref_log_prob'].shape} ----")
+                                except Exception as ref_e:
+                                    print(f"ERROR computing reference log probs at step {self.global_steps}: {ref_e}")
+                                    traceback.print_exc()
+                                    batch.batch["ref_log_prob"] = None  # Mark as failed
+                                    ref_log_prob_computed = False
+                        else:
+                            print(
+                                "Warning: Skipping external reference log prob calculation as use_reference_policy "
+                                "is False."
+                            )
+                            # DPO update will likely fail unless ActorAsRef logic is re-enabled in dp_actor
+                        # --- Compute Rewards/Scores (used to determine preference) ---
+                        with _timer("reward_calc", timing_raw):
+                            # (Reward calculation logic using RM or reward_fn as before)
+                            # ... Ensure this calculates 'token_level_rewards' or similar ...
+                            if self.use_rm:
+                                reward_tensor_rm = self.rm_wg.compute_rm_score(batch)
+                                batch = batch.union(reward_tensor_rm)  # Adds 'rm_scores'
+                            reward_extra_infos_dict = {}
+                            try:
+                                if self.reward_fn is None:
+                                    #  print(f"---- [DEBUG Step {self.global_steps}] ERROR: self.reward_fn is None! "
+                                    #        f"Using dummy rewards. ----")
+                                    # Use rm_scores if available, otherwise zeros
+                                    reward_tensor = batch.batch.get(
+                                        "rm_scores", torch.zeros_like(batch.batch["response_mask"], dtype=torch.float32)
+                                    )
+                                else:
+                                    reward_result = self.reward_fn(batch, return_dict=True)
+                                    reward_tensor = reward_result["reward_tensor"]  # Final combined reward
+                                    reward_extra_infos_dict = reward_result.get("reward_extra_info", {})
+                            except Exception:
+                                # print(f'---- [DEBUG Step {self.global_steps}] Error in reward_fn call: {e}. '
+                                #       f'Using dummy rewards. ----')
+                                traceback.print_exc()
+                                reward_tensor = torch.zeros_like(batch.batch["response_mask"], dtype=torch.float32)
+                                reward_extra_infos_dict = {}
+                            # Use 'token_level_rewards' as the key for preference calculation
+                            batch.batch["token_level_rewards"] = reward_tensor
+                            if reward_extra_infos_dict:
+                                batch.non_tensor_batch.update(
+                                    {k: np.array(v) for k, v in reward_extra_infos_dict.items()}
+                                )
+                        # --- Determine Preferences ---
+                        # Uses 'token_level_rewards' to determine chosen/rejected based on score
+                        batch = compute_onlineDPO_pref(batch)  # Adds 'preferences' key
+                        # --- Prepare DPO Batch ---
+                        dpo_update_batch_proto = None  # Initialize
+                        with _timer("prepare_dpo_batch", timing_raw):
+                            try:
+                                if "preferences" not in batch.batch or batch.batch["preferences"] is None:
+                                    raise ValueError("'preferences' key missing or None after compute_onlineDPO_pref.")
+                                # Check if reference log probs were computed successfully (if needed)
+                                if self.use_reference_policy and not ref_log_prob_computed:
+                                    raise ValueError("Reference log probs required but failed to compute.")
+                                # Check required base keys
+                                required_keys = ["input_ids", "attention_mask", "response_mask"]
+                                for rk in required_keys:
+                                    if rk not in batch.batch or batch.batch[rk] is None:
+                                        raise KeyError(f"Required key '{rk}' missing from batch for DPO prep.")
+                                preferences_mask = batch.batch["preferences"]  # Shape [batch_size * n]
+                                not_preferences_mask = ~preferences_mask
+                                # Gather Chosen/Rejected Base Tensors
+                                chosen_input_ids = batch.batch["input_ids"][preferences_mask]
+                                chosen_attention_mask = batch.batch["attention_mask"][preferences_mask]
+                                rejected_input_ids = batch.batch["input_ids"][not_preferences_mask]
+                                rejected_attention_mask = batch.batch["attention_mask"][not_preferences_mask]
+                                chosen_position_ids = (
+                                    batch.batch.get("position_ids")[preferences_mask]
+                                    if "position_ids" in batch.batch
+                                    else None
+                                )
+                                rejected_position_ids = (
+                                    batch.batch.get("position_ids")[not_preferences_mask]
+                                    if "position_ids" in batch.batch
+                                    else None
+                                )
+                                # Create Labels
+                                print("WARNING: Creating DPO labels using configured max_prompt_length...")
+                                prompt_len = self.config.data.max_prompt_length
+                                chosen_labels = chosen_input_ids.clone()
+                                chosen_labels[:, :prompt_len] = -100
+                                rejected_labels = rejected_input_ids.clone()
+                                rejected_labels[:, :prompt_len] = -100
+                                # Calculate and Gather Reference Log Probs (Sequence Level)
+                                if self.use_reference_policy:
+                                    ref_log_prob_tensor = batch.batch["ref_log_prob"]  # Token level [bsz * n, seq_len]
+                                    response_mask_full = batch.batch[
+                                        "response_mask"
+                                    ]  # Response mask [bsz * n, seq_len]
+                                    ref_sequence_logps = (ref_log_prob_tensor * response_mask_full).sum(
+                                        dim=-1
+                                    )  # Sequence level [bsz * n]
+                                    reference_chosen_logps = ref_sequence_logps[preferences_mask]
+                                    reference_rejected_logps = ref_sequence_logps[not_preferences_mask]
+                                else:
+                                    # If not using external ref, DPO needs ActorAsRef logic in dp_actor
+                                    # We won't add the keys here, dp_actor will handle it (or fail if not modified)
+                                    print(
+                                        "Info: Not adding explicit reference logps to DPO batch "
+                                        "(use_reference_policy=False)."
+                                    )
+                                    reference_chosen_logps = None  # Explicitly None
+                                    reference_rejected_logps = None
+                                # Package Tensors
+                                dpo_tensors = {
+                                    "chosen_input_ids": chosen_input_ids,
+                                    "chosen_attention_mask": chosen_attention_mask,
+                                    "chosen_labels": chosen_labels,
+                                    "rejected_input_ids": rejected_input_ids,
+                                    "rejected_attention_mask": rejected_attention_mask,
+                                    "rejected_labels": rejected_labels,
+                                }
+                                # Conditionally add reference logps if computed
+                                if reference_chosen_logps is not None:
+                                    dpo_tensors["reference_chosen_logps"] = reference_chosen_logps
+                                if reference_rejected_logps is not None:
+                                    dpo_tensors["reference_rejected_logps"] = reference_rejected_logps
+                                # Add position ids if they exist
+                                if chosen_position_ids is not None:
+                                    dpo_tensors["chosen_position_ids"] = chosen_position_ids
+                                if rejected_position_ids is not None:
+                                    dpo_tensors["rejected_position_ids"] = rejected_position_ids
+                                # Prepare Meta Info
+                                dpo_meta = {
+                                    "dpo_beta": OmegaConf.select(self.config.algorithm, "dpo_beta", default=0.1),
+                                    "dpo_loss_type": OmegaConf.select(
+                                        self.config.algorithm, "dpo_loss_type", default="sigmoid"
+                                    ),
+                                    "dpo_label_smoothing": OmegaConf.select(
+                                        self.config.algorithm, "dpo_label_smoothing", default=0.0
+                                    ),
+                                    "use_reference_policy": self.use_reference_policy,
+                                    "reference_free": not self.use_reference_policy,  # False if using external ref
+                                    "global_step": self.global_steps,
+                                }
+                                dpo_update_batch_proto = DataProto.from_dict(tensors=dpo_tensors, meta_info=dpo_meta)
+                                # print(f"---- [Step {self.global_steps}] DEBUG DPO: Prepared DPO Update Batch ----")
+                                # print(f"  Keys: {list(dpo_update_batch_proto.batch.keys())}")
+                                # print(f"  Meta Info: {dpo_meta}")
+                            except Exception as e_prep:
+                                print(f"ERROR preparing DPO batch at step {self.global_steps}: {e_prep}")
+                                traceback.print_exc()
+                                dpo_update_batch_proto = None  # Skip update on error
+                        # --- Actor Update Step ---
+                        actor_output = None
+                        if self.config.trainer.critic_warmup <= self.global_steps and dpo_update_batch_proto:
+                            with _timer("update_actor", timing_raw):
+                                # Pass the batch containing reference log probs (if computed)
+                                # The modified update_actor_dpo expects them if reference_free=False
+                                actor_output = self.actor_rollout_wg.update_actor_dpo(dpo_update_batch_proto)
+                            if actor_output and "metrics" in actor_output.meta_info:
+                                metrics.update(reduce_metrics(actor_output.meta_info["metrics"]))
+                        elif dpo_update_batch_proto is None:
+                            print(
+                                f"Skipping actor update at step {self.global_steps} due to DPO batch preparation error."
+                            )
+                        # --- Validation and Saving ---
+                        test_freq = OmegaConf.select(self.config.trainer, "test_freq", default=-1)
+                        is_last_step = self.global_steps >= self.total_training_steps
+                        if (
+                            self.val_reward_fn is not None
+                            and test_freq > 0
+                            and (is_last_step or self.global_steps % test_freq == 0)
+                        ):
+                            print(f"\nRunning DPO validation at step {self.global_steps}...")
+                            val_timing_raw = {}
+                            with _timer("testing", val_timing_raw):
+                                val_metrics: dict = self._validate()
+                            if is_last_step:
+                                last_val_metrics = val_metrics
+                            if val_metrics:
+                                metrics["time/validation_run"] = val_timing_raw.get("testing", 0)
+                                metrics.update(val_metrics)
+                            else:
+                                print("Validation skipped or returned no metrics.")
+                        save_freq = OmegaConf.select(self.config.trainer, "save_freq", default=-1)
+                        if save_freq > 0 and (is_last_step or self.global_steps % save_freq == 0):
+                            print(f"\nSaving DPO checkpoint at step {self.global_steps}...")
+                            with _timer("save_checkpoint", timing_raw):
+                                self._save_checkpoint()  # Saves actor (and potentially critic if used elsewhere)
+                            metrics["time/save_checkpoint"] = timing_raw.get("save_checkpoint", 0)
+                    # --- End main step timer context ---
+                    # --- Metrics calculation AFTER the 'step' timer block ---
+                    metrics.update(compute_dpo_data_metrics(batch=batch))  # Use DPO-specific metrics
+                    metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+                    n_gpus = self.resource_pool_manager.get_n_gpus()
+                    if "step" in timing_raw:
+                        metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
+                    else:
+                        print(
+                            f"Warning: 'step' key missing from timing_raw at step {self.global_steps}. "
+                            f"Skipping throughput."
+                        )
+                    step_timer.stop()
+                    metrics["time/step"] = step_timer.last
+                    # Log metrics
+                    log_freq = OmegaConf.select(self.config.trainer, "log_freq", default=1)
+                    if logger and self.global_steps % log_freq == 0:
+                        log_payload = metrics.copy()
+                        # Add learning rate to log payload
+                        if actor_output and "actor/lr" in metrics:
+                            log_payload["actor/lr"] = metrics["actor/lr"]
+                        print(f"[Step {self.global_steps} DPO] Logging Step Payload Keys: {list(log_payload.keys())}")
+                        try:
+                            logger.log(data=log_payload, step=self.global_steps)
+                        except Exception as e:
+                            print(f"Logging failed at step {self.global_steps}: {e}")
+                    # Update progress bar
+                    postfix_metrics = {
+                        k: f"{v:.3f}" if isinstance(v, float) else v
+                        for k, v in metrics.items()
+                        if isinstance(v, int | float)
+                    }
+                    progress_bar.set_postfix(postfix_metrics)
+                except Exception as step_e:
+                    print(f"\n!!!!!!!! ERROR DURING DPO Step {self.global_steps} !!!!!!!!")
+                    print(f"Caught Exception: {step_e}")
+                    traceback.print_exc()
+                    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                    step_timer.stop()
+                    should_stop = True
+                    break
+                if is_last_step or should_stop:
+                    print(f"Stopping DPO training at step {self.global_steps}.")
+                    break
+                self.global_steps += 1
+                progress_bar.update(1)
+            # End of epoch handling
+            if hasattr(self.train_dataloader, "reset"):
+                try:
+                    self.train_dataloader.reset()
+                except Exception as e:
+                    print(f"Warning: Failed to reset train dataloader state: {e}")
+            if should_stop:
+                break
+        # --- Final cleanup and logging ---
+        progress_bar.close()
+        final_step = max(0, self.global_steps - 1)
+        print(f"Online DPO Training finished at step {final_step}.")
+        # Save final checkpoint
+        save_freq = OmegaConf.select(self.config.trainer, "save_freq", default=-1)
+        if not self.config.trainer.get("val_only", False) and (save_freq <= 0 or final_step % save_freq != 0):
+            print(f"Saving final DPO checkpoint at step {final_step}...")
+            self._save_checkpoint()
+        # Final validation run
+        if self.val_reward_fn and last_val_metrics is None and not self.config.trainer.get("val_only", False):
+            print("Running final validation...")
+            last_val_metrics = self._validate()
+            if last_val_metrics and logger:
+                last_val_metrics["final_validation"] = True
+                try:
+                    logger.log(data=last_val_metrics, step=final_step)
+                except Exception as e:
+                    print(f"[Final Val Metrics Log Error]: {e}")
+        pprint(f"Final validation metrics: {last_val_metrics}")
+        if logger and hasattr(logger, "finish"):
+            logger.finish()
+        print("Online DPO Training Run Complete.")

ICL/LV/code/README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+Unified Multi-Model VQA Codebase
+目的
+- 这一套代码是“模型无关”的通用评测/数据/提示构造层；所有模型仅通过“适配器”接入。
+- 通用输入是 OpenAI 扁平内容序列（image→text；示例用 [REQUEST]/[RESPONSE]；查询的 [RESPONSE] 为空）。
+目录
+- core/
+  - prompting/openai_segments.py  扁平序列构造与落盘适配
+  - datasets/m3it_reader.py       M3IT 统一读取 & base64 图片缓存
+  - metrics/metrics.py            Token‑F1、BERTScore‑F1 等
+ - eval/                         与模型无关的评测脚本（调用 adapters）
+    - zero_shot_vqa.py / random_k_shot_vqa.py
+    - eval_textual_retriever_vqa.py / eval_visual_retriever_vqa.py / eval_multimodal_retriever_vqa.py
+    - order 评测（统一缓存 + 独立指标脚本）：
+      - order_eval_core.py（内部调用） / _modal_order.py（内部调用）
+      - eval_order_caption_bertscore.py / eval_order_caption_cider.py
+      - eval_order_classification_accuracy.py / eval_order_classification_f1.py
+      - eval_order_reasoning_accuracy.py / eval_order_reasoning_ras.py
+      - eval_order_vqa_bertscore.py / eval_order_vqa_tokenf1.py
+- adapters/
+  - idefics2_adapter.py
+  - qwen_vl_adapter.py
+  - qwen3vl_adapter.py
+  - gemma3_adapter.py
+使用
+- 例：零样本（Idefics2）
+  python3 -m core.eval.zero_shot_vqa \
+    --adapter idefics2 \
+    --model-path /path/to/idefics2-8b \
+    --dataset-root /path/to/M3IT \
+    --split test --total-samples 500 \
+    --instruction-image "C:\\Users\\you\\instruction.png" --dump-first 2
+- 例：随机 few‑shot（Qwen‑VL）
+  python3 -m core.eval.random_k_shot_vqa \
+    --adapter qwen-vl \
+    --model-path /path/to/Qwen-VL \
+    --dataset-root /path/to/M3IT \
+    --split test --k-shots 3 --total-samples 500 \
+    --use-paper-instruction --instruction-image "C:\\Users\\you\\instruction.png"
+- 例：模态顺序评测（以 VQA Token-F1 为例）
+  python3 -m core.eval.eval_order_vqa_tokenf1 \
+    --adapter idefics2 \
+    --model-path /path/to/idefics2-8b \
+    --dataset-root /path/to/M3IT \
+    --retriever-model-path /path/to/BridgeTower-or-CLIP \
+    --orders image-text,text-image,text-image-text \
+    --k-shots 3 --total-samples 500 --split val
+    --adapter qwen-vl \
+    --model-path /path/to/Qwen-VL \
+    --dataset-root /path/to/M3IT \
+    --split test --k-shots 3 --total-samples 500 \
+    --use-paper-instruction --instruction-image "C:\\Users\\you\\instruction.png"
+约定
+- 适配器接口见 adapters/*.py：
+  - create(model_path: str) -> Adapter
+  - Adapter.generate_from_segments(segs: List[dict], temperature: float, top_p: float, max_new_tokens: int) -> str
+  - 可选：Adapter.generate_single(image_path: str, prompt: str, ...)
+说明
+- 适配器与通用源码彻底分离；你可以只替换 adapters/xxx_adapter.py 即可对接新模型。
+- Windows 路径/BASE64/data:URL 的图片由 prompting/openai_segments.py 自动兼容。

ICL/LV/code/SFT/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (8.2 kB). View file

ICL/LV/code/SFT/build_icl_eval_sharegpt.py ADDED Viewed

	@@ -0,0 +1,437 @@

+#!/usr/bin/env python3
+"""
+Build a prompt-only ShareGPT-style eval set for deciding <RET> vs <ANS>.
+Prompt format is aligned with build_icl_dataset.py:
+  instruction + <image> + "Question: ...\\nAction:"
+But for evaluation we keep ONLY the initial human turn in `conversations` to avoid leaking labels.
+Gold labels are stored outside the prompt:
+  - expected_first_tag: "<RET>" or "<ANS>" (NOT included in conversations)
+  - answer: used for offline checking (NOT included in conversations)
+  - shots: for RET samples only, used for the follow-up step after model outputs <RET>
+Important:
+  - Never use train split for eval; recommend val/test/dev.
+  - Optionally excludes any uid already present in an existing training jsonl to avoid overlap.
+"""
+import argparse
+import json
+import random
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Set
+# Add code root to PYTHONPATH for core/ imports
+CODE_ROOT = Path(__file__).resolve().parents[1]
+if str(CODE_ROOT) not in sys.path:
+    sys.path.insert(0, str(CODE_ROOT))
+from core.datasets.m3it_reader import iter_m3it_samples, load_instructions  # noqa: E402
+@dataclass(frozen=True)
+class PoolItem:
+    image_path: str
+    description: str
+    subdir: str
+@dataclass(frozen=True)
+class QueryItem:
+    image_path: str
+    question: str
+    answer: str
+    subdir: str
+    uid: str
+def _extract_uid(raw: Dict, fallback: str) -> str:
+    if isinstance(raw, dict):
+        for k in ("id", "image_id"):
+            v = raw.get(k)
+            if isinstance(v, (str, int)):
+                return str(v)
+        meta = raw.get("meta") if isinstance(raw.get("meta"), dict) else {}
+        for k in ("img_id", "id", "image_id"):
+            v = meta.get(k)
+            if isinstance(v, (str, int)):
+                return str(v)
+    return fallback
+def discover_subdirs(dataset_root: Path, category: str) -> List[str]:
+    base = dataset_root / "data" / category
+    if not base.exists():
+        return []
+    out: List[str] = []
+    for p in sorted(base.iterdir()):
+        if p.is_dir():
+            out.append(f"{category}/{p.name}")
+    return out
+def pick_instruction(insts: List[str], rng: random.Random) -> str:
+    if insts:
+        s = rng.choice(insts)
+        if isinstance(s, str) and s.strip():
+            return s.strip()
+    return "Please answer the question based on the image."
+def load_exclude_uids(path: Optional[str]) -> Set[str]:
+    if not path:
+        return set()
+    p = Path(path)
+    if not p.exists():
+        return set()
+    out: Set[str] = set()
+    with p.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except Exception:
+                continue
+            if not isinstance(obj, dict):
+                continue
+            uid = obj.get("uid")
+            if isinstance(uid, (str, int)):
+                out.add(str(uid))
+                continue
+            sid = obj.get("id")
+            if isinstance(sid, (str, int)):
+                out.add(str(sid))
+    return out
+def to_rel(path: str, root: Path) -> str:
+    try:
+        return str(Path(path).relative_to(root))
+    except Exception:
+        return path
+def build_pool_for_subdir(
+    *,
+    dataset_root: Path,
+    subdir: str,
+    split: str,
+    cache_dir: Path,
+    target_n: int,
+    max_samples_scan: int,
+) -> List[PoolItem]:
+    items: List[PoolItem] = []
+    try:
+        iterable = iter_m3it_samples(
+            str(dataset_root),
+            subdir,
+            split=split,
+            cache_dir=str(cache_dir),
+            max_samples=None,
+        )
+    except FileNotFoundError:
+        return []
+    for idx, smp in enumerate(iterable):
+        if max_samples_scan > 0 and idx >= max_samples_scan:
+            break
+        if not smp.answers:
+            continue
+        desc = (smp.answers[0] or "").strip()
+        if not desc:
+            continue
+        items.append(PoolItem(smp.image_path, desc, subdir))
+        if target_n > 0 and len(items) >= target_n:
+            break
+    return items
+def collect_query_pool(
+    *,
+    dataset_root: Path,
+    subdirs: List[str],
+    split: str,
+    cache_dir: Path,
+    exclude_uids: Set[str],
+    target_n: int,
+    seed: int,
+    max_samples_per_subdir: int,
+) -> List[QueryItem]:
+    rng = random.Random(seed)
+    subdirs = list(subdirs)
+    rng.shuffle(subdirs)
+    seen: Set[str] = set()
+    out: List[QueryItem] = []
+    for subdir in subdirs:
+        taken = 0
+        try:
+            iterable = iter_m3it_samples(
+                str(dataset_root),
+                subdir,
+                split=split,
+                cache_dir=str(cache_dir),
+                max_samples=None,
+            )
+        except FileNotFoundError:
+            continue
+        for i, smp in enumerate(iterable):
+            if max_samples_per_subdir > 0 and taken >= max_samples_per_subdir:
+                break
+            q = (smp.text or "").strip()
+            if not q:
+                continue
+            if not smp.answers:
+                continue
+            ans = (smp.answers[0] or "").strip()
+            if not ans:
+                continue
+            uid = _extract_uid(smp.raw, f"{subdir}:{i:08d}")
+            if uid in exclude_uids:
+                continue
+            if uid in seen:
+                continue
+            seen.add(uid)
+            out.append(QueryItem(smp.image_path, q, ans, subdir, uid))
+            taken += 1
+            if target_n > 0 and len(out) >= target_n:
+                return out
+    return out
+def select_shots(
+    pool: List[PoolItem],
+    k: int,
+    rng: random.Random,
+    exclude_image: Optional[str] = None,
+) -> List[PoolItem]:
+    if not pool or k <= 0:
+        return []
+    cand = [p for p in pool if p.image_path != exclude_image]
+    if not cand:
+        cand = pool
+    if len(cand) >= k:
+        return rng.sample(cand, k=k)
+    return [rng.choice(cand) for _ in range(k)]
+def write_jsonl(path: Path, records: List[Dict]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for r in records:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+def build_prompt_only_record(
+    *,
+    uid: str,
+    instruction: str,
+    image_rel: str,
+    question: str,
+    expected_first_tag: str,
+    answer: str,
+    category: str,
+    subdir: str,
+    shots: List[Dict],
+    k_shot: int,
+) -> Dict:
+    human = []
+    if instruction:
+        human.append(instruction.strip())
+    human.append("<image>")
+    human.append(f"Question: {question}\nAction:")
+    human_value = "\n".join([x for x in human if x]).strip()
+    return {
+        "id": uid,
+        "images": [image_rel],
+        "conversations": [
+            {"from": "human", "value": human_value},
+        ],
+        "expected_first_tag": expected_first_tag,
+        "answer": answer,
+        "k_shot": k_shot,
+        "shots": shots,
+        "category": category,
+        "subdir": subdir,
+        "instruction": instruction,
+        "query": {"image": image_rel, "question": question},
+    }
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Build prompt-only eval set (ShareGPT jsonl) for <RET>/<ANS> decision.")
+    ap.add_argument("--dataset-root", default="/workspace/M3IT")
+    ap.add_argument("--output-dir", default="/workspace/M3IT_new/ICL_eval")
+    ap.add_argument("--category", default="vqa")
+    ap.add_argument("--split", default="val", help="Never use train; recommend val/test/dev.")
+    ap.add_argument("--pool-split", default="val", help="Never use train; recommend val/test/dev.")
+    ap.add_argument("--seed", type=int, default=42)
+    ap.add_argument("--total", type=int, default=100)
+    ap.add_argument("--ret-ratio", type=float, default=0.5)
+    ap.add_argument("--query-pool-size", type=int, default=1000, help="How many queries to collect before sampling.")
+    ap.add_argument("--max-samples-per-subdir", type=int, default=2000, help="Scan cap per subdir when collecting queries.")
+    ap.add_argument("--pool-size-per-subdir", type=int, default=2000, help="Max pool size to build per subdir (for shots).")
+    ap.add_argument("--pool-scan-per-subdir", type=int, default=4000, help="Scan cap per subdir when building pools.")
+    ap.add_argument("--shot-k-min", type=int, default=1)
+    ap.add_argument("--shot-k-max", type=int, default=3)
+    ap.add_argument(
+        "--exclude-uids-from",
+        default="/workspace/M3IT_new/ICL/vqa/merged_shuffled_sharegpt.jsonl",
+        help="Optional jsonl to exclude uids/ids (to avoid overlap with training).",
+    )
+    ap.add_argument("--overwrite", action="store_true")
+    ap.add_argument("--output", default=None, help="Default: {output_dir}/{category}/eval_sharegpt_{total}.jsonl")
+    args = ap.parse_args()
+    if args.split.strip().lower() == "train" or args.pool_split.strip().lower() == "train":
+        raise ValueError("split/pool-split=train is not allowed for eval set")
+    if args.total <= 0:
+        raise ValueError("total must be > 0")
+    if not (0.0 <= args.ret_ratio <= 1.0):
+        raise ValueError("ret-ratio must be in [0, 1]")
+    if args.shot_k_min <= 0 or args.shot_k_max < args.shot_k_min:
+        raise ValueError("invalid shot-k range")
+    dataset_root = Path(args.dataset_root)
+    output_dir = Path(args.output_dir)
+    cache_dir = output_dir / "_image_cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    out_path = Path(
+        args.output
+        if args.output
+        else str(output_dir / args.category / f"eval_sharegpt_{args.total}.jsonl")
+    )
+    if out_path.exists() and not args.overwrite:
+        raise FileExistsError(f"Output exists: {out_path} (use --overwrite to replace)")
+    subdirs = discover_subdirs(dataset_root, args.category)
+    if not subdirs:
+        raise FileNotFoundError(f"No subdirs found under {dataset_root}/data/{args.category}")
+    exclude_uids = load_exclude_uids(args.exclude_uids_from)
+    # Load instructions once per subdir.
+    inst_map: Dict[str, List[str]] = {sd: load_instructions(dataset_root, sd) for sd in subdirs}
+    query_pool_target = max(args.total, args.query_pool_size)
+    queries = collect_query_pool(
+        dataset_root=dataset_root,
+        subdirs=subdirs,
+        split=args.split,
+        cache_dir=cache_dir,
+        exclude_uids=exclude_uids,
+        target_n=query_pool_target,
+        seed=args.seed,
+        max_samples_per_subdir=args.max_samples_per_subdir,
+    )
+    if len(queries) < args.total:
+        raise RuntimeError(f"Not enough queries collected: got {len(queries)}/{args.total}. "
+                           f"Try increasing --max-samples-per-subdir or changing --split.")
+    rng = random.Random(args.seed)
+    ret_n = int(round(args.total * args.ret_ratio))
+    ret_n = max(0, min(args.total, ret_n))
+    ans_n = args.total - ret_n
+    labels = ["RET"] * ret_n + ["ANS"] * ans_n
+    rng.shuffle(labels)
+    # Pools are built lazily per subdir for RET samples only.
+    pool_map: Dict[str, List[PoolItem]] = {}
+    records: List[Dict] = []
+    used_uids: Set[str] = set()
+    for label in labels:
+        for _try in range(2000):
+            q = rng.choice(queries)
+            if q.uid in used_uids:
+                continue
+            inst = pick_instruction(inst_map.get(q.subdir, []), rng)
+            image_rel = to_rel(q.image_path, output_dir)
+            if label == "ANS":
+                used_uids.add(q.uid)
+                records.append(
+                    build_prompt_only_record(
+                        uid=q.uid,
+                        instruction=inst,
+                        image_rel=image_rel,
+                        question=q.question,
+                        expected_first_tag="<ANS>",
+                        answer=q.answer,
+                        category=args.category,
+                        subdir=q.subdir,
+                        shots=[],
+                        k_shot=0,
+                    )
+                )
+                break
+            # RET case: attach hidden shots for the follow-up step.
+            if q.subdir not in pool_map:
+                pool_map[q.subdir] = build_pool_for_subdir(
+                    dataset_root=dataset_root,
+                    subdir=q.subdir,
+                    split=args.pool_split,
+                    cache_dir=cache_dir,
+                    target_n=args.pool_size_per_subdir,
+                    max_samples_scan=args.pool_scan_per_subdir,
+                )
+            pool = pool_map.get(q.subdir, [])
+            if not pool:
+                continue
+            k = rng.randint(args.shot_k_min, args.shot_k_max)
+            shots_items = select_shots(pool, k, rng, exclude_image=q.image_path)
+            shots = [
+                {"image": to_rel(s.image_path, output_dir), "description": s.description}
+                for s in shots_items
+            ]
+            used_uids.add(q.uid)
+            records.append(
+                build_prompt_only_record(
+                    uid=q.uid,
+                    instruction=inst,
+                    image_rel=image_rel,
+                    question=q.question,
+                    expected_first_tag="<RET>",
+                    answer=q.answer,
+                    category=args.category,
+                    subdir=q.subdir,
+                    shots=shots,
+                    k_shot=k,
+                )
+            )
+            break
+        else:
+            raise RuntimeError(f"Failed to sample enough records for label={label}. "
+                               f"Try increasing --query-pool-size or relaxing --exclude-uids-from.")
+    rng.shuffle(records)
+    write_jsonl(out_path, records)
+    # Lightweight summary to stdout.
+    ret_cnt = sum(1 for r in records if r.get("expected_first_tag") == "<RET>")
+    ans_cnt = len(records) - ret_cnt
+    print(f"[OK] wrote={len(records)} ret={ret_cnt} ans={ans_cnt} -> {out_path}")
+    print(f"[INFO] image_root (for eval): {output_dir}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

ICL/LV/code/SFT/check_kshot_ret_ans.py ADDED Viewed

	@@ -0,0 +1,319 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Check whether the model outputs <RET> or <ANS> under different shot settings.
+0-shot: only query image + question.
+K-shot (K>=1): after the model outputs <RET>, append 1 shot (image+description) and
+               ask again. If it still outputs <RET>, append another shot, and so on.
+               We record whether it outputs <RET>/<ANS> at each step.
+"""
+import argparse
+import json
+import os
+import random
+import re
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from PIL import Image
+from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+TAG_RE = re.compile(r"(<ANS>|<RET>)")
+def _extract_tag(text: str) -> Optional[str]:
+    match = TAG_RE.search(text)
+    return match.group(1) if match else None
+def _resolve_path(root: str, maybe_rel: str) -> str:
+    if os.path.isabs(maybe_rel):
+        return maybe_rel
+    return os.path.normpath(os.path.join(root, maybe_rel))
+def _split_user_text_with_images(
+    text: str, image_paths: List[str]
+) -> Tuple[List[Dict[str, str]], List[str]]:
+    parts = text.split("<image>")
+    content: List[Dict[str, str]] = []
+    used: List[str] = []
+    for i, part in enumerate(parts):
+        part = part.strip()
+        if part:
+            content.append({"type": "text", "text": part})
+        if i < len(parts) - 1:
+            if not image_paths:
+                raise ValueError("用户文本里 <image> 数量 > images 列表长度")
+            img_path = image_paths.pop(0)
+            used.append(img_path)
+            content.append({"type": "image", "image": img_path})
+    return content, used
+def _append_human_turn(
+    *,
+    messages: List[Dict[str, Any]],
+    pil_images: List[Image.Image],
+    image_root: str,
+    text: str,
+    images_all: List[str],
+    image_cursor: int,
+) -> int:
+    n_placeholders = text.count("<image>")
+    img_paths = images_all[image_cursor : image_cursor + n_placeholders]
+    if len(img_paths) != n_placeholders:
+        raise ValueError("images 列表长度 < <image> 占位符数量")
+    user_content, used_paths = _split_user_text_with_images(text, img_paths.copy())
+    for p in used_paths:
+        p = _resolve_path(image_root, p) if not os.path.isabs(p) else p
+        if not os.path.exists(p):
+            raise FileNotFoundError(p)
+        with Image.open(p) as img:
+            pil_images.append(img.convert("RGB"))
+    messages.append({"role": "user", "content": user_content})
+    return image_cursor + n_placeholders
+def _append_shot_turn(
+    *,
+    messages: List[Dict[str, Any]],
+    pil_images: List[Image.Image],
+    image_root: str,
+    image_path: str,
+    description: str,
+) -> None:
+    img_path = _resolve_path(image_root, image_path)
+    if not os.path.exists(img_path):
+        raise FileNotFoundError(img_path)
+    with Image.open(img_path) as im:
+        pil_images.append(im.convert("RGB"))
+    messages.append(
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": img_path},
+                {"type": "text", "text": f"Description: {description}"},
+            ],
+        }
+    )
+def _build_base_messages(obj: Dict[str, Any], image_root: str) -> Tuple[List[Dict[str, Any]], List[Image.Image]]:
+    conversations = obj.get("conversations")
+    if not isinstance(conversations, list) or not conversations:
+        raise ValueError("样本缺少 conversations")
+    images_rel = obj.get("images") or []
+    if not isinstance(images_rel, list):
+        raise ValueError("images 字段不是 list")
+    messages: List[Dict[str, Any]] = []
+    pil_images: List[Image.Image] = []
+    image_cursor = 0
+    # Use the first human turn as query prompt.
+    human = None
+    for t in conversations:
+        if t.get("from") == "human":
+            human = t
+            break
+    if human is None:
+        raise ValueError("没有 human turn")
+    image_cursor = _append_human_turn(
+        messages=messages,
+        pil_images=pil_images,
+        image_root=image_root,
+        text=str(human.get("value", "")),
+        images_all=images_rel,
+        image_cursor=image_cursor,
+    )
+    return messages, pil_images
+def _pick_shots_from_pool(
+    pool: List[Dict[str, str]],
+    k: int,
+    rng: random.Random,
+    exclude_image: Optional[str],
+) -> List[Dict[str, str]]:
+    if k <= 0 or not pool:
+        return []
+    cand = [p for p in pool if p.get("image") != exclude_image]
+    if not cand:
+        cand = pool
+    if len(cand) >= k:
+        return rng.sample(cand, k=k)
+    return [rng.choice(cand) for _ in range(k)]
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Check <RET>/<ANS> outputs under 0/1/2/3-shot settings.")
+    ap.add_argument("--model", required=True, help="HF model dir")
+    ap.add_argument(
+        "--data",
+        default="/workspace/M3IT_new/ICL_eval/vqa/eval_sharegpt_100.jsonl",
+        help="Prompt-only eval jsonl (has conversations/images/shots).",
+    )
+    ap.add_argument("--image-root", default="/workspace/M3IT_new/ICL_eval")
+    ap.add_argument("--num-samples", type=int, default=20)
+    ap.add_argument("--seed", type=int, default=42)
+    ap.add_argument("--k-list", default="0,1,2,3", help="Comma-separated shot counts to report.")
+    ap.add_argument("--max-new-tokens", type=int, default=128)
+    ap.add_argument("--device", default="cuda:0")
+    ap.add_argument("--dtype", choices=["bf16", "fp16"], default="bf16")
+    ap.add_argument("--print-samples", action="store_true", help="Print each sample input/output.")
+    args = ap.parse_args()
+    rng = random.Random(args.seed)
+    k_list = [int(x.strip()) for x in args.k_list.split(",") if x.strip()]
+    if not k_list:
+        raise ValueError("k-list is empty")
+    max_k = max(k_list)
+    # Load dataset
+    data: List[Dict[str, Any]] = []
+    with open(args.data, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            data.append(json.loads(line))
+    if not data:
+        raise ValueError("empty data")
+    # Build global shot pool
+    pool: List[Dict[str, str]] = []
+    for obj in data:
+        shots = obj.get("shots") or []
+        if isinstance(shots, list):
+            for s in shots:
+                if not isinstance(s, dict):
+                    continue
+                img = s.get("image")
+                desc = s.get("description")
+                if isinstance(img, str) and isinstance(desc, str) and img and desc:
+                    pool.append({"image": img, "description": desc})
+    if not pool:
+        print("[WARN] shot pool is empty, k-shot tests may be skipped")
+    # Sample records
+    samples = rng.sample(data, k=min(args.num_samples, len(data)))
+    dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16
+    device = torch.device(args.device)
+    processor = AutoProcessor.from_pretrained(args.model, trust_remote_code=True)
+    model = Qwen3VLForConditionalGeneration.from_pretrained(
+        args.model, dtype=dtype, trust_remote_code=True
+    ).to(device)
+    model.eval()
+    summary: Dict[int, Dict[str, int]] = {
+        k: {"RET": 0, "ANS": 0, "NONE": 0, "REACHED": 0} for k in k_list
+    }
+    for obj in samples:
+        uid = obj.get("id") or obj.get("uid") or "unknown"
+        query = obj.get("query") or {}
+        query_image = query.get("image")
+        messages, pil_images = _build_base_messages(obj, args.image_root)
+        # Pre-sample shots for this sample (use first N as we go).
+        shots_all = _pick_shots_from_pool(pool, max_k, rng, exclude_image=query_image)
+        if max_k > 0 and len(shots_all) < max_k:
+            continue
+        step = 0  # step 0 = query only
+        while True:
+            prompt = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            inputs = processor(text=prompt, images=pil_images, padding=True, return_tensors="pt")
+            inputs = {k2: v.to(device) for k2, v in inputs.items()}
+            with torch.inference_mode():
+                out_ids = model.generate(
+                    **inputs, do_sample=False, max_new_tokens=args.max_new_tokens
+                )
+            in_len = int(inputs["input_ids"].shape[1])
+            pred = processor.batch_decode(out_ids[:, in_len:], skip_special_tokens=True)[0].strip()
+            tag = _extract_tag(pred)
+            # Append model output to the conversation
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": pred}]})
+            if step in summary:
+                summary[step]["REACHED"] += 1
+                if tag == "<RET>":
+                    summary[step]["RET"] += 1
+                elif tag == "<ANS>":
+                    summary[step]["ANS"] += 1
+                else:
+                    summary[step]["NONE"] += 1
+            if args.print_samples:
+                print("=" * 80)
+                print(f"uid={uid} | step={step} | pred_tag={tag}")
+                for m in messages:
+                    role = m.get("role")
+                    if role == "user":
+                        parts = []
+                        for c in m.get("content", []):
+                            if c.get("type") == "text":
+                                parts.append(c.get("text", ""))
+                            elif c.get("type") == "image":
+                                parts.append(f"<image> {c.get('image','')}")
+                        print("[输入]")
+                        print("\n".join([p for p in parts if p]).strip())
+                    elif role == "assistant":
+                        parts = []
+                        for c in m.get("content", []):
+                            if c.get("type") == "text":
+                                parts.append(c.get("text", ""))
+                        print("[输入-助手]")
+                        print("\n".join([p for p in parts if p]).strip())
+                print("[输出]")
+                print(pred)
+            # Stop if not <RET> or reached max_k shots
+            if tag != "<RET>":
+                break
+            if step >= max_k:
+                break
+            # Append next shot and ask again.
+            shot = shots_all[step] if step < len(shots_all) else None
+            if not shot:
+                break
+            _append_shot_turn(
+                messages=messages,
+                pil_images=pil_images,
+                image_root=args.image_root,
+                image_path=shot["image"],
+                description=shot["description"],
+            )
+            # Ask for decision again after each shot.
+            messages.append({"role": "user", "content": [{"type": "text", "text": "Action:"}]})
+            step += 1
+    print("=== summary ===")
+    for k in k_list:
+        s = summary[k]
+        reached = s["REACHED"]
+        if reached == 0:
+            print(f"k={k}: no samples")
+            continue
+        print(
+            f"k={k} | RET={s['RET']} | ANS={s['ANS']} | NONE={s['NONE']} | reached={reached}"
+        )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

ICL/LV/code/SFT/cuda-keyring_1.1-1_all.deb ADDED Viewed

Binary file (4.33 kB). View file

ICL/LV/code/SFT/prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/usr/bin/env python3
+"""
+预先生成数据集缓存
+运行一次后，训练时直接加载缓存，避免超时
+"""
+import os
+import sys
+import pickle
+from pathlib import Path
+# 添加当前目录到路径
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from config import get_config
+from dataset import SFTDataset
+def main():
+    config = get_config()
+    cache_path = Path(config.training.output_dir) / ".dataset_cache.pkl"
+    ready_flag = Path(config.training.output_dir) / ".dataset_ready"
+    # 创建输出目录
+    os.makedirs(config.training.output_dir, exist_ok=True)
+    print("=" * 60)
+    print("预生成 SFT 数据集缓存")
+    print("=" * 60)
+    print(f"数据目录: {config.data.sft_data_dir}")
+    print(f"缓存路径: {cache_path}")
+    print("=" * 60)
+    # 加载数据集
+    print("\n开始加载数据集...")
+    dataset = SFTDataset(config.data, split="train")
+    print(f"\n数据集加载完成！共 {len(dataset)} 个样本")
+    # 保存缓存
+    print(f"\n保存缓存到: {cache_path}")
+    with open(cache_path, "wb") as f:
+        pickle.dump(dataset.samples, f)
+    # 创建就绪标记
+    ready_flag.touch()
+    print(f"就绪标记: {ready_flag}")
+    print("\n" + "=" * 60)
+    print("缓存生成完成！现在可以运行 bash run_train.sh")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

ICL/LV/code/adapters/gemma3_adapter.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from __future__ import annotations
+from typing import List, Dict
+try:
+    from adapters._runners.gemma3_infer import Gemma3Runner
+except Exception:
+    Gemma3Runner = None  # type: ignore
+class Adapter:
+    def __init__(self, model_path: str):
+        if Gemma3Runner is None:
+            raise RuntimeError('Gemma3Runner unavailable. Ensure gemma3-code is on PYTHONPATH or install its runner.')
+        self.runner = Gemma3Runner(model_path)
+    def generate_from_segments(self, segs: List[Dict[str, str]], *,
+                               temperature: float, top_p: float, max_new_tokens: int) -> str:
+        gen = getattr(self.runner, 'generate_from_qwen_segs', None)
+        if gen is None:
+            raise RuntimeError('Gemma3Runner missing generate_from_qwen_segs')
+        return gen(segs, temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens)
+def create(model_path: str) -> Adapter:
+    return Adapter(model_path)

ICL/LV/code/adapters/qwen3vl_adapter.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from __future__ import annotations
+from typing import List, Dict
+try:
+    from adapters._runners.qwen3_vl_infer import Qwen3VLRunner
+except Exception:
+    Qwen3VLRunner = None  # type: ignore
+class Adapter:
+    def __init__(self, model_path: str):
+        if Qwen3VLRunner is None:
+            raise RuntimeError('Qwen3VLRunner unavailable. Ensure QWEN3VL-code is on PYTHONPATH or install its runner.')
+        self.runner = Qwen3VLRunner(model_path)
+    def generate_from_segments(self, segs: List[Dict[str, str]], *,
+                               temperature: float, top_p: float, max_new_tokens: int) -> str:
+        gen = getattr(self.runner, 'generate_from_segments', None)
+        if gen is None:
+            raise RuntimeError('Qwen3VLRunner missing generate_from_segments')
+        return gen(segs, temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens)
+def create(model_path: str) -> Adapter:
+    return Adapter(model_path)

ICL/LV/code/attn map/attn map/attn map/__pycache__/token_attention_utils.cpython-313.pyc ADDED Viewed

Binary file (2.57 kB). View file