Lekr0 commited on 23 days ago

Commit

5de3d77

verified ·

1 Parent(s): e9585fc

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

ICL/RL/trl_source/.github/workflows/tests-experimental.yml +70 -0
ICL/RL/trl_source/.github/workflows/tests_transformers_branch.yml +121 -0
ICL/RL/trl_source/examples/scripts/evals/judge_tldr.py +108 -0
ICL/RL/trl_source/examples/scripts/nemo_gym/deepspeed_zero3.yaml +22 -0
ICL/RL/trl_source/examples/scripts/nemo_gym/submit.sh +112 -0
ICL/RL/trl_source/examples/scripts/online_dpo.py +159 -0
ICL/RL/trl_source/examples/scripts/openenv/browsergym_llm.py +506 -0
ICL/RL/trl_source/examples/scripts/openenv/echo.py +248 -0
ICL/RL/trl_source/examples/scripts/openenv/wordle.py +607 -0
ICL/RL/trl_source/examples/scripts/openenv/wordle_prompt.txt +105 -0
ICL/RL/trl_source/examples/scripts/ppo/ppo.py +180 -0
ICL/RL/trl_source/examples/scripts/reward_modeling.py +136 -0
ICL/RL/trl_source/examples/scripts/sft_vlm_gemma3.py +194 -0
ICL/RL/trl_source/trl/__pycache__/__init__.cpython-313.pyc +0 -0
ICL/RL/trl_source/trl/__pycache__/_compat.cpython-313.pyc +0 -0
ICL/RL/trl_source/trl/__pycache__/chat_template_utils.cpython-313.pyc +0 -0
ICL/RL/trl_source/trl/__pycache__/data_utils.cpython-313.pyc +0 -0
ICL/RL/trl_source/trl/__pycache__/import_utils.cpython-313.pyc +0 -0
ICL/RL/trl_source/trl/accelerate_configs/fsdp1.yaml +28 -0
ICL/RL/trl_source/trl/accelerate_configs/fsdp2.yaml +25 -0
ICL/RL/trl_source/trl/accelerate_configs/multi_gpu.yaml +16 -0
ICL/RL/trl_source/trl/accelerate_configs/single_gpu.yaml +16 -0
ICL/RL/trl_source/trl/accelerate_configs/zero1.yaml +20 -0
ICL/RL/trl_source/trl/accelerate_configs/zero2.yaml +21 -0
ICL/RL/trl_source/trl/accelerate_configs/zero3.yaml +22 -0
ICL/RL/trl_source/trl/experimental/__init__.py +36 -0
ICL/RL/trl_source/trl/experimental/bco/__init__.py +16 -0
ICL/RL/trl_source/trl/experimental/bema_for_ref_model/__init__.py +16 -0
ICL/RL/trl_source/trl/experimental/bema_for_ref_model/dpo_trainer.py +30 -0
ICL/RL/trl_source/trl/experimental/cpo/__init__.py +19 -0
ICL/RL/trl_source/trl/experimental/cpo/cpo_config.py +207 -0
ICL/RL/trl_source/trl/experimental/cpo/cpo_trainer.py +1057 -0
ICL/RL/trl_source/trl/experimental/gfpo/gfpo_config.py +35 -0
ICL/RL/trl_source/trl/experimental/gkd/__init__.py +19 -0
ICL/RL/trl_source/trl/experimental/gkd/gkd_config.py +112 -0
ICL/RL/trl_source/trl/experimental/gold/__init__.py +19 -0
ICL/RL/trl_source/trl/experimental/gold/gold.py +155 -0
ICL/RL/trl_source/trl/experimental/gold/gold_config.py +419 -0
ICL/RL/trl_source/trl/experimental/grpo_with_replay_buffer/__init__.py +16 -0
ICL/RL/trl_source/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_config.py +34 -0
ICL/RL/trl_source/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_trainer.py +731 -0
ICL/RL/trl_source/trl/experimental/gspo_token/__init__.py +15 -0
ICL/RL/trl_source/trl/experimental/gspo_token/grpo_trainer.py +157 -0
ICL/RL/trl_source/trl/experimental/judges/__init__.py +36 -0
ICL/RL/trl_source/trl/experimental/judges/judges.py +482 -0
ICL/RL/trl_source/trl/experimental/kto/__init__.py +19 -0
ICL/RL/trl_source/trl/experimental/kto/kto_config.py +171 -0
ICL/RL/trl_source/trl/experimental/kto/kto_trainer.py +1511 -0
ICL/RL/trl_source/trl/experimental/merge_model_callback.py +352 -0
ICL/RL/trl_source/trl/experimental/minillm/__init__.py +19 -0

ICL/RL/trl_source/.github/workflows/tests-experimental.yml ADDED Viewed

	@@ -0,0 +1,70 @@

+name: Tests (experimental)
+on:
+  pull_request:
+    paths:
+      # Run only when relevant files are modified
+      - "trl/experimental/**"
+      - "tests/experimental/**"
+env:
+  TQDM_DISABLE: 1
+  PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+  TRL_EXPERIMENTAL_SILENCE: 1
+jobs:
+  check_code_quality:
+    name: Check code quality
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.draft == false
+    steps:
+      - uses: actions/checkout@v6
+      - name: Set up Python 3.13
+        uses: actions/setup-python@v6
+        with:
+          python-version: 3.13
+      - uses: pre-commit/action@v3.0.1
+        with:
+          extra_args: --all-files
+  tests:
+    name: Tests (experimental)
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
+      options: --gpus all
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Git checkout
+        uses: actions/checkout@v6
+      - name: Set up Python 3.13
+        uses: actions/setup-python@v6
+        with:
+          python-version: 3.13
+      - name: Install Make and Git
+        run: |
+          apt-get update && apt-get install -y make git curl
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel
+      - name: Install dependencies
+        run: |
+          source .venv/bin/activate
+          uv pip install ".[dev]"
+      - name: Test with pytest
+        run: |
+          source .venv/bin/activate
+          make test_experimental

ICL/RL/trl_source/.github/workflows/tests_transformers_branch.yml ADDED Viewed

	@@ -0,0 +1,121 @@

+name: Tests against Transformers branch
+on:
+  workflow_dispatch:
+    inputs:
+      transformers_ref:
+        description: "Transformers git ref (branch, tag, or commit SHA)"
+        required: true
+        default: "main"
+env:
+  TQDM_DISABLE: 1
+  CI_SLACK_CHANNEL: ${{ secrets.CI_PUSH_MAIN_CHANNEL }}
+  PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+jobs:
+  tests_transformers_branch:
+    name: Tests with Transformers ${{ inputs.transformers_ref }}
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
+      options: --gpus all
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Git checkout
+        uses: actions/checkout@v6
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+      - name: Install Make and Git
+        run: |
+          apt-get update && apt-get install -y make git curl
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel
+      - name: Install dependencies
+        run: |
+          source .venv/bin/activate
+          uv pip install ".[dev]"
+          uv pip install -U git+https://github.com/huggingface/transformers.git@${{ inputs.transformers_ref }}
+      - name: Test with pytest
+        run: |
+          source .venv/bin/activate
+          make test
+      - name: Post to Slack
+        if: github.ref == 'refs/heads/main' && always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: Results with Transformers ${{ inputs.transformers_ref }}
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+  distributed_smoke:
+    name: Distributed smoke tests with Transformers ${{ inputs.transformers_ref }}
+    runs-on:
+      group: aws-g5-12xlarge-cache
+    container:
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
+      options: --gpus all
+    defaults:
+      run:
+        shell: bash
+    env:
+      CUDA_VISIBLE_DEVICES: "0,1"
+    steps:
+      - name: Git checkout
+        uses: actions/checkout@v6
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+      - name: Install Make and Git
+        run: |
+          apt-get update && apt-get install -y make git curl
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel
+      - name: Install dependencies
+        run: |
+          source .venv/bin/activate
+          uv pip install ".[dev]"
+          uv pip install -U git+https://github.com/huggingface/transformers.git@${{ inputs.transformers_ref }}
+      - name: Run distributed smoke tests
+        run: |
+          source .venv/bin/activate
+          pytest -v tests/distributed/test_distributed.py
+      - name: Post to Slack
+        if: github.ref == 'refs/heads/main' && always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: Results of distributed smoke tests with Transformers ${{ inputs.transformers_ref }}
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

ICL/RL/trl_source/examples/scripts/evals/judge_tldr.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# /// script
+# dependencies = [
+#     "trl[vllm]",
+# ]
+# ///
+from dataclasses import dataclass, field
+from datasets import load_dataset
+from transformers import HfArgumentParser
+from vllm import LLM, SamplingParams
+from trl.experimental.judges import HfPairwiseJudge, OpenAIPairwiseJudge
+"""
+Examples:
+python examples/scripts/evals/judge_tldr.py --model_name_or_path vwxyzjn/rloo_tldr --num_examples 1000
+Model win rate: 31.40%
+python examples/scripts/evals/judge_tldr.py --model_name_or_path vwxyzjn/rloo_tldr --judge_model gpt-3.5-turbo-0125 --num_examples 1000
+Model win rate: 51.60%
+python examples/scripts/evals/judge_tldr.py --model_name_or_path vwxyzjn/rloo_tldr --judge_model gpt-4o-mini --num_examples 1000
+Model win rate: 51.20%
+python examples/scripts/evals/judge_tldr.py --model_name_or_path vwxyzjn/ppo_tldr --num_examples 1000
+Model win rate: 46.30%
+python examples/scripts/evals/judge_tldr.py --model_name_or_path vwxyzjn/ppo_tldr --judge_model gpt-3.5-turbo-0125 --num_examples 1000
+Model win rate: 52.50%
+python examples/scripts/evals/judge_tldr.py --model_name_or_path vwxyzjn/ppo_tldr --judge_model gpt-4o-mini --num_examples 1000
+Model win rate: 63.00%
+"""
+@dataclass
+class ScriptArguments:
+    r"""
+    Arguments for the script.
+    Args:
+        model_name_or_path (`str`):
+            Model name or path to the model to evaluate.
+        judge_model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`):
+            Model name or path to the model to use as a judge. E.g., 'gpt-3.5-turbo-0125' or
+            'meta-llama/Meta-Llama-3-70B-Instruct'.
+        num_examples (`int`, *optional*):
+            Number of examples to evaluate.
+    """
+    model_name_or_path: str = field(metadata={"help": "Model name or path to the model to evaluate."})
+    judge_model: str = field(
+        default="meta-llama/Meta-Llama-3-70B-Instruct",
+        metadata={
+            "help": "Model name or path to the model to use as a judge. E.g., 'gpt-3.5-turbo-0125' or "
+            "'meta-llama/Meta-Llama-3-70B-Instruct'."
+        },
+    )
+    num_examples: int | None = field(default=None, metadata={"help": "Number of examples to evaluate."})
+if __name__ == "__main__":
+    # Parse the arguments
+    parser = HfArgumentParser(ScriptArguments)
+    script_args = parser.parse_args_into_dataclasses()[0]
+    # Load the dataset
+    dataset = load_dataset("trl-lib/tldr", split="validation")
+    if script_args.num_examples is not None:
+        dataset = dataset.select(range(script_args.num_examples))
+    # Extract the prompts and reference completions
+    prompts = dataset["prompt"]
+    reference_completions = dataset["completion"]
+    # Generate the model completions
+    sampling_params = SamplingParams(temperature=0.0, top_p=0.95, max_tokens=200)  # very generous max token length
+    llm = LLM(model=script_args.model_name_or_path, tensor_parallel_size=1)
+    outputs = llm.generate(prompts, sampling_params)
+    model_completions = [output.outputs[0].text.strip() for output in outputs]
+    # Judge the outputs
+    if "gpt" in script_args.judge_model:
+        judge = OpenAIPairwiseJudge(script_args.judge_model)
+    else:
+        judge = HfPairwiseJudge(script_args.judge_model)
+    completions = [[c0, c1] for c0, c1 in zip(reference_completions, model_completions, strict=True)]
+    best_idxs = judge.judge(prompts, completions)
+    model_win_rate = best_idxs.count(1) / len(best_idxs)
+    print(f"Model win rate: {model_win_rate * 100:.2f}%")

ICL/RL/trl_source/examples/scripts/nemo_gym/deepspeed_zero3.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 4
+num_processes: 32
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

ICL/RL/trl_source/examples/scripts/nemo_gym/submit.sh ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/bin/bash
+#SBATCH -A account
+#SBATCH -p partition
+#SBATCH -N 5
+#SBATCH --gres gpu:8
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=16
+#SBATCH --time=4:00:00
+#SBATCH --job-name=trl_nemo_gym
+#SBATCH --output=logs/%j/slurm.out
+#SBATCH --error=logs/%j/slurm.err
+CONTAINER_IMAGE="nvcr.io/nvidia/pytorch:25.12-py3"
+MOUNTS="/path/to/mounts:/path/to/mounts"
+NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST))
+TRAIN_NODE_0="${NODELIST[0]}"
+TRAIN_NODE_1="${NODELIST[1]}"
+TRAIN_NODE_2="${NODELIST[2]}"
+TRAIN_NODE_3="${NODELIST[3]}"
+VLLM_NODE="${NODELIST[4]}"
+echo "Training Nodes: $TRAIN_NODE_0, $TRAIN_NODE_1, $TRAIN_NODE_2, $TRAIN_NODE_3"
+echo "vLLM Node: $VLLM_NODE"
+echo "Main process IP: $TRAIN_NODE_0"
+LOG_DIR="logs/${SLURM_JOB_ID}"
+mkdir -p ${LOG_DIR}
+echo "Starting ng_run and vLLM on ${VLLM_NODE}..."
+echo "Logs will be saved to: ${LOG_DIR}"
+# NOTE: If you have already set up your TRL venv, you can remove all of the pip installs and uv venv related commands below!
+srun --nodes=1 --ntasks=1 --nodelist="${VLLM_NODE}" \
+    --container-image="${CONTAINER_IMAGE}" \
+    --container-mounts="${MOUNTS}" \
+    --container-mount-home \
+    bash -c "
+    LOG_DIR=/path/to/logs
+    mkdir -p \${LOG_DIR}
+    # Install uv if not already installed
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    source \$HOME/.local/bin/env
+    # Start nemo gym servers
+    (set -x && \
+    export HOME=/path/to/user && \
+    export PATH=\$HOME/.local/bin:\$PATH && \
+    cd /path/to/user/Gym && \
+    uv venv --python 3.12 && \
+    source .venv/bin/activate && \
+    uv sync && \
+    ray stop --force && \
+    ng_run +config_paths=[responses_api_models/vllm_model/configs/vllm_model.yaml,resources_servers/workplace_assistant/configs/workplace_assistant.yaml] +head_server.host=0.0.0.0 +head_server.port=11000) > \${LOG_DIR}/ng_run.log 2>&1 &
+    sleep 10
+    # Start trl vllm server
+    (set -x && \
+    export HOME=/path/to/user && \
+    export HF_HOME=/path/to/user/hf_home && \
+    cd /path/to/user/trl && \
+    rm -rf .venv && uv venv && source .venv/bin/activate && uv sync && uv pip install -e .[vllm] && uv pip install fastapi uvicorn && \
+    python -m trl.scripts.vllm_serve \
+    --model Qwen/Qwen3-4B-Instruct-2507 \
+    --host 0.0.0.0 \
+    --tensor-parallel-size 8 \
+    --data-parallel-size 1 \
+    --max-model-len 16384 \
+    --gpu-memory-utilization 0.7 \
+    --port 8000) > \${LOG_DIR}/vllm_serve.log 2>&1 &
+    wait
+" &
+echo "Waiting for nemo gym and vllm to start..."
+sleep 120
+echo "Launching training on 4 nodes..."
+TRAIN_NODES_LIST="${TRAIN_NODE_0},${TRAIN_NODE_1},${TRAIN_NODE_2},${TRAIN_NODE_3}"
+srun --nodes=4 --ntasks=4 --nodelist="${TRAIN_NODES_LIST}" \
+    --container-image="${CONTAINER_IMAGE}" \
+    --container-mounts="${MOUNTS}" \
+    --container-mount-home \
+    bash -c "
+    set -x && \
+    export HOME=/path/to/user && \
+    export HF_HOME=/path/to/user/hf_home && \
+    cd /path/to/user/trl && \
+    source .venv/bin/activate && uv pip install accelerate deepspeed wandb omegaconf && \
+    cd examples/scripts/nemo_gym && \
+    export WANDB_API_KEY=<your wandb api key> && \
+    accelerate launch \
+    --config_file deepspeed_zero3.yaml \
+    --num_processes 32 \
+    --num_machines 4 \
+    --machine_rank \$SLURM_PROCID \
+    --main_process_ip ${TRAIN_NODE_0} \
+    --main_process_port 29500 \
+    --rdzv_backend c10d \
+    train_multi_environment.py \
+    --config config.yaml \
+    --vllm_server_host ${VLLM_NODE} \
+    --head_server_host ${VLLM_NODE}" &
+wait

ICL/RL/trl_source/examples/scripts/online_dpo.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# /// script
+# dependencies = [
+#     "trl",
+#     "peft",
+#     "trackio",
+#     "kernels",
+# ]
+# ///
+"""
+Usage:
+python examples/scripts/online_dpo.py \
+    --model_name_or_path trl-lib/pythia-1b-deduped-tldr-sft  \
+    --reward_model_path trl-lib/pythia-1b-deduped-tldr-rm \
+    --dataset_name trl-lib/tldr \
+    --learning_rate 5.0e-7 \
+    --output_dir pythia-1b-tldr-online-dpo \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 16 \
+    --warmup_steps 0.1 \
+    --missing_eos_penalty 1.0
+With LoRA:
+python examples/scripts/online_dpo.py \
+    --model_name_or_path trl-lib/pythia-1b-deduped-tldr-sft  \
+    --reward_model_path trl-lib/pythia-1b-deduped-tldr-rm \
+    --dataset_name trl-lib/tldr \
+    --learning_rate 5.0e-6 \
+    --output_dir pythia-1b-tldr-online-dpo \
+    --per_device_train_batch_size 16 \
+    --gradient_accumulation_steps 8 \
+    --warmup_steps 0.1 \
+    --missing_eos_penalty 1.0 \
+    --use_peft
+"""
+import os
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
+from trl import (
+    LogCompletionsCallback,
+    ModelConfig,
+    ScriptArguments,
+    TrlParser,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+from trl.experimental.judges import HfPairwiseJudge, OpenAIPairwiseJudge, PairRMJudge
+from trl.experimental.online_dpo import OnlineDPOConfig, OnlineDPOTrainer
+# Enable logging in a Hugging Face Space
+os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")
+JUDGES = {"pair_rm": PairRMJudge, "openai": OpenAIPairwiseJudge, "hf": HfPairwiseJudge}
+if __name__ == "__main__":
+    parser = TrlParser((ScriptArguments, OnlineDPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
+    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        attn_implementation=model_args.attn_implementation,
+        dtype=dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+    )
+    quantization_config = get_quantization_config(model_args)
+    if quantization_config is not None:
+        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
+        model_kwargs["device_map"] = get_kbit_device_map()
+        model_kwargs["quantization_config"] = quantization_config
+    model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, **model_kwargs
+    )
+    if training_args.reward_model_path is not None:
+        reward_model = AutoModelForSequenceClassification.from_pretrained(
+            training_args.reward_model_path,
+            num_labels=1,
+            trust_remote_code=model_args.trust_remote_code,
+            **model_kwargs,
+        )
+        reward_tokenizer = AutoTokenizer.from_pretrained(
+            training_args.reward_model_path,
+            trust_remote_code=model_args.trust_remote_code,
+            truncation=True,
+            truncation_side="left",  # since we judge the completion, truncating left is more appropriate
+        )
+        if reward_tokenizer.pad_token_id is None:
+            reward_tokenizer.pad_token = reward_tokenizer.eos_token
+    else:
+        reward_model = None
+        reward_tokenizer = None
+    if training_args.judge is not None:
+        judge_cls = JUDGES[training_args.judge]
+        judge = judge_cls()
+    else:
+        judge = None
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        padding_side="left",
+        trust_remote_code=model_args.trust_remote_code,
+        **model_kwargs,
+    )
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    trainer = OnlineDPOTrainer(
+        model=model,
+        reward_funcs=reward_model,
+        judge=judge,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        processing_class=tokenizer,
+        reward_processing_classes=reward_tokenizer,
+        peft_config=get_peft_config(model_args),
+    )
+    if training_args.eval_strategy != "no":
+        generation_config = GenerationConfig(
+            max_new_tokens=training_args.max_new_tokens, do_sample=True, temperature=training_args.temperature
+        )
+        completions_callback = LogCompletionsCallback(trainer, generation_config, num_prompts=8)
+        trainer.add_callback(completions_callback)
+    trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)

ICL/RL/trl_source/examples/scripts/openenv/browsergym_llm.py ADDED Viewed

	@@ -0,0 +1,506 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# /// script
+# dependencies = [
+#     "trl[vllm]",
+#     "peft",
+#     "trackio",
+#     "kernels",
+#     "openenv-browsergym @ git+https://huggingface.co/spaces/openenv/browsergym_env",
+# ]
+# ///
+"""
+Simple script to run GRPO training with OpenEnv's BrowserGym environment and vLLM for LLMs.
+This script is optimized for text-only Language Models (LLMs). It uses the accessibility
+tree text from BrowserGym, making it memory-efficient.
+The environment runs on a Hugging Face Space by default.
+Setup (Option A - Install from HF Space, recommended):
+```sh
+uv pip install git+https://huggingface.co/spaces/openenv/browsergym_env
+```
+Setup (Option B - Clone OpenEnv repo, for development):
+```sh
+git clone https://github.com/meta-pytorch/OpenEnv.git
+cd OpenEnv/envs/browsergym_env
+uv pip install -e .
+```
+# Option 1: HF Spaces + Colocated vLLM (1 GPU required)
+```sh
+python examples/scripts/openenv/browsergym_llm.py --vllm-mode colocate
+```
+# Option 2: HF Spaces + Separate vLLM server (2 GPUs required)
+# Spin up vLLM server (Terminal 1)
+```sh
+CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8001
+```
+# Run training (Terminal 2)
+```sh
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/browsergym_llm.py --vllm-mode server --vllm-server-url http://localhost:8001
+```
+"""
+from __future__ import annotations
+import argparse
+from datetime import datetime
+from pathlib import Path
+from browsergym_env import BrowserGymAction, BrowserGymEnv
+from datasets import Dataset
+from transformers import AutoTokenizer
+from trl import GRPOConfig, GRPOTrainer
+from trl.experimental.openenv import generate_rollout_completions
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run GRPO training for BrowserGym MiniWoB using OpenEnv environment.")
+    parser.add_argument(
+        "--model-id",
+        default="Qwen/Qwen3-0.6B",
+        help="Model identifier passed to GRPOTrainer for fine-tuning.",
+    )
+    parser.add_argument(
+        "--space-url",
+        type=str,
+        default="https://openenv-browsergym-env.hf.space",
+        help="URL for the Hugging Face Space running the BrowserGym environment.",
+    )
+    parser.add_argument(
+        "--benchmark",
+        default="miniwob",
+        help="BrowserGym benchmark to use (miniwob, webarena, etc.).",
+    )
+    parser.add_argument(
+        "--task-name",
+        default="click-test",
+        help="Specific task within the benchmark (e.g., click-test, click-button).",
+    )
+    parser.add_argument(
+        "--dataset-prompt",
+        default="Complete the web task successfully.",
+        help="Prompt text used to seed the training dataset.",
+    )
+    parser.add_argument(
+        "--dataset-size",
+        type=int,
+        default=1000,
+        help="Number of entries to include in the synthetic training dataset.",
+    )
+    parser.add_argument(
+        "--max-steps",
+        type=int,
+        default=10,
+        help="Maximum number of steps per episode.",
+    )
+    parser.add_argument(
+        "--max-new-tokens",
+        type=int,
+        default=32,
+        help="Maximum number of new tokens to request from vLLM for each action.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.7,
+        help="Sampling temperature used during rollout generation.",
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=50,
+        help="Top-k sampling parameter forwarded to vLLM.",
+    )
+    parser.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Optional top-p sampling parameter forwarded to vLLM.",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        type=float,
+        default=5e-6,
+        help="Learning rate for GRPO training.",
+    )
+    parser.add_argument(
+        "--weight-decay",
+        type=float,
+        default=0.0,
+        help="Weight decay applied during optimization.",
+    )
+    parser.add_argument(
+        "--gradient-accumulation-steps",
+        type=int,
+        default=32,
+        help="Gradient accumulation steps for GRPO training.",
+    )
+    parser.add_argument(
+        "--warmup-steps",
+        type=int,
+        default=10,
+        help="Warmup steps for the scheduler.",
+    )
+    parser.add_argument(
+        "--per-device-batch-size",
+        type=int,
+        default=1,
+        help="Per-device train batch size.",
+    )
+    parser.add_argument(
+        "--num-generations",
+        type=int,
+        default=4,
+        help="Number of rollout generations per dataset prompt.",
+    )
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=1,
+        help="Number of training epochs.",
+    )
+    parser.add_argument(
+        "--save-interval",
+        type=int,
+        default=50,
+        help="Interval (in steps) between checkpoint saves.",
+    )
+    parser.add_argument(
+        "--save-total-limit",
+        type=int,
+        default=None,
+        help="Maximum number of checkpoints to keep.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="Directory where training outputs and checkpoints are stored.",
+    )
+    parser.add_argument(
+        "--run-name",
+        default=None,
+        help="Optional run name for logging systems.",
+    )
+    parser.add_argument(
+        "--project",
+        default=None,
+        help="Optional project identifier for logging systems.",
+    )
+    parser.add_argument(
+        "--vllm-mode",
+        choices=("colocate", "server"),
+        default="colocate",
+        help="vLLM execution mode: 'colocate' or 'server'.",
+    )
+    parser.add_argument(
+        "--vllm-server-url",
+        type=str,
+        default="http://localhost:8001",
+        help="URL for the vLLM server (only used when --vllm-mode=server).",
+    )
+    parser.add_argument(
+        "--logging-steps",
+        type=int,
+        default=1,
+        help="Frequency of logging steps for GRPO training.",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        default=False,
+        help="Enable verbose debugging output during rollouts.",
+    )
+    return parser.parse_args()
+def sanitize_name(name: str) -> str:
+    return name.replace("/", "-")
+# ---------------------------------------------------------------------------
+# System Prompt
+# ---------------------------------------------------------------------------
+SYSTEM_PROMPT = """You control a web browser through BrowserGym actions.
+You must complete the given web task by interacting with the page.
+Available actions:
+- noop() - Do nothing
+- click(bid) - Click element with BrowserGym ID (the number in brackets)
+- fill(bid, text) - Fill input field with text
+- send_keys(text) - Send keyboard input
+- scroll(direction) - Scroll up/down
+The page structure shows elements as: [bid] element_type 'element_text'
+For example: [13] button 'Click Me!' means bid='13'
+Reply with exactly ONE action on a single line, e.g.:
+click('13')
+fill('42', 'hello world')
+noop()
+Do not include explanations or multiple actions."""
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def make_user_prompt(goal: str, step_num: int, axtree: str, error: str = "") -> str:
+    """Create user prompt from observation."""
+    prompt_parts = [f"Step {step_num + 1}"]
+    if goal:
+        prompt_parts.append(f"Goal: {goal}")
+    if error:
+        prompt_parts.append(f"Previous action error: {error}")
+    # Include accessibility tree (truncated for context)
+    if axtree:
+        max_len = 2000
+        axtree_truncated = axtree[:max_len] + "..." if len(axtree) > max_len else axtree
+        prompt_parts.append(f"Page structure:\n{axtree_truncated}")
+    prompt_parts.append("What action do you take?")
+    return "\n\n".join(prompt_parts)
+def parse_action(response_text: str) -> str:
+    """Parse BrowserGym action from model response."""
+    # Extract first line that looks like an action
+    for line in response_text.strip().split("\n"):
+        line = line.strip()
+        if "(" in line and ")" in line:
+            return line
+    # Fallback to noop if no valid action found
+    return "noop()"
+def rollout_once(
+    trainer: GRPOTrainer,
+    env: BrowserGymEnv,
+    tokenizer: AutoTokenizer,
+    dataset_prompt: str,
+    max_steps: int,
+    debug: bool = False,
+) -> dict[str, list]:
+    """Run one episode and collect training data (text-only, no screenshots)."""
+    result = env.reset()
+    observation = result.observation
+    prompt_ids: list[int] = []
+    completion_ids: list[int] = []
+    logprobs: list[float] = []
+    step_rewards: list[float] = []
+    completion_rewards: list[float] = []
+    for step_num in range(max_steps):
+        if result.done:
+            break
+        # Create prompt from observation (text-only using accessibility tree)
+        goal = observation.goal or dataset_prompt
+        axtree = observation.axtree_txt or ""
+        error = observation.error if observation.last_action_error else ""
+        user_prompt = make_user_prompt(goal, step_num, axtree, error)
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": user_prompt},
+        ]
+        prompt_text = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        # Generate action with vLLM
+        rollout_outputs = generate_rollout_completions(trainer, [prompt_text])[0]
+        prompt_ids.extend(rollout_outputs["prompt_ids"])
+        completion_ids.extend(rollout_outputs["completion_ids"])
+        logprobs.extend(rollout_outputs["logprobs"])
+        completion_text = rollout_outputs.get("text") or tokenizer.decode(
+            rollout_outputs["completion_ids"], skip_special_tokens=True
+        )
+        # Parse and execute action
+        action_str = parse_action(completion_text)
+        if debug:
+            print(f"Step {step_num + 1}: {action_str}")
+        # Take action in environment
+        result = env.step(BrowserGymAction(action_str=action_str))
+        observation = result.observation
+        # Track rewards
+        step_reward = float(result.reward or 0.0)
+        step_rewards.append(step_reward)
+        # Reward shaping: success is most important
+        if result.done and step_reward > 0:
+            completion_rewards.append(1.0)  # Task completed successfully
+        elif result.done and step_reward == 0:
+            completion_rewards.append(0.0)  # Task failed
+        else:
+            completion_rewards.append(step_reward)  # Intermediate reward
+    # Final reward is based on task completion
+    final_reward = completion_rewards[-1] if completion_rewards else 0.0
+    return {
+        "prompt_ids": prompt_ids,
+        "completion_ids": completion_ids,
+        "logprobs": logprobs,
+        "step_rewards": step_rewards,
+        "completion_reward": final_reward,
+    }
+# ---------------------------------------------------------------------------
+# Rewards
+# ---------------------------------------------------------------------------
+def reward_completion(completions: list[str], **kwargs) -> list[float]:
+    """Reward for task completion."""
+    rewards = kwargs.get("completion_reward") if kwargs else None
+    if rewards is None:
+        return [0.0 for _ in completions]
+    return [float(r) for r in rewards]
+# ---------------------------------------------------------------------------
+# Main entrypoint
+# ---------------------------------------------------------------------------
+def main() -> None:
+    args = parse_args()
+    # Connect to BrowserGym environment via Hugging Face Space
+    client = BrowserGymEnv(base_url=args.space_url)
+    print(f"🌍 Using Hugging Face Space environment at: {args.space_url}")
+    dataset = Dataset.from_dict({"prompt": [args.dataset_prompt] * args.dataset_size})
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    default_output_dir = Path("outputs") / f"browsergym-grpo-{sanitize_name(args.model_id)}-{timestamp}"
+    output_dir = Path(args.output_dir or default_output_dir)
+    grpo_config = GRPOConfig(
+        use_vllm=True,
+        vllm_mode=args.vllm_mode,
+        vllm_server_base_url=args.vllm_server_url if args.vllm_mode == "server" else None,
+        vllm_gpu_memory_utilization=0.4,
+        output_dir=str(output_dir),
+        num_train_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        weight_decay=args.weight_decay,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        per_device_train_batch_size=args.per_device_batch_size,
+        warmup_steps=args.warmup_steps,
+        num_generations=args.num_generations,
+        generation_batch_size=args.num_generations,  # Must be divisible by num_generations
+        max_completion_length=args.max_new_tokens,
+        logging_steps=args.logging_steps,
+        report_to="trackio",
+        trackio_space_id=f"browsergym-grpo-{sanitize_name(args.model_id)}-{timestamp}",
+        save_strategy="steps",
+        save_steps=args.save_interval,
+        save_total_limit=args.save_total_limit,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+    )
+    grpo_config.run_name = args.run_name or f"run-{timestamp}"
+    grpo_config.project = args.project or f"group-{sanitize_name(args.model_id)}"
+    def rollout_func(prompts: list[str], trainer: GRPOTrainer) -> dict[str, list]:
+        episode_prompt_ids: list[list[int]] = []
+        episode_completion_ids: list[list[int]] = []
+        episode_logprobs: list[list[float]] = []
+        completion_rewards: list[float] = []
+        if args.debug:
+            print(f"\n[DEBUG] rollout_func called with {len(prompts)} prompts (LLM mode, text-only)")
+        for i, prompt_text in enumerate(prompts):
+            if args.debug:
+                print(f"[DEBUG] Processing prompt {i + 1}/{len(prompts)}")
+            episode = rollout_once(
+                trainer=trainer,
+                env=client,
+                tokenizer=trainer.processing_class,
+                dataset_prompt=prompt_text,
+                max_steps=args.max_steps,
+                debug=args.debug,
+            )
+            episode_prompt_ids.append(episode["prompt_ids"])
+            episode_completion_ids.append(episode["completion_ids"])
+            episode_logprobs.append(episode["logprobs"])
+            completion_rewards.append(episode["completion_reward"])
+        return {
+            "prompt_ids": episode_prompt_ids,
+            "completion_ids": episode_completion_ids,
+            "logprobs": episode_logprobs,
+            "completion_reward": completion_rewards,
+        }
+    trainer = GRPOTrainer(
+        model=args.model_id,
+        reward_funcs=[reward_completion],
+        train_dataset=dataset,
+        args=grpo_config,
+        rollout_func=rollout_func,
+    )
+    print("=" * 80)
+    print("Starting GRPO training with BrowserGym environment (LLM mode)")
+    print(f"Benchmark: {args.benchmark}")
+    print(f"Task: {args.task_name}")
+    print(f"Model: {args.model_id}")
+    print("Mode: LLM (text-only, using accessibility tree)")
+    print(f"Using {args.num_generations} rollouts per dataset prompt")
+    print(f"Output directory: {output_dir}")
+    print("=" * 80)
+    try:
+        trainer.train()
+        print("\nTraining completed successfully!")
+    finally:
+        client.close()
+if __name__ == "__main__":
+    main()

ICL/RL/trl_source/examples/scripts/openenv/echo.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# /// script
+# dependencies = [
+#     "trl[vllm]",
+#     "peft",
+#     "trackio",
+#     "kernels",
+#     "openenv-echo-env @ git+https://huggingface.co/spaces/openenv/echo_env",
+# ]
+# ///
+"""
+Simple script to run GRPO training with OpenEnv's Echo environment and vLLM. The reward function encourages
+longer completions.
+Setup (Option A - Install from HF Space, recommended):
+```sh
+uv pip install git+https://huggingface.co/spaces/openenv/echo_env
+```
+Setup (Option B - Clone OpenEnv repo, for development):
+```sh
+git clone https://github.com/meta-pytorch/OpenEnv.git
+cd OpenEnv/envs/echo_env
+uv pip install -e .
+```
+# Option 1: HF Spaces + Colocated vLLM (1 GPU required)
+```sh
+python examples/scripts/openenv/echo.py --env-mode space --env-host https://openenv-echo-env.hf.space --vllm-mode colocate
+```
+# Option 2: HF Spaces + Separate vLLM server (2 GPUs required)
+# Spin up vLLM server (Terminal 1)
+```sh
+CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen2.5-0.5B-Instruct --host 0.0.0.0 --port 8000
+```
+# Run training (Terminal 2)
+```sh
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/echo.py --env-mode space --env-host https://openenv-echo-env.hf.space --vllm-mode server --vllm-server-url http://localhost:8000
+```
+# Option 3: Local + Colocated vLLM (1 GPU required)
+# Start the environment only if using --env-mode docker-local
+```sh
+docker run -d -p 8001:8001 registry.hf.space/openenv-echo-env:latest
+```
+```sh
+python examples/scripts/openenv/echo.py --env-mode docker-local --vllm-mode colocate
+```
+"""
+# ruff: noqa: T201
+import argparse
+import os
+import subprocess
+import sys
+import time
+from pathlib import Path
+import requests
+from datasets import load_dataset
+from echo_env import EchoEnv
+from echo_env.models import EchoAction
+from trl import GRPOConfig, GRPOTrainer, RichProgressCallback
+from trl.experimental.openenv import generate_rollout_completions
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run GRPO training with Echo environment and vLLM.")
+    parser.add_argument("--env-host", type=str, default="0.0.0.0", help="Host for the Echo environment.")
+    parser.add_argument("--env-port", type=int, default=8001, help="Port for the Echo environment.")
+    parser.add_argument(
+        "--env-mode",
+        choices=["local", "docker-local", "docker-image", "docker-hub", "space"],
+        default="docker-image",
+        help="Where to run the Echo environment: 'local' to launch it, 'docker-local' if already running locally, 'docker-image' to run from a Docker image, 'docker-hub' to run from Docker Hub, or 'space' to use a remote Space URL.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="Qwen/Qwen2.5-0.5B-Instruct",
+        help="Model to use for training.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="trl-lib/ultrafeedback-prompt",
+        help="Dataset to use for training.",
+    )
+    parser.add_argument(
+        "--env-image", type=str, default="echo-env:latest", help="Docker image for the Echo environment."
+    )
+    parser.add_argument(
+        "--vllm-mode",
+        choices=["colocate", "server"],
+        default="colocate",
+        help="vLLM execution mode: 'colocate' or 'server'.",
+    )
+    parser.add_argument(
+        "--vllm-server-url",
+        type=str,
+        default="http://localhost:8000",
+        help="URL for the vLLM server (only used when --vllm-mode=server).",
+    )
+    return parser.parse_args()
+def start_env_server(env_host: str, env_port: int):
+    """Launch the Echo environment server locally."""
+    env_url = f"http://{env_host}:{env_port}"
+    print(f"⚡ Starting FastAPI server for Echo Environment on {env_url}...")
+    work_dir = str(Path.cwd().parent.absolute())
+    process = subprocess.Popen(
+        [sys.executable, "-m", "uvicorn", "echo_env.server.app:app", "--host", env_host, "--port", str(env_port)],
+        env={**os.environ, "PYTHONPATH": f"{work_dir}/src"},
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        cwd=work_dir,
+    )
+    print("⏳ Waiting for server to start...")
+    time.sleep(5)
+    try:
+        requests.get(f"{env_url}/health", timeout=2)
+        print("\n✅ Echo Environment server is running!")
+    except Exception as e:
+        print(f"\n��� Server failed to start: {e}")
+        if process.stderr:
+            print(process.stderr.read())
+        raise
+    return process
+def reward_from_env(completions, **kwargs):
+    """Extract environment rewards for training."""
+    env_rewards = kwargs.get("env_reward", [])
+    return [float(r) for r in env_rewards] if env_rewards else [0.0] * len(completions)
+def main():
+    args = parse_args()
+    # Select environment mode
+    if args.env_mode == "local":
+        env_url = f"http://{args.env_host}:{args.env_port}"
+        server_process = start_env_server(args.env_host, args.env_port)
+    elif args.env_mode == "docker-local":
+        env_url = f"http://{args.env_host}:{args.env_port}"
+        server_process = None
+        print(f"🌍 Using existing Echo Environment (Docker) at: {env_url}")
+    elif args.env_mode == "docker-image":
+        client = EchoEnv.from_docker_image(args.env_image)
+        server_process = None
+        print("🌍 Using Echo Environment (Docker) from local Image")
+    elif args.env_mode == "docker-hub":
+        client = EchoEnv.from_hub(args.env_image)
+        server_process = None
+        print("🌍 Using existing Echo Environment (Docker) from Hub Image")
+    elif args.env_mode == "space":
+        env_url = args.env_host
+        server_process = None
+        print(f"🌍 Using Hugging Face Space environment at: {env_url}")
+    else:
+        raise ValueError(f"Unknown environment mode: {args.env_mode}")
+    if args.env_mode != "docker-hub" and args.env_mode != "docker-image":
+        client = EchoEnv(base_url=env_url)
+    dataset = load_dataset(args.dataset, split="train[:1000]")
+    training_args = GRPOConfig(
+        output_dir=f"{args.model.split('/')[-1]}-GRPO-Rollout",
+        use_vllm=True,
+        vllm_mode=args.vllm_mode,
+        vllm_server_base_url=args.vllm_server_url if args.vllm_mode == "server" else None,
+        logging_steps=1,
+        report_to="trackio",
+        trackio_space_id=f"{args.model.split('/')[-1]}-GRPO-Rollout",
+        num_train_epochs=1,
+        max_completion_length=2048,
+        gradient_accumulation_steps=4,
+    )
+    def rollout_func(prompts: list[str], trainer: GRPOTrainer) -> dict[str, list]:
+        outputs = generate_rollout_completions(trainer, prompts)
+        tokenizer = trainer.processing_class
+        completions_text = [tokenizer.decode(output["completion_ids"], skip_special_tokens=True) for output in outputs]
+        env_result = client.reset()
+        env_rewards: list[float] = []
+        for message in completions_text:
+            env_result = client.step(EchoAction(message=message))
+            env_rewards.append(env_result.reward)
+        return {
+            "prompt_ids": [output["prompt_ids"] for output in outputs],
+            "completion_ids": [output["completion_ids"] for output in outputs],
+            "logprobs": [output["logprobs"] for output in outputs],
+            "env_reward": env_rewards,
+        }
+    trainer = GRPOTrainer(
+        model=args.model,
+        reward_funcs=reward_from_env,
+        args=training_args,
+        train_dataset=dataset,
+        rollout_func=rollout_func,
+        callbacks=[RichProgressCallback()],
+    )
+    trainer.train()
+    time.sleep(5)
+    if server_process:
+        print("🛑 Terminating Echo Environment server...")
+        server_process.terminate()
+if __name__ == "__main__":
+    main()

ICL/RL/trl_source/examples/scripts/openenv/wordle.py ADDED Viewed

	@@ -0,0 +1,607 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# /// script
+# dependencies = [
+#     "trl[vllm]",
+#     "peft",
+#     "trackio",
+#     "kernels",
+#     "openenv-textarena @ git+https://huggingface.co/spaces/sergiopaniego/wordle",
+# ]
+# ///
+"""
+Simple script to run GRPO training with OpenEnv's Wordle environment and vLLM.
+Setup (Option A - Install from HF Space, recommended):
+```sh
+uv pip install git+https://huggingface.co/spaces/sergiopaniego/wordle
+```
+# Option 1: HF Spaces + Colocated vLLM (1 GPU required)
+```sh
+python examples/scripts/openenv/wordle.py --vllm-mode colocate
+```
+# Option 2: HF Spaces + Separate vLLM server (2 GPUs required)
+# Spin up vLLM server (Terminal 1)
+```sh
+CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen3-1.7B --host 0.0.0.0 --port 8000
+```
+# Run training (Terminal 2)
+```sh
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/wordle.py --vllm-mode server --vllm-server-url http://localhost:8000
+```
+# Option 3: Local Environment + Colocated vLLM (1 GPU required)
+To run the Wordle environment locally, you have several options:
+## Option 3a: Using Docker Image (Recommended)
+First, build the Docker image from the textarena_env directory:
+```sh
+cd 3rd_party/OpenEnv/envs/textarena_env
+docker build -t textarena-env:latest -f server/Dockerfile .
+```
+Then run the environment server:
+```sh
+docker run -d -p 8001:8001 textarena-env:latest
+```
+Finally, run training pointing to local server:
+```sh
+python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
+```
+## Option 3b: Running Server Directly
+From the textarena_env directory:
+```sh
+cd 3rd_party/OpenEnv/envs/textarena_env
+uv venv && source .venv/bin/activate
+uv pip install -e .
+python -m uvicorn server.app:app --reload --port 8001
+```
+Then in another terminal, run training:
+```sh
+python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
+```
+## Option 3c: Using Pre-built HF Space Image
+```sh
+docker run -d -p 8001:8001 registry.hf.space/burtenshaw-wordle:latest
+python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
+```
+"""
+import argparse
+import re
+import sys
+from collections.abc import Iterable
+from datetime import datetime
+from pathlib import Path
+from datasets import Dataset
+from transformers import AutoTokenizer
+from trl import GRPOConfig, GRPOTrainer
+from trl.experimental.openenv import generate_rollout_completions
+# Ensure src/ is on the path
+sys.path.insert(0, str(Path(__file__).parent / "src"))
+from textarena_env import TextArenaAction, TextArenaEnv
+from textarena_env.models import TextArenaMessage
+from textarena_env.rewards import extract_feedback_counts, extract_guess, extract_wordle_feedback
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run GRPO training for Wordle using the TextArena OpenEnv environment."
+    )
+    parser.add_argument(
+        "--tokenizer-id",
+        default="Qwen/Qwen3-1.7B",
+        help="Model identifier used to load the tokenizer.",
+    )
+    parser.add_argument(
+        "--model-id",
+        default="Qwen/Qwen3-1.7B",
+        help="Model identifier passed to GRPOTrainer for fine-tuning.",
+    )
+    parser.add_argument(
+        "--env-url", type=str, default="https://sergiopaniego-wordle.hf.space", help="URL for the environment server."
+    )
+    parser.add_argument(
+        "--system-prompt-path",
+        default="wordle_prompt.txt",
+        help="Path to the file containing the system prompt.",
+    )
+    parser.add_argument(
+        "--dataset-prompt",
+        default="Play Wordle like an expert.",
+        help="Prompt text used to seed the training dataset.",
+    )
+    parser.add_argument(
+        "--dataset-size",
+        type=int,
+        default=3000,
+        help="Number of entries to include in the synthetic training dataset.",
+    )
+    parser.add_argument(
+        "--max-turns",
+        type=int,
+        default=6,
+        help="Maximum number of turns to play in the Wordle environment per episode.",
+    )
+    parser.add_argument(
+        "--max-new-tokens",
+        type=int,
+        default=8,
+        help="Maximum number of new tokens to request from vLLM for each guess.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.8,
+        help="Sampling temperature used during rollout generation.",
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=10,
+        help="Top-k sampling parameter forwarded to vLLM.",
+    )
+    parser.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Optional top-p sampling parameter forwarded to vLLM.",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        type=float,
+        default=1e-6,
+        help="Learning rate for GRPO training.",
+    )
+    parser.add_argument(
+        "--weight-decay",
+        type=float,
+        default=0.0,
+        help="Weight decay applied during optimization.",
+    )
+    parser.add_argument(
+        "--gradient-accumulation-steps",
+        type=int,
+        default=64,
+        help="Gradient accumulation steps for GRPO training.",
+    )
+    parser.add_argument(
+        "--warmup-steps",
+        type=int,
+        default=10,
+        help="Warmup steps for the scheduler.",
+    )
+    parser.add_argument(
+        "--per-device-batch-size",
+        type=int,
+        default=1,
+        help="Per-device train batch size.",
+    )
+    parser.add_argument(
+        "--num-generations",
+        type=int,
+        default=4,
+        help="Number of rollout generations per dataset prompt.",
+    )
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=1,
+        help="Number of training epochs.",
+    )
+    parser.add_argument(
+        "--save-interval",
+        type=int,
+        default=10,
+        help="Interval (in steps) between checkpoint saves.",
+    )
+    parser.add_argument(
+        "--save-total-limit",
+        type=int,
+        default=None,
+        help="Maximum number of checkpoints to keep.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="Directory where training outputs and checkpoints are stored.",
+    )
+    parser.add_argument(
+        "--run-name",
+        default=None,
+        help="Optional run name for logging systems.",
+    )
+    parser.add_argument(
+        "--project",
+        default=None,
+        help="Optional project identifier for logging systems.",
+    )
+    parser.add_argument(
+        "--trackio-space-id",
+        default="Wordle-GRPO",
+        help="TrackIO space identifier.",
+    )
+    parser.add_argument(
+        "--vllm-mode",
+        choices=("colocate", "server"),
+        default="colocate",
+        help="vLLM execution mode: 'colocate' or 'server'.",
+    )
+    parser.add_argument(
+        "--vllm-server-url",
+        type=str,
+        default="http://localhost:8000",
+        help="URL for the vLLM server (only used when --vllm-mode=server).",
+    )
+    parser.add_argument(
+        "--logging-steps",
+        type=int,
+        default=1,
+        help="Frequency of logging steps for GRPO training.",
+    )
+    return parser.parse_args()
+def resolve_system_prompt(path: str) -> str:
+    prompt_path = Path(path)
+    if not prompt_path.is_file():
+        prompt_path = Path(__file__).parent / path
+    return prompt_path.read_text()
+def sanitize_name(name: str) -> str:
+    return name.replace("/", "-")
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def format_history(messages: Iterable[TextArenaMessage]) -> str:
+    lines: list[str] = []
+    for message in messages:
+        tag = message.category or "MESSAGE"
+        content = message.content.strip()
+        if not content:
+            continue
+        lines.append(f"[{tag}] {content}")
+    return "\n".join(lines)
+def make_user_prompt(prompt_text: str, messages: Iterable[TextArenaMessage]) -> str:
+    history = format_history(messages)
+    # Only use messages for conversation history - the prompt is already included as the first message
+    history_section = history if history else "[PROMPT] Awaiting first feedback."
+    return f"Conversation so far:\n{history_section}\n\nReply with your next guess enclosed in square brackets."
+def rollout_once(
+    trainer: GRPOTrainer,
+    env: TextArenaEnv,
+    tokenizer: AutoTokenizer,
+    dataset_prompt: str,
+    system_prompt: str,
+    max_turns: int,
+    max_new_tokens: int = 16,
+) -> dict[str, list]:
+    result = env.reset()
+    observation = result.observation
+    prompt_ids: list[int] = []
+    completion_ids: list[int] = []
+    logprobs: list[float] = []
+    env_mask: list[int] = []  # 1 for model-generated tokens, 0 for environment tokens
+    model_outputs: list[str] = []
+    raw_rewards: list[float] = []
+    position_scores: list[float] = []
+    correct_scores: list[float] = []
+    prev_env_output_len: int = 0  # Track length to only add NEW portion each turn
+    accumulated_messages: list[dict[str, str]] = [{"role": "system", "content": system_prompt}]
+    # Build initial prompt (only once, at the start)
+    # The initial env messages are included in the prompt, not completion
+    base_prompt = observation.prompt or dataset_prompt
+    initial_user_prompt = make_user_prompt(base_prompt, observation.messages)
+    # Track initial env output length so we don't add it again
+    initial_env_output = format_history(observation.messages) if observation.messages else ""
+    prev_env_output_len = len(initial_env_output)
+    initial_messages = accumulated_messages + [{"role": "user", "content": initial_user_prompt}]
+    initial_prompt_text = tokenizer.apply_chat_template(
+        initial_messages,
+        add_generation_prompt=True,
+        tokenize=False,
+        enable_thinking=False,
+    )
+    # Tokenize initial prompt once - this is the base prompt for the entire episode.
+    # GRPO expects one prompt-completion pair per episode, where:
+    # - prompt_ids = the initial/base prompt (what the model sees at episode start)
+    # - completion_ids = all model responses + env feedback from all turns concatenated
+    # Note: The actual prompts used for generation in each turn are longer (include conversation history),
+    # but we only count the initial prompt tokens here.
+    initial_prompt_ids = tokenizer.encode(initial_prompt_text, add_special_tokens=False)
+    prompt_ids.extend(initial_prompt_ids)
+    for _turn in range(max_turns):
+        if result.done:
+            break
+        base_prompt = observation.prompt or dataset_prompt
+        user_prompt = make_user_prompt(base_prompt, observation.messages)
+        messages = accumulated_messages + [{"role": "user", "content": user_prompt}]
+        prompt_text = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False,
+            enable_thinking=False,
+        )
+        rollout_outputs = generate_rollout_completions(
+            trainer, [prompt_text], generation_overrides={"max_tokens": max_new_tokens}
+        )[0]
+        # Add model-generated completion tokens and logprobs with newlines for readability
+        newline_tokens = tokenizer.encode("\n", add_special_tokens=False)
+        completion_ids.extend(newline_tokens)  # newline before guess
+        logprobs.extend([0.0] * len(newline_tokens))
+        env_mask.extend([1] * len(newline_tokens))  # newlines are part of model output format
+        completion_ids.extend(rollout_outputs["completion_ids"])
+        logprobs.extend(rollout_outputs["logprobs"])
+        env_mask.extend([1] * len(rollout_outputs["completion_ids"]))  # model-generated tokens
+        completion_ids.extend(newline_tokens)  # newline after guess
+        logprobs.extend([0.0] * len(newline_tokens))
+        env_mask.extend([1] * len(newline_tokens))  # newlines are part of model output format
+        completion_text = rollout_outputs.get("text") or tokenizer.decode(
+            rollout_outputs["completion_ids"], skip_special_tokens=True
+        )
+        guess = extract_guess(completion_text)
+        model_outputs.append(completion_text.strip())  # Store raw model output for format reward
+        result = env.step(TextArenaAction(message=guess))
+        raw_rewards.append(float(result.reward or 0.0))
+        observation = result.observation
+        correct_score = float(result.reward or 0.0)
+        feedback = extract_wordle_feedback(observation)
+        full_env_output = format_history(observation.messages) if observation.messages else ""
+        new_env_output = full_env_output[prev_env_output_len:].lstrip("\n")
+        prev_env_output_len = len(full_env_output)
+        if new_env_output:
+            env_output_tokens = tokenizer.encode(new_env_output, add_special_tokens=False)
+            completion_ids.extend(env_output_tokens)  # Add to completion_ids
+            logprobs.extend([0.0] * len(env_output_tokens))  # Placeholder (ignored via env_mask=0)
+            env_mask.extend([0] * len(env_output_tokens))  # Environment tokens - mask out from loss
+            completion_with_env = completion_text + "\n" + new_env_output
+        else:
+            completion_with_env = completion_text
+        accumulated_messages.append({"role": "user", "content": user_prompt})
+        accumulated_messages.append({"role": "assistant", "content": completion_with_env})
+        if not feedback:
+            position_score = 0.0
+        else:
+            green_count, yellow_count = extract_feedback_counts(feedback)
+            position_score = (green_count + 0.5 * yellow_count) / 5.0
+        position_scores.append(position_score)
+        correct_scores.append(correct_score)
+    # Use the final correct reward (win/lose is binary at end)
+    correct_reward_value = correct_scores[-1] if correct_scores else (raw_rewards[-1] if raw_rewards else 0.0)
+    # Position reward as shaping signal:
+    # - If model WINS: position_reward = 1.0 (no penalty for winning fast)
+    # - If model LOSES: position_reward = last attempt (where it ended up)
+    if correct_reward_value >= 1.0:
+        final_position_reward = 1.0
+    else:
+        final_position_reward = position_scores[-1] if position_scores else 0.0
+    return {
+        "prompt_ids": prompt_ids,
+        "completion_ids": completion_ids,
+        "logprobs": logprobs,
+        "env_mask": env_mask,
+        "raw_rewards": raw_rewards,
+        "correct_reward": correct_reward_value,
+        "position_reward": final_position_reward,
+        "model_outputs": model_outputs,
+    }
+# ---------------------------------------------------------------------------
+# Rewards
+# ---------------------------------------------------------------------------
+def reward_correct(completions: list[str], **kwargs) -> list[float]:
+    """Reward from environment (correct answer)."""
+    rewards = kwargs.get("correct_reward") if kwargs else None
+    if rewards is None:
+        return [0.0 for _ in completions]
+    return [float(r) for r in rewards]
+def reward_position(completions: list[str], **kwargs) -> list[float]:
+    """Position reward: green worth 1.0, yellow worth 0.5, normalized by 5."""
+    rewards = kwargs.get("position_reward") if kwargs else None
+    if rewards is None:
+        return [0.0 for _ in completions]
+    return [float(r) for r in rewards]
+def compute_format_reward(model_outputs: list[str]) -> float:
+    """Compute format reward from a list of model outputs (one per turn).
+    Each output should be exactly [5 letters] with optional whitespace.
+    Returns proportion of correctly formatted outputs.
+    """
+    if not model_outputs:
+        return 0.0
+    exact_pattern = re.compile(r"^\s*\[[A-Za-z]{5}\]\s*$")
+    correct_count = sum(1 for output in model_outputs if exact_pattern.match(output))
+    return correct_count / len(model_outputs)
+def reward_format_strict(completions: list[str], **kwargs) -> list[float]:
+    """Format reward - pre-computed in rollout_func."""
+    rewards = kwargs.get("format_reward") if kwargs else None
+    if rewards is None:
+        return [0.0 for _ in completions]
+    return [float(r) for r in rewards]
+# ---------------------------------------------------------------------------
+# Main entrypoint
+# ---------------------------------------------------------------------------
+def main() -> None:
+    args = parse_args()
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_id)
+    tokenizer.pad_token = tokenizer.eos_token
+    client = TextArenaEnv(base_url=args.env_url)
+    system_prompt = resolve_system_prompt(args.system_prompt_path)
+    dataset = Dataset.from_dict({"prompt": [args.dataset_prompt] * args.dataset_size})
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    default_output_dir = Path("outputs") / f"wordle-grpo-{sanitize_name(args.model_id)}-{timestamp}"
+    output_dir = Path(args.output_dir or default_output_dir)
+    grpo_config = GRPOConfig(
+        use_vllm=True,
+        vllm_mode=args.vllm_mode,
+        vllm_server_base_url=args.vllm_server_url if args.vllm_mode == "server" else None,
+        output_dir=str(output_dir),
+        num_train_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        weight_decay=args.weight_decay,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        per_device_train_batch_size=args.per_device_batch_size,
+        warmup_steps=args.warmup_steps,
+        num_generations=args.num_generations,
+        max_completion_length=1024,  # Full episode length, not per-turn
+        logging_steps=args.logging_steps,
+        log_completions=True,
+        report_to="trackio",
+        trackio_space_id=f"wordle-grpo-{sanitize_name(args.model_id)}-{timestamp}",
+        save_strategy="steps",
+        save_steps=args.save_interval,
+        save_total_limit=args.save_total_limit,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        vllm_gpu_memory_utilization=0.25,
+        vllm_max_model_length=8192,
+        vllm_importance_sampling_mode="token_truncate",  # Less aggressive than default sequence_mask
+        optim="adamw_torch",
+        max_grad_norm=1.0,  # Clip gradients to prevent explosion
+    )
+    grpo_config.run_name = args.run_name or f"run-{timestamp}"
+    grpo_config.project = args.project or f"wordle-grpo-{sanitize_name(args.model_id)}-{timestamp}"
+    grpo_config.trackio_space_id = args.trackio_space_id
+    def rollout_func(prompts: list[str], trainer: GRPOTrainer) -> dict[str, list]:
+        episode_prompt_ids: list[list[int]] = []
+        episode_completion_ids: list[list[int]] = []
+        episode_logprobs: list[list[float]] = []
+        episode_env_masks: list[list[int]] = []
+        correctness_rewards: list[float] = []
+        position_rewards: list[float] = []
+        format_rewards: list[float] = []
+        for prompt_text in prompts:
+            episode = rollout_once(
+                trainer=trainer,
+                env=client,
+                tokenizer=tokenizer,
+                dataset_prompt=prompt_text,
+                system_prompt=system_prompt,
+                max_turns=args.max_turns,
+                max_new_tokens=args.max_new_tokens,
+            )
+            episode_prompt_ids.append(episode["prompt_ids"])
+            episode_completion_ids.append(episode["completion_ids"])
+            episode_logprobs.append(episode["logprobs"])
+            episode_env_masks.append(episode["env_mask"])
+            correctness_rewards.append(episode["correct_reward"])
+            position_rewards.append(episode["position_reward"])
+            format_rewards.append(compute_format_reward(episode["model_outputs"]))
+        return {
+            "prompt_ids": episode_prompt_ids,
+            "completion_ids": episode_completion_ids,
+            "logprobs": episode_logprobs,
+            "env_mask": episode_env_masks,
+            "correct_reward": correctness_rewards,
+            "position_reward": position_rewards,
+            "format_reward": format_rewards,
+        }
+    trainer = GRPOTrainer(
+        model=args.model_id,
+        processing_class=tokenizer,
+        reward_funcs=[
+            reward_correct,
+            reward_position,
+            reward_format_strict,
+        ],
+        train_dataset=dataset,
+        args=grpo_config,
+        rollout_func=rollout_func,
+    )
+    print("Starting GRPO training with Wordle environment...")
+    print(f"Using {args.num_generations} rollouts per dataset prompt")
+    try:
+        trainer.train()
+    finally:
+        client.close()
+if __name__ == "__main__":
+    main()

ICL/RL/trl_source/examples/scripts/openenv/wordle_prompt.txt ADDED Viewed

	@@ -0,0 +1,105 @@

+You are an expert Wordle solver with deep knowledge of English vocabulary, letter frequency patterns, and optimal guessing strategies.
+## GAME RULES
+1. The target is a 5-letter English word
+2. You have 6 attempts to guess the correct word
+3. After each guess, you receive color-coded feedback:
+   - GREEN: Letter is correct and in the correct position
+   - YELLOW: Letter is in the word but in the wrong position
+   - GRAY: Letter is not in the word at all
+4. All guesses must be valid 5-letter English words
+5. You cannot reuse a word you've already guessed
+## RESPONSE FORMAT
+Only respond with your next guess in square brackets, e.g., [crane].
+Format:
+```
+[guess]
+```
+## STRATEGIC APPROACH
+Do not repeat the same guess twice.
+### Opening Strategy
+- Start with words rich in common vowels (A, E, I, O, U) and consonants (R, S, T, L, N)
+- Optimal starters: CRANE, SLATE, STARE, AROSE, IRATE
+- Prioritize words that test the most common letters in different positions
+### Mid-Game Strategy
+- Use confirmed GREEN letters in their correct positions
+- Place YELLOW letters in different positions than where they appeared
+- Eliminate GRAY letters entirely from consideration
+- If multiple letters are unknown, prioritize common letter combinations (TH, CH, ST, ER, etc.)
+- Consider letter frequency: E is most common, followed by A, R, I, O, T, N, S
+### Vowel Placement
+- Most 5-letter words have 2 vowels
+- Common patterns: vowel-consonant-vowel (like CRANE) or consonant-vowel-vowel-consonant-vowel (like QUEUE)
+- If you have 1-2 vowels confirmed, consider where the others might be
+### Advanced Tactics
+- Use "sacrificial" guesses to test multiple new letters if you have attempts to spare
+- Avoid repeating letter patterns unless you're certain (e.g., SPEED has two E's)
+- Think about word endings: -ER, -LY, -ED, -ING are common but may not fit the 5-letter constraint
+- Consider less common letters (Q, X, Z, J) only when you've eliminated most common options
+### Common Pitfalls to Avoid
+- Don't reuse X letters
+- Don't place Y letters in the same position they appeared
+- Don't ignore confirmed G letters
+- Don't guess words that contradict known information
+## EXAMPLES
+### Example 1: Opening Guess
+"Starting with a word that tests common vowels and consonants in varied positions."
+[crane]
+### Example 2: After Receiving Feedback
+Previous guess: CRANE
+Feedback: C=gray, R=yellow, A=green, N=gray, E=yellow
+"A is confirmed in position 2. R and E are in the word but need different positions. C and N are eliminated. I'll try a word with A in position 2, and test R and E in new positions along with common letters like S and T."
+[spare]
+### Example 3: Narrowing Down
+Previous guesses: CRANE (C=gray, R=yellow, A=green, N=gray, E=yellow), SPARE (S=gray, P=gray, A=green, R=green, E=green)
+Feedback summary: _ARE_ with R in position 4, A in position 2, E in position 5
+"I have _AR E_ confirmed. Position 1 and 3 are unknown. Common letters to try: T, L, D, B, F, G. Testing with TARED."
+[tared]
+### Example 4: Final Deduction
+Previous feedback shows: _ARED with position 1 unknown and all common consonants tested
+"Only position 1 remains. I've eliminated S, P, C, N. Common starting consonants left are B, F, G, H. BARED is a common word."
+[bared]
+## LETTER FREQUENCY REFERENCE
+Most common letters in 5-letter words (in order):
+S, E, A, O, R, I, L, T, N, U, D, Y, C, P, M, H, G, B, K, F
+Most common starting letters:
+S, C, B, T, P, A, F, G, D, M
+Most common ending letters:
+E, Y, T, S, R, L, N, D
+## IMPORTANT CONSTRAINTS
+- Use lowercase only
+- One guess per response
+- Must be exactly 5 letters
+- Must be a real English word from standard dictionaries
+- Never repeat a previous guess
+- Always include brief reasoning before your guess
+## YOUR GOAL
+Solve the Wordle in as few guesses as possible by strategically using feedback to eliminate impossible words and narrow down the solution space efficiently.

ICL/RL/trl_source/examples/scripts/ppo/ppo.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# /// script
+# dependencies = [
+#     "trl",
+#     "peft",
+#     "trackio",
+#     "kernels",
+# ]
+# ///
+import os
+import shutil
+import torch
+from accelerate import PartialState
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    HfArgumentParser,
+)
+from trl import ModelConfig, ScriptArguments, get_kbit_device_map, get_peft_config, get_quantization_config
+from trl.experimental.ppo import PPOConfig, PPOTrainer
+# Enable logging in a Hugging Face Space
+os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")
+"""
+python -i examples/scripts/ppo/ppo.py \
+    --dataset_name trl-internal-testing/descriptiveness-sentiment-trl-style \
+    --dataset_train_split descriptiveness \
+    --learning_rate 3e-6 \
+    --output_dir pythia-1b-deduped-descriptiveness-sentiment-trl-style-ppo \
+    --per_device_train_batch_size 64 \
+    --gradient_accumulation_steps 1 \
+    --total_episodes 10000 \
+    --model_name_or_path EleutherAI/pythia-1b-deduped \
+    --missing_eos_penalty 1.0
+accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml \
+    examples/scripts/ppo/ppo.py \
+    --dataset_name trl-internal-testing/descriptiveness-sentiment-trl-style \
+    --dataset_train_split descriptiveness \
+    --output_dir pythia-1b-deduped-descriptiveness-sentiment-trl-style-ppo \
+    --num_ppo_epochs 1 \
+    --num_mini_batches 1 \
+    --learning_rate 3e-6 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --total_episodes 10000 \
+    --model_name_or_path EleutherAI/pythia-1b-deduped \
+    --sft_model_path EleutherAI/pythia-1b-deduped \
+    --reward_model_path EleutherAI/pythia-1b-deduped \
+    --local_rollout_forward_batch_size 1 \
+    --missing_eos_penalty 1.0
+"""
+if __name__ == "__main__":
+    parser = HfArgumentParser((ScriptArguments, PPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_into_dataclasses()
+    # remove output_dir if exists
+    shutil.rmtree(training_args.output_dir, ignore_errors=True)
+    ################
+    # Model & Tokenizer
+    ################
+    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        attn_implementation=model_args.attn_implementation,
+        dtype=dtype,
+    )
+    quantization_config = get_quantization_config(model_args)
+    if quantization_config is not None:
+        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
+        model_kwargs["device_map"] = get_kbit_device_map()
+        model_kwargs["quantization_config"] = quantization_config
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path, padding_side="left", trust_remote_code=model_args.trust_remote_code
+    )
+    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+    value_model = AutoModelForSequenceClassification.from_pretrained(
+        training_args.reward_model_path,
+        trust_remote_code=model_args.trust_remote_code,
+        num_labels=1,
+        **model_kwargs,
+    )
+    reward_model = AutoModelForSequenceClassification.from_pretrained(
+        training_args.reward_model_path,
+        trust_remote_code=model_args.trust_remote_code,
+        num_labels=1,
+        **model_kwargs,
+    )
+    policy = AutoModelForCausalLM.from_pretrained(
+        training_args.sft_model_path, trust_remote_code=model_args.trust_remote_code, **model_kwargs
+    )
+    peft_config = get_peft_config(model_args)
+    if peft_config is None:
+        ref_policy = AutoModelForCausalLM.from_pretrained(
+            training_args.sft_model_path, trust_remote_code=model_args.trust_remote_code, **model_kwargs
+        )
+    else:
+        ref_policy = None
+    ################
+    # Dataset
+    ################
+    dataset = load_dataset(
+        script_args.dataset_name, name=script_args.dataset_config, split=script_args.dataset_train_split
+    )
+    eval_samples = 100
+    train_dataset = dataset.select(range(len(dataset) - eval_samples))
+    eval_dataset = dataset.select(range(len(dataset) - eval_samples, len(dataset)))
+    dataset_text_field = "prompt"
+    def prepare_dataset(dataset, tokenizer):
+        """pre-tokenize the dataset before training; only collate during training"""
+        def tokenize(element):
+            outputs = tokenizer(
+                element[dataset_text_field],
+                padding=False,
+            )
+            return {"input_ids": outputs["input_ids"]}
+        return dataset.map(
+            tokenize,
+            batched=True,
+            remove_columns=dataset.column_names,
+            num_proc=training_args.dataset_num_proc,
+        )
+    # Compute that only on the main process for faster data processing.
+    # see: https://github.com/huggingface/trl/pull/1255
+    with PartialState().local_main_process_first():
+        train_dataset = prepare_dataset(train_dataset, tokenizer)
+        eval_dataset = prepare_dataset(eval_dataset, tokenizer)
+    ################
+    # Training
+    ################
+    trainer = PPOTrainer(
+        args=training_args,
+        processing_class=tokenizer,
+        model=policy,
+        ref_model=ref_policy,
+        reward_model=reward_model,
+        value_model=value_model,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        peft_config=peft_config,
+    )
+    trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+    trainer.generate_completions()

ICL/RL/trl_source/examples/scripts/reward_modeling.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# /// script
+# dependencies = [
+#     "trl",
+#     "trackio",
+#     "kernels",
+# ]
+# ///
+"""
+Full training:
+python examples/scripts/reward_modeling.py \
+    --model_name_or_path Qwen/Qwen2-0.5B-Instruct \
+    --dataset_name trl-lib/ultrafeedback_binarized \
+    --output_dir Qwen2-0.5B-Reward \
+    --per_device_train_batch_size 8 \
+    --num_train_epochs 1 \
+    --learning_rate 1.0e-5 \
+    --eval_strategy steps \
+    --eval_steps 50 \
+    --max_length 2048
+LoRA:
+python examples/scripts/reward_modeling.py \
+    --model_name_or_path Qwen/Qwen2-0.5B-Instruct \
+    --dataset_name trl-lib/ultrafeedback_binarized \
+    --output_dir Qwen2-0.5B-Reward-LoRA \
+    --per_device_train_batch_size 8 \
+    --num_train_epochs 1 \
+    --learning_rate 1.0e-4 \
+    --eval_strategy steps \
+    --eval_steps 50 \
+    --max_length 2048 \
+    --use_peft \
+    --lora_task_type SEQ_CLS \
+    --lora_r 32 \
+    --lora_alpha 16
+"""
+import os
+import torch
+from accelerate import logging
+from datasets import load_dataset
+from transformers import AutoModelForSequenceClassification, HfArgumentParser
+from trl import (
+    ModelConfig,
+    RewardConfig,
+    RewardTrainer,
+    ScriptArguments,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+logger = logging.get_logger(__name__)
+# Enable logging in a Hugging Face Space
+os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")
+if __name__ == "__main__":
+    parser = HfArgumentParser((ScriptArguments, RewardConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_into_dataclasses()
+    ################
+    # Model & Tokenizer
+    ################
+    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        dtype=dtype,
+    )
+    quantization_config = get_quantization_config(model_args)
+    if quantization_config is not None:
+        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
+        model_kwargs["device_map"] = get_kbit_device_map()
+        model_kwargs["quantization_config"] = quantization_config
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path, num_labels=1, trust_remote_code=model_args.trust_remote_code, **model_kwargs
+    )
+    if model_args.use_peft and model_args.lora_task_type != "SEQ_CLS":
+        logger.warning(
+            "You are using a `task_type` that is different than `SEQ_CLS` for PEFT. This will lead to silent bugs"
+            " Make sure to pass --lora_task_type SEQ_CLS when using this script with PEFT.",
+        )
+    ##############
+    # Load dataset
+    ##############
+    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    ##########
+    # Training
+    ##########
+    trainer = RewardTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+    )
+    trainer.train()
+    ############################
+    # Save model and push to Hub
+    ############################
+    trainer.save_model(training_args.output_dir)
+    if training_args.eval_strategy != "no":
+        metrics = trainer.evaluate()
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)

ICL/RL/trl_source/examples/scripts/sft_vlm_gemma3.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# /// script
+# dependencies = [
+#     "trl",
+#     "Pillow>=9.4.0",
+#     "peft",
+#     "trackio",
+#     "kernels",
+# ]
+# ///
+"""
+Train Gemma 3 on the HuggingFaceH4/llava-instruct-mix-vsft dataset (single-image).
+accelerate launch \
+    --config_file examples/accelerate_configs/deepspeed_zero3.yaml \
+    examples/scripts/sft_vlm_gemma3.py \
+    --dataset_name HuggingFaceH4/llava-instruct-mix-vsft \
+    --model_name_or_path google/gemma-3-4b-it \
+    --per_device_train_batch_size 1 \
+    --output_dir Gemma-3-4B-SFT-MMIU \
+    --dtype bfloat16 \
+    --use_peft \
+    --lora_target_modules all-linear \
+    --attn_implementation eager
+Train Gemma 3 on the FanqingM/MMIU-Benchmark dataset (multi-image).
+accelerate launch \
+    --config_file examples/accelerate_configs/deepspeed_zero3.yaml \
+    examples/scripts/sft_vlm_gemma3.py \
+    --dataset_name FanqingM/MMIU-Benchmark \
+    --dataset_train_split test \
+    --model_name_or_path google/gemma-3-4b-it \
+    --per_device_train_batch_size 1 \
+    --output_dir Gemma-3-4B-SFT-MMIU \
+    --dtype bfloat16 \
+    --use_peft \
+    --lora_target_modules all-linear \
+    --attn_implementation eager
+"""
+import io
+import os
+import zipfile
+import torch
+from datasets import DatasetDict, load_dataset
+from huggingface_hub import hf_hub_download, list_repo_files
+from PIL import Image
+from transformers import AutoModelForImageTextToText
+from trl import (
+    ModelConfig,
+    ScriptArguments,
+    SFTConfig,
+    SFTTrainer,
+    TrlParser,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+# Enable logging in a Hugging Face Space
+os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")
+# For multi-image example
+def process_vision_info(messages: list[dict]) -> list[Image.Image]:
+    image_inputs = []
+    for msg in messages:
+        content = msg.get("content", [])
+        if not isinstance(content, list):
+            content = [content]
+        for element in content:
+            if isinstance(element, dict) and ("image" in element or element.get("type") == "image"):
+                if "image" in element:
+                    image = element["image"]
+                else:
+                    image = element
+                if image is not None:
+                    image = Image.open(io.BytesIO(image["bytes"]))
+                    image_inputs.append(image.convert("RGB"))
+    return image_inputs
+def format_data(samples: dict[str, any]) -> dict[str, list]:
+    formatted_samples = {"messages": []}
+    for cont in range(len(samples["question"])):
+        images = []
+        for img_path in samples["input_image_path"][cont]:
+            try:
+                with open(img_path, "rb") as f:
+                    img_bytes = f.read()
+                image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+                images.append({"type": "image", "image": image})
+            except Exception as e:
+                print(f"Error processing image {img_path}: {e}")
+                continue
+        formatted_samples["messages"].append(
+            [
+                {"role": "system", "content": [{"type": "text", "text": samples["context"][cont]}]},
+                {"role": "user", "content": images + [{"type": "text", "text": samples["question"][cont]}]},
+                {"role": "assistant", "content": [{"type": "text", "text": samples["output"][cont]}]},
+            ]
+        )
+    return formatted_samples
+# For multi-image example
+def prepare_dataset(dataset: DatasetDict, dataset_name: str) -> DatasetDict:
+    all_files = list_repo_files(dataset_name, repo_type="dataset")
+    zip_files = [f for f in all_files if f.endswith(".zip")]
+    for zip_filename in zip_files:
+        zip_path = hf_hub_download(repo_id=dataset_name, filename=zip_filename, repo_type="dataset")
+        extract_folder = zip_filename.replace(".zip", "")
+        os.makedirs(extract_folder, exist_ok=True)
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            zip_ref.extractall(extract_folder)
+    dataset = dataset.map(format_data, batched=True, batch_size=4, num_proc=16)
+    return dataset
+def main():
+    parser = TrlParser((ScriptArguments, SFTConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    training_args.max_length = None
+    ################
+    # Model
+    ################
+    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        attn_implementation=model_args.attn_implementation,
+        dtype=dtype,
+    )
+    quantization_config = get_quantization_config(model_args)
+    if quantization_config is not None:
+        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
+        model_kwargs["device_map"] = get_kbit_device_map()
+        model_kwargs["quantization_config"] = quantization_config
+    model = AutoModelForImageTextToText.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, **model_kwargs
+    )
+    ################
+    # Dataset
+    ################
+    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    if script_args.dataset_name == "FanqingM/MMIU-Benchmark":
+        dataset = prepare_dataset(dataset, script_args.dataset_name)
+    ################
+    # Training
+    ################
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+    )
+    trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    main()

ICL/RL/trl_source/trl/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (2.3 kB). View file

ICL/RL/trl_source/trl/__pycache__/_compat.cpython-313.pyc ADDED Viewed

Binary file (11.1 kB). View file

ICL/RL/trl_source/trl/__pycache__/chat_template_utils.cpython-313.pyc ADDED Viewed

Binary file (22.3 kB). View file

ICL/RL/trl_source/trl/__pycache__/data_utils.cpython-313.pyc ADDED Viewed

Binary file (43.1 kB). View file

ICL/RL/trl_source/trl/__pycache__/import_utils.cpython-313.pyc ADDED Viewed

Binary file (8.1 kB). View file

ICL/RL/trl_source/trl/accelerate_configs/fsdp1.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: FULL_SHARD
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+  fsdp_version: 1
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

ICL/RL/trl_source/trl/accelerate_configs/fsdp2.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+# Requires accelerate 1.7.0 or higher
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_version: 2
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

ICL/RL/trl_source/trl/accelerate_configs/multi_gpu.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

ICL/RL/trl_source/trl/accelerate_configs/single_gpu.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: "NO"
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

ICL/RL/trl_source/trl/accelerate_configs/zero1.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  zero3_init_flag: false
+  zero_stage: 1
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

ICL/RL/trl_source/trl/accelerate_configs/zero2.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

ICL/RL/trl_source/trl/accelerate_configs/zero3.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

ICL/RL/trl_source/trl/experimental/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Experimental submodule for TRL.
+This submodule contains unstable or incubating features. Anything here may change (or be removed) in any release
+without deprecation. Use at your own risk.
+To silence this notice set environment variable TRL_EXPERIMENTAL_SILENCE=1.
+"""
+import os
+import warnings
+from ..import_utils import TRLExperimentalWarning
+if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+    warnings.warn(
+        "You are importing from 'trl.experimental'. APIs here are unstable and may change or be removed without "
+        "notice. Silence this warning by setting environment variable TRL_EXPERIMENTAL_SILENCE=1.",
+        TRLExperimentalWarning,
+        stacklevel=2,
+    )

ICL/RL/trl_source/trl/experimental/bco/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .bco_config import BCOConfig
+from .bco_trainer import BCOTrainer

ICL/RL/trl_source/trl/experimental/bema_for_ref_model/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .callback import BEMACallback
+from .dpo_trainer import DPOTrainer

ICL/RL/trl_source/trl/experimental/bema_for_ref_model/dpo_trainer.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...trainer.dpo_trainer import DPOTrainer as _DPOTrainer
+from .callback import CallbackHandlerWithRefModel
+class DPOTrainer(_DPOTrainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Replace with a new one that calls the events with the reference model
+        self.callback_handler = CallbackHandlerWithRefModel(
+            self.callback_handler.callbacks,
+            self.model,
+            self.ref_model,
+            self.processing_class,
+            self.optimizer,
+            self.lr_scheduler,
+        )

ICL/RL/trl_source/trl/experimental/cpo/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .cpo_config import CPOConfig
+from .cpo_trainer import CPOTrainer
+__all__ = ["CPOConfig", "CPOTrainer"]

ICL/RL/trl_source/trl/experimental/cpo/cpo_config.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from typing import Any
+from transformers import TrainingArguments
+@dataclass
+class CPOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`experimental.cpo.CPOTrainer`].
+    This class includes only the parameters that are specific to CPO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+    Parameters:
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_completion_length (`int`, *optional*):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+            the [paper](https://huggingface.co/papers/2310.12036).
+        label_smoothing (`float`, *optional*, defaults to `0.0`):
+            Label smoothing factor. This argument is required if you want to use the default data collator.
+        loss_type (`str`, *optional*, defaults to `"sigmoid"`):
+            Type of loss to use. Possible values are:
+                - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+                - `"hinge"`: hinge loss on the normalized likelihood from the
+                  [SLiC](https://huggingface.co/papers/2305.10425) paper.
+                - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+                - `"simpo"`: SimPO loss from the [SimPO](https://huggingface.co/papers/2405.14734) paper.
+                - `"alphapo"`: AlphaPO loss from the [AlphaPO](https://huggingface.co/papers/2501.03884) paper. This
+                  automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        cpo_alpha (`float`, *optional*, defaults to `1.0`):
+            Weight of the BC regularizer in CPO training.
+        simpo_gamma (`float`, *optional*, defaults to `0.5`):
+            Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`.
+        alpha (`float`, *optional*, defaults to `0.0`):
+            Alpha parameter that controls reward function shape across all loss types. When alpha=0 (default), uses
+            standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: `r = (1 - p^(-alpha))
+            / alpha` from the [AlphaPO paper](https://huggingface.co/papers/2501.03884). This parameter works with all
+            loss types.
+        truncation_mode (`str`,*optional*,  defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
+        is_encoder_decoder (`bool`, *optional*):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+    """
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322 and released in 4.57.5. We add a temporary
+    # workaround here, which can be removed once we drop support for versions older than 4.57.5.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
+    max_length: int | None = field(
+        default=1024,
+        metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
+    )
+    max_completion_length: int | None = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from "
+            "the reference model."
+        },
+    )
+    label_smoothing: float = field(
+        default=0.0,
+        metadata={"help": "Label smoothing factor."},
+    )
+    loss_type: str = field(
+        default="sigmoid",
+        metadata={
+            "help": "Type of loss to use.",
+            "choices": ["sigmoid", "hinge", "ipo", "simpo", "alphapo"],
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    cpo_alpha: float = field(
+        default=1.0,
+        metadata={"help": "Weight of the BC regularizer in CPO training."},
+    )
+    simpo_gamma: float = field(
+        default=0.5,
+        metadata={"help": "Target reward margin for the SimPO loss, used only when the `loss_type='simpo'`."},
+    )
+    alpha: float = field(
+        default=0.0,
+        metadata={
+            "help": "Alpha parameter that controls reward function shape across all loss types. When alpha=0 "
+            "(default), uses standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: "
+            "`r = (1 - p^(-alpha)) / alpha` from the AlphaPO paper. This parameter works with all loss types."
+        },
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={"help": "If `True`, generates and logs completions from the model to W&B during evaluation."},
+    )
+    is_encoder_decoder: bool | None = field(
+        default=None,
+        metadata={"help": "Whether the model is an encoder-decoder model."},
+    )
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+        # Syntactic sugar for AlphaPO: set loss_type to "simpo" and cpo_alpha to 0.0
+        if self.loss_type == "alphapo":
+            self.loss_type = "simpo"
+            self.cpo_alpha = 0.0
+        super().__post_init__()

ICL/RL/trl_source/trl/experimental/cpo/cpo_trainer.py ADDED Viewed

	@@ -0,0 +1,1057 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import random
+import textwrap
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import nullcontext
+from pathlib import Path
+from typing import Any, Literal
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import transformers
+from accelerate import PartialState, logging
+from datasets import Dataset
+from packaging.version import Version
+from torch import autocast
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoModelForCausalLM,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+    is_comet_available,
+    is_wandb_available,
+)
+from transformers.trainer_utils import EvalLoopOutput
+from transformers.utils import is_peft_available, is_torch_fx_proxy
+from ...data_utils import maybe_apply_chat_template, maybe_extract_prompt
+from ...models.utils import peft_module_casting_to_bf16
+from ...trainer.base_trainer import BaseTrainer
+from ...trainer.utils import (
+    disable_dropout_in_model,
+    log_table_to_comet_experiment,
+    pad_to_length,
+    selective_log_softmax,
+)
+from ..utils import DPODataCollatorWithPadding, add_bos_token_if_needed, add_eos_token_if_needed
+from .cpo_config import CPOConfig
+if is_peft_available():
+    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
+if is_wandb_available():
+    import wandb
+logger = logging.get_logger(__name__)
+class CPOTrainer(BaseTrainer):
+    r"""
+    Initialize CPOTrainer.
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
+        args ([`experimental.cpo.CPOConfig`]):
+            The CPO config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`experimental.utils.DPODataCollatorWithPadding`]) will be used which will pad the sequences to the
+            maximum length of the sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+    """
+    _tag_names = ["trl", "cpo"]
+    _name = "CPO"
+    _paper = {
+        "title": "Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation",
+        "id": "2401.08417",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{xu2024contrastive,
+                title        = {{Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation}},
+                author       = {Haoran Xu and Amr Sharaf and Yunmo Chen and Weiting Tan and Lingfeng Shen and Benjamin Van Durme and Kenton Murray and Young Jin Kim},
+                year         = 2024,
+                booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
+                publisher    = {OpenReview.net},
+                url          = {https://openreview.net/forum?id=51iwkioZpn}
+            }"""),
+    }
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str | None = None,
+        args: CPOConfig | None = None,
+        data_collator: DataCollator | None = None,
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        model_init: Callable[[], PreTrainedModel] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
+    ):
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError("You passed model_kwargs to the CPOTrainer. But your model is already instantiated.")
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype", "auto")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the CPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+            model_init_kwargs["device_map"] = model_init_kwargs.get("device_map", "auto")
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            if isinstance(model, PeftModel):
+                raise ValueError(
+                    "You passed a `PeftModel` instance together with a `peft_config` to the trainer. Please first "
+                    "merge and unload the existing adapter, save the resulting base model, and then pass that base "
+                    "model along with the new `peft_config` to the trainer."
+                )
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+        if args.generate_during_eval and not (is_wandb_available() or is_comet_available()):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.")
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+        if self.is_encoder_decoder:
+            self.decoder_start_token_id = model.config.decoder_start_token_id
+            self.pad_token_id = model.config.pad_token_id
+        if processing_class is None:
+            raise ValueError("processing_class must be specified to tokenize a CPO dataset.")
+        if args.max_length is None:
+            logger.warning(
+                "`max_length` is not set in the CPOConfig's init"
+                " it will default to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        else:
+            max_length = args.max_length
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using an encoder decoder architecture, you should set `max_completion_length` in the CPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            max_completion_length = 128
+        else:
+            max_completion_length = args.max_completion_length
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.truncation_mode = args.truncation_mode
+        self.max_completion_length = max_completion_length
+        self.processing_class = processing_class
+        if processing_class.pad_token is None:
+            processing_class.pad_token = processing_class.eos_token
+        self.pad_token_id = processing_class.pad_token_id
+        if args.loss_type in ["hinge", "ipo"] and args.label_smoothing > 0:
+            logger.warning(
+                f"You are using the {args.loss_type} loss type that does not support label smoothing. The "
+                "`label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this warning.",
+            )
+        if args.loss_type == "kto_pair":
+            raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.")
+        self.beta = args.beta
+        self.label_smoothing = args.label_smoothing
+        self.loss_type = args.loss_type
+        self.cpo_alpha = args.cpo_alpha
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+        if args.loss_type == "simpo":
+            self.simpo_gamma = args.simpo_gamma
+        # AlphaPO parameter for reward shaping
+        self.alpha = args.alpha
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        with PartialState().main_process_first():
+            # Extract the prompt if needed, and apply the chat template if needed
+            train_dataset = train_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc)
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                )
+            # tokenize the dataset
+            train_dataset = train_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc)
+        # Transformers explicitly set use_reentrant=True in the past to silence a PyTorch warning, but the default was
+        # never updated once PyTorch switched to recommending use_reentrant=False. Until that change lands upstream
+        # (see https://github.com/huggingface/transformers/pull/43203) and is released (most likely in 5.0.0), we
+        # default to the recommended non-reentrant behavior here, while preserving any user-provided value.
+        if args.gradient_checkpointing and Version(transformers.__version__) < Version("5.0.0"):
+            args.gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs or {}
+            args.gradient_checkpointing_kwargs.setdefault("use_reentrant", False)
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+    def build_tokenized_answer(self, prompt, answer):
+        """
+        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
+        b)[len(enc(a)):]`. Reference:
+            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+        """
+        full_tokenized = self.processing_class(prompt + answer, add_special_tokens=False)
+        prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)["input_ids"]
+        answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
+        answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :]
+        # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+        full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])
+        # Prepare input tokens for token by token comparison
+        full_input_ids = np.array(full_tokenized["input_ids"])
+        if len(full_input_ids) != len(full_concat_input_ids):
+            raise ValueError("Prompt input ids and answer input ids should have the same length.")
+        # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+        # can be merged together when tokenizing prompt+answer. This could result
+        # on the last token from the prompt being different when tokenized on its own
+        # vs when done as prompt+answer.
+        response_token_ids_start_idx = len(prompt_input_ids)
+        # If tokenized prompt is different than both prompt+answer, then it means the
+        # last token has changed due to merging.
+        if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]:
+            response_token_ids_start_idx -= 1
+        prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
+        prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx]
+        if len(prompt_input_ids) != len(prompt_attention_mask):
+            raise ValueError("Prompt input ids and attention mask should have the same length.")
+        answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
+        answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:]
+        return dict(
+            prompt_input_ids=prompt_input_ids,
+            prompt_attention_mask=prompt_attention_mask,
+            input_ids=answer_input_ids,
+            attention_mask=answer_attention_mask,
+        )
+    def tokenize_row(self, feature, model: PreTrainedModel | nn.Module | None = None) -> dict:
+        """Tokenize a single row from a CPO specific dataset.
+        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
+        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
+        we truncate the chosen/rejected.
+        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
+        of the prompt and the chosen/rejected response, with `-100` for the prompt tokens.
+        """
+        batch = {}
+        prompt = feature["prompt"]
+        chosen = feature["chosen"]
+        rejected = feature["rejected"]
+        if not self.is_encoder_decoder:
+            # Check issues below for more details
+            #  1. https://github.com/huggingface/trl/issues/907
+            #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+            #  3. https://github.com/LianjiaTech/BELLE/issues/337
+            if not isinstance(prompt, str):
+                raise ValueError(f"prompt should be an str but got {type(prompt)}")
+            prompt_tokens = self.processing_class(prompt, add_special_tokens=False)
+            prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
+            if not isinstance(chosen, str):
+                raise ValueError(f"chosen should be an str but got {type(chosen)}")
+            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
+            if not isinstance(rejected, str):
+                raise ValueError(f"rejected should be an str but got {type(rejected)}")
+            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
+            # Last prompt token might get merged by tokenizer and
+            # it should not be included for generation if that happens
+            prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"])
+            chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"])
+            rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"])
+            prompt_len_input_ids = min(chosen_prompt_len_input_ids, rejected_prompt_len_input_ids)
+            for k, v in prompt_tokens.items():
+                prompt_tokens[k] = v[:prompt_len_input_ids]
+            # Make sure prompts only have one different token at most an
+            # and length only differs by 1 at most
+            num_diff_tokens = sum(
+                a != b
+                for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"], strict=True)
+            )
+            num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids)
+            if num_diff_tokens > 1 or num_diff_len > 1:
+                raise ValueError(
+                    "Chosen and rejected prompt_input_ids might only differ on the "
+                    "last token due to tokenizer merge ops."
+                )
+            # add BOS token to head of prompt. Avoid adding if it's already there
+            prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed(
+                self.processing_class.bos_token_id,
+                prompt_len_input_ids,
+                prompt_tokens,
+                chosen_prompt_len_input_ids,
+                chosen_tokens,
+                rejected_prompt_len_input_ids,
+                rejected_tokens,
+            )
+            # add EOS token to end of answer. Avoid adding if it's already there
+            chosen_tokens, rejected_tokens = add_eos_token_if_needed(
+                self.processing_class.eos_token_id, chosen_tokens, rejected_tokens
+            )
+            longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"]))
+            # if combined sequence is too long, truncate the response
+            for answer_tokens in [chosen_tokens, rejected_tokens]:
+                if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length:
+                    for k in ["input_ids", "attention_mask"]:
+                        answer_tokens[k] = answer_tokens[k][: self.max_length - longer_response_length]
+            # Create labels
+            chosen_sequence_tokens = {
+                k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"]
+            }
+            rejected_sequence_tokens = {
+                k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"]
+            }
+            chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
+            chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [-100] * len(
+                chosen_tokens["prompt_input_ids"]
+            )
+            rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:]
+            rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [-100] * len(
+                rejected_tokens["prompt_input_ids"]
+            )
+            for k, toks in {
+                "chosen_": chosen_sequence_tokens,
+                "rejected_": rejected_sequence_tokens,
+                "": prompt_tokens,
+            }.items():
+                for type_key, tokens in toks.items():
+                    if type_key == "token_type_ids":
+                        continue
+                    batch[f"{k}{type_key}"] = tokens
+        else:
+            chosen_tokens = self.processing_class(
+                chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
+            )
+            rejected_tokens = self.processing_class(
+                rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
+            )
+            prompt_tokens = self.processing_class(prompt, add_special_tokens=True)
+            batch["chosen_labels"] = chosen_tokens["input_ids"]
+            batch["rejected_labels"] = rejected_tokens["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokens["input_ids"]
+            batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
+            if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
+                batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["rejected_labels"])
+                )
+                batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["chosen_labels"])
+                )
+        return batch
+    @staticmethod
+    def concatenated_inputs(
+        batch: dict[str, list | torch.LongTensor],
+        is_encoder_decoder: bool = False,
+        padding_value: int = 0,
+        device: torch.device | None = None,
+    ) -> dict[str, torch.LongTensor]:
+        """Concatenate the chosen and rejected inputs into a single tensor.
+        Args:
+            batch:
+                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
+                of shape (batch_size, sequence_length).
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model.
+            padding_value:
+                The padding value to use for the concatenated inputs_ids.
+            device:
+                The device for the concatenated inputs.
+        Returns:
+            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
+        """
+        concatenated_batch = {}
+        if is_encoder_decoder:
+            max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1])
+        else:
+            max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
+        for k in batch:
+            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = -100
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("chosen", "concatenated")
+                concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value)
+        for k in batch:
+            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = -100
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("rejected", "concatenated")
+                concatenated_batch[concatenated_key] = torch.cat(
+                    (
+                        concatenated_batch[concatenated_key],
+                        pad_to_length(batch[k], max_length, pad_value=pad_value),
+                    ),
+                    dim=0,
+                ).to(device=device)
+        if is_encoder_decoder:
+            concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device)
+            concatenated_batch["concatenated_attention_mask"] = (
+                batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
+            )
+        return concatenated_batch
+    def cpo_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute the CPO loss for a batch of policy and reference model log probabilities.
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+        Returns:
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the CPO
+            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
+            the chosen and rejected responses, respectively.
+        """
+        # Apply AlphaPO reward transformation if alpha != 0
+        if self.alpha != 0.0:
+            # Compute probabilities
+            chosen_probs = torch.exp(policy_chosen_logps)
+            rejected_probs = torch.exp(policy_rejected_logps)
+            # Apply AlphaPO transformation: r = (1 - p^(-alpha)) / alpha
+            policy_chosen_rewards = (1 - chosen_probs.pow(-self.alpha)) / self.alpha
+            policy_rejected_rewards = (1 - rejected_probs.pow(-self.alpha)) / self.alpha
+            logits = (policy_chosen_rewards - policy_rejected_rewards).to(self.accelerator.device)
+        else:
+            # Standard log probability rewards when alpha = 0
+            logits = (policy_chosen_logps - policy_rejected_logps).to(self.accelerator.device)
+        # The beta is a temperature parameter for the CPO loss, typically something in the range of 0.1 to 0.5.
+        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and
+        # calculates a conservative CPO loss.
+        if self.loss_type == "simpo":
+            gamma_logratios = self.simpo_gamma / self.beta
+            logits = logits - gamma_logratios
+            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        elif self.loss_type == "sigmoid":
+            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        elif self.loss_type == "hinge":
+            losses = torch.relu(1 - self.beta * logits)
+        elif self.loss_type == "ipo":
+            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper.
+            losses = (logits - 1 / (2 * self.beta)) ** 2
+        else:
+            raise ValueError(
+                f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'simpo']"
+            )
+        # Calculate rewards for logging
+        if self.alpha != 0.0:
+            # When using AlphaPO transformation, use the transformed rewards
+            chosen_rewards = self.beta * policy_chosen_rewards.to(self.accelerator.device).detach()
+            rejected_rewards = self.beta * policy_rejected_rewards.to(self.accelerator.device).detach()
+        else:
+            # Standard log probability rewards
+            chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
+            rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach()
+        return losses, chosen_rewards, rejected_rewards
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of `-100` are ignored.
+                Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        loss_mask = labels != -100
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == -100] = 0
+        per_token_logps = selective_log_softmax(logits, labels)
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+    def concatenated_forward(
+        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            padding_value=self.pad_token_id,
+            device=self.accelerator.device,
+        )
+        len_chosen = batch["chosen_labels"].shape[0]
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(concatenated_batch["concatenated_labels"]),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+        outputs = model(
+            concatenated_batch["concatenated_input_ids"],
+            attention_mask=concatenated_batch["concatenated_attention_mask"],
+            use_cache=False,
+            **model_kwargs,
+        )
+        all_logits = outputs.logits
+        def cross_entropy_loss(logits, labels):
+            if not self.is_encoder_decoder:
+                # Shift so that tokens < n predict n
+                logits = logits[..., :-1, :].contiguous()
+                labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            logits = logits.view(-1, logits.shape[-1])
+            labels = labels.view(-1)
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+            return loss
+        labels = concatenated_batch["concatenated_labels"].clone()
+        if self.cpo_alpha == 0:
+            nll_loss = torch.tensor(0.0).to(self.accelerator.device)
+        else:
+            nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
+        all_logps = self.get_batch_logps(
+            all_logits,
+            concatenated_batch["concatenated_labels"],
+            average_log_prob=self.loss_type in ["ipo", "simpo"],
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+        chosen_logps = all_logps[:len_chosen]
+        rejected_logps = all_logps[len_chosen:]
+        chosen_logits = all_logits[:len_chosen]
+        rejected_logits = all_logits[len_chosen:]
+        if self.aux_loss_enabled:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss, outputs.aux_loss)
+        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss)
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, list | torch.LongTensor],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the CPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        forward_output = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = forward_output[:5]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[5]
+        losses, chosen_rewards, rejected_rewards = self.cpo_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+        )
+        loss = losses.mean() + self.cpo_alpha * policy_nll_loss
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
+        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
+        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
+        metrics[f"{prefix}rewards/margins"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards).mean().item()
+        )
+        metrics[f"{prefix}logps/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean().item()
+        )
+        metrics[f"{prefix}logps/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean().item()
+        )
+        metrics[f"{prefix}logits/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logits.detach().mean()).mean().item()
+        )
+        metrics[f"{prefix}logits/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logits.detach().mean()).mean().item()
+        )
+        metrics[f"{prefix}nll_loss"] = self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean().item()
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+        return loss, metrics
+    def compute_loss(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+        policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id)
+        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
+        return policy_output_decoded
+    def prediction_step(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,
+        ignore_keys: list[str] | None = None,
+    ):
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval")
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+        return (loss.detach(), logits, labels)
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: bool | None = None,
+        ignore_keys: list[str] | None = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+        Works both with or without labels.
+        """
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+            policy_output_decoded = self.generate_from_model(self.model, random_batch)
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy"],
+                data=[
+                    [prompt, pol[len(prompt) :]]
+                    for prompt, pol in zip(random_batch["prompt"], policy_output_decoded, strict=True)
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
+        )
+        return initial_output
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float`, *optional*):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+    def _shift_right(self, input_ids):
+        if self.decoder_start_token_id is None:
+            raise ValueError(
+                "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id."
+            )
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), self.decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = self.decoder_start_token_id
+        if self.pad_token_id is None:
+            raise ValueError("model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id)
+        return shifted_input_ids
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)

ICL/RL/trl_source/trl/experimental/gfpo/gfpo_config.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from ...trainer.grpo_config import GRPOConfig as _GRPOConfig
+@dataclass
+class GFPOConfig(_GRPOConfig):
+    num_remains_in_group: int | None = field(
+        default=None,
+        metadata={
+            "help": "number inputs remains after group filter function, `'num_remains_in_group'` must be >=2 if given."
+        },
+    )
+    def __post_init__(self):
+        super().__post_init__()
+        if self.num_remains_in_group is not None and self.num_remains_in_group >= self.num_generations:
+            raise ValueError(
+                f"Number remains in Group {self.num_remains_in_group} must be less than num_generations : {self.num_generations}."
+            )

ICL/RL/trl_source/trl/experimental/gkd/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .gkd_config import GKDConfig
+from .gkd_trainer import GKDTrainer
+__all__ = ["GKDConfig", "GKDTrainer"]

ICL/RL/trl_source/trl/experimental/gkd/gkd_config.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from typing import Any
+from transformers import TrainingArguments
+from ...trainer.sft_config import SFTConfig
+@dataclass
+class GKDConfig(SFTConfig):
+    """
+    Configuration class for [`experimental.gkd.GKDTrainer`].
+    This class includes only the parameters that are specific to GKD training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] and [`SFTConfig`] documentation.
+    Args:
+        temperature (`float`, *optional*, defaults to `0.9`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        lmbda (`float`, *optional*, defaults to `0.5`):
+            Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy
+            student-generated outputs).
+        beta (`float`, *optional*, defaults to `0.5`):
+            Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence loss. When
+            beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
+        max_new_tokens (`int`, *optional*, defaults to `128`):
+            Maximum number of tokens to generate per completion.
+        teacher_model_name_or_path (`str`, *optional*):
+            Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
+            trained.
+        teacher_model_init_kwargs (`dict[str, Any]]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
+            from a string.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        seq_kd (`bool`, *optional*, defaults to `False`):
+            Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised FT on
+            teacher-generated output).
+    """
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["teacher_model_init_kwargs"]
+    temperature: float = field(
+        default=0.9,
+        metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."},
+    )
+    lmbda: float = field(
+        default=0.5,
+        metadata={
+            "help": "Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy "
+            "student-generated outputs)."
+        },
+    )
+    beta: float = field(
+        default=0.5,
+        metadata={
+            "help": "Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence "
+            "loss. When beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL "
+            "Divergence."
+        },
+    )
+    max_new_tokens: int = field(
+        default=128,
+        metadata={"help": "Maximum number of tokens to generate per completion."},
+    )
+    teacher_model_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Model name or path of the teacher model. If `None`, the teacher model will be the same as the "
+            "model being trained."
+        },
+    )
+    teacher_model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "teacher model from a string."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropouts in `model`."},
+    )
+    seq_kd: bool = field(
+        default=False,
+        metadata={
+            "help": "Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised "
+            "FT on teacher-generated output)."
+        },
+    )
+    def __post_init__(self):
+        super().__post_init__()
+        # check lmbda and beta are in the range [0, 1]
+        if self.lmbda < 0.0 or self.lmbda > 1.0:
+            raise ValueError("lmbda must be in the range [0.0, 1.0].")
+        if self.beta < 0.0 or self.beta > 1.0:
+            raise ValueError("beta must be in the range [0.0, 1.0].")

ICL/RL/trl_source/trl/experimental/gold/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .gold_config import GOLDConfig
+from .gold_trainer import GOLDTrainer
+__all__ = ["GOLDConfig", "GOLDTrainer"]

ICL/RL/trl_source/trl/experimental/gold/gold.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# /// script
+# dependencies = [
+#     "trl @ git+https://github.com/huggingface/trl.git",
+#     "peft",
+#     "trackio",
+# ]
+# ///
+# docstyle-ignore
+"""
+# Full training:
+python trl/experimental/gold/gold.py \
+    --model_name_or_path meta-llama/Llama-3.2-1B-Instruct \
+    --teacher_model_name_or_path Qwen/Qwen2-1.5B-Instruct \
+    --dataset_name trl-lib/chatbot_arena_completions \
+    --learning_rate 2e-5 \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 8 \
+    --output_dir gold-model \
+    --num_train_epochs 1 \
+    --push_to_hub
+# LoRA:
+python trl/experimental/gold/gold.py \
+    --model_name_or_path meta-llama/Llama-3.2-1B-Instruct \
+    --teacher_model_name_or_path Qwen/Qwen2-1.5B-Instruct \
+    --dataset_name trl-lib/chatbot_arena_completions \
+    --learning_rate 2e-4 \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 8 \
+    --output_dir gold-model \
+    --num_train_epochs 1 \
+    --push_to_hub \
+    --use_peft \
+    --lora_r 64 \
+    --lora_alpha 16
+"""
+import logging
+from datasets import load_dataset
+from transformers import AutoTokenizer, GenerationConfig
+from trl import (
+    LogCompletionsCallback,
+    ModelConfig,
+    ScriptArguments,
+    TrlParser,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+from trl.experimental.gold.gold_config import GOLDConfig
+from trl.experimental.gold.gold_trainer import GOLDTrainer
+logger = logging.getLogger(__name__)
+if __name__ == "__main__":
+    parser = TrlParser((ScriptArguments, GOLDConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    ################
+    # Model & Tokenizer
+    ################
+    quantization_config = get_quantization_config(model_args)
+    model_kwargs = dict(
+        revision=training_args.student_model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=model_args.dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    training_args.model_init_kwargs = model_kwargs
+    if training_args.teacher_tokenizer_name_or_path is None and training_args.use_uld_loss:
+        training_args.teacher_tokenizer_name_or_path = training_args.teacher_model_name_or_path
+    teacher_model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=model_args.dtype,
+        use_cache=True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    training_args.teacher_model_init_kwargs = teacher_model_kwargs
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        padding_side="left",
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    ################
+    # Dataset
+    ################
+    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    ################
+    # Training
+    ################
+    # Handle eval dataset - check if test split exists, fallback to validation or None
+    eval_dataset = None
+    if training_args.eval_strategy != "no":
+        if script_args.dataset_test_split in dataset:
+            eval_dataset = dataset[script_args.dataset_test_split]
+        elif "validation" in dataset:
+            eval_dataset = dataset["validation"]
+        elif "dev" in dataset:
+            eval_dataset = dataset["dev"]
+    trainer = GOLDTrainer(
+        model=model_args.model_name_or_path,
+        teacher_model=training_args.teacher_model_name_or_path,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=eval_dataset,
+        processing_class=tokenizer,
+        peft_config=get_peft_config(model_args),
+    )
+    if training_args.eval_strategy != "no":
+        generation_config = GenerationConfig(
+            max_new_tokens=training_args.max_completion_length, do_sample=True, temperature=training_args.temperature
+        )
+        completions_callback = LogCompletionsCallback(trainer, generation_config, num_prompts=8)
+        trainer.add_callback(completions_callback)
+    trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)

ICL/RL/trl_source/trl/experimental/gold/gold_config.py ADDED Viewed

	@@ -0,0 +1,419 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from typing import Any
+from transformers import TrainingArguments
+from ...trainer.sft_config import SFTConfig
+@dataclass
+class GOLDConfig(SFTConfig):
+    r"""
+    Configuration class for [`GOLDTrainer`].
+    This class includes only the parameters that are specific to GOLD training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] and [`SFTConfig`] documentation.
+    Args:
+        temperature (`float`, *optional*, defaults to `0.9`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        lmbda (`float`, *optional*, defaults to `0.5`):
+            Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy
+            student-generated outputs).
+        beta (`float`, *optional*, defaults to `0.5`):
+            Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence loss. When
+            beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
+        max_completion_length (`int`, *optional*, defaults to `128`):
+            Maximum number of tokens to generate per completion.
+        teacher_model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
+            Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
+            trained.
+        teacher_model_init_kwargs (`dict[str, Any]]` or `None`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
+            from a string.
+        teacher_tokenizer_name_or_path (`str` or `None`, *optional*, defaults to `None`):
+            Tokenizer name or path for the teacher model. If None when using ULD loss, will use the same tokenizer as
+            the student model (not recommended for cross-tokenizer distillation).
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        seq_kd (`bool`, *optional*, defaults to `False`):
+            Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised FT on
+            teacher-generated output).
+        use_uld_loss (`bool`, *optional*, defaults to `False`):
+            Whether to use Universal Logit Distillation (ULD) loss instead of Generalized Jensen-Shannon Divergence
+            loss.
+        uld_crossentropy_weight (`float`, *optional*, defaults to `0.0`):
+            Weight for the cross-entropy loss component in ULD loss. If 0, only ULD distillation loss is used.
+        uld_distillation_weight (`float`, *optional*, defaults to `1.0`):
+            Weight for the distillation loss component in ULD loss.
+        uld_student_temperature (`float`, *optional*, defaults to `1.0`):
+            Temperature for student logits in ULD loss computation.
+        uld_teacher_temperature (`float`, *optional*, defaults to `1.0`):
+            Temperature for teacher logits in ULD loss computation.
+        uld_skip_student_eos (`bool`, *optional*, defaults to `True`):
+            Whether to skip EOS token for student in ULD loss computation.
+        uld_skip_teacher_eos (`bool`, *optional*, defaults to `True`):
+            Whether to skip EOS token for teacher in ULD loss computation.
+        use_vllm (`bool`, *optional*, defaults to `False`):
+            Whether to use vLLM for generating completions from the student model. Requires `vllm` to be installed.
+        vllm_mode (`str`, *optional*, defaults to `"server"`):
+            Mode for student vLLM integration. Either `"server"` (connect to a running TRL vLLM server) or `"colocate"`
+            (run vLLM in the same process).
+        vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
+            Host of the vLLM server for the student model (if `vllm_mode="server"`).
+        vllm_server_port (`int`, *optional*, defaults to `8001`):
+            Port of the vLLM server for the student model (if `vllm_mode="server"`).
+        vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
+            Timeout for connecting to the student vLLM server (if `vllm_mode="server"`).
+        vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.9`):
+            GPU memory utilization for the colocated student vLLM engine (if `vllm_mode="colocate"`). It is recommended
+            to set this to a low value if the student and teacher models share the same GPU.
+        vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
+            Tensor parallel size for the colocated student vLLM engine (if `vllm_mode="colocate"`).
+        vllm_structured_outputs_regex (`str` or `None`, *optional*, defaults to `None`):
+            Regex for vLLM structured outputs for the student model.
+        vllm_sync_frequency (`int`, *optional*, defaults to `1`):
+            Frequency (in training steps) to synchronize student model weights to vLLM engine. Set to 1 to sync after
+            every step.
+        vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`):
+            Enable vLLM sleep mode to offload student weights/cache during the optimizer step. Keeps GPU memory usage
+            low, but waking the engine adds host–device transfer latency.
+    """
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["teacher_model_init_kwargs"]
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-7,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    # GOLD-specific parameters
+    temperature: float = field(
+        default=0.9,
+        metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."},
+    )
+    top_p: float = field(
+        default=0.95,
+        metadata={
+            "help": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to "
+            "`top_p` or higher are kept for generation."
+        },
+    )
+    top_k: int = field(
+        default=0,
+        metadata={
+            "help": "Number of highest probability vocabulary tokens to keep for top-k-filtering. If `0`, "
+            "top-k-filtering is disabled and all tokens are considered."
+        },
+    )
+    lmbda: float = field(
+        default=0.5,
+        metadata={
+            "help": "Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy "
+            "student-generated outputs)."
+        },
+    )
+    beta: float = field(
+        default=0.5,
+        metadata={
+            "help": "Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence "
+            "loss. When beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL "
+            "Divergence."
+        },
+    )
+    max_completion_length: int = field(
+        default=128,
+        metadata={"help": "Maximum number of tokens to generate per completion."},
+    )
+    student_model_revision: str = field(
+        default="main",
+        metadata={
+            "help": "Revision of the student model to use. If not specified, the default revision of the model will be used."
+        },
+    )
+    teacher_model_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Model name or path of the teacher model. If `None`, the teacher model will be the same as the "
+            "model being trained."
+        },
+    )
+    teacher_model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "teacher model from a string."
+        },
+    )
+    teacher_tokenizer_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Tokenizer name or path for the teacher model. If None when using ULD loss, will use the same "
+            "tokenizer as the student model (not recommended for cross-tokenizer distillation)."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropouts in `model`."},
+    )
+    seq_kd: bool = field(
+        default=False,
+        metadata={
+            "help": "Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised "
+            "FT on teacher-generated output)."
+        },
+    )
+    steps_per_generation: int | None = field(
+        default=None,
+        metadata={
+            "help": "Number of optimization steps per generation. If `None`, it defaults to gradient_accumulation_steps."
+        },
+    )
+    # ULD Loss parameters
+    use_uld_loss: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use Universal Logit Distillation (ULD) loss instead of Generalized Jensen-Shannon Divergence loss."
+        },
+    )
+    use_extended_uld: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to enable extended ULD alignment that uses tokenizers to align and merge token "
+                "probabilities across student and teacher tokenizations. When True, the trainer will compute "
+                "token mappings and merge probabilities for split tokens; when False, ULD will use simple "
+                "positional truncation like in the original ULD paper."
+            )
+        },
+    )
+    uld_use_hybrid_loss: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to use a hybrid loss that combines ULD loss and JSD loss. When True, the final loss is a "
+                "a combination of JSD for known token mappings and ULD for unknown token mappings."
+            )
+        },
+    )
+    uld_hybrid_matched_weight: float | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Weight for the matched token loss component when using hybrid ULD + JSD loss. This weight scales "
+                "the JSD loss computed over tokens that have a direct mapping between student and teacher "
+                "tokenizations. If None, uses adaptive weighting based on vocabulary overlap. Must be set together "
+                "with uld_hybrid_unmatched_weight (both None or both float)."
+            )
+        },
+    )
+    uld_hybrid_unmatched_weight: float | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Weight for the unmatched token loss component when using hybrid ULD + JSD loss. This weight scales "
+                "the ULD loss computed over tokens that do not have a direct mapping between student and teacher "
+                "tokenizations. If None, uses adaptive weighting based on vocabulary overlap. Must be set together "
+                "with uld_hybrid_matched_weight (both None or both float)."
+            )
+        },
+    )
+    uld_crossentropy_weight: float = field(
+        default=0.0,
+        metadata={"help": "Weight for the cross-entropy loss component in ULD loss."},
+    )
+    uld_distillation_weight: float = field(
+        default=1.0,
+        metadata={"help": "Weight for the distillation loss component in ULD loss."},
+    )
+    uld_student_temperature: float = field(
+        default=1.0,
+        metadata={"help": "Temperature for student logits in ULD loss computation."},
+    )
+    uld_teacher_temperature: float = field(
+        default=1.0,
+        metadata={"help": "Temperature for teacher logits in ULD loss computation."},
+    )
+    uld_skip_student_eos: bool = field(
+        default=True,
+        metadata={"help": "Whether to skip EOS token for student in ULD loss computation."},
+    )
+    uld_skip_teacher_eos: bool = field(
+        default=True,
+        metadata={"help": "Whether to skip EOS token for teacher in ULD loss computation."},
+    )
+    # transformers paged attention
+    use_transformers_paged: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use the `transformers` paged implementation for generation. If set to `True`, the "
+            "`transformers` paged implementation will be used for generation instead of the default padded "
+            "implementation."
+        },
+    )
+    # vLLM parameters
+    use_vllm: bool = field(
+        default=False,
+        metadata={"help": "Whether to use vLLM for generating completions. Requires `vllm` to be installed."},
+    )
+    vllm_mode: str = field(
+        default="server",
+        metadata={
+            "help": 'Mode for vLLM integration. Either "server" (connect to a running TRL vLLM server) or "colocate" (run vLLM in the same process).'
+        },
+    )
+    vllm_server_host: str = field(
+        default="0.0.0.0",
+        metadata={"help": 'Host of the vLLM server when `vllm_mode="server"`.'},
+    )
+    vllm_server_port: int = field(
+        default=8001,
+        metadata={"help": 'Port of the vLLM server when `vllm_mode="server"`.'},
+    )
+    vllm_server_timeout: float = field(
+        default=240.0,
+        metadata={"help": 'Timeout (in seconds) for connecting to the vLLM server when `vllm_mode="server"`.'},
+    )
+    vllm_gpu_memory_utilization: float = field(
+        default=0.9,
+        metadata={
+            "help": 'GPU memory utilization for the colocated vLLM engine when `vllm_mode="colocate"`. Lower values reduce contention when sharing a device with the student/teacher models.'
+        },
+    )
+    vllm_tensor_parallel_size: int = field(
+        default=1,
+        metadata={"help": 'Tensor parallel size for the colocated vLLM engine when `vllm_mode="colocate"`.'},
+    )
+    vllm_structured_outputs_regex: str | None = field(
+        default=None,
+        metadata={"help": "Regex pattern used for vLLM structured outputs (optional)."},
+    )
+    vllm_sync_frequency: int = field(
+        default=1,
+        metadata={
+            "help": "Frequency (in training steps) to synchronize model weights to the vLLM engine. Set to 1 to sync after every step."
+        },
+    )
+    vllm_enable_sleep_mode: bool = field(
+        default=False,
+        metadata={
+            "help": "Enable vLLM sleep mode to offload student weights/cache during the optimizer step. Keeps GPU "
+            "memory usage low, but waking the engine adds host–device transfer latency."
+        },
+    )
+    # Parameters that control the logging
+    log_completions: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is "
+            "installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`."
+        },
+    )
+    log_completions_steps: int = field(
+        default=100,
+        metadata={
+            "help": "Number of steps between logging (prompt, completion) pairs. Only used if `log_completions` is "
+            "set to `True`."
+        },
+    )
+    num_completions_to_print: int | None = field(
+        default=None,
+        metadata={"help": "Number of completions to print with `rich`. If `None`, all completions are logged."},
+    )
+    wandb_entity: str | None = field(
+        default=None,
+        metadata={"help": ("The entity to store runs under.")},
+    )
+    wandb_project: str | None = field(
+        default=None,
+        metadata={"help": ("The project to store runs under.")},
+    )
+    wandb_run_group: str | None = field(
+        default=None,
+        metadata={"help": ("The group to store runs under.")},
+    )
+    wandb_log_unique_prompts: bool = field(
+        default=True,
+        metadata={
+            "help": ("Whether to log the unique prompts to wandb. This will create a new run for each unique prompt.")
+        },
+    )
+    callbacks: list[str] = field(
+        default_factory=lambda: [],
+        metadata={"help": "The callbacks to run during training."},
+    )
+    hub_model_revision: str | None = field(
+        default="main", metadata={"help": "The Hub model branch to push the model to."}
+    )
+    num_completions_to_print: int = field(default=5, metadata={"help": "Number of completions to print."})
+    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
+    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
+    trl_project: str = field(
+        default="smollm3",
+        metadata={
+            "help": "The TRL project to use for evaluation. This is used to determine the path to the evaluation script."
+        },
+    )
+    def __post_init__(self):
+        super().__post_init__()
+        # check lmbda and beta are in the range [0, 1]
+        if self.lmbda < 0.0 or self.lmbda > 1.0:
+            raise ValueError("lmbda must be in the range [0.0, 1.0].")
+        if self.beta < 0.0 or self.beta > 1.0:
+            raise ValueError("beta must be in the range [0.0, 1.0].")
+        # Validate that max_length is sufficient for max_completion_length
+        if self.max_length is not None and self.max_completion_length >= self.max_length:
+            raise ValueError(
+                f"max_completion_length ({self.max_completion_length}) must be smaller than max_length ({self.max_length}) "
+                f"to leave room for the prompt. Consider increasing max_length or reducing max_completion_length."
+            )
+        if self.steps_per_generation is None:
+            self.steps_per_generation = self.gradient_accumulation_steps
+        # Validate ULD parameters
+        if self.use_uld_loss:
+            if self.uld_crossentropy_weight < 0.0:
+                raise ValueError("uld_crossentropy_weight must be non-negative.")
+            if self.uld_distillation_weight < 0.0:
+                raise ValueError("uld_distillation_weight must be non-negative.")
+            if self.uld_student_temperature <= 0.0:
+                raise ValueError("uld_student_temperature must be positive.")
+            if self.uld_teacher_temperature <= 0.0:
+                raise ValueError("uld_teacher_temperature must be positive.")
+            # Validate hybrid loss weights - both must be None or both must be set
+            if self.uld_use_hybrid_loss:
+                if (self.uld_hybrid_matched_weight is None) != (self.uld_hybrid_unmatched_weight is None):
+                    raise ValueError(
+                        "uld_hybrid_matched_weight and uld_hybrid_unmatched_weight must both be None (for adaptive "
+                        "weighting) or both be set to numeric values. Got uld_hybrid_matched_weight="
+                        f"{self.uld_hybrid_matched_weight} and uld_hybrid_unmatched_weight="
+                        f"{self.uld_hybrid_unmatched_weight}."
+                    )
+                if self.uld_hybrid_matched_weight is not None:
+                    if self.uld_hybrid_matched_weight < 0.0:
+                        raise ValueError("uld_hybrid_matched_weight must be non-negative.")
+                    if self.uld_hybrid_unmatched_weight < 0.0:
+                        raise ValueError("uld_hybrid_unmatched_weight must be non-negative.")

ICL/RL/trl_source/trl/experimental/grpo_with_replay_buffer/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .grpo_with_replay_buffer_config import GRPOWithReplayBufferConfig
+from .grpo_with_replay_buffer_trainer import GRPOWithReplayBufferTrainer, ReplayBuffer

ICL/RL/trl_source/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_config.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from ...trainer.grpo_config import GRPOConfig
+@dataclass
+class GRPOWithReplayBufferConfig(GRPOConfig):
+    """
+    New Parameters:
+        replay_buffer_size (`int`, *optional*, defaults to `0`):
+                A cache that stores the rollouts with the highest advantage scores and variance per group. If a new
+                group has 0 variance, it is replaced with a group sampled from the replay buffer.
+    """
+    replay_buffer_size: int = field(
+        default=64,
+        metadata={
+            "help": "A cache that stores the rollouts with the highest advantage scores and variance per group. If a new group has 0 variance, it is replaced with a group sampled from the replay buffer."
+        },
+    )

ICL/RL/trl_source/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_trainer.py ADDED Viewed

	@@ -0,0 +1,731 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import heapq
+from typing import Any
+import torch
+from accelerate.utils import gather_object
+from ...data_utils import apply_chat_template, prepare_multimodal_messages
+from ...models.utils import disable_gradient_checkpointing
+from ...trainer.grpo_trainer import GRPOTrainer
+from ...trainer.utils import nanmax, nanmin, nanstd, pad
+from .grpo_with_replay_buffer_config import GRPOWithReplayBufferConfig
+class ReplayBuffer:
+    """
+    A simple replay buffer to store and sample previously seen rollouts.
+    """
+    def __init__(self, max_size: int):
+        self.max_size = max_size
+        self.heap = []  # Min-heap of (score, data) tuples
+    def add(self, scores: list[float], data: list[dict]):
+        for score, datum in zip(scores, data, strict=True):
+            if len(self.heap) < self.max_size:
+                heapq.heappush(self.heap, (score, datum))
+            else:
+                # Only add if score is better than worst (minimum) item
+                if score > self.heap[0][0]:
+                    heapq.heapreplace(self.heap, (score, datum))
+    def sample(self, num_samples: int) -> list[dict[str, torch.Tensor]]:
+        if not self.heap:
+            return None
+        # Sample by normalized scores
+        scores = torch.tensor([item[0] for item in self.heap], dtype=torch.float32)
+        probabilities = scores / scores.sum()
+        replacement = False
+        if num_samples > len(self.heap):
+            replacement = True
+        chosen_indices = torch.multinomial(probabilities, num_samples, replacement=replacement).tolist()
+        return [self.heap[i][1] for i in chosen_indices]
+class GRPOWithReplayBufferTrainer(GRPOTrainer):
+    def __init__(self, args: GRPOWithReplayBufferConfig | None = None, **kwargs):
+        super().__init__(args=args, **kwargs)
+        self.replay_buffer = ReplayBuffer(args.replay_buffer_size) if args.replay_buffer_size > 0 else None
+    def _generate_and_score_completions(
+        self, inputs: list[dict[str, torch.Tensor | Any]]
+    ) -> dict[str, torch.Tensor | Any]:
+        device = self.accelerator.device
+        mode = "train" if self.model.training else "eval"
+        prompts = [x["prompt"] for x in inputs]
+        if "images" in inputs[0]:
+            images = [example.get("images") for example in inputs]
+        elif "image" in inputs[0]:
+            images = [[example.get("image")] if example.get("image") is not None else None for example in inputs]
+        else:
+            images = None
+        # Transformers requires at least one image in the batch, otherwise it throws an error
+        if images is not None and all(img_list == [] for img_list in images):
+            images = None
+        # If the prompts are conversational and the inputs contain images, we need to convert the prompts from
+        # [{"role": "user", "content": "What color is the sky?"}] to
+        # [{"role": "user", "content": [{"type": "image", "image": <Image>}, {"type": "text", "text": "What color is the sky?"}]}]
+        if images is not None:
+            prompts = [
+                prepare_multimodal_messages(prompt, image_list)
+                for prompt, image_list in zip(prompts, images, strict=True)
+            ]
+        (
+            prompt_ids_list,
+            completion_ids_list,
+            tool_mask_list,
+            completions,
+            num_items_in_batch,
+            sampling_per_token_logps_list,
+            extra_fields,
+        ) = self._generate(prompts)
+        # Convert lists of token IDs to padded tensors
+        prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list]
+        prompt_mask = [torch.ones_like(ids, dtype=torch.long) for ids in prompt_ids]
+        prompt_ids = pad(prompt_ids, padding_value=self.pad_token_id, padding_side="left")
+        prompt_mask = pad(prompt_mask, padding_value=0, padding_side="left")
+        completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids_list]
+        completion_mask = [torch.ones_like(ids, dtype=torch.long) for ids in completion_ids]
+        completion_ids = pad(completion_ids, padding_value=self.pad_token_id, padding_side="right")
+        completion_mask = pad(completion_mask, padding_value=0, padding_side="right")
+        if sampling_per_token_logps_list is not None:
+            sampling_per_token_logps = [torch.tensor(logps, device=device) for logps in sampling_per_token_logps_list]
+            sampling_per_token_logps = pad(sampling_per_token_logps, padding_value=0.0, padding_side="right")
+        else:
+            sampling_per_token_logps = None
+        if self.tools:
+            tool_mask = [torch.tensor(mask, device=device) for mask in tool_mask_list]
+            tool_mask = pad(tool_mask, padding_value=1, padding_side="right")  # 0 for tool result tokens, 1 elsewhere
+        # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
+        if self.mask_truncated_completions:
+            eos_and_pad = [self.eos_token_id, self.pad_token_id]
+            is_truncated = torch.tensor([ids[-1] not in eos_and_pad for ids in completion_ids_list], device=device)
+            completion_mask = completion_mask * (~is_truncated).unsqueeze(1).int()
+        # Concatenate prompt_mask with completion_mask for logit computation
+        prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)  # (B, P+C)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
+        logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+        batch_size = self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_size
+        num_images = [len(img_list) for img_list in images] if images is not None else None
+        # Get forward_kwargs for models with multimodal inputs
+        if images is not None:
+            prompts_text = [
+                apply_chat_template(
+                    {"prompt": prompt}, self.processing_class, tools=self.tools, **self.chat_template_kwargs
+                )["prompt"]
+                for prompt in prompts
+            ]
+            prompt_inputs = self.processing_class(images=images, text=prompts_text, padding=True, return_tensors="pt")
+            prompt_inputs = super()._prepare_inputs(prompt_inputs)
+            forward_kwargs = {k: v for k, v in prompt_inputs.items() if k not in ["input_ids", "attention_mask"]}
+        else:
+            forward_kwargs = {}
+        # If token_type_ids are used, extend them with zeros for the completion part
+        if "token_type_ids" in forward_kwargs:
+            token_type_ids = forward_kwargs["token_type_ids"]
+            forward_kwargs["token_type_ids"] = torch.cat(
+                [token_type_ids, token_type_ids.new_zeros(completion_ids.shape)], dim=1
+            )
+        # When gradient checkpointing is enabled with use_reentrant=True (non default), calling the model inside a
+        # torch.no_grad() block triggers a harmless PyTorch warning ("None of the inputs have requires_grad=True").
+        # Temporarily disable checkpointing to avoid this warning during inference.
+        with torch.no_grad(), disable_gradient_checkpointing(self.model, self.args.gradient_checkpointing_kwargs):
+            # If the generation and optimization steps are misaligned—i.e., if generation does not occur at the end of
+            # a full optimizer step (when gradient_accumulation_steps is not a multiple of generate_every)—then the
+            # samples may come from an earlier version of the model. In that case, we need to track old_per_token_logps
+            # for importance sampling. If the steps are aligned, importance sampling isn't necessary and we set
+            # old_per_token_logps to None.
+            # When using vLLM, we always compute old_per_token_logps for importance sampling, it was shown that the
+            # distribution mismatch between vLLM and the training model can be large and harm the training.
+            generate_every = self.args.steps_per_generation * self.num_iterations  # generation frequency
+            if self.args.gradient_accumulation_steps % generate_every != 0 or (
+                self.use_vllm and self.vllm_importance_sampling_correction
+            ):
+                old_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                    self.model,
+                    prompt_completion_ids,
+                    attention_mask,
+                    logits_to_keep,
+                    batch_size,
+                    num_images=num_images,
+                    **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                )
+            else:
+                old_per_token_logps = None
+            # Compute the importance sampling ratio when using vLLM, to correct for potential distribution mismatch
+            if self.use_vllm and self.vllm_importance_sampling_correction:
+                importance_sampling_ratio = torch.exp(old_per_token_logps - sampling_per_token_logps)
+                importance_sampling_ratio = torch.clamp(
+                    importance_sampling_ratio, max=self.vllm_importance_sampling_cap
+                )
+            # Compute the per-token log probabilities for the reference model
+            if self.beta != 0.0:
+                if self.ref_model is not None:
+                    ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                        self.ref_model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        batch_size=batch_size,
+                        num_images=num_images,
+                        **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                    )
+                else:
+                    with self.accelerator.unwrap_model(self.model).disable_adapter():
+                        ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                            self.model,
+                            prompt_completion_ids,
+                            attention_mask,
+                            logits_to_keep,
+                            batch_size=batch_size,
+                            num_images=num_images,
+                            **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                        )
+            else:
+                ref_per_token_logps = None
+        # Decode
+        prompts_text = self.processing_class.batch_decode(prompt_ids, skip_special_tokens=True)
+        completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True)
+        # Merge extra_fields from rollout_func into inputs for reward functions
+        if extra_fields:
+            for i, inp in enumerate(inputs):
+                for key, values in extra_fields.items():
+                    if isinstance(values, list) and i < len(values):
+                        inp[key] = values[i]
+                    elif not isinstance(values, list):
+                        inp[key] = values
+        # Calculate rewards for each reward function. rewards_per_func aggregates rewards across all processes. This is
+        # important because rewards will be normalized per group, and completions are distributed. We will later slice
+        # rewards_per_func to extract each process's subset.
+        rewards_per_func = self._calculate_rewards(inputs, prompts, completions, completion_ids_list)
+        # Apply weights to each reward function's output and sum
+        rewards = (rewards_per_func * self.reward_weights.to(device).unsqueeze(0)).nansum(dim=1)
+        # Compute grouped-wise rewards
+        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        advantages = rewards - mean_grouped_rewards
+        grouped_std_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+        grouped_std_rewards = grouped_std_rewards.repeat_interleave(self.num_generations, dim=0)
+        if self.scale_rewards in ["group", "none"]:
+            # If self.scale_rewards = "none", we'll still log group level std
+            std_rewards = grouped_std_rewards.clone()
+        elif self.scale_rewards == "batch":
+            # Compute global std
+            std_rewards = rewards.std().expand_as(rewards)
+        else:
+            raise ValueError(
+                f"Invalid value for scale_rewards: {self.scale_rewards}. Must be one of 'batch', 'group', or 'none'."
+            )
+        is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards))
+        if self.scale_rewards != "none":
+            advantages = advantages / (std_rewards + 1e-4)
+        # Slice to keep only the local part of the data
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts),
+            (self.accelerator.process_index + 1) * len(prompts),
+        )
+        all_process_advantages = advantages.clone()  # keep the aggregated advantages for logging
+        advantages = advantages[process_slice]
+        grouped_std_rewards = grouped_std_rewards[process_slice]
+        # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
+        for i, reward_func_name in enumerate(self.reward_func_names):
+            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards)
+            std_func_rewards = nanstd(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/std"].append(std_func_rewards)
+        self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item())
+        self._metrics[mode]["reward_std"].append(std_rewards.mean().item())
+        self._metrics[mode]["frac_reward_zero_std"].append(is_std_zero.float().mean().item())
+        # Log prompt and completion texts
+        self._logs["prompt"].extend(gather_object(prompts_text))
+        self._logs["completion"].extend(gather_object(completions_text))
+        for i, name in enumerate(self.reward_func_names):
+            self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist())
+        self._logs["advantages"].extend(all_process_advantages.tolist())
+        if images is not None:
+            self._logs["images"].extend(gather_object(images))
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            delta = torch.abs(old_per_token_logps - sampling_per_token_logps)
+            mask = completion_mask.bool() if not self.tools else (completion_mask * tool_mask).bool()
+            delta = delta[mask]
+            mean_delta = torch.mean(delta) if delta.numel() > 0 else torch.tensor(0.0, device=device)
+            max_delta = torch.max(delta) if delta.numel() > 0 else torch.tensor(0.0, device=device)
+            self._metrics[mode]["sampling/sampling_logp_difference/mean"].append(
+                self.accelerator.gather(mean_delta).mean().item()
+            )
+            self._metrics[mode]["sampling/sampling_logp_difference/max"].append(
+                self.accelerator.gather(max_delta).max().item()
+            )
+            flat_is_ratio = importance_sampling_ratio[mask]
+            min_importance_sampling_ratio = (
+                torch.min(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=device)
+            )
+            mean_importance_sampling_ratio = (
+                torch.mean(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=device)
+            )
+            max_importance_sampling_ratio = (
+                torch.max(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=device)
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/min"].append(
+                nanmin(self.accelerator.gather(min_importance_sampling_ratio)).item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/mean"].append(
+                self.accelerator.gather(mean_importance_sampling_ratio).nanmean().item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/max"].append(
+                nanmax(self.accelerator.gather(max_importance_sampling_ratio)).item()
+            )
+        outputs_after_sampling_buffer = self.update_with_replay_buffer(
+            advantages,
+            grouped_std_rewards,
+            prompt_ids,
+            prompt_mask,
+            completion_ids,
+            completion_mask,
+            forward_kwargs,
+            num_items_in_batch,
+            old_per_token_logps,
+            ref_per_token_logps,
+            importance_sampling_ratio if self.use_vllm and self.vllm_importance_sampling_correction else None,
+        )
+        if outputs_after_sampling_buffer is not None:
+            return outputs_after_sampling_buffer
+        else:
+            output = {
+                "prompt_ids": prompt_ids,
+                "prompt_mask": prompt_mask,
+                "completion_ids": completion_ids,
+                "completion_mask": completion_mask,
+                "advantages": advantages,
+                "num_items_in_batch": num_items_in_batch,
+            }
+            if old_per_token_logps is not None:
+                output["old_per_token_logps"] = old_per_token_logps
+            if self.use_vllm and self.vllm_importance_sampling_correction:
+                output["importance_sampling_ratio"] = importance_sampling_ratio
+            if ref_per_token_logps is not None:
+                output["ref_per_token_logps"] = ref_per_token_logps
+            if "pixel_values" in forward_kwargs:
+                output["pixel_values"] = forward_kwargs["pixel_values"]
+            if "image_grid_thw" in forward_kwargs:
+                output["image_grid_thw"] = forward_kwargs["image_grid_thw"]
+            if "pixel_attention_mask" in forward_kwargs:
+                output["pixel_attention_mask"] = forward_kwargs["pixel_attention_mask"]
+            if "image_sizes" in forward_kwargs:
+                output["image_sizes"] = forward_kwargs["image_sizes"]
+            if "token_type_ids" in forward_kwargs:
+                output["token_type_ids"] = forward_kwargs["token_type_ids"]
+            if images is not None:
+                output["num_images"] = num_images
+            if self.tools:
+                output["tool_mask"] = tool_mask
+            return output
+    def slice_group_data(
+        self, data: torch.Tensor, mask: torch.Tensor, group_idx: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Slices the input data and mask tensors for a specific group index. Also trims the sequence length to the
+        maximum length in the group based on the mask.
+        Args:
+            data: Tensor of shape (num_groups * num_generations, seq_length)
+            mask: Tensor of shape (num_groups * num_generations, seq_length)
+            group_idx: Index of the group to slice
+        Returns:
+            Tuple of (sliced_data, sliced_mask) for the specified group, with sequence length trimmed to the maximum
+            length in the group.
+        """
+        start_idx = group_idx * self.num_generations
+        end_idx = (group_idx + 1) * self.num_generations
+        group_data = data[start_idx:end_idx]
+        group_mask = mask[start_idx:end_idx]
+        group_max_len = group_mask.sum(dim=1).max().item()
+        return group_data[:, :group_max_len], group_mask[:, :group_max_len]
+    def update_replay_buffer(
+        self,
+        groups_with_variance: torch.Tensor,
+        group_advantages: torch.Tensor,
+        group_std_rewards: torch.Tensor,
+        prompt_ids: torch.Tensor,
+        prompt_mask: torch.Tensor,
+        completion_ids: torch.Tensor,
+        completion_mask: torch.Tensor,
+        forward_kwargs: dict,
+        optional_vision_fields: list[str] = None,
+        old_per_token_logps: torch.Tensor | None = None,
+        ref_per_token_logps: torch.Tensor | None = None,
+        importance_sampling_ratio: float | None = None,
+    ) -> None:
+        """
+        Update the replay buffer with groups that have reward variance (std > 0).
+        Args:
+            groups_with_variance: Boolean tensor indicating which groups have reward variance
+            group_advantages: Tensor of shape (num_groups, num_generations) containing advantage values
+            std_rewards: Tensor of shape (num_groups, num_generations) containing std of rewards per group
+            prompt_ids: Tensor containing prompt token IDs
+            prompt_mask: Tensor containing prompt attention masks
+            completion_ids: Tensor containing completion token IDs
+            completion_mask: Tensor containing completion attention masks
+            forward_kwargs: Dictionary containing additional prompt inputs (vision data, etc.)
+            optional_vision_fields: List of optional vision-related fields to include if present in forward_kwargs
+            old_per_token_logps: Optional tensor of old per-token log probabilities
+            ref_per_token_logps: Optional tensor of reference per-token log probabilities
+            importance_sampling_ratio: Optional importance sampling correction ratio
+        """
+        # Prepare buffered outputs for groups with variance
+        buffered_outputs = []
+        for _, group_idx in enumerate(groups_with_variance.nonzero(as_tuple=True)[0].unique().tolist()):
+            group_prompt_ids, group_prompt_mask = self.slice_group_data(prompt_ids, prompt_mask, group_idx)
+            group_completion_ids, group_completion_mask = self.slice_group_data(
+                completion_ids, completion_mask, group_idx
+            )
+            # Store unpadded data in the buffer
+            buffered_output = {
+                "prompt_ids": group_prompt_ids,
+                "completion_ids": group_completion_ids,
+                "advantages": group_advantages[group_idx].tolist(),
+                "prompt_mask": group_prompt_mask,
+                "completion_mask": group_completion_mask,
+            }
+            # Add optional fields if they exist
+            optional_fields = {
+                "old_per_token_logps": old_per_token_logps if old_per_token_logps is not None else None,
+                "ref_per_token_logps": ref_per_token_logps if ref_per_token_logps is not None else None,
+            }
+            for field_name, field_data in optional_fields.items():
+                if field_data is not None:
+                    buffered_output[field_name] = self.slice_group_data(field_data, completion_mask, group_idx)[0]
+            # Add importance sampling if needed
+            if self.use_vllm and self.vllm_importance_sampling_correction:
+                buffered_output["importance_sampling_ratio"] = importance_sampling_ratio
+            if optional_vision_fields:
+                # Add vision-related fields if they exist
+                for field_name in optional_vision_fields:
+                    if field_name in forward_kwargs:
+                        buffered_output[field_name] = self.slice_group_data(
+                            forward_kwargs[field_name], prompt_mask, group_idx
+                        )[0]
+            buffered_outputs.append(buffered_output)
+        if groups_with_variance.any():
+            # Calculate replay buffer scores for groups with variance
+            replay_buffer_scores = (group_advantages.abs() * group_std_rewards).sum(dim=-1)[groups_with_variance]
+            # Add all groups to replay buffer at once (batch operation)
+            self.replay_buffer.add(replay_buffer_scores.tolist(), buffered_outputs)
+    def sample_from_replay_buffer(
+        self, num_samples: int, optional_vision_fields: list[str] = None, optional_tensor_fields: list[str] = None
+    ) -> list[dict]:
+        """
+        Sample groups from the replay buffer.
+        Args:
+            num_samples: Number of samples to draw from the replay buffer
+            optional_vision_fields: List of optional vision-related fields to include if present in sampled data
+            optional_tensor_fields: List of optional tensor fields to include if present in sampled data
+        Returns:
+            List of sampled data dictionaries from the replay buffer
+        """
+        sampled = self.replay_buffer.sample(num_samples=num_samples)
+        # Extract and concatenate sampled data
+        sampled_data = {
+            "prompt_ids": [],
+            "prompt_mask": [],
+            "completion_ids": [],
+            "completion_mask": [],
+            "advantages": [],
+        }
+        all_optional_fields = (optional_tensor_fields or []) + (optional_vision_fields or [])
+        # Initialize containers for optional fields if they exist in sampled data
+        for field in all_optional_fields:
+            if sampled and field in sampled[0]:
+                sampled_data[field] = []
+        # Extract data from each sampled item
+        for item in sampled:
+            # Handle core fields
+            for key in ["prompt_ids", "prompt_mask", "completion_ids", "completion_mask"]:
+                sampled_data[key].append(item[key])
+            # Handle advantages (list, not tensor)
+            sampled_data["advantages"].append(item["advantages"])
+            # Handle optional fields
+            for field in all_optional_fields:
+                if field in item:
+                    sampled_data[field].append(item[field])
+        return sampled_data
+    def update_with_replay_buffer(
+        self,
+        group_advantages: torch.Tensor,
+        group_std_rewards: torch.Tensor,
+        prompt_ids: torch.Tensor,
+        prompt_mask: torch.Tensor,
+        completion_ids: torch.Tensor,
+        completion_mask: torch.Tensor,
+        forward_kwargs: dict,
+        num_items_in_batch: int,
+        old_per_token_logps: torch.Tensor | None = None,
+        ref_per_token_logps: torch.Tensor | None = None,
+        importance_sampling_ratio: float | None = None,
+    ) -> None:
+        """
+        Update current batch data with samples from replay buffer.
+        Groups with reward variance (std > 0) are added to the replay buffer and then replaced with samples from the
+        buffer to improve training stability.
+        Args:
+            group_advantages: Tensor of shape (num_groups, num_generations) containing advantage values
+            std_rewards: Tensor of shape (num_groups, num_generations) containing std of rewards per group
+            prompt_ids: Tensor containing prompt token IDs
+            prompt_mask: Tensor containing prompt attention masks
+            completion_ids: Tensor containing completion token IDs
+            completion_mask: Tensor containing completion attention masks
+            forward_kwargs: Dictionary containing additional prompt inputs (vision data, etc.)
+            num_items_in_batch: Number of items in the current batch
+            old_per_token_logps: Optional tensor of old per-token log probabilities
+            ref_per_token_logps: Optional tensor of reference per-token log probabilities
+            importance_sampling_ratio: Optional importance sampling correction ratio
+        """
+        if self.replay_buffer.max_size <= 0:
+            return
+        # Groups to consider for adding to the replay buffer
+        groups_with_variance = group_std_rewards.max(dim=0).values > 0
+        # Groups to replace from the replay buffer
+        groups_without_variance = ~groups_with_variance
+        # Track which optional fields are present in sampled data
+        optional_tensor_fields = ["old_per_token_logps", "ref_per_token_logps"]
+        vision_fields = ["pixel_values", "image_grid_thw", "pixel_attention_mask", "image_sizes"]
+        self.update_replay_buffer(
+            groups_with_variance,
+            group_advantages,
+            group_std_rewards,
+            prompt_ids,
+            prompt_mask,
+            completion_ids,
+            completion_mask,
+            forward_kwargs,
+            vision_fields,
+            old_per_token_logps,
+            ref_per_token_logps,
+            importance_sampling_ratio,
+        )
+        # Sample from replay buffer to replace groups with variance
+        num_groups_to_replace = groups_without_variance.sum().item()
+        if not num_groups_to_replace:
+            return
+        sampled_data = self.sample_from_replay_buffer(
+            num_samples=num_groups_to_replace,
+            optional_vision_fields=vision_fields,
+            optional_tensor_fields=optional_tensor_fields,
+        )
+        # Pad sampled data if they are shorter than the current batch sequences
+        # Or pad the current batch if sampled are longer
+        current_batch_prompt_seq_len = prompt_ids.size(1)
+        current_batch_completion_seq_len = completion_ids.size(1)
+        groups_to_replace_idxs = groups_with_variance.logical_not().nonzero(as_tuple=True)[0].unique().tolist()
+        # Determine target (max) sequence lengths once
+        sampled_prompt_lengths = [t.size(1) for t in sampled_data["prompt_ids"]]
+        sampled_completion_lengths = [t.size(1) for t in sampled_data["completion_ids"]]
+        target_prompt_len = max([current_batch_prompt_seq_len] + sampled_prompt_lengths)
+        target_completion_len = max([current_batch_completion_seq_len] + sampled_completion_lengths)
+        # If any sampled prompt is longer, pad the whole batch prompt tensors once (left padding)
+        if target_prompt_len > current_batch_prompt_seq_len:
+            prompt_ids = pad(
+                list(prompt_ids.unbind(0)),
+                padding_value=self.pad_token_id,
+                pad_to_multiple_of=target_prompt_len,
+                padding_side="left",
+            )
+            prompt_mask = pad(
+                list(prompt_mask.unbind(0)), padding_value=0, pad_to_multiple_of=target_prompt_len, padding_side="left"
+            )
+        # If any sampled completion is longer, pad the whole batch completion tensors once (right padding)
+        if target_completion_len > current_batch_completion_seq_len:
+            completion_ids = pad(
+                list(completion_ids.unbind(0)),
+                padding_value=self.pad_token_id,
+                pad_to_multiple_of=target_completion_len,
+                padding_side="right",
+            )
+            completion_mask = pad(
+                list(completion_mask.unbind(0)),
+                padding_value=0,
+                pad_to_multiple_of=target_completion_len,
+                padding_side="right",
+            )
+            if old_per_token_logps is not None:
+                old_per_token_logps = pad(
+                    list(old_per_token_logps.unbind(0)),
+                    padding_value=0.0,
+                    pad_to_multiple_of=target_completion_len,
+                    padding_side="right",
+                )
+            if ref_per_token_logps is not None:
+                ref_per_token_logps = pad(
+                    list(ref_per_token_logps.unbind(0)),
+                    padding_value=0.0,
+                    pad_to_multiple_of=target_completion_len,
+                    padding_side="right",
+                )
+        # Replace per-group data, padding only sampled groups that are shorter than the target
+        for i, group_idx in enumerate(groups_to_replace_idxs):
+            start_idx = group_idx * self.num_generations
+            end_idx = (group_idx + 1) * self.num_generations
+            idx_range = slice(start_idx, end_idx)
+            # Pad sampled prompt to target length if needed
+            if sampled_data["prompt_ids"][i].size(1) < target_prompt_len:
+                sampled_data["prompt_ids"][i] = pad(
+                    sampled_data["prompt_ids"][i],
+                    padding_value=self.pad_token_id,
+                    pad_to_multiple_of=target_prompt_len,
+                    padding_side="left",
+                )
+                sampled_data["prompt_mask"][i] = pad(
+                    sampled_data["prompt_mask"][i],
+                    padding_value=0,
+                    pad_to_multiple_of=target_prompt_len,
+                    padding_side="left",
+                )
+            # Pad sampled completion to target length if needed
+            if sampled_data["completion_ids"][i].size(1) < target_completion_len:
+                sampled_data["completion_ids"][i] = pad(
+                    sampled_data["completion_ids"][i],
+                    padding_value=self.pad_token_id,
+                    pad_to_multiple_of=target_completion_len,
+                    padding_side="right",
+                )
+                sampled_data["completion_mask"][i] = pad(
+                    sampled_data["completion_mask"][i],
+                    padding_value=0,
+                    pad_to_multiple_of=target_completion_len,
+                    padding_side="right",
+                )
+                if "old_per_token_logps" in sampled_data:
+                    sampled_data["old_per_token_logps"][i] = pad(
+                        sampled_data["old_per_token_logps"][i],
+                        padding_value=0.0,
+                        pad_to_multiple_of=target_completion_len,
+                        padding_side="right",
+                    )
+                if "ref_per_token_logps" in sampled_data:
+                    sampled_data["ref_per_token_logps"][i] = pad(
+                        sampled_data["ref_per_token_logps"][i],
+                        padding_value=0.0,
+                        pad_to_multiple_of=target_completion_len,
+                        padding_side="right",
+                    )
+            # Assign (replace) group slice
+            prompt_ids[idx_range] = sampled_data["prompt_ids"][i]
+            prompt_mask[idx_range] = sampled_data["prompt_mask"][i]
+            completion_ids[idx_range] = sampled_data["completion_ids"][i]
+            completion_mask[idx_range] = sampled_data["completion_mask"][i]
+            group_advantages[group_idx] = sampled_data["advantages"][i]
+            if "old_per_token_logps" in sampled_data:
+                old_per_token_logps[idx_range] = sampled_data["old_per_token_logps"][i]
+            if "ref_per_token_logps" in sampled_data:
+                ref_per_token_logps[idx_range] = sampled_data["ref_per_token_logps"][i]
+            for field in vision_fields:
+                if field in sampled_data and field in forward_kwargs:
+                    forward_kwargs[field][idx_range] = sampled_data[field][i]
+        # Prepare final outputs after sampling and replacement
+        outputs_after_sampling_buffer = {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "advantages": group_advantages,
+        }
+        # Replace optional tensor fields if they exist
+        for field in optional_tensor_fields:
+            if field in sampled_data:
+                outputs_after_sampling_buffer[field] = (
+                    old_per_token_logps if field == "old_per_token_logps" else ref_per_token_logps
+                )
+        # Replace vision fields if they exist
+        for field in vision_fields:
+            if field in sampled_data and field in forward_kwargs:
+                outputs_after_sampling_buffer[field] = forward_kwargs[field]
+        outputs_after_sampling_buffer["num_items_in_batch"] = num_items_in_batch
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            outputs_after_sampling_buffer["importance_sampling_ratio"] = importance_sampling_ratio
+        return outputs_after_sampling_buffer

ICL/RL/trl_source/trl/experimental/gspo_token/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .grpo_trainer import GRPOTrainer

ICL/RL/trl_source/trl/experimental/gspo_token/grpo_trainer.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from ...trainer.grpo_trainer import GRPOTrainer as _GRPOTrainer
+from ...trainer.utils import nanmax, nanmin
+class GRPOTrainer(_GRPOTrainer):
+    def _compute_loss(self, model, inputs):
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+        # Compute the per_token_logps and the entropy at each position in the completion
+        per_token_logps, entropies = self._get_per_token_logps_and_entropies(
+            model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            compute_entropy=True,
+            pixel_values=inputs.get("pixel_values"),
+            image_grid_thw=inputs.get("image_grid_thw"),
+            num_images=inputs.get("num_images"),
+            pixel_attention_mask=inputs.get("pixel_attention_mask"),
+            image_sizes=inputs.get("image_sizes"),
+            token_type_ids=inputs.get("token_type_ids"),
+        )
+        if self.top_entropy_quantile < 1.0:
+            entropy_mask = self.get_high_entropy_mask(entropies, completion_mask, 1 - self.top_entropy_quantile)
+        else:
+            entropy_mask = None
+        # Compute the KL divergence between the model and the reference model
+        if self.beta != 0.0:
+            ref_per_token_logps = inputs["ref_per_token_logps"]
+            per_token_kl = (
+                torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
+            )
+        # Compute the loss
+        advantages = inputs["advantages"]
+        # When num_iterations == 1 and steps_per_generation <= gradient_accumulation_steps,
+        # old_per_token_logps == per_token_logps. In this case we can skip its computation
+        # (see _generate_and_score_completions) and instead use per_token_logps.detach().
+        # The exception is when using vLLM, where we always compute old_per_token_logps
+        # for importance sampling
+        old_per_token_logps = inputs.get("old_per_token_logps")
+        old_per_token_logps = per_token_logps.detach() if old_per_token_logps is None else old_per_token_logps
+        log_ratio = per_token_logps - old_per_token_logps
+        if self.importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif self.importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        elif self.importance_sampling_level == "sequence_token":
+            # GSPO-token: sg[si(θ)] * πθ(yi,t)/sg[πθ(yi,t)]
+            seq_level_log_weight = (log_ratio * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)
+            seq_level_log_weight = seq_level_log_weight.detach().unsqueeze(-1)  # Stop gradient
+            log_importance_weights = per_token_logps - per_token_logps.detach() + seq_level_log_weight
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {self.importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+        coef_1 = torch.exp(log_importance_weights)
+        coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
+        # Two-sided clipping
+        if self.args.delta is not None:
+            coef_1 = torch.clamp(coef_1, max=self.args.delta)
+        per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+        per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        if entropy_mask is not None:
+            per_token_loss = per_token_loss * entropy_mask
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            per_token_loss = per_token_loss * inputs["importance_sampling_ratio"]
+        if self.beta != 0.0:
+            per_token_loss = per_token_loss + self.beta * per_token_kl
+        mode = "train" if self.model.training else "eval"
+        if self.loss_type == "grpo":
+            loss = ((per_token_loss * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)).mean()
+            normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0  # no accum in eval
+            loss = loss / normalizer
+        elif self.loss_type == "bnpo":
+            loss = (per_token_loss * completion_mask).sum() / completion_mask.sum().clamp(min=1.0)
+            normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0  # no accum in eval
+            loss = loss / normalizer
+        elif self.loss_type == "dr_grpo":
+            loss = (per_token_loss * completion_mask).sum() / (per_token_loss.size(0) * self.max_completion_length)
+            normalizer = self.current_gradient_accumulation_steps if mode == "train" else 1.0  # no accum in eval
+            loss = loss / normalizer
+        elif self.loss_type == "dapo":
+            normalizer = inputs["num_items_in_batch"] / self.accelerator.num_processes
+            loss = (per_token_loss * completion_mask).sum() / normalizer
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}")
+        # Log the metrics
+        completion_token_count = completion_mask.sum().clamp(min=1.0)
+        def masked_batch_mean(x):
+            if x.shape[1] == 1:  # when importance_sampling_level == "sequence"
+                return x.mean()
+            else:
+                return (x * completion_mask).sum() / completion_token_count
+        if self.beta != 0.0:
+            mean_kl = masked_batch_mean(per_token_kl)
+            self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item())
+        mean_entropy = masked_batch_mean(entropies)
+        self._metrics[mode]["entropy"].append(self.accelerator.gather(mean_entropy).nanmean().item())
+        # Compute the clipped probability ratios
+        is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages.unsqueeze(1) < 0)
+        is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (advantages.unsqueeze(1) > 0)
+        is_region_clipped = is_low_clipped | is_high_clipped
+        low_clip = masked_batch_mean(is_low_clipped.float())
+        high_clip = masked_batch_mean(is_high_clipped.float())
+        clip_ratio = masked_batch_mean(is_region_clipped.float())
+        gathered_low_clip = self.accelerator.gather(low_clip)
+        self._metrics[mode]["clip_ratio/low_mean"].append(gathered_low_clip.nanmean().item())
+        self._metrics[mode]["clip_ratio/low_min"].append(nanmin(gathered_low_clip).item())
+        gathered_high_clip = self.accelerator.gather(high_clip)
+        self._metrics[mode]["clip_ratio/high_mean"].append(gathered_high_clip.nanmean().item())
+        self._metrics[mode]["clip_ratio/high_max"].append(nanmax(gathered_high_clip).item())
+        gathered_clip_ratio = self.accelerator.gather(clip_ratio)
+        self._metrics[mode]["clip_ratio/region_mean"].append(gathered_clip_ratio.nanmean().item())
+        return loss

ICL/RL/trl_source/trl/experimental/judges/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .judges import (
+    AllTrueJudge,
+    BaseBinaryJudge,
+    BaseJudge,
+    BasePairwiseJudge,
+    BaseRankJudge,
+    HfPairwiseJudge,
+    OpenAIPairwiseJudge,
+    PairRMJudge,
+)
+__all__ = [
+    "AllTrueJudge",
+    "BaseBinaryJudge",
+    "BaseJudge",
+    "BasePairwiseJudge",
+    "BaseRankJudge",
+    "HfPairwiseJudge",
+    "OpenAIPairwiseJudge",
+    "PairRMJudge",
+]

ICL/RL/trl_source/trl/experimental/judges/judges.py ADDED Viewed

	@@ -0,0 +1,482 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import concurrent.futures
+import logging
+from abc import ABC, abstractmethod
+import numpy as np
+from accelerate import Accelerator
+from huggingface_hub import InferenceClient
+from packaging.version import Version
+from transformers.utils import is_openai_available
+from ...import_utils import is_llm_blender_available
+DEFAULT_PAIRWISE_SYSTEM_PROMPT = '''I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
+## Instruction
+{{
+    "instruction": """{prompt}""",
+}}
+## Model Outputs
+Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
+{{
+    {{
+        "model_identifier": "0",
+        "output": """{response0}"""
+    }},
+    {{
+        "model_identifier": "1",
+        "output": """{response1}"""
+    }}
+}}
+## Task
+Evaluate the models on the basis of the quality and relevance of their results, and select the model that generated the best result. Reply with the identifier of the best model. Our evaluation will only take into account the first character of your answer, so make sure it contains only one of the identifiers and nothing else (no quotation marks, no spaces, no new lines, ...).
+'''
+def _ensure_llm_blender_importable() -> None:
+    """
+    Pre-import shim to work around a known `llm-blender` issue.
+    As of `llm-blender` v0.0.2 (see upstream issue: https://github.com/yuchenlin/LLM-Blender/issues/33), importing
+    `llm_blender` may fail on `transformers` >= 5.0.0 because it unconditionally accesses
+    `transformers.utils.hub.TRANSFORMERS_CACHE`.
+    We set this attribute to a dummy value before importing `llm_blender` so that the import succeeds. This helper is
+    intentionally a no-op on older `transformers` versions.
+    This shim can be removed once the upstream issue is fixed and the minimum required `llm-blender` version includes
+    that fix.
+    """
+    import transformers.utils.hub
+    if Version(transformers.__version__) >= Version("5.0.0"):
+        transformers.utils.hub.TRANSFORMERS_CACHE = None  # unused; just needs to exist
+class BaseJudge(ABC):
+    """
+    Base class for judges. The subclasses of this class should implement the `judge` method.
+    """
+    @abstractmethod
+    def judge(self, prompts: list[str], completions: list[str], shuffle_order: bool = True) -> list:
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+class BaseRankJudge(ABC):
+    """
+    Base class for LLM ranking judges.
+    **Example**:
+    ```python
+    class MyRankJudge(BaseRankJudge):
+        def judge(self, prompts, completions, shuffle_order=True):
+            return ...  # Your ranking logic here
+    judge = MyRankJudge()
+    judge.judge(
+        prompts=["The capital of France is", "The capital of Germany is"],
+        completions=[[" Paris", " Marseille", "Lyon"], [" Munich", " Berlin"]],
+    )  # [[0, 1, 2], [1, 0]]
+    ```
+    """
+    @abstractmethod
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[list[int]]:
+        """
+        Judge the completion for the given prompts and return the ranks of each completion.
+        Args:
+            prompts (`list[str]`):
+                List of prompts.
+            completions (`list[list[str]]`):
+                List of completions list, where each element is a list of completions for the corresponding prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
+        Returns:
+            `list[list[int]]`:
+                List of lists of idxs, where each list contains the ranks of the completions for the corresponding
+                prompt. E.g., `[1, 2, 0]` means that the second completion (`idx=1`) is the best, followed by the
+                third, and then the first.
+        """
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+class BasePairwiseJudge(BaseJudge):
+    """
+    Base class for pairwise judges.
+    """
+    @abstractmethod
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
+        """
+        Judge the completion pairs for the given prompts.
+        Args:
+            prompts (`list[str]`):
+                List of prompts.
+            completions (`list[list[str]]`):
+                List of completions pairs, where each element is a pair of completions for the corresponding prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
+        Returns:
+            `list[int]`:
+                List of idxs, where each idx is the rank of the best completion for the corresponding prompt. E.g., `1`
+                means that the second completion (`idx=1`) is the best.
+        Note:
+            If the judge returns `-1` for any prompt, it indicates that the inner process used to compute the
+            preference has failed. For instance, this could occur if the underlying language model returned an invalid
+            answer. In such cases, the caller should handle these invalid indices appropriately, possibly by
+            implementing fallback logic or error handling.
+        """
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+class BaseBinaryJudge(BaseJudge):
+    """
+    Base class for binary judges.
+    """
+    @abstractmethod
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[str],
+        gold_completions: list[str] | None = None,
+        shuffle_order: bool = True,
+    ) -> list[int]:
+        """
+        Judge the completion for a given prompt. Used to assess if a completion satisfies a constraint.
+        This base class should be used to implement binary evaluations as done in section 4.1.4 of the [CGPO
+        paper](https://huggingface.co/papers/2409.20370). It is relevant for assessing whether a prompt-completion pair
+        satisfies a specific constraint.
+        Args:
+            prompts (`list[str]`): List of prompts.
+            completions (`list[str]`): List of completions.
+            gold_completions (`list[str]`, `optional`): List of gold completions if it exists.
+            shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias.
+        Returns:
+            list[int]: A list of binary labels:
+                - 1 indicates that the completion satisfies the evaluated constraint.
+                - 0 indicates that the completion does not satisfy the evaluated constraint.
+        Note:
+            If the judge returns -1 for any prompt, it indicates that the inner process used to compute the preference
+            has failed. For instance, this could occur if the underlying language model or rule based constraint
+            returned an invalid answer. In such cases, the caller should handle these invalid indices appropriately,
+            possibly by implementing fallback logic or error handling.
+        """
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+class PairRMJudge(BasePairwiseJudge):
+    # docstyle-ignore
+    """
+    LLM judge based on the PairRM model from AllenAI.
+    This judge uses the PairRM model to rank pairs of completions for given prompts. It's designed for pairwise
+    comparison of language model outputs. The PairRM model is loaded using the llm-blender library and runs on the
+    default Accelerator device.
+    **Attributes**:
+        blender (`llm_blender.Blender`):
+            An instance of the Blender class from llm-blender.
+    **Example**:
+    ```python
+    >>> pairrm_judge = PairRMJudge()
+    >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"]
+    >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]]
+    >>> results = pairrm_judge.judge(prompts, completions)
+    >>> print(results)  # [0, 1] (indicating the first completion is preferred for the first prompt and the second)
+    ```
+    > [!TIP]
+    > This class requires the llm-blender library to be installed. Install it with: `pip install llm-blender`.
+    """
+    def __init__(self):
+        if not is_llm_blender_available():
+            raise ValueError("llm-blender is not installed. Please install it with `pip install llm-blender`.")
+        import transformers
+        if Version(transformers.__version__) >= Version("5.0.0"):
+            raise RuntimeError(
+                "llm-blender currently supports transformers < 5.0.0. Please install a compatible version: `pip install 'transformers<5.0.0'`. Check the issue tracker for updates: https://github.com/huggingface/trl/issues/4918"
+            )
+        _ensure_llm_blender_importable()
+        import llm_blender
+        self.blender = llm_blender.Blender()
+        self.blender.loadranker("llm-blender/PairRM", device=Accelerator().device)
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[list[str]],
+        shuffle_order: bool = True,
+        return_scores: bool = False,
+        temperature: float = 1.0,
+    ) -> list[int | float]:
+        """
+        Judge the completion pairs for the given prompts using the PairRM model.
+        Args:
+            prompts (`list[str]`):
+                List of prompts to judge.
+            completions (`list[list[str]]`):
+                List of completion pairs for each prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
+            return_scores (`bool`, *optional*, defaults to `False`):
+                If `True`, return probability scores of the first completion instead of ranks (i.e. a *soft-judge*).
+            temperature (`float`, *optional*, defaults to `1.0`):
+                Temperature for scaling logits if `return_scores` is True.
+        Returns:
+            `list[int | float]`:
+                If `return_scores` is `False`, returns a list of ranks (`0` or `1`) for each prompt, indicating which
+                completion is preferred. If `return_scores` is `True`, returns softmax probabilities for the first
+                completion.
+        Raises:
+            `ValueError`:
+                If the number of completions per prompt is not exactly 2.
+        Note:
+            Unlike llm-blender, ranks are 0-indexed (`0` means the first completion is preferred).
+        """
+        if len(completions[0]) != 2:
+            raise ValueError("PairRM judge requires exactly 2 completions per prompt.")
+        # Shuffle the order of the completions to avoid positional bias
+        if shuffle_order:
+            flip_mask = np.random.choice([True, False], size=len(prompts))
+            completions = [pair[::-1] if flip else pair for flip, pair in zip(flip_mask, completions, strict=True)]
+        # Rank the completions
+        ranks = self.blender.rank(prompts, completions, return_scores=return_scores, disable_tqdm=True)
+        if not return_scores:
+            ranks -= 1  # PairRM rank is 1-indexed, so we subtract 1 to make it 0-indexed
+        else:
+            # scale the logits by temperature
+            ranks /= temperature
+        # Flip back the ranks or scores to the original order if needed
+        if shuffle_order:
+            ranks[flip_mask] = ranks[flip_mask][:, ::-1]
+        # Return the ranks or score probability
+        if return_scores:
+            logit_max = np.amax(ranks, axis=-1, keepdims=True)
+            exp_logit_shifted = np.exp(ranks - logit_max)
+            probs = exp_logit_shifted / np.sum(exp_logit_shifted, axis=-1, keepdims=True)
+            return probs[:, 0].tolist()
+        else:
+            return ranks[:, 0].tolist()
+class HfPairwiseJudge(BasePairwiseJudge):
+    """
+    Pairwise judge based on the Hugging Face API with chat completion.
+    This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
+    Args:
+        model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`):
+            Model to use for the judge.
+        token (`str`, *optional*):
+            Hugging Face API token to use for the [`huggingface_hub.InferenceClient`].
+        system_prompt (`str`, *optional*):
+            The system prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
+            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
+            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
+            response.
+    """
+    def __init__(
+        self,
+        model="meta-llama/Meta-Llama-3-70B-Instruct",
+        token: str | None = None,
+        system_prompt: str | None = None,
+    ):
+        self.client = InferenceClient(model=model, token=token)
+        self.system_prompt = system_prompt or DEFAULT_PAIRWISE_SYSTEM_PROMPT
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
+        # Shuffle the order of the completions to avoid positional bias
+        if shuffle_order:
+            flip_mask = np.random.choice([True, False], size=len(prompts))
+            completions = [pair[::-1] if flip else pair for flip, pair in zip(flip_mask, completions, strict=True)]
+        # Define a function to get the rank for a single prompt, will be called concurrently
+        def get_rank(prompt, candidates):
+            content = self.system_prompt.format(prompt=prompt, response0=candidates[0], response1=candidates[1])
+            completion = self.client.chat_completion(messages=[{"role": "user", "content": content}], max_tokens=1)
+            response = completion.choices[0].message.content
+            if response in ["0", "1"]:
+                return int(response)
+            else:
+                logging.debug(f"Invalid response from the judge model: '{response}'. Returning -1.")
+                return -1
+        # Call the completions concurrently
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            ranks = list(executor.map(get_rank, prompts, completions))
+        # Flip back the ranks to the original order if needed
+        if shuffle_order:
+            ranks = [ranks[i] if not flip else 1 - ranks[i] for i, flip in enumerate(flip_mask)]
+        # Return the ranks
+        return ranks
+class OpenAIPairwiseJudge(BasePairwiseJudge):
+    """
+    Judge based on the OpenAI API.
+    This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
+    Args:
+        model (`str`, *optional*, defaults to `"gpt-4-turbo-preview"`):
+            Model to use for the judge.
+        system_prompt (`str`, *optional*):
+            System prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
+            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
+            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
+            response.
+        max_requests (`int` or `None`, *optional*, defaults to `1000`):
+            Maximum number of requests to make to the OpenAI API. If set to `None`, there is no limit.
+    """
+    def __init__(
+        self, model="gpt-4-turbo-preview", system_prompt: str | None = None, max_requests: int | None = 1_000
+    ):
+        if not is_openai_available():
+            raise ValueError("OpenAI client is not installed. Please install it with 'pip install openai'.")
+        from openai import OpenAI
+        self.client = OpenAI()
+        self.model = model
+        self.system_prompt = system_prompt or DEFAULT_PAIRWISE_SYSTEM_PROMPT
+        self.max_requests = max_requests
+        self.num_requests = 0
+        self._warned = False
+    def judge(self, prompts: list[str], completions: list[list[str]], shuffle_order: bool = True) -> list[int]:
+        # Check if the limit of requests is reached, if so, use random choice instead
+        if self.max_requests is not None and self.num_requests >= self.max_requests:
+            if not self._warned:  # Print the warning only once
+                logging.warning(
+                    f"Reached the maximum number of requests ({self.max_requests}). From now on, returning -1 instead. "
+                    " To increase the limit, set `max_requests` to a higher value, or to `None` for no limit."
+                )
+                self._warned = True
+            return [-1] * len(prompts)
+        # Shuffle the order of the completions to avoid positional bias
+        if shuffle_order:
+            flip_mask = np.random.choice([True, False], size=len(prompts))
+            completions = [pair[::-1] if flip else pair for flip, pair in zip(flip_mask, completions, strict=True)]
+        # Define a function to get the rank for a single prompt, will be called concurrently
+        def get_rank(prompt, candidates):
+            content = self.system_prompt.format(prompt=prompt, response0=candidates[0], response1=candidates[1])
+            messages = [{"role": "user", "content": content}]
+            completion = self.client.chat.completions.create(model=self.model, messages=messages, max_tokens=1)
+            response = completion.choices[0].message.content
+            if response in ["0", "1"]:
+                return int(response)
+            else:
+                logging.debug(f"Invalid response from the judge model: '{response}'. Returning -1.")
+                return -1
+        # Call the completions concurrently
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            ranks = list(executor.map(get_rank, prompts, completions))
+        # Flip back the ranks to the original order if needed
+        if shuffle_order:
+            ranks = [ranks[i] if not flip else 1 - ranks[i] for i, flip in enumerate(flip_mask)]
+        # Update the number of requests
+        self.num_requests += len(prompts)
+        # Return the ranks
+        return ranks
+class AllTrueJudge(BaseBinaryJudge):
+    """
+    Unify the decision of multiple [`experimental.judges.BaseBinaryJudge`] instances.
+    Returns `1` only if all inner binary judges return `1`. If any judge returns `0`, it returns `0`. If any judge
+    returns `-1`, indicating a failure in its process, this judge will also return `-1`.
+    Implements the Mixture of Judges as described in the [CGPO paper](https://huggingface.co/papers/2409.20370).
+    Args:
+        judges (`list` of [`experimental.judges.BaseBinaryJudge`]):
+            A list of [`experimental.judges.BaseBinaryJudge`] instances whose decisions will be unified.
+    """
+    def __init__(self, judges: list[BaseBinaryJudge]):
+        self.judges = judges
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[str],
+        gold_completions: list[str] | None = None,
+        shuffle_order: bool = True,
+    ) -> list[int]:
+        all_binary_judgments = [
+            judge.judge(prompts, completions, gold_completions, shuffle_order) for judge in self.judges
+        ]
+        output = []
+        for binary_judgments in zip(*all_binary_judgments, strict=True):
+            # Check that all values are in {0, 1, -1}
+            if any(binary_judgment not in {0, 1, -1} for binary_judgment in binary_judgments):
+                raise ValueError(
+                    f"Invalid binary judgment: {binary_judgments}, expected list of values in {{0, 1, -1}}."
+                )
+            # Unify the decision
+            if -1 in binary_judgments:
+                output.append(-1)
+            elif all(binary_judgment == 1 for binary_judgment in binary_judgments):
+                output.append(1)
+            else:
+                output.append(0)
+        return output

ICL/RL/trl_source/trl/experimental/kto/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .kto_config import KTOConfig
+from .kto_trainer import KTOTrainer
+__all__ = ["KTOConfig", "KTOTrainer"]

ICL/RL/trl_source/trl/experimental/kto/kto_config.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from typing import Any
+from transformers import TrainingArguments
+@dataclass
+class KTOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`experimental.kto.KTOTrainer`].
+    This class includes only the parameters that are specific to KTO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+    Parameters:
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model.
+        loss_type (`str`, *optional*, defaults to `"kto"`):
+            Type of loss to use. Possible values are:
+                - `"kto"`: KTO loss from the [KTO](https://huggingface.co/papers/2402.01306) paper.
+                - `"apo_zero_unpaired"`: Unpaired variant of APO-zero loss from the
+                  [APO](https://huggingface.co/papers/2408.06266) paper.
+        desirable_weight (`float`, *optional*, defaults to `1.0`):
+            Desirable losses are weighed by this factor to counter unequal number of desirable and undesirable paris.
+        undesirable_weight (`float`, *optional*, defaults to `1.0`):
+            Undesirable losses are weighed by this factor to counter unequal number of desirable and undesirable pairs.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from both the model and the reference model to W&B or Comet
+            during evaluation.
+        precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
+            Whether to precompute reference model log probabilities for training and evaluation datasets. This is
+            useful when training without the reference model to reduce the total GPU memory needed.
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        dataset_num_proc: (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model and reference model.
+    """
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+    # Transformers 4.57.0 introduced a bug that caused the dtype of `lr_scheduler_kwargs` to be unparsable. This issue
+    # was fixed in https://github.com/huggingface/transformers/pull/41322 and released in 4.57.5. We add a temporary
+    # workaround here, which can be removed once we drop support for versions older than 4.57.5.
+    lr_scheduler_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Additional parameters for the lr_scheduler, such as {'num_cycles': 1} for cosine with hard "
+            "restarts."
+        },
+    )
+    max_length: int | None = field(
+        default=1024,
+        metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from "
+            "the reference model."
+        },
+    )
+    loss_type: str = field(
+        default="kto",
+        metadata={
+            "help": "Type of loss to use.",
+            "choices": ["kto", "apo_zero_unpaired"],
+        },
+    )
+    desirable_weight: float = field(
+        default=1.0,
+        metadata={
+            "help": "Desirable losses are weighed by this factor to counter unequal number of desirable and "
+            "undesirable pairs.",
+        },
+    )
+    undesirable_weight: float = field(
+        default=1.0,
+        metadata={
+            "help": "Undesirable losses are weighed by this factor to counter unequal number of desirable and "
+            "undesirable pairs.",
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, generates and logs completions from both the model and the reference model to W&B "
+            "during evaluation."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    precompute_ref_log_probs: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to precompute reference model log probabilities for training and evaluation datasets. "
+            "This is useful when training without the reference model to reduce the total GPU memory needed."
+        },
+    )
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+        super().__post_init__()

ICL/RL/trl_source/trl/experimental/kto/kto_trainer.py ADDED Viewed

	@@ -0,0 +1,1511 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import random
+import textwrap
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import contextmanager, nullcontext
+from operator import itemgetter
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import transformers
+from accelerate import PartialState, logging
+from accelerate.utils import tqdm
+from datasets import Dataset, concatenate_datasets
+from packaging.version import Version
+from torch import autocast
+from torch.utils.data import DataLoader, SequentialSampler
+from transformers import (
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+    TrainingArguments,
+    is_comet_available,
+    is_wandb_available,
+)
+from transformers.trainer_utils import EvalLoopOutput, has_length
+from transformers.utils import is_peft_available
+from ...data_utils import maybe_apply_chat_template, maybe_extract_prompt, maybe_unpair_preference_dataset
+from ...import_utils import is_liger_kernel_available
+from ...models.utils import create_reference_model, peft_module_casting_to_bf16, prepare_deepspeed
+from ...trainer.base_trainer import BaseTrainer
+from ...trainer.utils import (
+    create_model_from_path,
+    disable_dropout_in_model,
+    log_table_to_comet_experiment,
+    pad_to_length,
+    selective_log_softmax,
+)
+from ..utils import DPODataCollatorWithPadding
+from .kto_config import KTOConfig
+if is_liger_kernel_available():
+    from liger_kernel.chunked_loss import LigerFusedLinearKTOLoss
+if is_peft_available():
+    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
+if is_wandb_available():
+    import wandb
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+logger = logging.get_logger(__name__)
+RUNNING_NAME = "running.pt"
+def _get_kl_dataset(batch: dict[str, list[Any]]) -> dict[str, list[Any]]:
+    """
+    Creates mismatched pairs of prompts and completions for the KL dataset by adding a +1 offset to the order of
+    completions. For best results, the mismatched outputs y' used to estimate the KL term for a batch should be the
+    same set as the matched outputs y used to estimate the rewards in that batch, just paired with different x.
+    """
+    batch["answer_input_ids"] = [batch["answer_input_ids"][-1]] + batch["answer_input_ids"][:-1]
+    batch["answer_attention_mask"] = [batch["answer_attention_mask"][-1]] + batch["answer_attention_mask"][:-1]
+    return batch
+def _tokenize(
+    batch: dict[str, list[Any]],
+    tokenizer: "PreTrainedTokenizer",
+) -> dict[str, list[Any]]:
+    """Tokenize a batch from a KTO specific dataset."""
+    prompt_tokenized = tokenizer(batch["prompt"], add_special_tokens=False)
+    prompt_input_ids = prompt_tokenized["input_ids"]
+    prompt_attention_mask = prompt_tokenized["attention_mask"]
+    prompt_and_completion = [
+        prompt + completion for prompt, completion in zip(batch["prompt"], batch["completion"], strict=True)
+    ]
+    full_tokenized = tokenizer(prompt_and_completion, add_special_tokens=False)
+    full_input_ids = full_tokenized["input_ids"]
+    full_attention_mask = full_tokenized["attention_mask"]
+    answer_input_ids = [f[len(p) :] for f, p in zip(full_input_ids, prompt_input_ids, strict=True)]
+    answer_attention_mask = [f[len(p) :] for f, p in zip(full_attention_mask, prompt_attention_mask, strict=True)]
+    # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+    full_concat_input_ids = [np.concatenate([p, a]) for p, a in zip(prompt_input_ids, answer_input_ids, strict=True)]
+    # Prepare input tokens for token by token comparison
+    full_input_ids = [np.array(f) for f in full_input_ids]
+    for full, concat in zip(full_input_ids, full_concat_input_ids, strict=True):
+        if len(full) != len(concat):
+            raise ValueError(
+                "The elements in 'full_input_ids' and 'full_concat_input_ids' must have the same pairwise length."
+            )
+    # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+    # can be merged together when tokenizing prompt+answer. This could result
+    # on the last token from the prompt being different when tokenized on its own
+    # vs when done as prompt+answer.
+    response_token_ids_start_idx = [len(p) for p in prompt_input_ids]
+    # If tokenized prompt is different than both prompt+answer, then it means the
+    # last token has changed due to merging.
+    for idx, (p, f, r) in enumerate(zip(prompt_input_ids, full_input_ids, response_token_ids_start_idx, strict=True)):
+        if not np.array_equal(p, f[:r]):
+            response_token_ids_start_idx[idx] -= 1
+    prompt_input_ids = [f[:r] for f, r in zip(full_input_ids, response_token_ids_start_idx, strict=True)]
+    prompt_attention_mask = [f[:r] for f, r in zip(full_attention_mask, response_token_ids_start_idx, strict=True)]
+    for p, m in zip(prompt_input_ids, prompt_attention_mask, strict=True):
+        if len(p) != len(m):
+            raise ValueError("Prompt input ids and attention mask should have the same length.")
+    answer_input_ids = [f[r:] for f, r in zip(full_input_ids, response_token_ids_start_idx, strict=True)]
+    answer_attention_mask = [f[r:] for f, r in zip(full_attention_mask, response_token_ids_start_idx, strict=True)]
+    output = dict(
+        prompt_input_ids=prompt_input_ids,
+        prompt_attention_mask=prompt_attention_mask,
+        answer_input_ids=answer_input_ids,
+        answer_attention_mask=answer_attention_mask,
+    )
+    return output
+def _process_tokens(example: dict[str, Any], model: "PreTrainedModel" = None, **kwargs) -> dict:
+    """Process tokens of a KTO specific dataset.
+    At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
+    completion responses is/are too long. We truncate from the end (completion) to fit within max_length.
+    We also create the labels for the completion responses, which are of length equal to the sum of the length of the
+    prompt and the completion response, with `-100` for the prompt tokens.
+    """
+    prompt = example["prompt"]
+    completion = example["completion"]
+    batch = {
+        f"{kwargs['prefix']}prompt": prompt,
+        f"{kwargs['prefix']}completion": completion,
+        f"{kwargs['prefix']}label": example["label"],
+    }
+    # Check issues below for more details
+    #  1. https://github.com/huggingface/trl/issues/907
+    #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+    #  3. https://github.com/LianjiaTech/BELLE/issues/337
+    if not isinstance(prompt, str):
+        raise ValueError(f"prompt should be an str but got {type(prompt)}")
+    if not isinstance(completion, str):
+        raise ValueError(f"completion should be an str but got {type(completion)}")
+    # keys of format prompt_* refers to just the prompt and answer_* refers to just the answer
+    all_tokens = {
+        "prompt_input_ids": example["prompt_input_ids"],
+        "prompt_attention_mask": example["prompt_attention_mask"],
+        "answer_input_ids": example["answer_input_ids"],
+        "answer_attention_mask": example["answer_attention_mask"],
+    }
+    # calculate max length by checking if BOS/EOS is already there
+    max_length = kwargs["max_length"]
+    bos_token_id = kwargs["tokenizer"].bos_token_id
+    eos_token_id = kwargs["tokenizer"].eos_token_id
+    if len(all_tokens["prompt_input_ids"]) > 0 and bos_token_id != all_tokens["prompt_input_ids"][0]:
+        max_length -= 1
+    if len(all_tokens["answer_input_ids"]) > 0 and eos_token_id != all_tokens["answer_input_ids"][-1]:
+        max_length -= 1
+    # if combined sequence is too long, truncate the completion (answer) from the end
+    prompt_length = len(all_tokens["prompt_input_ids"])
+    completion_length = len(all_tokens["answer_input_ids"])
+    if prompt_length + completion_length > max_length:
+        max_completion_length = max_length - prompt_length
+        for k in ["answer_input_ids", "answer_attention_mask"]:
+            all_tokens[k] = all_tokens[k][:max_completion_length]
+    # all input_ids and attention mask as is. We then check if we need to add BOS/EOS tokens
+    batch[f"{kwargs['prefix']}prompt_input_ids"] = all_tokens["prompt_input_ids"]
+    batch[f"{kwargs['prefix']}prompt_attention_mask"] = all_tokens["prompt_attention_mask"]
+    batch[f"{kwargs['prefix']}completion_input_ids"] = all_tokens["prompt_input_ids"] + all_tokens["answer_input_ids"]
+    batch[f"{kwargs['prefix']}completion_attention_mask"] = (
+        all_tokens["prompt_attention_mask"] + all_tokens["answer_attention_mask"]
+    )
+    # add BOS, which affects both prompt and the full completion
+    if bos_token_id is not None:
+        if len(all_tokens["prompt_input_ids"]) == 0 or bos_token_id != all_tokens["prompt_input_ids"][0]:
+            batch[f"{kwargs['prefix']}prompt_input_ids"] = [bos_token_id] + batch[
+                f"{kwargs['prefix']}prompt_input_ids"
+            ]
+            batch[f"{kwargs['prefix']}prompt_attention_mask"] = [1] + batch[f"{kwargs['prefix']}prompt_attention_mask"]
+            batch[f"{kwargs['prefix']}completion_input_ids"] = [bos_token_id] + batch[
+                f"{kwargs['prefix']}completion_input_ids"
+            ]
+            batch[f"{kwargs['prefix']}completion_attention_mask"] = [1] + batch[
+                f"{kwargs['prefix']}completion_attention_mask"
+            ]
+    # add EOS, which affects only the full completion
+    if len(all_tokens["answer_input_ids"]) == 0 or eos_token_id != all_tokens["answer_input_ids"][-1]:
+        batch[f"{kwargs['prefix']}completion_input_ids"] = batch[f"{kwargs['prefix']}completion_input_ids"] + [
+            eos_token_id
+        ]
+        batch[f"{kwargs['prefix']}completion_attention_mask"] = batch[
+            f"{kwargs['prefix']}completion_attention_mask"
+        ] + [1]
+    batch[f"{kwargs['prefix']}completion_labels"] = batch[f"{kwargs['prefix']}completion_input_ids"][:]
+    batch[f"{kwargs['prefix']}completion_labels"][: len(batch[f"{kwargs['prefix']}prompt_input_ids"])] = [-100] * len(
+        batch[f"{kwargs['prefix']}prompt_input_ids"]
+    )
+    return batch
+class KTOTrainer(BaseTrainer):
+    r"""
+    Initialize KTOTrainer.
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
+        ref_model ([`~transformers.PreTrainedModel`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        args ([`experimental.kto.KTOConfig`]):
+            The arguments to use for training.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`experimental.utils.DPODataCollatorWithPadding`]) will be used which will pad the sequences to the
+            maximum length of the sequences in the batch, given a dataset of paired sequences.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        model_adapter_name (`str`, defaults to `None`):
+            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+        ref_adapter_name (`str`, defaults to `None`):
+            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+    """
+    _tag_names = ["trl", "kto"]
+    _name = "KTO"
+    _paper = {
+        "title": "KTO: Model Alignment as Prospect Theoretic Optimization",
+        "id": "2402.01306",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{ethayarajh2024kto,
+                title        = {{KTO: Model Alignment as Prospect Theoretic Optimization}},
+                author       = {Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela},
+                year         = 2024,
+                eprint       = {arXiv:2402.01306},
+            }"""),
+    }
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str = None,
+        ref_model: PreTrainedModel | nn.Module | str | None = None,
+        args: KTOConfig = None,
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        data_collator: DataCollator | None = None,
+        model_init: Callable[[], PreTrainedModel] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
+        model_adapter_name: str | None = None,
+        ref_adapter_name: str | None = None,
+    ):
+        if type(args) is TrainingArguments:
+            raise ValueError("Please use `KTOConfig` instead TrainingArguments.")
+        if not isinstance(model, str) and ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must mass a copy of it, or `None` if you use peft."
+            )
+        # Model initialization
+        if isinstance(model, str):
+            model_init_kwargs = args.model_init_kwargs or {}
+            # Distributed training requires device_map=None ("auto" fails)
+            if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
+                model_init_kwargs["device_map"] = None
+            model = create_model_from_path(model, **model_init_kwargs)
+        else:
+            if args.model_init_kwargs is not None:
+                logger.warning(
+                    "You passed `model_init_kwargs` to the KTOConfig, but your model is already instantiated. "
+                    "The `model_init_kwargs` will be ignored."
+                )
+        # Reference model initialization
+        if isinstance(ref_model, str):
+            ref_model_init_kwargs = args.model_init_kwargs or {}
+            # Distributed training requires device_map=None ("auto" fails)
+            if args.distributed_state.distributed_type in ["MULTI_GPU", "DEEPSPEED"]:
+                ref_model_init_kwargs["device_map"] = None
+            ref_model = create_model_from_path(ref_model, **ref_model_init_kwargs)
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it with `pip install peft` to use the PEFT models"
+            )
+        elif is_peft_available() and peft_config is not None:
+            if isinstance(model, PeftModel):
+                raise ValueError(
+                    "You passed a `PeftModel` instance together with a `peft_config` to the trainer. Please first "
+                    "merge and unload the existing adapter, save the resulting base model, and then pass that base "
+                    "model along with the new `peft_config` to the trainer."
+                )
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+                prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+                    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+        if args.generate_during_eval and not (is_wandb_available() or is_comet_available()):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+        # KTO only supports causal language models, not encoder-decoder models
+        if model is not None and hasattr(model.config, "is_encoder_decoder") and model.config.is_encoder_decoder:
+            raise ValueError(
+                "KTO only supports causal language models. Encoder-decoder models are not supported. "
+                "Please use a causal LM (e.g., GPT, Llama, Mistral) instead of an encoder-decoder model (e.g., T5, BART)."
+            )
+        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+        self.model_adapter_name = model_adapter_name
+        self.ref_adapter_name = ref_adapter_name
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model or args.precompute_ref_log_probs:
+            # The `model` with adapters turned off will be used as the reference model
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(model)
+        if processing_class is None:
+            raise ValueError(
+                "max_length or a processing_class must be specified when using the default DPODataCollatorWithPadding"
+            )
+        if args.max_length is None:
+            logger.warning(
+                "When using DPODataCollatorWithPadding, you should set `max_length` in the KTOTrainer's init"
+                " it will be set to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        if args.max_length is not None:
+            max_length = args.max_length
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+            )
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your KTOConfig"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+        self.loss_type = args.loss_type
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.processing_class = processing_class
+        self.precompute_ref_log_probs = args.precompute_ref_log_probs
+        # Not all losses require a KL calculation
+        self.calculate_KL = True
+        if self.loss_type in ["apo_zero_unpaired"]:
+            self.calculate_KL = False
+        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
+        # keep track of first called to avoid computation of future calls
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+        # metric
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+        # KTO parameter
+        self.beta = args.beta
+        self.desirable_weight = args.desirable_weight
+        self.undesirable_weight = args.undesirable_weight
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        with PartialState().main_process_first():
+            # Extract the prompt if needed
+            train_dataset = train_dataset.map(
+                maybe_extract_prompt, num_proc=args.dataset_num_proc, desc="Extracting prompt from train dataset"
+            )
+            # Unpair the dataset if needed
+            train_dataset = maybe_unpair_preference_dataset(
+                train_dataset, args.dataset_num_proc, desc="Unpairing train dataset"
+            )
+            # Apply the chat template if needed
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template,
+                fn_kwargs={"tokenizer": processing_class},
+                num_proc=args.dataset_num_proc,
+                desc="Applying chat template to train dataset",
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    maybe_extract_prompt, num_proc=args.dataset_num_proc, desc="Extracting prompt from eval dataset"
+                )
+                eval_dataset = maybe_unpair_preference_dataset(
+                    eval_dataset, args.dataset_num_proc, desc="Unpairing eval dataset"
+                )
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                    desc="Applying chat template to eval dataset",
+                )
+            # Tokenize and prepare the training datasets
+            train_dataset = train_dataset.map(
+                _tokenize,
+                batched=True,
+                fn_kwargs={"tokenizer": self.processing_class},
+                num_proc=args.dataset_num_proc,
+                desc="Tokenizing train dataset",
+            )
+            fn_kwargs = {
+                "prefix": "",
+                "tokenizer": self.processing_class,
+                "max_length": self.max_length,
+            }
+            train_dataset = train_dataset.map(
+                _process_tokens,
+                fn_kwargs=fn_kwargs,
+                num_proc=args.dataset_num_proc,
+                desc="Processing tokenized train dataset",
+            )
+            # Tokenize and prepare the eval datasets
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    _tokenize,
+                    fn_kwargs={"tokenizer": self.processing_class},
+                    batched=True,
+                    num_proc=args.dataset_num_proc,
+                    desc="Tokenizing eval dataset",
+                )
+                eval_dataset = eval_dataset.map(
+                    _process_tokens,
+                    fn_kwargs=fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                    desc="Processing tokenized eval dataset",
+                )
+            # Get KL datasets if needed
+            if self.calculate_KL:
+                if args.per_device_train_batch_size <= 1:
+                    raise ValueError(
+                        "Actual (not effective) batch size must be > 1. KTO will not work properly because the KL term will be equivalent to the implied reward."
+                    )
+                # create pairs for estimating the KL term by flipping the matched pairs in each batch of size total_batch_size
+                # i.e., (x_1, y_1), ..., (x_n, y_n) --> (x_1, y_n), ..., (x_n, y_1) = (x'_1, y'_1), ..., (x'_n, y'_n)
+                train_kl_dataset = train_dataset.map(
+                    _get_kl_dataset,
+                    batched=True,
+                    batch_size=args.per_device_train_batch_size,
+                    num_proc=args.dataset_num_proc,
+                    desc="Extracting KL train dataset",
+                )
+                fn_kwargs["prefix"] = "KL_"
+                train_kl_dataset = train_kl_dataset.map(
+                    _process_tokens,
+                    fn_kwargs=fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                    remove_columns=[c for c in train_kl_dataset.column_names if c in train_dataset.column_names],
+                    desc="Processing tokenized train KL dataset",
+                )
+                # merge the datasets
+                train_dataset = concatenate_datasets([train_dataset, train_kl_dataset], axis=1)
+                if eval_dataset is not None:
+                    # Get KL dataset
+                    eval_kl_dataset = eval_dataset.map(
+                        _get_kl_dataset,
+                        batched=True,
+                        batch_size=args.per_device_train_batch_size,
+                        num_proc=args.dataset_num_proc,
+                        desc="Extracting eval KL dataset",
+                    )
+                    eval_kl_dataset = eval_kl_dataset.map(
+                        _process_tokens,
+                        fn_kwargs=fn_kwargs,
+                        num_proc=args.dataset_num_proc,
+                        remove_columns=[c for c in eval_kl_dataset.column_names if c in eval_dataset.column_names],
+                        desc="Processing tokenized eval KL dataset",
+                    )
+                    # merge the datasets
+                    eval_dataset = concatenate_datasets([eval_dataset, eval_kl_dataset], axis=1)
+            # calculate dataset desirability balance
+            num_desirable = max(sum(train_dataset["label"]), 1)
+            num_undesirable = max(len(train_dataset["label"]) - num_desirable, 1)  # "label" is binary
+            if num_desirable != num_undesirable:
+                # The lower and upper bounds come from Eq. (8) of https://huggingface.co/papers/2402.01306
+                des_weight_lower_bound = round((num_undesirable * self.undesirable_weight / num_desirable) * 1, 2)
+                des_weight_upper_bound = round((num_undesirable * self.undesirable_weight / num_desirable) * 1.33, 2)
+                und_weight_lower_bound = round((num_desirable * self.desirable_weight / num_undesirable) / 1.33, 2)
+                und_weight_upper_bound = round((num_desirable * self.desirable_weight / num_undesirable) / 1, 2)
+                des_weight_in_range = des_weight_lower_bound <= self.desirable_weight <= des_weight_upper_bound
+                und_weight_in_range = und_weight_lower_bound <= self.undesirable_weight <= und_weight_upper_bound
+                if not (des_weight_in_range or und_weight_in_range):
+                    logger.warning(
+                        "You have different amounts of desirable/positive and undesirable/negative examples but the "
+                        "weights on the desirable and undesirable losses don't seem to be in an ideal range. Based "
+                        f"on your data, we recommend EITHER "
+                        f"desirable_weight in [{des_weight_lower_bound}, {des_weight_upper_bound}] or "
+                        f"undesirable_weight in [{und_weight_lower_bound}, {und_weight_upper_bound}] (but NOT BOTH). "
+                        "See the documentation on how to optimally set these weights.",
+                    )
+        # Transformers explicitly set use_reentrant=True in the past to silence a PyTorch warning, but the default was
+        # never updated once PyTorch switched to recommending use_reentrant=False. Until that change lands upstream
+        # (see https://github.com/huggingface/transformers/pull/43203) and is released (most likely in 5.0.0), we
+        # default to the recommended non-reentrant behavior here, while preserving any user-provided value.
+        if args.gradient_checkpointing and Version(transformers.__version__) < Version("5.0.0"):
+            args.gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs or {}
+            args.gradient_checkpointing_kwargs.setdefault("use_reentrant", False)
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+        # Deepspeed Zero-3 does not support precompute_ref_log_probs
+        if self.is_deepspeed_enabled:
+            if self.accelerator.state.deepspeed_plugin.zero_stage == 3 and self.precompute_ref_log_probs:
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`."
+                )
+        if self.ref_model is None:
+            if not (self.is_peft_model or self.precompute_ref_log_probs):
+                raise ValueError(
+                    "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`"
+                )
+        else:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+        # Import Liger kernel if enabled
+        if self.args.use_liger_kernel:
+            if not is_liger_kernel_available():
+                raise ImportError(
+                    "You set `use_liger_kernel=True` but the liger kernel is not available. "
+                    "Please install liger-kernel first: `pip install liger-kernel`"
+                )
+            if self.loss_type in ["apo_zero_unpaired"]:
+                raise ValueError(
+                    "You cannot set `loss_type='apo_zero_unpaired'` with liger-kernel."
+                    "Only KTO loss is supported with liger-kernel."
+                )
+            if self.precompute_ref_log_probs:
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with liger kernel. Please set "
+                    "`precompute_ref_log_probs=False`."
+                )
+            if self.is_peft_model or self.ref_adapter_name is not None:
+                raise ValueError(
+                    "You cannot use `use_liger_kernel=True` with Peft models. Please set `use_liger_kernel=False`."
+                )
+            self.kto_loss_fn = LigerFusedLinearKTOLoss(beta=self.beta, use_ref_model=(self.ref_model is not None))
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.model_adapter_name or "default")
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+        Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`.
+        """
+        if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_train_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(DataLoader(self.train_dataset, **dataloader_params))
+            reference_completion_logps = []
+            reference_KL_logps = []
+            for padded_batch in tqdm(iterable=data_loader, desc="Train dataset reference log probs"):
+                reference_completion_logp, reference_KL_logp = self.compute_reference_log_probs(padded_batch)
+                reference_completion_logp = self.accelerator.gather_for_metrics(reference_completion_logp)
+                reference_completion_logps.append(reference_completion_logp.cpu())
+                if self.calculate_KL:
+                    reference_KL_logp = self.accelerator.gather_for_metrics(reference_KL_logp)
+                    reference_KL_logps.append(reference_KL_logp.cpu())
+            self.train_dataset = self.train_dataset.add_column(
+                name="reference_logps", column=torch.cat(reference_completion_logps).float().numpy()
+            )
+            if self.calculate_KL:
+                self.train_dataset = self.train_dataset.add_column(
+                    name="reference_KL_logps", column=torch.cat(reference_KL_logps).float().numpy()
+                )
+            self._precomputed_train_ref_log_probs = True
+        return super().get_train_dataloader()
+    def get_eval_dataloader(self, eval_dataset: Dataset | None = None) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+        Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`.
+        Args:
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
+                by the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+        if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_eval_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params))
+            reference_completion_logps = []
+            reference_KL_logps = []
+            for padded_batch in tqdm(iterable=data_loader, desc="Eval dataset reference log probs"):
+                reference_completion_logp, reference_KL_logp = self.compute_reference_log_probs(padded_batch)
+                reference_completion_logp = self.accelerator.gather_for_metrics(reference_completion_logp)
+                reference_completion_logps.append(reference_completion_logp.cpu())
+                if self.calculate_KL:
+                    reference_KL_logp = self.accelerator.gather_for_metrics(reference_KL_logp)
+                    reference_KL_logps.append(reference_KL_logp.cpu())
+            eval_dataset = eval_dataset.add_column(
+                name="reference_logps", column=torch.cat(reference_completion_logps).float().numpy()
+            )
+            if self.calculate_KL:
+                eval_dataset = eval_dataset.add_column(
+                    name="reference_KL_logps", column=torch.cat(reference_KL_logps).float().numpy()
+                )
+            # Save calculated reference_chosen_logps and reference_rejected_logps to the eval_dataset for subsequent runs
+            if self.eval_dataset is not None:
+                self.eval_dataset = eval_dataset
+            self._precomputed_eval_ref_log_probs = True
+        return super().get_eval_dataloader(eval_dataset=eval_dataset)
+    def compute_reference_log_probs(self, padded_batch: dict) -> dict:
+        """Computes log probabilities of the reference model for a single padded batch of a KTO specific dataset."""
+        with torch.no_grad():
+            if self.ref_model is None:
+                with self.null_ref_context():
+                    completion_logits = self.model(
+                        padded_batch["completion_input_ids"],
+                        attention_mask=padded_batch["completion_attention_mask"],
+                    ).logits
+                    if self.calculate_KL:
+                        KL_logits = self.model(
+                            padded_batch["KL_completion_input_ids"],
+                            attention_mask=padded_batch["KL_completion_attention_mask"],
+                        ).logits
+            else:
+                completion_logits = self.ref_model(
+                    padded_batch["completion_input_ids"], attention_mask=padded_batch["completion_attention_mask"]
+                ).logits
+                if self.calculate_KL:
+                    KL_logits = self.ref_model(
+                        padded_batch["KL_completion_input_ids"],
+                        attention_mask=padded_batch["KL_completion_attention_mask"],
+                    ).logits
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            padded_batch["completion_labels"],
+            average_log_prob=False,
+        )
+        if self.calculate_KL:
+            KL_logps = self.get_batch_logps(
+                KL_logits,
+                padded_batch["KL_completion_labels"],
+                average_log_prob=False,
+            )
+        else:
+            KL_logps = None
+        return completion_logps, KL_logps
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+        Args:
+            logits:
+                Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of `-100` are ignored.
+                Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+        # For causal LM, shift labels and logits by one position
+        labels = labels[:, 1:].clone()
+        logits = logits[:, :-1, :]
+        loss_mask = labels != -100
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == -100] = 0
+        per_token_logps = selective_log_softmax(logits, labels)
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+    def forward(
+        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        KL_logps = self._compute_kl_logps(model, batch)
+        model_kwargs = {}
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+        outputs = model(
+            batch["completion_input_ids"],
+            attention_mask=batch["completion_attention_mask"],
+            **model_kwargs,
+        )
+        completion_logits = outputs.logits
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            batch["completion_labels"],
+            average_log_prob=False,
+        )
+        if completion_logps.shape[0] != len(batch["label"]):
+            raise ValueError(
+                "There is a mismatch between the number of examples in this batch and the number of "
+                "examples for which an output sequence was predicted."
+            )
+        # Use torch.nonzero for efficient tensor index selection
+        device = completion_logits.device
+        labels = torch.as_tensor(batch["label"], dtype=torch.bool, device=device)
+        chosen_idx = torch.nonzero(labels, as_tuple=False).view(-1)
+        rejected_idx = torch.nonzero(~labels, as_tuple=False).view(-1)
+        # Use index_select for efficient CUDA operations
+        chosen_logps = completion_logps.index_select(0, chosen_idx)
+        rejected_logps = completion_logps.index_select(0, rejected_idx)
+        chosen_logits = completion_logits.index_select(0, chosen_idx)
+        rejected_logits = completion_logits.index_select(0, rejected_idx)
+        if self.aux_loss_enabled:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, KL_logps, outputs.aux_loss)
+        else:
+            return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, KL_logps)
+    def kto_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        policy_KL_logps: torch.FloatTensor,
+        reference_chosen_logps: torch.FloatTensor,
+        reference_rejected_logps: torch.FloatTensor,
+        reference_KL_logps: torch.FloatTensor,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute the KTO loss for a batch of policy and reference model log probabilities.
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (num(chosen) in batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (num(rejected) in batch_size,)
+            policy_KL_logps: Log probabilities of the policy model for the KL responses. Shape: (batch_size,)
+            reference_chosen_logps:
+                Log probabilities of the reference model for the chosen responses. Shape: (num(chosen) in batch_size,)
+            reference_rejected_logps:
+                Log probabilities of the reference model for the rejected responses. Shape: (num(rejected) in
+                batch_size,)
+            reference_KL_logps: Log probabilities of the reference model for the KL responses. Shape: (batch_size,)
+        Returns:
+            A tuple of four tensors: (losses, chosen_rewards, rejected_rewards, KL). The losses tensor contains the KTO
+            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
+            the chosen and rejected responses, respectively. The KL tensor contains the detached KL divergence estimate
+            between the policy and reference models.
+        """
+        if self.calculate_KL:
+            kl = (policy_KL_logps - reference_KL_logps).mean().detach()
+            kl = self.accelerator.gather_for_metrics(kl).mean().clamp(min=0)
+        else:
+            kl = torch.zeros(1).to(policy_chosen_logps.device)
+        # Chosen losses
+        if policy_chosen_logps.shape[0] != 0 or reference_chosen_logps.shape[0] != 0:
+            chosen_logratios = policy_chosen_logps - reference_chosen_logps
+            if self.loss_type == "kto":
+                # Eqn (7) of the KTO paper (https://huggingface.co/papers/2402.01306)
+                chosen_losses = 1 - F.sigmoid(self.beta * (chosen_logratios - kl))
+            elif self.loss_type == "apo_zero_unpaired":
+                # Unpaired variant of Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+                # Use this loss when you believe the chosen outputs are better than your model's default output
+                chosen_losses = 1 - F.sigmoid(self.beta * chosen_logratios)
+            chosen_rewards = self.beta * chosen_logratios.detach()
+        else:
+            # lists can't be empty -- if they are, then accelerate.gather will hang
+            chosen_losses = torch.Tensor([]).to(self.accelerator.device)
+            chosen_rewards = torch.Tensor([]).to(self.accelerator.device)
+        # Rejected losses
+        if policy_rejected_logps.shape[0] != 0 or reference_rejected_logps.shape[0] != 0:
+            rejected_logratios = policy_rejected_logps - reference_rejected_logps
+            if self.loss_type == "kto":
+                rejected_losses = 1 - F.sigmoid(self.beta * (kl - rejected_logratios))
+            elif self.loss_type == "apo_zero_unpaired":
+                rejected_losses = F.sigmoid(self.beta * rejected_logratios)
+            rejected_rewards = self.beta * rejected_logratios.detach()
+        else:
+            # lists can't be empty -- if they are, then accelerate.gather will hang
+            rejected_losses = torch.Tensor([]).to(self.accelerator.device)
+            rejected_rewards = torch.Tensor([]).to(self.accelerator.device)
+        losses = torch.cat(
+            (self.desirable_weight * chosen_losses, self.undesirable_weight * rejected_losses),
+            0,
+        )
+        return losses, chosen_rewards, rejected_rewards, kl
+    def _compute_kl_logps(self, model, batch):
+        """Compute KL log probabilities for a given batch."""
+        KL_logps = None
+        if self.calculate_KL:
+            KL_model_kwargs = {
+                "input_ids": batch["KL_completion_input_ids"],
+                "attention_mask": batch["KL_completion_attention_mask"],
+            }
+            with torch.no_grad():
+                KL_logits = model(**KL_model_kwargs).logits
+            KL_logps = self.get_batch_logps(
+                KL_logits,
+                batch["KL_completion_labels"],
+                average_log_prob=False,
+            )
+        return KL_logps
+    def _compute_loss_liger(self, model, batch):
+        """
+        Compute the KTO loss using the Liger-Kernel's LigerFusedLinearKTOLoss.
+        Args:
+            model:
+                The policy model used for generating log probabilities and outputs. It could be an encoder-decoder
+                model or a regular language model.
+            batch: A dictionary containing the input data and labels for the batch.
+        Returns:
+            A dictionary containing the following keys:
+                - "loss": The computed KTO loss for the batch.
+                - "chosen_logits_sum": Sum of the logits for the chosen responses from the policy model.
+                - "rejected_logits_sum": Sum of the logits for the rejected responses from the policy model.
+                - "chosen_logps": Log probabilities of the chosen responses from the policy model.
+                - "rejected_logps": Log probabilities of the rejected responses from the policy model.
+                - "chosen_rewards": Rewards for the chosen responses.
+                - "rejected_rewards": Rewards for the rejected responses.
+                - "kl": The KL divergence between the policy and reference models (detached).
+            If auxiliary loss is enabled, the dictionary will also include:
+                - "aux_loss": The auxiliary loss from the model outputs.
+        """
+        policy_KL_logps = self._compute_kl_logps(model, batch)
+        reference_KL_logps = self._compute_kl_logps(self.ref_model, batch)
+        if self.calculate_KL:
+            kl = (policy_KL_logps - reference_KL_logps).mean().detach()
+            kl = self.accelerator.gather_for_metrics(kl).mean().clamp(min=0)
+        else:
+            kl = torch.zeros(1).to(self.accelerator.device)
+        model_kwargs = {}
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+        # skip the lm head and get the last hidden state
+        base_model = model.get_decoder()
+        outputs = base_model(
+            batch["completion_input_ids"],
+            attention_mask=batch["completion_attention_mask"],
+            use_cache=False,
+            **model_kwargs,
+        )
+        # reference model
+        ref_base_model = self.ref_model.get_decoder()
+        ref_outputs = ref_base_model(
+            batch["completion_input_ids"],
+            attention_mask=batch["completion_attention_mask"],
+            use_cache=False,
+            **model_kwargs,
+        )
+        lm_head = model.get_output_embeddings()
+        ref_lm_head = self.ref_model.get_output_embeddings()
+        (
+            loss,
+            (
+                chosen_logps_sum,
+                rejected_logps_sum,
+                chosen_logits_sum,
+                rejected_logits_sum,
+                chosen_rewards_sum,
+                rejected_rewards_sum,
+            ),
+        ) = self.kto_loss_fn(
+            _input=outputs.last_hidden_state[:, :-1],
+            lin_weight=lm_head.weight,
+            target=batch["completion_labels"][:, 1:],
+            bias=lm_head.bias if hasattr(lm_head, "bias") else None,
+            preference_labels=torch.tensor(batch["label"], dtype=torch.bool).to(self.accelerator.device),
+            ref_input=ref_outputs.last_hidden_state[:, :-1],
+            ref_weight=ref_lm_head.weight,
+            ref_bias=ref_lm_head.bias if hasattr(lm_head, "bias") else None,
+            kl=kl,
+        )
+        output = {
+            "loss": loss,
+            "chosen_logits_sum": chosen_logits_sum,
+            "rejected_logits_sum": rejected_logits_sum,
+            "chosen_logps_sum": chosen_logps_sum,
+            "rejected_logps_sum": rejected_logps_sum,
+            "chosen_rewards_sum": chosen_rewards_sum,
+            "rejected_rewards_sum": rejected_rewards_sum,
+            "kl": kl,
+        }
+        if self.aux_loss_enabled:
+            output["aux_loss"] = outputs.aux_loss
+        return output
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, list | torch.LongTensor],
+    ):
+        """Compute the KTO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        batch = {k: (v.to(self.accelerator.device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
+        labels = torch.tensor(batch["label"])
+        num_chosen = labels.sum().to(self.accelerator.device)
+        num_rejected = (len(labels) - num_chosen).to(self.accelerator.device)
+        if self.args.use_liger_kernel:
+            model_output = self._compute_loss_liger(model, batch)
+            losses = model_output["loss"]
+            policy_chosen_logits = model_output["chosen_logits_sum"]
+            policy_rejected_logits = model_output["rejected_logits_sum"]
+            policy_chosen_logps = model_output["chosen_logps_sum"]
+            policy_rejected_logps = model_output["rejected_logps_sum"]
+            chosen_rewards = model_output["chosen_rewards_sum"]
+            rejected_rewards = model_output["rejected_rewards_sum"]
+            kl = model_output["kl"]
+            if self.aux_loss_enabled:
+                aux_loss = model_output["aux_loss"]
+        else:
+            forward_output = self.forward(model, batch)
+            (
+                policy_chosen_logps,
+                policy_rejected_logps,
+                policy_chosen_logits,
+                policy_rejected_logits,
+                policy_KL_logps,
+            ) = forward_output[:5]
+            if self.aux_loss_enabled:
+                aux_loss = forward_output[5]
+            # if reference_logps in batch use them, otherwise use the reference model
+            if "reference_logps" in batch:
+                # Convert Python lists to tensor indices for efficient CUDA operations
+                device = batch["reference_logps"].device
+                labels = torch.as_tensor(batch["label"], dtype=torch.bool, device=device)
+                chosen_idx = torch.nonzero(labels, as_tuple=False).view(-1)
+                rejected_idx = torch.nonzero(~labels, as_tuple=False).view(-1)
+                # Use index_select for efficient CUDA operations
+                reference_chosen_logps = batch["reference_logps"].index_select(0, chosen_idx)
+                reference_rejected_logps = batch["reference_logps"].index_select(0, rejected_idx)
+                if self.calculate_KL:
+                    reference_KL_logps = batch["reference_KL_logps"]
+                else:
+                    reference_KL_logps = None
+            else:
+                with torch.no_grad():
+                    if self.ref_model is None:
+                        with self.null_ref_context():
+                            (
+                                reference_chosen_logps,
+                                reference_rejected_logps,
+                                _,
+                                _,
+                                reference_KL_logps,
+                            ) = self.forward(self.model, batch)[:5]
+                    else:
+                        (
+                            reference_chosen_logps,
+                            reference_rejected_logps,
+                            _,
+                            _,
+                            reference_KL_logps,
+                        ) = self.forward(self.ref_model, batch)[:5]
+            losses, chosen_rewards, rejected_rewards, kl = self.kto_loss(
+                policy_chosen_logps,
+                policy_rejected_logps,
+                policy_KL_logps,
+                reference_chosen_logps,
+                reference_rejected_logps,
+                reference_KL_logps,
+            )
+        metrics["kl"] = kl.item()
+        all_num_chosen = self.accelerator.gather_for_metrics(num_chosen).sum().item()
+        all_num_rejected = self.accelerator.gather_for_metrics(num_rejected).sum().item()
+        if all_num_chosen > 0:
+            metrics["rewards/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(chosen_rewards.nansum()).nansum().item()
+            )
+            metrics["logps/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(policy_chosen_logps.nansum()).nansum().item()
+            )
+            metrics["logits/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(policy_chosen_logits.nansum()).nansum().item()
+            )
+            metrics["count/chosen"] = all_num_chosen
+        if all_num_rejected > 0:
+            metrics["rewards/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(rejected_rewards.nansum()).nansum().item()
+            )
+            metrics["logps/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(policy_rejected_logps.nansum()).nansum().item()
+            )
+            metrics["logits/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(policy_rejected_logits.nansum()).nansum().item()
+            )
+            metrics["count/rejected"] = all_num_rejected
+        loss = losses.nanmean()
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+        return loss, metrics
+    def compute_loss(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs)
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+        # force log the metrics
+        if self.accelerator.is_main_process:
+            self.store_metrics(metrics, train_eval="train")
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+    def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+    def _get_train_sampler(self, dataset: Dataset | None = None) -> torch.utils.data.Sampler | None:
+        if dataset is None:
+            dataset = self.train_dataset
+        if dataset is None or not has_length(dataset):
+            return None
+        return SequentialSampler(dataset)
+    def generate_from_model_and_ref(self, model, batch: dict[str, torch.LongTensor]) -> tuple[str, str]:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+            # if reference_output in batch use that otherwise use the reference model
+            if "reference_output" in batch:
+                reference_output = batch["reference_output"]
+            else:
+                if self.ref_model is None:
+                    with self.null_ref_context():
+                        reference_output = self.model.generate(
+                            input_ids=batch["prompt_input_ids"],
+                            attention_mask=batch["prompt_attention_mask"],
+                            max_length=self.max_length,
+                            do_sample=True,
+                            pad_token_id=self.processing_class.pad_token_id,
+                        )
+                else:
+                    reference_output = self.ref_model.generate(
+                        input_ids=batch["prompt_input_ids"],
+                        attention_mask=batch["prompt_attention_mask"],
+                        max_length=self.max_length,
+                        do_sample=True,
+                        pad_token_id=self.processing_class.pad_token_id,
+                    )
+        policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id)
+        policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True)
+        reference_output = pad_to_length(reference_output, self.max_length, self.processing_class.pad_token_id)
+        reference_output_decoded = self.processing_class.batch_decode(reference_output, skip_special_tokens=True)
+        return policy_output_decoded, reference_output_decoded
+    def prediction_step(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,
+        ignore_keys: list[str] | None = None,
+    ):
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext()
+        )
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs)
+        # force log the metrics
+        if self.accelerator.is_main_process:
+            self.store_metrics(metrics, train_eval="eval")
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+        # logits for the chosen and rejected samples from model
+        logits_dict = {}
+        if "logits/chosen_sum" in metrics:
+            logits_dict["eval_logits/chosen"] = metrics["logits/chosen_sum"]
+        if "logits/rejected_sum" in metrics:
+            logits_dict["eval_logits/rejected"] = metrics["logits/rejected_sum"]
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+        return (loss.detach(), logits, labels)
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: bool | None = None,
+        ignore_keys: list[str] | None = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+        Works both with or without labels.
+        """
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size)
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+            target_labels = torch.tensor(random_batch["label"], dtype=torch.bool, device=self.accelerator.device)
+            target_indices = torch.where(~target_labels)[0]
+            target_batch = {
+                "prompt_input_ids": random_batch["prompt_input_ids"][target_indices],
+                "prompt_attention_mask": random_batch["prompt_attention_mask"][target_indices],
+                "prompt": itemgetter(*target_indices)(random_batch["prompt"]),
+            }
+            policy_output_decoded, ref_output_decoded = self.generate_from_model_and_ref(self.model, target_batch)
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy", "Ref Model"],
+                data=[
+                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
+                    for prompt, pol, ref in zip(
+                        target_batch["prompt"], policy_output_decoded, ref_output_decoded, strict=True
+                    )
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix
+        )
+        return initial_output
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float`, *optional*):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # train metrics should have no prefix, eval should have 'eval_'
+        prefix = "eval_" if train_eval == "eval" else ""
+        # accumulate average metrics from sums and lengths
+        for split in ["chosen", "rejected"]:
+            if f"count/{split}" in self._stored_metrics[train_eval]:
+                count_sum = torch.Tensor(self._stored_metrics[train_eval][f"count/{split}"]).sum().item()
+                for metric in ["rewards", "logps", "logits"]:
+                    logs[f"{prefix}{metric}/{split}"] = (
+                        torch.Tensor(self._stored_metrics[train_eval][f"{metric}/{split}_sum"]).sum().item()
+                        / count_sum
+                    )
+                    # delete obsolete metric
+                    del self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
+                del self._stored_metrics[train_eval][f"count/{split}"]
+        # calculate reward margin
+        if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs:
+            logs[f"{prefix}rewards/margins"] = logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"]
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[f"{prefix}{key}"] = torch.Tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)

ICL/RL/trl_source/trl/experimental/merge_model_callback.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+import torch
+from huggingface_hub import HfApi
+from transformers import TrainerCallback
+from ..import_utils import is_mergekit_available
+from ..trainer.utils import get_config_model_id
+if is_mergekit_available():
+    from mergekit.config import MergeConfiguration
+    from mergekit.merge import MergeOptions, run_merge
+# Logger for module-level logging
+logger = logging.getLogger(__name__)
+def upload_model_to_hf(folder_path: str, repo_id: str):
+    api = HfApi()
+    # Create the repository if it doesn't exist
+    repo = api.create_repo(repo_id, repo_type="model")
+    # Upload the folder to the specified repository
+    api.upload_folder(
+        folder_path=folder_path,
+        repo_id=repo.repo_id,
+        repo_type=repo.repo_type,
+    )
+class MergeConfig:
+    r"""
+    Configuration class for merging two models using `mergekit`.
+    This class provides a structured way to configure and generate merge configurations for various merge methods, such
+    as `linear`, `ties`, `dare_ties`, and `slerp`.
+    Args:
+        method (`str`, *optional*, defaults to `"linear"`):
+            Merge method to use. Supported methods include:
+            - `"linear"`: Linearly combines two models with specified weights.
+            - `"ties"`: Combines two models using the TIES method with density parameters.
+            - `"dare_ties"`: A variant of TIES for domain adaptation.
+            - `"slerp"`: Combines models using spherical linear interpolation.
+    Note:
+        For more details about the merge methods and how they are implemented, see the [MergeKit GitHub
+        repository](https://github.com/arcee-ai/mergekit?tab=readme-ov-file#merge-methods).
+    Attributes:
+        method (`str`): The merge method to use.
+        policy_model_path (`str` or `None`): Path to the policy model.
+        target_model_path (`str` or `None`): Path to the target model.
+        policy_model_weight (`float`): Weight for the policy model (for `linear` and `ties` methods).
+        target_model_weight (`float`): Weight for the target model (for `linear` and `ties` methods).
+        policy_model_density (`list[float]`): Density parameters for the policy model (for `ties` and `dare_ties`).
+        target_model_density (`list[float]`): Density parameters for the target model (for `ties` and `dare_ties`).
+        normalize (`float` or `None`): Normalization factor for the TIES method.
+        t_values (`float` or `None`): Interpolation factor for the SLERP method.
+        dtype (`str`): Data type to use for merging, e.g., `"float16"`.
+    """
+    def __init__(self, method: str = "linear"):
+        if not is_mergekit_available():
+            raise ImportError("MergeConfig requires the `mergekit` extra. To install, run `pip install mergekit`.")
+        self.method = method
+        self.policy_model_path = None
+        self.target_model_path = None
+        # Initialize relevant parameters based on the method
+        if method == "linear":
+            self.policy_model_weight = 0.5
+            self.target_model_weight = 0.5
+            self.dtype = "float16"
+        elif method == "ties":
+            self.policy_model_weight = 1.0
+            self.policy_model_density = [1.0, 0.7, 0.1]
+            self.target_model_weight = 1.0
+            self.target_model_density = [1.0]
+            self.normalize = 1.0
+            self.dtype = "float16"
+        elif method == "dare_ties":
+            self.policy_model_weight = 1.0
+            self.policy_model_density = [1.0, 0.7, 0.1]
+            self.target_model_weight = 1.0
+            self.target_model_density = [1.0]
+            self.normalize = 1.0
+            self.dtype = "float16"
+        elif method == "slerp":
+            self.t_values = 0.5
+            self.dtype = "float16"
+        else:
+            raise ValueError(f"Unsupported merge method: {method}")
+    def create_merge_config_linear(self) -> "MergeConfiguration":
+        """
+        Creates a merge configuration for a linear merge of two models with specified weights.
+        """
+        # Create the merge configuration dictionary
+        merge_config_dict = {
+            "dtype": self.dtype,
+            "merge_method": "linear",
+            "models": [
+                {"model": self.policy_model_path, "parameters": {"weight": self.policy_model_weight}},
+                {"model": self.target_model_path, "parameters": {"weight": self.target_model_weight}},
+            ],
+        }
+        # Create the MergeConfiguration from the dictionary
+        merge_config = MergeConfiguration.model_validate(merge_config_dict)
+        return merge_config
+    def create_merge_config_ties(self) -> "MergeConfiguration":
+        """
+        Creates a merge configuration for a TIES merge of two models, with specified weights and densities.
+        """
+        # Create the TIES merge configuration dictionary
+        merge_config_dict = {
+            "merge_method": "ties",
+            "slices": None,  # Optional slices if needed
+            "models": [
+                {
+                    "model": {
+                        "model": {"path": self.target_model_path, "revision": None},
+                        "lora": None,
+                        "override_architecture": None,
+                    },
+                    "parameters": {"density": self.target_model_density, "weight": self.target_model_weight},
+                },
+                {
+                    "model": {
+                        "model": {"path": self.policy_model_path, "revision": None},
+                        "lora": None,
+                        "override_architecture": None,
+                    },
+                    "parameters": {"density": self.policy_model_density, "weight": self.policy_model_weight},
+                },
+            ],
+            "parameters": {"normalize": self.normalize},
+            "base_model": {
+                "model": {"path": self.policy_model_path, "revision": None},
+                "lora": None,
+                "override_architecture": None,
+            },
+            "dtype": self.dtype,
+            "tokenizer_source": None,
+            "tokenizer": None,
+            "chat_template": None,
+            "out_dtype": None,
+        }
+        # Create the MergeConfiguration from the dictionary
+        merge_config = MergeConfiguration.model_validate(merge_config_dict)
+        return merge_config
+    def create_merge_config_dare_ties(self) -> "MergeConfiguration":
+        """
+        Creates a merge configuration for a DARE TIES merge of two models, with specified weights and densities.
+        """
+        # Create the DARE TIES merge configuration dictionary
+        merge_config_dict = {
+            "merge_method": "dare_ties",
+            "slices": None,  # Optional slices if needed
+            "models": [
+                {
+                    "model": {
+                        "model": {"path": self.target_model_path, "revision": None},
+                        "lora": None,
+                        "override_architecture": None,
+                    },
+                    "parameters": {"density": self.target_model_density, "weight": self.target_model_weight},
+                },
+                {
+                    "model": {
+                        "model": {"path": self.policy_model_path, "revision": None},
+                        "lora": None,
+                        "override_architecture": None,
+                    },
+                    "parameters": {"density": self.policy_model_density, "weight": self.policy_model_weight},
+                },
+            ],
+            "parameters": {"normalize": self.normalize},
+            "base_model": {
+                "model": {"path": self.policy_model_path, "revision": None},
+                "lora": None,
+                "override_architecture": None,
+            },
+            "dtype": self.dtype,
+            "tokenizer_source": None,
+            "tokenizer": None,
+            "chat_template": None,
+            "out_dtype": None,
+        }
+        # Create the MergeConfiguration from the dictionary
+        merge_config = MergeConfiguration.model_validate(merge_config_dict)
+        return merge_config
+    def create_merge_config_slerp(self) -> "MergeConfiguration":
+        """
+        Creates a merge configuration for a SLERP merge of a model with a base model.
+        """
+        # Create the SLERP merge configuration dictionary
+        merge_config_dict = {
+            "merge_method": "slerp",
+            "slices": None,  # Optional slices if needed
+            "models": [
+                {
+                    "model": {
+                        "model": {"path": self.target_model_path, "revision": None},
+                        "lora": None,
+                        "override_architecture": None,
+                    },
+                    "parameters": None,  # No specific parameters for SLERP model
+                }
+            ],
+            "parameters": {
+                "t": self.t_values  # Set the t values for SLERP
+            },
+            "base_model": {
+                "model": {"path": self.policy_model_path, "revision": None},
+                "lora": None,
+                "override_architecture": None,
+            },
+            "dtype": self.dtype,
+            "tokenizer_source": None,
+            "tokenizer": None,
+            "chat_template": None,
+            "out_dtype": None,
+        }
+        # Create the MergeConfiguration from the dictionary
+        merge_config = MergeConfiguration.model_validate(merge_config_dict)
+        return merge_config
+    def create(self) -> "MergeConfiguration":
+        if self.method == "linear":
+            return self.create_merge_config_linear()
+        elif self.method == "ties":
+            return self.create_merge_config_ties()
+        elif self.method == "dare_ties":
+            return self.create_merge_config_dare_ties()
+        elif self.method == "slerp":
+            return self.create_merge_config_slerp()
+def merge_models(config: "MergeConfiguration", out_path: str):
+    """
+    Merge two models using mergekit
+    Args:
+        config (`MergeConfiguration`): The merge configuration.
+        out_path (`str`): The output path for the merged model.
+    """
+    if not is_mergekit_available():
+        raise ImportError("merge_models requires the `mergekit` extra. To install, run `pip install mergekit`.")
+    run_merge(
+        config,
+        out_path=out_path,
+        options=MergeOptions(
+            device="auto",
+            cuda=torch.cuda.is_available(),
+            copy_tokenizer=True,
+            lazy_unpickle=False,
+            low_cpu_memory=False,
+        ),
+    )
+class MergeModelCallback(TrainerCallback):
+    r"""
+    A [`~transformers.TrainerCallback`] that merges the policy model (the model being trained) with another model based
+    on a merge configuration.
+    Args:
+        merge_config ([`experimental.merge_model_callback.MergeConfig`], *optional*):
+            Configuration used for the merging process. If not provided, the default
+            [`~experimental.merge_model_callback.MergeConfig`] is used.
+        merge_at_every_checkpoint (`bool`, *optional*, defaults to `False`):
+            Whether to merge the model at every checkpoint.
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether to push the merged model to the Hub after merging.
+    Example:
+    ```python
+    from trl.experimental.merge_model_callback import MergeConfig, MergeModelCallback
+    config = MergeConfig()
+    merge_callback = MergeModelCallback(config)
+    trainer = DPOTrainer(..., callbacks=[merge_callback])
+    ```
+    """
+    def __init__(
+        self,
+        merge_config: "MergeConfig | None" = None,
+        merge_at_every_checkpoint: bool = False,
+        push_to_hub: bool = False,
+    ):
+        if not is_mergekit_available():
+            raise ImportError(
+                "MergeModelCallback requires the `mergekit` extra. To install, run `pip install mergekit`."
+            )
+        self.merge_config = merge_config or MergeConfig()
+        self.merge_at_every_checkpoint = merge_at_every_checkpoint
+        self.push_to_hub = push_to_hub
+    def _merge_and_maybe_push(self, output_dir, global_step, model):
+        checkpoint_path = os.path.join(output_dir, f"checkpoint-{global_step}")
+        self.merge_config.policy_model_path = checkpoint_path
+        if self.merge_config.target_model_path is None:
+            self.merge_config.target_model_path = get_config_model_id(model.config)
+        merge_path = os.path.join(checkpoint_path, "merged")
+        merge_models(self.merge_config.create(), merge_path)
+        if self.push_to_hub:
+            repo_name = f"{output_dir}_checkpoint-{global_step}_merged"
+            upload_model_to_hf(merge_path, repo_name)
+    def on_save(self, args, state, control, model=None, **kwargs):
+        if self.merge_at_every_checkpoint:
+            self._merge_and_maybe_push(args.output_dir, state.global_step, model)
+    def on_train_end(self, args, state, control, model=None, **kwargs):
+        if not self.merge_at_every_checkpoint:
+            self._merge_and_maybe_push(args.output_dir, state.global_step, model)

ICL/RL/trl_source/trl/experimental/minillm/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .minillm_config import MiniLLMConfig
+from .minillm_trainer import MiniLLMTrainer
+__all__ = ["MiniLLMConfig", "MiniLLMTrainer"]