#!/bin/bash
# ╔══════════════════════════════════════════════════════════════════════════════╗
# ║            AIPF 灵活评估脚本使用指南 (配合 run_eval_flex.sh)                 ║
# ╚══════════════════════════════════════════════════════════════════════════════╝
#
# 本文件是 run_eval_flex.sh 的说明文档，不需要直接执行。
# 实际运行请用: bash run_eval_flex.sh [参数]
#
#
# ┌─────────────────────────────────────────────────────────────────────────────┐
# │  1. 背景与思路                                                              │
# └─────────────────────────────────────────────────────────────────────────────┘
#
#   AIPF (AI Policy Framework) 的核心评测任务是：给定一条待评内容（golden set），
#   在一把已排好序的"尺子"（ruler，约 200 条标注样本）上找到它应该插入的位置，
#   从而得到一个严重程度分数。找位置的过程依赖 LLM 做 pairwise comparison（两两比较）。
#
#   问题在于：尺子有 200 条，不可能逐条比较（太贵太慢），所以需要搜索策略来决定
#   "跟尺子上的哪几条去比"。目前有两个关键的可调维度：
#
#   【维度一：比较轮次 (num_rounds)】
#     - 8 轮：每条样本跟尺子做 8 次 LLM pairwise comparison，精度高但成本翻倍
#     - 4 轮：只比 4 次，省一半 LLM 调用，但可能不够精确
#
#   【维度二：是否用 embedding 做 warm-start】
#     - 无 warm-start：第一轮用均匀分段采样，从尺子里等间距选候选去比较
#     - 有 warm-start：先用 embedding 模型 (Qwen3-Embedding-8B) 把待评内容和尺子
#       全部编码成向量，算 cosine 相似度找到最近邻，用最近邻的 rank 作为搜索起点。
#       这样第一轮就能从一个"大概对"的位置开始比较，后续几轮用来微调，
#       理论上可以用更少的轮次达到同样的精度。
#
#   核心假设：如果 warm-start 能给出一个足够好的初始位置，那么 4 轮微调就能
#   逼近 8 轮的精度，同时 LLM 调用量减半 → 成本减半、速度翻倍。
#
#
# ┌─────────────────────────────────────────────────────────────────────────────┐
# │  2. 四种实验配置                                                            │
# └─────────────────────────────────────────────────────────────────────────────┘
#
#   ┌─────────────────┬──────────────┬───────────────────────────────────────┐
#   │ 配置             │ 命令         │ 含义                                  │
#   ├─────────────────┼──────────────┼───────────────────────────────────────┤
#   │ baseline         │ --rounds 8   │ 纯 LLM 8轮，无辅助，精度天花板        │
#   │ 轮次减半         │ --rounds 4   │ 纯 LLM 4轮，看精度掉多少              │
#   │ warm+4轮(top5)   │ --rounds 4   │ embedding top5 预估起点 + LLM 4轮     │
#   │                  │ --warmstart  │ 用最少的邻居做粗估，省 GPU 时间        │
#   │                  │   top5       │                                       │
#   │ warm+4轮(top100) │ --rounds 4   │ embedding top100 预估起点 + LLM 4轮   │
#   │                  │ --warmstart  │ 用更多邻居做加权平均，位置估计更稳     │
#   │                  │   top100     │                                       │
#   └─────────────────┴──────────────┴───────────────────────────────────────┘
#
#
# ┌─────────────────────────────────────────────────────────────────────────────┐
# │  3. 完整流程图                                                              │
# └─────────────────────────────────────────────────────────────────────────────┘
#
#   [golden_set.csv]
#         │
#         ├── (如果 --warmstart top5/top100) ──────────────────────────┐
#         │   Step 0a: batch_top{5,100}_match.py                      │
#         │     → 加载 Qwen3-Embedding-8B                             │
#         │     → 对 golden_set 和 ruler 分别做 embedding             │
#         │     → 算 cosine 相似度，取 top-K 最近邻                   │
#         │     → 输出 emb_top{5,100}.jsonl                           │
#         │                                                           │
#         │   Step 0b: add_estimated_position.py                      │
#         │     → 读 jsonl 里 top-K 邻居的 rank，算均值               │
#         │     → 写入 csv 的 estimated_position 列                   │
#         │     → 输出 golden_with_warmstart.csv（不污染原始文件）     │
#         │                                                           │
#         ├───────────────────────────────────────────────────────────-┘
#         │
#         ▼
#   Step 1: 生成运行时 pipeline.yaml
#     → 从基准 pipeline.yaml 复制一份到 workspace
#     → 覆盖 num_rounds 和 search_method
#     → 保证实验参数与基准配置隔离，不改原文件
#         │
#         ▼
#   Step 2: prepare_local_eval_data.py
#     → 把 csv 转成 find_positions 需要的 jsonl 格式
#     → 如果 csv 有 estimated_position 列，会透传到 jsonl
#         │
#         ▼
#   Step 3: gen_find_positions_cfg.py
#     → 读 pipeline.yaml + 场景参数
#     → 生成 find_positions.py 的完整配置 yaml
#     → 包含：LLM 地址、ruler 路径、搜索参数等
#         │
#         ▼
#   Step 4: find_positions.py（核心步骤）
#     → 逐条读待评样本
#     → 如果有 estimated_position → warm-start，第一轮直接比该位置的 ruler item
#     → 如果没有 → 均匀分段采样选候选
#     → 每轮比完后，按 score 差异选下一轮候选（heuristic_search 策略）
#     → 共比 num_rounds 轮，输出最终插入位置和分数
#         │
#         ▼
#   Step 5: evaluate_local_ruler_results.py
#     → 把 find_positions 的结果和 golden_set 的 ground truth 对比
#     → 输出逐条 case_results.jsonl + 汇总 metrics.json (precision/recall/f1)
#
#
# ┌─────────────────────────────────────────────────────────────────────────────┐
# │  4. 用法示例                                                                │
# └─────────────────────────────────────────────────────────────────────────────┘

# 实验1: baseline —— 纯 heuristic 8轮，成本最高但精度最好
bash run_eval_flex.sh --rounds 8

# 实验2: 轮次减半 —— 纯 heuristic 4轮，看精度下降多少
bash run_eval_flex.sh --rounds 4

# 实验3: embedding top5 warm-start + 4轮
#   用 Qwen3 embedding 找 5 个最近邻，均值 rank 当起点，然后 LLM 做 4 轮微调
#   预期：粗估够用，省 GPU（只算 5 个邻居的相似度）
bash run_eval_flex.sh --rounds 4 --warmstart top5

# 实验4: embedding top100 warm-start + 4轮
#   用 100 个邻居做加权平均，起点估计更稳
#   GPU 开销稍大但仍远小于 LLM 调用成本
bash run_eval_flex.sh --rounds 4 --warmstart top100

# 只跑 nsa 场景
bash run_eval_flex.sh --rounds 4 --warmstart top5 --scenario nsa

# 全部场景 (yss + nsa)
bash run_eval_flex.sh --rounds 4 --warmstart top100 --scenario all

# 指定日期 + 只跑前 50 条做 smoke test（验证流程能跑通）
DATE=20260514 bash run_eval_flex.sh --rounds 4 --warmstart top100 --limit 50


# ┌─────────────────────────────────────────────────────────────────────────────┐
# │  5. 参数详解                                                                │
# └─────────────────────────────────────────────────────────────────────────────┘
#
#   --rounds N          LLM pairwise 比较的轮次（默认 8）
#                       每轮选 1 个候选跟待评样本做比较，所以 rounds=N 意味着
#                       每条样本消耗 N 次 LLM 调用。典型值: 4 或 8
#
#   --warmstart MODE    embedding 预匹配模式（默认 none，即不做）
#                       - none:   不做 embedding，第一轮用均匀分段采样
#                       - top5:   用 batch_top5_match.py，取 5 个最近邻的
#                                 rank 均值作为搜索起点。快，但估计可能粗
#                       - top100: 用 batch_top100_match.py，取 100 个最近邻的
#                                 rank 均值作为搜索起点。稍慢，估计更稳
#                       注意: warm-start 需要 GPU 跑 Qwen3-Embedding-8B，
#                       但这是一次性开销，跑完后有 cache_emb/ 缓存，重跑秒出
#
#   --scenario NAME     评估场景（默认 yss）
#                       - yss: youth_sexual_and_physical_abuse（青少年相关）
#                       - nsa: ansa（另一个安全类别）
#                       - all: 两个都跑
#
#   --limit N           只处理前 N 条样本（仅 warmstart 的 embedding 阶段生效）
#                       用于快速验证流程是否跑通，不影响正式评估
#
#   --emb-batch-size N  embedding 编码的 batch size（默认 4）
#                       GPU 显存不够时可以调小（如 2 或 1）
#
#   环境变量:
#     DATE=YYYYMMDD     指定评估日期标签（默认当天），影响输出路径
#     RUN_ID=xxx        指定运行 ID（默认时间戳），影响输出路径
#
#
# ┌─────────────────────────────────────────────────────────────────────────────┐
# │  6. 输出目录结构                                                            │
# └─────────────────────────────────────────────────────────────────────────────┘
#
#   aipf_example/{场景}/runs/{DATE}/{RUN_ID}_{TAG}/
#   ├── configs/
#   │   ├── pipeline_runtime.yaml          # 本次运行的实际参数（含覆盖后的 rounds）
#   │   └── pos_config/
#   │       └── find_positions_*.yaml      # find_positions.py 的完整配置
#   ├── intermediate/
#   │   ├── evr_*_local_eval_input.jsonl   # 转换后的评估输入
#   │   ├── emb_top{5,100}.jsonl           # (仅 warmstart) embedding 匹配结果
#   │   ├── golden_with_warmstart.csv      # (仅 warmstart) 带 estimated_position 的 csv
#   │   └── pairwise/                      # LLM 比较的中间缓存
#   └── outputs/
#       ├── find_positions/                # find_positions 的原始输出
#       ├── *_case_results_*.jsonl         # 逐条评估结果（每条样本的位置和判定）
#       └── *_metrics_*.json              # 汇总指标 (precision / recall / f1)
#
#   TAG 格式: heuristic_r{N}[_warm_{top5|top100}]
#   例如: heuristic_r4_warm_top5, heuristic_r8
#   → 不同实验的输出目录自动隔离，方便横向对比
#
#
# ┌─────────────────────────────────────────────────────────────────────────────┐
# │  7. warm-start 的数据流详解                                                 │
# └─────────────────────────────────────────────────────────────────────────────┘
#
#   warm-start 的核心思路是"先粗后精"：
#
#   a) batch_top{5,100}_match.py
#      - 用 Qwen3-Embedding-8B 把待评样本和 ruler 200 条都编码成向量
#      - 算 cosine 相似度矩阵 (N_sample x 200)
#      - 每条样本取 top-K 个最相似的 ruler item
#      - 输出 jsonl，每行包含 top-K 邻居的 rank/score/sim
#      - embedding 有缓存（cache_emb/ 目录），第二次跑同样的数据秒出
#
#   b) add_estimated_position.py
#      - 读 jsonl，取每条样本 top-K 邻居的 rank 做均值
#      - 例如 top5 邻居 rank = [45, 50, 52, 48, 55] → estimated_position = 50
#      - 把这个值写入 csv 的 estimated_position 列
#
#   c) prepare_local_eval_data.py
#      - 读 csv，如果有 estimated_position 列就透传到 jsonl
#
#   d) find_positions.py → _heuristic_search()
#      - 读到 estimated_position=50 后，第一轮直接跟 ruler[50] 做 LLM 比较
#      - 而不是从均匀分段采样开始（可能选到 ruler[0], ruler[50], ruler[100]...）
#      - 后续轮次正常按 score 差异选候选微调
#      - 效果：起点更准 → 更少轮次就能收敛 → 省 LLM 调用
#
#
# ┌─────────────────────────────────────────────────────────────────────────────┐
# │  8. 相关文件索引                                                            │
# └─────────────────────────────────────────────────────────────────────────────┘
#
#   脚本:
#     run_eval_flex.sh               ← 本说明对应的主脚本
#     run_eval.sh                    ← 原版评估脚本（固定参数，不支持 warm-start）
#     batch_top5_match.py            ← embedding top5 匹配
#     batch_top100_match.py          ← embedding top100 匹配
#     add_estimated_position.py      ← 把 embedding 估计的位置写回 csv
#
#   流水线:
#     pipeline/prepare_local_eval_data.py    ← csv → jsonl
#     pipeline/gen_find_positions_cfg.py     ← 生成 find_positions 配置
#     pipeline/evaluate_local_ruler_results.py ← 结果评估
#
#   核心引擎:
#     vendor/ranking_moderation/scripts/find_positions.py        ← 入口
#     vendor/ranking_moderation/src/ranking_moderation/
#       true_skill_ranking.py        ← _heuristic_search() / warm-start 逻辑
#       pairwise_comparison.py       ← LLM 调用做两两比较
#
#   配置:
#     aipf_example/yss_ruler_eval/pipeline.yaml   ← yss 场景基准配置
#     aipf_example/nsa_ruler_eval/pipeline.yaml   ← nsa 场景基准配置