DECADE / scripts /slurm /example_dense_retrieval.slurm
anonymous-penguin's picture
Initial code release
9c60174 verified
#!/bin/bash
# Example SLURM array script for flat dense retrieval (GTE 7B) with top-k=20.
#
# Usage:
# 1. Set SLURM_ACCOUNT, PARTITIONS, PROJECT_DIR, and MODEL_NAME below.
# 2. Make sure shards exist under $SHARD_ROOT/dataset/ and $SHARD_ROOT/ret_cache/
# (see scripts/make_v5_shards.py and scripts/build_retrieval_cache.py).
# 3. sbatch scripts/slurm/example_dense_retrieval.slurm
#
#SBATCH -J dense_gte_topk20
#SBATCH -A ${SLURM_ACCOUNT:-your-account}
#SBATCH -p ${PARTITIONS:-cpu}
#SBATCH --nodes=1
#SBATCH --time=04:00:00
#SBATCH --array=0-7
#SBATCH --output=logs/dense_gte_topk20_%A_%a.log
#SBATCH --export=ALL,NV_API_KEY
set -euo pipefail
PROJECT_DIR="${PROJECT_DIR:-$(pwd)}"
cd "$PROJECT_DIR"
MODEL_NAME="${MODEL_NAME:-gpt-5.5}" # any key from model_zoo.py
TOP_K="${TOP_K:-20}"
SHARD_ROOT="${SHARD_ROOT:-output/shards/v5_${MODEL_NAME//./_}_nchunks10}"
shard_id=$(printf "%02d" "$SLURM_ARRAY_TASK_ID")
export ret_cache="$SHARD_ROOT/ret_cache/shard_${shard_id}.jsonl"
python main.py \
--in_file "$SHARD_ROOT/dataset/shard_${shard_id}.json" \
--out_file "$SHARD_ROOT/dense_gte_topk${TOP_K}/part_${shard_id}.jsonl" \
--model_name "$MODEL_NAME" \
--top_k "$TOP_K" \
--n_chunks 10 \
--nvidia \
--all_sessions_file dataset/all_sessions.json \
--no_semantic \
--mode embed