#!/bin/bash # Run training on cloud instance, then auto-stop # Prerequisites: set WANDB_API_KEY and HF_TOKEN set +e LOG_FILE="training_$(date +%Y%m%d_%H%M%S).log" NUM_GPUS="${NUM_GPUS:-1}" DATASET_DIR="${DATASET_DIR:-/ephemeral/community_dataset_v3}" if [ -z "$WANDB_API_KEY" ]; then echo "ERROR: WANDB_API_KEY not set"; exit 1; fi if [ -z "$HF_TOKEN" ]; then echo "ERROR: HF_TOKEN not set"; exit 1; fi if [ ! -d "$DATASET_DIR" ]; then echo "ERROR: Dataset not at $DATASET_DIR"; exit 1; fi echo "=== Starting Training ===" | tee "$LOG_FILE" # Activate conda env if available (bare metal), otherwise assume deps are global (Docker) if command -v conda &> /dev/null; then eval "$(conda shell.bash hook)" conda activate lerobot fi ACCEL_FLAGS="" if [ "$NUM_GPUS" -gt 1 ]; then ACCEL_FLAGS="--multi_gpu --num_processes $NUM_GPUS" fi # Check if we can resume from a checkpoint RESUME_ARGS="" LAST_CKPT="/ephemeral/production_run/checkpoints/last/pretrained_model/train_config.json" if [ -f "$LAST_CKPT" ]; then echo "Resuming from checkpoint: $LAST_CKPT" | tee -a "$LOG_FILE" RESUME_ARGS="--resume=true --config_path=$LAST_CKPT" else echo "Starting fresh training" | tee -a "$LOG_FILE" RESUME_ARGS="--policy.path=lerobot/pi05_base" fi python3.12 -m accelerate.commands.launch $ACCEL_FLAGS \ -m lerobot.scripts.lerobot_train \ $RESUME_ARGS \ --dataset.repo_id="so100:$DATASET_DIR:/workspace/pi05-so100-diverse/filtered_index.json:/workspace/pi05-so100-diverse/norm_stats.json" \ --policy.train_expert_only=true \ --policy.dtype=bfloat16 \ --policy.gradient_checkpointing=false \ --policy.push_to_hub=true \ --policy.repo_id=StrongRoboticsLab/pi05-so100-diverse \ --policy.normalization_mapping='{"VISUAL": "IDENTITY", "STATE": "MEAN_STD", "ACTION": "MEAN_STD"}' \ --policy.scheduler_warmup_steps=1000 \ --policy.scheduler_decay_steps=340000 \ --rename_map='{"observation.images.image": "observation.images.base_0_rgb", "observation.images.image2": "observation.images.left_wrist_0_rgb"}' \ --batch_size=16 \ --steps=340000 \ --save_freq=5000 \ --log_freq=50 \ --num_workers=4 \ --wandb.enable=true \ --wandb.project=pi05-so100-diverse \ --output_dir=/ephemeral/production_run \ 2>&1 | tee -a "$LOG_FILE" TRAIN_EXIT=${PIPESTATUS[0]} echo "=== Training Complete (exit: $TRAIN_EXIT) ===" | tee -a "$LOG_FILE" python -c " from huggingface_hub import HfApi HfApi().upload_file(path_or_fileobj='$LOG_FILE', path_in_repo='logs/$LOG_FILE', repo_id='StrongRoboticsLab/pi05-so100-diverse', repo_type='model') print('Log uploaded') " 2>&1 | tee -a "$LOG_FILE" # Only auto-shutdown if training succeeded (exit 0 = weights uploaded) if [ "$TRAIN_EXIT" -eq 0 ] && command -v sudo &> /dev/null; then sudo shutdown -h now else echo "=== NOT shutting down: training exited with code $TRAIN_EXIT ===" | tee -a "$LOG_FILE" echo "=== Weights may still be on disk at /ephemeral/production_run ===" | tee -a "$LOG_FILE" fi